Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::Wordrec Class Reference

#include <wordrec.h>

Inheritance diagram for tesseract::Wordrec:
tesseract::Classify tesseract::CCStruct tesseract::CUtil tesseract::CCUtil tesseract::Tesseract

Public Member Functions

 Wordrec ()
 
virtual ~Wordrec ()
 
void CopyCharChoices (const BLOB_CHOICE_LIST_VECTOR &from, BLOB_CHOICE_LIST_VECTOR *to)
 
bool ChoiceIsCorrect (const UNICHARSET &uni_set, const WERD_CHOICE *choice, const GenericVector< STRING > &truth_text)
 
void SaveAltChoices (const LIST &best_choices, WERD_RES *word)
 
void FillLattice (const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void CallFillLattice (const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void update_ratings (const BLOB_CHOICE_LIST_VECTOR &new_choices, const CHUNKS_RECORD *chunks_record, const SEARCH_STATE search_state)
 
void SegSearch (CHUNKS_RECORD *chunks_record, WERD_CHOICE *best_choice, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_CHOICE *raw_choice, STATE *output_best_state, BlamerBundle *blamer_bundle)
 
SEAMattempt_blob_chop (TWERD *word, TBLOB *blob, inT32 blob_number, bool italic_blob, SEAMS seam_list)
 
SEAMchop_numbered_blob (TWERD *word, inT32 blob_number, bool italic_blob, SEAMS seam_list)
 
SEAMchop_overlapping_blob (const GenericVector< TBOX > &boxes, WERD_RES *word_res, inT32 *blob_number, bool italic_blob, SEAMS seam_list)
 
void junk_worst_seam (SEAM_QUEUE seams, SEAM *new_seam, float new_priority)
 
void choose_best_seam (SEAM_QUEUE seam_queue, SEAM_PILE *seam_pile, SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob)
 
void combine_seam (SEAM_QUEUE seam_queue, SEAM_PILE seam_pile, SEAM *seam)
 
inT16 constrained_split (SPLIT *split, TBLOB *blob)
 
void delete_seam_pile (SEAM_PILE seam_pile)
 
SEAMpick_good_seam (TBLOB *blob)
 
PRIORITY seam_priority (SEAM *seam, inT16 xmin, inT16 xmax)
 
void try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, SEAM_QUEUE seam_queue, SEAM_PILE *seam_pile, SEAM **seam, TBLOB *blob)
 
void try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, EDGEPT_CLIST *new_points, SEAM_QUEUE seam_queue, SEAM_PILE *seam_pile, SEAM **seam, TBLOB *blob)
 
PRIORITY full_split_priority (SPLIT *split, inT16 xmin, inT16 xmax)
 
PRIORITY grade_center_of_blob (register BOUNDS_RECT rect)
 
PRIORITY grade_overlap (register BOUNDS_RECT rect)
 
PRIORITY grade_split_length (register SPLIT *split)
 
PRIORITY grade_sharpness (register SPLIT *split)
 
PRIORITY grade_width_change (register BOUNDS_RECT rect)
 
void set_outline_bounds (register EDGEPT *point1, register EDGEPT *point2, BOUNDS_RECT rect)
 
int crosses_outline (EDGEPT *p0, EDGEPT *p1, EDGEPT *outline)
 
int is_crossed (TPOINT a0, TPOINT a1, TPOINT b0, TPOINT b1)
 
int is_same_edgept (EDGEPT *p1, EDGEPT *p2)
 
bool near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
 
void reverse_outline (EDGEPT *outline)
 
virtual BLOB_CHOICE_LIST * classify_piece (TBLOB *pieces, const DENORM &denorm, SEAMS seams, inT16 start, inT16 end, BlamerBundle *blamer_bundle)
 
void merge_fragments (MATRIX *ratings, inT16 num_blobs)
 
void get_fragment_lists (inT16 current_frag, inT16 current_row, inT16 start, inT16 num_frag_parts, inT16 num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
 
void merge_and_put_fragment_lists (inT16 row, inT16 column, inT16 num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
 
void fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
 
BLOB_CHOICE_LIST * get_piece_rating (MATRIX *ratings, TBLOB *blobs, const DENORM &denorm, SEAMS seams, inT16 start, inT16 end, BlamerBundle *blamer_bundle)
 
TBOXrecord_blob_bounds (TBLOB *blobs)
 
MATRIXrecord_piece_ratings (TBLOB *blobs)
 
WIDTH_RECORDstate_char_widths (WIDTH_RECORD *chunk_widths, STATE *state, int num_joints)
 
FLOAT32 get_width_variance (WIDTH_RECORD *wrec, float norm_height)
 
FLOAT32 get_gap_variance (WIDTH_RECORD *wrec, float norm_height)
 
FLOAT32 prioritize_state (CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search)
 
FLOAT32 width_priority (CHUNKS_RECORD *chunks_record, STATE *state, int num_joints)
 
FLOAT32 seamcut_priority (SEAMS seams, STATE *state, int num_joints)
 
FLOAT32 rating_priority (CHUNKS_RECORD *chunks_record, STATE *state, int num_joints)
 
program_editup

Initialize all the things in the program that need to be initialized. init_permute determines whether to initialize the permute functions and Dawg models.

void program_editup (const char *textbase, bool init_classifier, bool init_permute)
 
cc_recog

Recognize a word.

BLOB_CHOICE_LIST_VECTORcc_recog (WERD_RES *word)
 
program_editdown

This function holds any nessessary post processing for the Wise Owl program.

void program_editdown (inT32 elasped_time)
 
set_pass1

Get ready to do some pass 1 stuff.

void set_pass1 ()
 
set_pass2

Get ready to do some pass 2 stuff.

void set_pass2 ()
 
end_recog

Cleanup and exit the recog program.

int end_recog ()
 
call_matcher

Called from Tess with a blob in tess form. The blob may need rotating to the correct orientation for classification.

BLOB_CHOICE_LIST * call_matcher (const DENORM *denorm, TBLOB *blob)
 
dict_word()

Test the dictionaries, returning NO_PERM (0) if not found, or one of the PermuterType values if found, according to the dictionary.

int dict_word (const WERD_CHOICE &word)
 
classify_blob

Classify the this blob if it is not already recorded in the match table. Attempt to recognize this blob as a character. The recognition rating for this blob will be stored as a part of the blob. This value will also be returned to the caller.

Parameters
blobCurrent blob
stringThe string to display in ScrollView
colorThe colour to use when displayed with ScrollView
BLOB_CHOICE_LIST * classify_blob (TBLOB *blob, const DENORM &denorm, const char *string, C_COL color, BlamerBundle *blamer_bundle)
 
BLOB_CHOICE_LIST * fake_classify_blob (UNICHAR_ID class_id, float rating, float certainty)
 
update_blob_classifications

For each blob in the given word update match_table with the corresponding BLOB_CHOICES_LIST from choices.

void update_blob_classifications (TWERD *word, const BLOB_CHOICE_LIST_VECTOR &choices)
 
best_first_search

Find the best segmentation by doing a best first search of the solution space.

BLOB_CHOICE_LIST_VECTORevaluate_chunks (CHUNKS_RECORD *chunks_record, SEARCH_STATE search_state, BlamerBundle *blamer_bundle)
 
void best_first_search (CHUNKS_RECORD *chunks_record, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_RES *word, STATE *state, DANGERR *fixpt, STATE *best_state)
 
void delete_search (SEARCH_RECORD *the_search)
 
evaluate_state

Evaluate the segmentation that is represented by this state in the best first search. Add this state to the "states_seen" list.

inT16 evaluate_state (CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search, DANGERR *fixpt, BlamerBundle *blamer_bundle)
 
BLOB_CHOICE_LIST_VECTORrebuild_current_state (WERD_RES *word, STATE *state, BLOB_CHOICE_LIST_VECTOR *char_choices, MATRIX *ratings)
 
new_search

Create and initialize a new search record.

SEARCH_RECORDnew_search (CHUNKS_RECORD *chunks_record, int num_joints, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_CHOICE *best_choice, WERD_CHOICE *raw_choice, STATE *state)
 
expand_node

Create the states that are attached to this one. Check to see that each one has not already been visited. If not add it to the priority queue.

void expand_node (FLOAT32 worst_priority, CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search)
 
replace_char_widths

Replace the value of the char_width field in the chunks_record with the updated width measurements from the last_segmentation.

void replace_char_widths (CHUNKS_RECORD *chunks_record, SEARCH_STATE state)
 
BLOB_CHOICErebuild_fragments (const char *unichar, const char *expanded_fragment_lengths, int choice_index, BLOB_CHOICE_LIST_VECTOR *old_choices)
 
BLOB_CHOICE_LIST * join_blobs_and_classify (WERD_RES *word, int x, int y, int choice_index, MATRIX *ratings, BLOB_CHOICE_LIST_VECTOR *old_choices)
 
pop_queue

Get this state from the priority queue. It should be the state that has the greatest urgency to be evaluated.

STATEpop_queue (HEAP *queue)
 
push_queue

Add this state into the priority queue.

void push_queue (HEAP *queue, STATE *state, FLOAT32 worst_priority, FLOAT32 priority, bool debug)
 
point_priority

Assign a priority to and edge point that might be used as part of a split. The argument should be of type EDGEPT.

PRIORITY point_priority (EDGEPT *point)
 
add_point_to_list

Add an edge point to a POINT_GROUP containg a list of other points.

void add_point_to_list (POINT_GROUP point_list, EDGEPT *point)
 
angle_change

Return the change in angle (degrees) of the line segments between points one and two, and two and three.

int angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
 
is_little_chunk

Return TRUE if one of the pieces resulting from this split would less than some number of edge points.

int is_little_chunk (EDGEPT *point1, EDGEPT *point2)
 
is_small_area

Test the area defined by a split accross this outline.

int is_small_area (EDGEPT *point1, EDGEPT *point2)
 
pick_close_point

Choose the edge point that is closest to the critical point. This point may not be exactly vertical from the critical point.

EDGEPTpick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
 
prioritize_points

Find a list of edge points from the outer outline of this blob. For each of these points assign a priority. Sort these points using a heap structure so that they can be visited in order.

void prioritize_points (TESSLINE *outline, POINT_GROUP points)
 
new_min_point

Found a new minimum point try to decide whether to save it or not. Return the new value for the local minimum. If a point is saved then the local minimum is reset to NULL.

void new_min_point (EDGEPT *local_min, POINT_GROUP points)
 
new_max_point

Found a new minimum point try to decide whether to save it or not. Return the new value for the local minimum. If a point is saved then the local minimum is reset to NULL.

void new_max_point (EDGEPT *local_max, POINT_GROUP points)
 
vertical_projection_point

For one point on the outline, find the corresponding point on the other side of the outline that is a likely projection for a split point. This is done by iterating through the edge points until the X value of the point being looked at is greater than the X value of the split point. Ensure that the point being returned is not right next to the split point. Return the edge point in *best_point as a result, and any points that were newly created are also saved on the new_points list.

void vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
 
improve_one_blob

Start with the current word of blobs and its classification. Find the worst blobs and try to divide it up to improve the ratings.

bool improve_one_blob (WERD_RES *word_res, BLOB_CHOICE_LIST_VECTOR *char_choices, inT32 *blob_number, SEAMS *seam_list, DANGERR *fixpt, bool split_next_to_fragment, BlamerBundle *blamer_bundle)
 
modify_blob_choice

Takes a blob and its chop index, converts that chop index to a unichar_id, and stores the chop index in place of the blob's original unichar_id.

void modify_blob_choice (BLOB_CHOICE_LIST *answer, int chop_index)
 
chop_one_blob

Start with the current one-blob word and its classification. Find the worst blobs and try to divide it up to improve the ratings. Used for testing chopper.

bool chop_one_blob (TWERD *word, BLOB_CHOICE_LIST_VECTOR *char_choices, inT32 *blob_number, SEAMS *seam_list, int *right_chop_index)
 
bool chop_one_blob2 (const GenericVector< TBOX > &boxes, WERD_RES *word_res, SEAMS *seam_list)
 
chop_word_main

Classify the blobs in this word and permute the results. Find the worst blob in the word and chop it up. Continue this process until a good answer has been found or all the blobs have been chopped up enough. Return the word level ratings.

BLOB_CHOICE_LIST_VECTORchop_word_main (WERD_RES *word)
 
improve_by_chopping

Start with the current word of blobs and its classification. Find the worst blobs and try to divide them up to improve the ratings. As long as ratings are produced by the new blob splitting. When all the splitting has been accomplished all the ratings memory is reclaimed.

void improve_by_chopping (WERD_RES *word, BLOB_CHOICE_LIST_VECTOR *char_choices, STATE *best_state, BLOB_CHOICE_LIST_VECTOR *best_char_choices, DANGERR *fixpt, bool *updated_best_choice)
 
MATRIXword_associator (bool only_create_ratings_matrtix, WERD_RES *word, STATE *state, BLOB_CHOICE_LIST_VECTOR *best_char_choices, DANGERR *fixpt, STATE *best_state)
 
inT16 select_blob_to_split (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_ceiling, bool split_next_to_fragment)
 
inT16 select_blob_to_split_from_fixpt (DANGERR *fixpt)
 
void set_chopper_blame (WERD_RES *word)
 
- Public Member Functions inherited from tesseract::Classify
 Classify ()
 
virtual ~Classify ()
 
DictgetDict ()
 
const ShapeTableshape_table () const
 
ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)
 
int GetFontinfoId (ADAPT_CLASS Class, uinT8 ConfigId)
 
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, const INT_FEATURE_STRUCT *features, const uinT8 *normalization_factors, const uinT16 *expected_num_features, CP_RESULT_STRUCT *results)
 
void ReadNewCutoffs (FILE *CutoffFile, bool swap, inT64 end_offset, CLASS_CUTOFF_ARRAY Cutoffs)
 
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
ADAPT_TEMPLATES ReadAdaptedTemplates (FILE *File)
 
FLOAT32 ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, BOOL8 DebugMatch)
 
void FreeNormProtos ()
 
NORM_PROTOSReadNormProtos (FILE *File, inT64 end_offset)
 
void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
 
INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
 
void LearnWord (const char *filename, const char *rejmap, WERD_RES *word)
 
void LearnPieces (const char *filename, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
 
void InitAdaptiveClassifier (bool load_pre_trained_templates)
 
void InitAdaptedClass (TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
 
void AdaptToPunc (TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold)
 
void AmbigClassifier (TBLOB *Blob, const DENORM &denorm, INT_TEMPLATES Templates, ADAPT_CLASS *Classes, UNICHAR_ID *Ambiguities, ADAPT_RESULTS *Results)
 
void MasterMatcher (INT_TEMPLATES templates, inT16 num_features, const INT_FEATURE_STRUCT *features, const uinT8 *norm_factors, ADAPT_CLASS *classes, int debug, int num_classes, const TBOX &blob_box, CLASS_PRUNER_RESULTS results, ADAPT_RESULTS *final_results)
 
void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, const uinT8 *cn_factors, INT_RESULT_STRUCT &int_result, ADAPT_RESULTS *final_results)
 
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, const uinT8 *cn_factors)
 
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
 
void AddNewResult (ADAPT_RESULTS *results, CLASS_ID class_id, int shape_id, FLOAT32 rating, bool adapted, int config, int fontinfo_id, int fontinfo_id2)
 
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
 
void DebugAdaptiveClassifier (TBLOB *Blob, const DENORM &denorm, ADAPT_RESULTS *Results)
 
void GetAdaptThresholds (TWERD *Word, const DENORM &denorm, const WERD_CHOICE &BestChoice, const WERD_CHOICE &BestRawChoice, FLOAT32 Thresholds[])
 
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
 
int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
 
void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, const DENORM &denorm, TBLOB *Blob)
 
void PrintAdaptiveMatchResults (FILE *File, ADAPT_RESULTS *Results)
 
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
 
void RemoveBadMatches (ADAPT_RESULTS *Results)
 
void SetAdaptiveThreshold (FLOAT32 Threshold)
 
void ShowBestMatchFor (TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int shape_id, BOOL8 AdaptiveOn, BOOL8 PreTrainedOn, ADAPT_RESULTS *Results)
 
STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
 
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
 
int ShapeIDToClassID (int shape_id) const
 
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const DENORM &denorm, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
 
int CharNormClassifier (TBLOB *Blob, const DENORM &denorm, INT_TEMPLATES Templates, ADAPT_RESULTS *Results)
 
int CharNormTrainingSample (bool pruner_only, const TrainingSample &sample, GenericVector< ShapeRating > *results)
 
UNICHAR_IDGetAmbiguities (TBLOB *Blob, const DENORM &denorm, CLASS_ID CorrectClass)
 
void DoAdaptiveMatch (TBLOB *Blob, const DENORM &denorm, ADAPT_RESULTS *Results)
 
void AdaptToChar (TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold)
 
void DisplayAdaptedChar (TBLOB *blob, const DENORM &denorm, INT_CLASS_STRUCT *int_class)
 
int AdaptableWord (TWERD *Word, const WERD_CHOICE &BestChoiceWord, const WERD_CHOICE &RawChoiceWord)
 
void EndAdaptiveClassifier ()
 
void PrintAdaptiveStatistics (FILE *File)
 
void SettupPass1 ()
 
void SettupPass2 ()
 
void AdaptiveClassifier (TBLOB *Blob, const DENORM &denorm, BLOB_CHOICE_LIST *Choices, CLASS_PRUNER_RESULTS cp_results)
 
void ClassifyAsNoise (ADAPT_RESULTS *Results)
 
void ResetAdaptiveClassifierInternal ()
 
int GetBaselineFeatures (TBLOB *Blob, const DENORM &denorm, INT_TEMPLATES Templates, INT_FEATURE_ARRAY IntFeatures, uinT8 *CharNormArray, inT32 *BlobLength)
 
int GetCharNormFeatures (TBLOB *Blob, const DENORM &denorm, INT_TEMPLATES Templates, INT_FEATURE_ARRAY IntFeatures, uinT8 *PrunerNormArray, uinT8 *CharNormArray, inT32 *BlobLength, inT32 *FeatureOutlineIndex)
 
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uinT8 *char_norm_array, uinT8 *pruner_array)
 
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
 
void UpdateAmbigsGroup (CLASS_ID class_id, const DENORM &denorm, TBLOB *Blob)
 
void ResetFeaturesHaveBeenExtracted ()
 
bool AdaptiveClassifierIsFull ()
 
bool LooksLikeGarbage (const DENORM &denorm, TBLOB *blob)
 
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
 
void ClearCharNormArray (uinT8 *char_norm_array)
 
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uinT8 *char_norm_array)
 
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
 
INT_TEMPLATES ReadIntTemplates (FILE *File)
 
void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
 
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
 
void ShowMatchDisplay ()
 
UnicityTable< FontInfo > & get_fontinfo_table ()
 
UnicityTable< FontSet > & get_fontset_table ()
 
void NormalizeOutlines (LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale)
 
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
 
void ReadClassFile ()
 
- Public Member Functions inherited from tesseract::CCStruct
 CCStruct ()
 
 ~CCStruct ()
 
- Public Member Functions inherited from tesseract::CUtil
 CUtil ()
 
 ~CUtil ()
 
void read_variables (const char *filename, bool global_only)
 
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
 
virtual ~CCUtil ()
 
void main_setup (const char *argv0, const char *basename)
 
ParamsVectorsparams ()
 

Public Attributes

bool merge_fragments_in_matrix = TRUE
 
bool wordrec_no_block = FALSE
 
bool wordrec_enable_assoc = TRUE
 
bool force_word_assoc = FALSE
 
int wordrec_num_seg_states = 30
 
double wordrec_worst_state = 1
 
bool fragments_guide_chopper = FALSE
 
int repair_unchopped_blobs = 1
 
double tessedit_certainty_threshold = -2.25
 
int chop_debug = 0
 
bool chop_enable = 1
 
bool chop_vertical_creep = 0
 
int chop_split_length = 10000
 
int chop_same_distance = 2
 
int chop_min_outline_points = 6
 
int chop_inside_angle = -50
 
int chop_min_outline_area = 2000
 
double chop_split_dist_knob = 0.5
 
double chop_overlap_knob = 0.9
 
double chop_center_knob = 0.15
 
double chop_sharpness_knob = 0.06
 
double chop_width_change_knob = 5.0
 
double chop_ok_split = 100.0
 
double chop_good_split = 50.0
 
int chop_x_y_weight = 3
 
int segment_adjust_debug = 0
 
bool assume_fixed_pitch_char_segment = FALSE
 
bool use_new_state_cost = FALSE
 
double heuristic_segcost_rating_base = 1.25
 
double heuristic_weight_rating = 1
 
double heuristic_weight_width = 0
 
double heuristic_weight_seamcut = 0
 
double heuristic_max_char_wh_ratio = 2.0
 
int wordrec_debug_level = 0
 
bool wordrec_debug_blamer = false
 
bool wordrec_run_blamer = false
 
bool enable_new_segsearch = false
 
int segsearch_debug_level = 0
 
int segsearch_max_pain_points = 2000
 
int segsearch_max_futile_classifications = 10
 
double segsearch_max_char_wh_ratio = 2.0
 
double segsearch_max_fixed_pitch_char_wh_ratio = 2.0
 
bool save_alt_choices = false
 
LanguageModellanguage_model_
 
PRIORITY pass2_ok_split
 
int pass2_seg_states
 
int num_joints
 
int num_pushed
 
int num_popped
 
BlobMatchTable blob_match_table
 
EVALUATION_ARRAY last_segmentation
 
WERD_CHOICEprev_word_best_choice_
 
GenericVector< int > blame_reasons_
 
void(Wordrec::* fill_lattice_ )(const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
- Public Attributes inherited from tesseract::Classify
bool prioritize_division = FALSE
 
int tessedit_single_match = FALSE
 
bool classify_enable_learning = true
 
int classify_debug_level = 0
 
int classify_norm_method = character
 
double classify_char_norm_range = 0.2
 
double classify_min_norm_scale_x = 0.0
 
double classify_max_norm_scale_x = 0.325
 
double classify_min_norm_scale_y = 0.0
 
double classify_max_norm_scale_y = 0.325
 
bool tess_cn_matching = 0
 
bool tess_bn_matching = 0
 
bool classify_enable_adaptive_matcher = 1
 
bool classify_use_pre_adapted_templates = 0
 
bool classify_save_adapted_templates = 0
 
bool classify_enable_adaptive_debugger = 0
 
int matcher_debug_level = 0
 
int matcher_debug_flags = 0
 
int classify_learning_debug_level = 0
 
double matcher_good_threshold = 0.125
 
double matcher_great_threshold = 0.0
 
double matcher_perfect_threshold = 0.02
 
double matcher_bad_match_pad = 0.15
 
double matcher_rating_margin = 0.1
 
double matcher_avg_noise_size = 12.0
 
int matcher_permanent_classes_min = 1
 
int matcher_min_examples_for_prototyping = 3
 
int matcher_sufficient_examples_for_prototyping = 5
 
double matcher_clustering_max_angle_delta = 0.015
 
double classify_misfit_junk_penalty = 0.0
 
double rating_scale = 1.5
 
double certainty_scale = 20.0
 
double tessedit_class_miss_scale = 0.00390625
 
int classify_adapt_proto_threshold = 230
 
int classify_adapt_feature_threshold = 230
 
bool disable_character_fragments = TRUE
 
double classify_character_fragments_garbage_certainty_threshold = -3.0
 
bool classify_debug_character_fragments = FALSE
 
bool matcher_debug_separate_windows = FALSE
 
char * classify_learn_debug_str = ""
 
int classify_class_pruner_threshold = 229
 
int classify_class_pruner_multiplier = 30
 
int classify_cp_cutoff_strength = 7
 
int classify_integer_matcher_multiplier = 14
 
INT_TEMPLATES PreTrainedTemplates
 
ADAPT_TEMPLATES AdaptedTemplates
 
BIT_VECTOR AllProtosOn
 
BIT_VECTOR PrunedProtos
 
BIT_VECTOR AllConfigsOn
 
BIT_VECTOR AllProtosOff
 
BIT_VECTOR AllConfigsOff
 
BIT_VECTOR TempProtoMask
 
bool EnableLearning
 
NORM_PROTOSNormProtos
 
UnicityTable< FontInfofontinfo_table_
 
UnicityTable< FontSetfontset_table_
 
int il1_adaption_test = 0
 
bool classify_bln_numeric_mode = 0
 
- Public Attributes inherited from tesseract::CCUtil
STRING datadir
 
STRING imagebasename
 
STRING lang
 
STRING language_data_path_prefix
 
TessdataManager tessdata_manager
 
UNICHARSET unicharset
 
UnicharAmbigs unichar_ambigs
 
STRING imagefile
 
STRING directory
 
char * m_data_sub_dir = "tessdata/"
 
int ambigs_debug_level = 0
 
bool use_definite_ambigs_for_classifier = 0
 
bool use_ambigs_for_adaption = 0
 

Protected Member Functions

bool SegSearchDone (int num_futile_classifications)
 
void UpdateSegSearchNodes (int starting_col, SEG_SEARCH_PENDING_LIST *pending[], BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, HEAP *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const WERD_CHOICE *best_choice, SEG_SEARCH_PENDING_LIST *pending[], CHUNKS_RECORD *chunks_record, HEAP *pain_points, BlamerBundle *blamer_bundle)
 
void InitBlamerForSegSearch (const WERD_CHOICE *best_choice, CHUNKS_RECORD *chunks_record, HEAP *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
 
void FinishBlamerForSegSearch (const WERD_CHOICE *best_choice, BlamerBundle *blamer_bundle, STRING *blamer_debug)
 

Additional Inherited Members

- Static Public Attributes inherited from tesseract::CCStruct
static const double kDescenderFraction = 0.25
 
static const double kXHeightFraction = 0.5
 
static const double kAscenderFraction = 0.25
 
static const double kXHeightCapRatio
 
- Protected Attributes inherited from tesseract::Classify
IntegerMatcher im_
 
FEATURE_DEFS_STRUCT feature_defs_
 
ShapeTableshape_table_
 
- Protected Attributes inherited from tesseract::CCStruct
Image image_
 

Detailed Description

Definition at line 91 of file wordrec.h.

Constructor & Destructor Documentation

tesseract::Wordrec::Wordrec ( )

Definition at line 26 of file wordrec.cpp.

26  :
27  // control parameters
29  "Merge the fragments in the ratings matrix and delete them"
30  " after merging", params()),
31  BOOL_MEMBER(wordrec_no_block, FALSE, "Don't output block information",
32  params()),
33  BOOL_MEMBER(wordrec_enable_assoc, TRUE, "Associator Enable",
34  params()),
36  "force associator to run regardless of what enable_assoc is."
37  "This is used for CJK where component grouping is necessary.",
38  CCUtil::params()),
39  INT_MEMBER(wordrec_num_seg_states, 30, "Segmentation states",
40  CCUtil::params()),
41  double_MEMBER(wordrec_worst_state, 1.0, "Worst segmentation state",
42  params()),
44  "Use information from fragments to guide chopping process",
45  params()),
46  INT_MEMBER(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped",
47  params()),
48  double_MEMBER(tessedit_certainty_threshold, -2.25, "Good blob limit",
49  params()),
50  INT_MEMBER(chop_debug, 0, "Chop debug",
51  params()),
52  BOOL_MEMBER(chop_enable, 1, "Chop enable",
53  params()),
54  BOOL_MEMBER(chop_vertical_creep, 0, "Vertical creep",
55  params()),
56  INT_MEMBER(chop_split_length, 10000, "Split Length",
57  params()),
58  INT_MEMBER(chop_same_distance, 2, "Same distance",
59  params()),
60  INT_MEMBER(chop_min_outline_points, 6, "Min Number of Points on Outline",
61  params()),
62  INT_MEMBER(chop_inside_angle, -50, "Min Inside Angle Bend",
63  params()),
64  INT_MEMBER(chop_min_outline_area, 2000, "Min Outline Area",
65  params()),
66  double_MEMBER(chop_split_dist_knob, 0.5, "Split length adjustment",
67  params()),
68  double_MEMBER(chop_overlap_knob, 0.9, "Split overlap adjustment",
69  params()),
70  double_MEMBER(chop_center_knob, 0.15, "Split center adjustment",
71  params()),
72  double_MEMBER(chop_sharpness_knob, 0.06, "Split sharpness adjustment",
73  params()),
74  double_MEMBER(chop_width_change_knob, 5.0, "Width change adjustment",
75  params()),
76  double_MEMBER(chop_ok_split, 100.0, "OK split limit",
77  params()),
78  double_MEMBER(chop_good_split, 50.0, "Good split limit",
79  params()),
80  INT_MEMBER(chop_x_y_weight, 3, "X / Y length weight",
81  params()),
82  INT_MEMBER(segment_adjust_debug, 0, "Segmentation adjustment debug",
83  params()),
85  "include fixed-pitch heuristics in char segmentation",
86  params()),
88  "use new state cost heuristics for segmentation state evaluation",
89  params()),
91  "base factor for adding segmentation cost into word rating."
92  "It's a multiplying factor, the larger the value above 1, "
93  "the bigger the effect of segmentation cost.",
94  params()),
96  "weight associated with char rating in combined cost of state",
97  params()),
99  "weight associated with width evidence in combined cost of"
100  " state", params()),
102  "weight associated with seam cut in combined cost of state",
103  params()),
105  "max char width-to-height ratio allowed in segmentation",
106  params()),
108  "Debug level for wordrec", params()),
110  "Print blamer debug messages", params()),
112  "Try to set the blame for errors", params()),
114  "Enable new segmentation search path.", params()),
116  "SegSearch debug level", params()),
118  "Maximum number of pain points stored in the queue",
119  params()),
121  "Maximum number of pain point classifications per word that"
122  "did not result in finding a better word choice.",
123  params()),
125  "Maximum character width-to-height ratio", params()),
127  "Maximum character width-to-height ratio for"
128  " fixed-pitch fonts",
129  params()),
131  "Save alternative paths found during chopping"
132  " and segmentation search",
133  params()) {
135  language_model_ = new LanguageModel(&get_fontinfo_table(),
136  &(getDict()));
137  pass2_seg_states = 0;
138  num_joints = 0;
139  num_pushed = 0;
140  num_popped = 0;
142 }
double chop_overlap_knob
Definition: wordrec.h:117
int segment_adjust_debug
Definition: wordrec.h:124
double segsearch_max_fixed_pitch_char_wh_ratio
Definition: wordrec.h:155
double heuristic_segcost_rating_base
Definition: wordrec.h:132
double segsearch_max_char_wh_ratio
Definition: wordrec.h:152
double tessedit_certainty_threshold
Definition: wordrec.h:107
#define NULL
Definition: host.h:144
int wordrec_debug_level
Definition: wordrec.h:141
double heuristic_weight_rating
Definition: wordrec.h:134
bool enable_new_segsearch
Definition: wordrec.h:145
double wordrec_worst_state
Definition: wordrec.h:103
bool wordrec_no_block
Definition: wordrec.h:97
double chop_ok_split
Definition: wordrec.h:121
#define FALSE
Definition: capi.h:28
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:272
bool fragments_guide_chopper
Definition: wordrec.h:105
int chop_min_outline_points
Definition: wordrec.h:113
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:126
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:506
int segsearch_max_pain_points
Definition: wordrec.h:148
Dict & getDict()
Definition: classify.h:62
double chop_sharpness_knob
Definition: wordrec.h:119
bool force_word_assoc
Definition: wordrec.h:101
int segsearch_max_futile_classifications
Definition: wordrec.h:150
double chop_good_split
Definition: wordrec.h:122
bool wordrec_run_blamer
Definition: wordrec.h:143
int wordrec_num_seg_states
Definition: wordrec.h:102
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:336
int chop_min_outline_area
Definition: wordrec.h:115
bool use_new_state_cost
Definition: wordrec.h:128
double chop_split_dist_knob
Definition: wordrec.h:116
bool wordrec_debug_blamer
Definition: wordrec.h:142
bool save_alt_choices
Definition: wordrec.h:158
void(Wordrec::* fill_lattice_)(const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:510
LanguageModel * language_model_
Definition: wordrec.h:495
bool wordrec_enable_assoc
Definition: wordrec.h:98
double heuristic_weight_seamcut
Definition: wordrec.h:138
int segsearch_debug_level
Definition: wordrec.h:146
double chop_center_knob
Definition: wordrec.h:118
bool chop_vertical_creep
Definition: wordrec.h:110
int chop_same_distance
Definition: wordrec.h:112
double heuristic_weight_width
Definition: wordrec.h:136
ParamsVectors * params()
Definition: ccutil.h:65
double heuristic_max_char_wh_ratio
Definition: wordrec.h:140
double chop_width_change_knob
Definition: wordrec.h:120
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:275
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:281
#define TRUE
Definition: capi.h:27
int repair_unchopped_blobs
Definition: wordrec.h:106
bool merge_fragments_in_matrix
Definition: wordrec.h:96
tesseract::Wordrec::~Wordrec ( )
virtual

Definition at line 144 of file wordrec.cpp.

144  {
145  delete language_model_;
146 }
LanguageModel * language_model_
Definition: wordrec.h:495

Member Function Documentation

void tesseract::Wordrec::add_point_to_list ( POINT_GROUP  point_list,
EDGEPT point 
)

Definition at line 65 of file chop.cpp.

65  {
66  HEAPENTRY data;
67 
68  if (SizeOfHeap (point_list) < MAX_NUM_POINTS - 2) {
69  data.Data = (char *) point;
70  data.Key = point_priority (point);
71  HeapStore(point_list, &data);
72  }
73 
74 #ifndef GRAPHICS_DISABLED
75  if (chop_debug > 2)
76  mark_outline(point);
77 #endif
78 }
void mark_outline(EDGEPT *edgept)
Definition: plotedges.cpp:95
#define MAX_NUM_POINTS
Definition: chop.h:38
#define SizeOfHeap(H)
Definition: oldheap.h:48
void HeapStore(HEAP *Heap, HEAPENTRY *Entry)
Definition: oldheap.cpp:234
void * Data
Definition: oldheap.h:34
FLOAT32 Key
Definition: oldheap.h:33
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:55
int tesseract::Wordrec::angle_change ( EDGEPT point1,
EDGEPT point2,
EDGEPT point3 
)

Definition at line 87 of file chop.cpp.

87  {
88  VECTOR vector1;
89  VECTOR vector2;
90 
91  int angle;
92  float length;
93 
94  /* Compute angle */
95  vector1.x = point2->pos.x - point1->pos.x;
96  vector1.y = point2->pos.y - point1->pos.y;
97  vector2.x = point3->pos.x - point2->pos.x;
98  vector2.y = point3->pos.y - point2->pos.y;
99  /* Use cross product */
100  length = (float)sqrt((float)LENGTH(vector1) * LENGTH(vector2));
101  if ((int) length == 0)
102  return (0);
103  angle = static_cast<int>(floor(asin(CROSS (vector1, vector2) /
104  length) / PI * 180.0 + 0.5));
105 
106  /* Use dot product */
107  if (SCALAR (vector1, vector2) < 0)
108  angle = 180 - angle;
109  /* Adjust angle */
110  if (angle > 180)
111  angle -= 360;
112  if (angle <= -180)
113  angle += 360;
114  return (angle);
115 }
#define PI
Definition: const.h:19
#define SCALAR(a, b)
Definition: vecfuncs.h:63
#define LENGTH(a)
Definition: vecfuncs.h:72
inT16 y
Definition: blobs.h:68
inT16 x
Definition: blobs.h:67
Definition: blobs.h:53
#define CROSS(a, b)
Definition: vecfuncs.h:54
TPOINT pos
Definition: blobs.h:100
SEAM * tesseract::Wordrec::attempt_blob_chop ( TWERD word,
TBLOB blob,
inT32  blob_number,
bool  italic_blob,
SEAMS  seam_list 
)

Definition at line 146 of file chopper.cpp.

147  {
148  TBLOB *next_blob = blob->next;
149  TBLOB *other_blob;
150  SEAM *seam;
151 
154  other_blob = new TBLOB; /* Make new blob */
155  other_blob->next = blob->next;
156  other_blob->outlines = NULL;
157  blob->next = other_blob;
158 
159  seam = NULL;
160  if (prioritize_division) {
161  TPOINT location;
162  if (divisible_blob(blob, italic_blob, &location)) {
163  seam = new_seam(0.0f, location, NULL, NULL, NULL);
164  }
165  }
166  if (seam == NULL)
167  seam = pick_good_seam(blob);
168  if (seam == NULL && word->latin_script) {
169  // If the blob can simply be divided into outlines, then do that.
170  TPOINT location;
171  if (divisible_blob(blob, italic_blob, &location)) {
172  seam = new_seam(0.0f, location, NULL, NULL, NULL);
173  }
174  }
175  if (chop_debug) {
176  if (seam != NULL) {
177  print_seam ("Good seam picked=", seam);
178  }
179  else
180  cprintf ("\n** no seam picked *** \n");
181  }
182  if (seam) {
183  apply_seam(blob, other_blob, italic_blob, seam);
184  }
185 
186  if ((seam == NULL) ||
187  (blob->outlines == NULL) ||
188  (other_blob->outlines == NULL) ||
189  total_containment (blob, other_blob) ||
190  check_blob (other_blob) ||
191  !(check_seam_order (blob, seam) &&
192  check_seam_order (other_blob, seam)) ||
193  any_shared_split_points (seam_list, seam) ||
194  !test_insert_seam(seam_list, blob_number, blob, word->blobs)) {
195 
196  blob->next = next_blob;
197  if (seam) {
198  undo_seam(blob, other_blob, seam);
199  delete_seam(seam);
200 #ifndef GRAPHICS_DISABLED
201  if (chop_debug) {
202  if (chop_debug >2)
203  display_blob(blob, Red);
204  cprintf ("\n** seam being removed ** \n");
205  }
206 #endif
207  } else {
208  delete other_blob;
209  }
210 
213  return (NULL);
214  }
215  return (seam);
216 }
void restore_outline_tree(TESSLINE *srcline)
Definition: chopper.cpp:129
inT16 check_seam_order(TBLOB *blob, SEAM *seam)
Definition: chopper.cpp:539
bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT *location)
Definition: blobs.cpp:596
#define NULL
Definition: host.h:144
#define f(xc, yc)
Definition: imgscale.cpp:39
TBLOB * blobs
Definition: blobs.h:274
TESSLINE * outlines
Definition: blobs.h:227
int check_blob(TBLOB *blob)
Definition: chopper.cpp:306
bool latin_script
Definition: blobs.h:275
void preserve_outline_tree(TESSLINE *srcline)
Definition: chopper.cpp:81
Definition: callcpp.h:35
Definition: blobs.h:53
Definition: blobs.h:174
inT16 total_containment(TBLOB *blob1, TBLOB *blob2)
Definition: chopper.cpp:1050
bool test_insert_seam(SEAMS seam_list, int index, TBLOB *left_blob, TBLOB *first_blob)
Definition: seam.cpp:213
void delete_seam(void *arg)
Definition: seam.cpp:154
void undo_seam(TBLOB *blob, TBLOB *other_blob, SEAM *seam)
Definition: makechop.cpp:168
void apply_seam(TBLOB *blob, TBLOB *other_blob, bool italic_blob, SEAM *seam)
Definition: makechop.cpp:52
void display_blob(TBLOB *blob, C_COL color)
Definition: render.cpp:64
SEAM * new_seam(PRIORITY priority, const TPOINT &location, SPLIT *split1, SPLIT *split2, SPLIT *split3)
Definition: seam.cpp:421
int any_shared_split_points(SEAMS seam_list, SEAM *seam)
Definition: chopper.cpp:289
SEAM * pick_good_seam(TBLOB *blob)
Definition: findseam.cpp:380
void cprintf(const char *format,...)
Definition: callcpp.cpp:41
bool prioritize_division
Definition: classify.h:354
void print_seam(const char *label, SEAM *seam)
Definition: seam.cpp:458
TBLOB * next
Definition: blobs.h:228
int repair_unchopped_blobs
Definition: wordrec.h:106
void tesseract::Wordrec::best_first_search ( CHUNKS_RECORD chunks_record,
BLOB_CHOICE_LIST_VECTOR best_char_choices,
WERD_RES word,
STATE state,
DANGERR fixpt,
STATE best_state 
)

Definition at line 88 of file bestfirst.cpp.

93  {
94  SEARCH_RECORD *the_search;
95  inT16 keep_going;
96  STATE guided_state; // not used
97 
98  int num_joints = chunks_record->ratings->dimension() - 1;
99  the_search = new_search(chunks_record, num_joints, best_char_choices,
100  word->best_choice, word->raw_choice, state);
101 
102  // The default state is initialized as the best choice. In order to apply
103  // segmentation adjustment, or any other contextual processing in permute,
104  // we give the best choice a poor rating to force the processed raw choice
105  // to be promoted to best choice.
107  evaluate_state(chunks_record, the_search, fixpt, word->blamer_bundle);
108  if (wordrec_debug_level > 1) {
109  tprintf("\n\n\n =========== BestFirstSearch ==============\n");
110  word->best_choice->print("**Initial BestChoice**");
111  }
112 
113  FLOAT32 worst_priority = 2.0f * prioritize_state(chunks_record, the_search);
114  if (worst_priority < wordrec_worst_state)
115  worst_priority = wordrec_worst_state;
116  if (wordrec_debug_level > 1) {
117  log_state("BestFirstSearch", num_joints, best_state);
118  }
119 
120  guided_state = *state;
121  do {
122  /* Look for answer */
123  STATE orig_state = *the_search->this_state;
124  if (!hash_lookup (the_search->closed_states, the_search->this_state)) {
125  guided_state = *(the_search->this_state);
126  keep_going = evaluate_state(chunks_record, the_search, fixpt,
127  word->blamer_bundle);
128  hash_add (the_search->closed_states, the_search->this_state);
129 
130  if (!keep_going ||
131  (the_search->num_states > wordrec_num_seg_states)) {
132  if (wordrec_debug_level > 1)
133  tprintf("Breaking best_first_search on keep_going %s numstates %d\n",
134  ((keep_going) ? "T" :"F"), the_search->num_states);
135  free_state (the_search->this_state);
136  break;
137  }
138 
139  FLOAT32 new_worst_priority = 2.0f * prioritize_state(chunks_record,
140  the_search);
141  if (new_worst_priority < worst_priority) {
142  if (wordrec_debug_level > 1)
143  tprintf("Lowering WorstPriority %f --> %f\n",
144  worst_priority, new_worst_priority);
145  // Tighten the threshold for admitting new paths as better search
146  // candidates are found. After lowering this threshold, we can safely
147  // popout everything that is worse than this score also.
148  worst_priority = new_worst_priority;
149  }
150  expand_node(worst_priority, chunks_record, the_search);
151  }
152 
153  if (wordrec_debug_level > 1) {
154  log_state("Done with", the_search->num_joints, &orig_state);
155  }
156  free_state (the_search->this_state);
157  num_popped++;
158  the_search->this_state = pop_queue (the_search->open_states);
159  if (wordrec_debug_level > 1 && !the_search->this_state)
160  tprintf("No more states to evalaute after %d evals", num_popped);
161  } while (the_search->this_state);
162 
163  state->part1 = the_search->best_state->part1;
164  state->part2 = the_search->best_state->part2;
165  if (wordrec_debug_level > 1) {
166  tprintf("\n\n\n =========== BestFirstSearch ==============\n");
167  // best_choice->debug_string().string());
168  word->best_choice->print("**Final BestChoice**");
169  }
170  // save the best_state stats
171  delete_search(the_search);
172 }
SEARCH_RECORD * new_search(CHUNKS_RECORD *chunks_record, int num_joints, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_CHOICE *best_choice, WERD_CHOICE *raw_choice, STATE *state)
Definition: bestfirst.cpp:568
int hash_add(HASH_TABLE state_table, STATE *state)
Definition: closed.cpp:50
void set_rating(float new_val)
Definition: ratngs.h:255
WERD_CHOICE * best_choice
Definition: bestfirst.h:56
void expand_node(FLOAT32 worst_priority, CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search)
Definition: bestfirst.cpp:499
STATE * best_state
Definition: bestfirst.h:51
void free_state(STATE *)
int wordrec_debug_level
Definition: wordrec.h:141
double wordrec_worst_state
Definition: wordrec.h:103
uinT32 part1
Definition: states.h:41
float FLOAT32
Definition: host.h:111
int num_joints
Definition: bestfirst.h:52
int dimension() const
Definition: matrix.h:190
int wordrec_num_seg_states
Definition: wordrec.h:102
inT16 evaluate_state(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search, DANGERR *fixpt, BlamerBundle *blamer_bundle)
Definition: bestfirst.cpp:256
const void print() const
Definition: ratngs.h:406
FLOAT32 prioritize_state(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search)
Definition: heuristic.cpp:289
static const float kBadRating
Definition: ratngs.h:188
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
long num_states
Definition: bestfirst.h:53
void delete_search(SEARCH_RECORD *the_search)
Definition: bestfirst.cpp:179
HEAP * open_states
Definition: bestfirst.h:47
WERD_CHOICE * raw_choice
Definition: pageres.h:360
short inT16
Definition: host.h:100
STATE * this_state
Definition: bestfirst.h:49
uinT32 part2
Definition: states.h:42
int hash_lookup(HASH_TABLE state_table, STATE *state)
Definition: closed.cpp:86
MATRIX * ratings
Definition: associate.h:52
STATE * pop_queue(HEAP *queue)
Definition: bestfirst.cpp:607
Definition: states.h:39
HASH_TABLE closed_states
Definition: bestfirst.h:48
BlamerBundle * blamer_bundle
Definition: pageres.h:367
WERD_CHOICE * best_choice
Definition: pageres.h:359
BLOB_CHOICE_LIST * tesseract::Wordrec::call_matcher ( const DENORM denorm,
TBLOB blob 
)

Definition at line 143 of file tface.cpp.

143  {
144  // Rotate the blob for classification if necessary.
145  TBLOB* rotated_blob = tessblob->ClassifyNormalizeIfNeeded(&denorm);
146  if (rotated_blob == NULL) {
147  rotated_blob = tessblob;
148  }
149  BLOB_CHOICE_LIST *ratings = new BLOB_CHOICE_LIST(); // matcher result
150  AdaptiveClassifier(rotated_blob, *denorm, ratings, NULL);
151  if (rotated_blob != tessblob) {
152  delete rotated_blob;
153  delete denorm;
154  }
155  return ratings;
156 }
void AdaptiveClassifier(TBLOB *Blob, const DENORM &denorm, BLOB_CHOICE_LIST *Choices, CLASS_PRUNER_RESULTS cp_results)
Definition: adaptmatch.cpp:178
#define NULL
Definition: host.h:144
Definition: blobs.h:174
TBLOB * ClassifyNormalizeIfNeeded(const DENORM **denorm) const
Definition: blobs.cpp:281
void tesseract::Wordrec::CallFillLattice ( const MATRIX ratings,
const LIST best_choices,
const UNICHARSET unicharset,
BlamerBundle blamer_bundle 
)
inline

Definition at line 183 of file wordrec.h.

184  {
185  (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle);
186  }
void(Wordrec::* fill_lattice_)(const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:510
BLOB_CHOICE_LIST_VECTOR * tesseract::Wordrec::cc_recog ( WERD_RES word)

Definition at line 117 of file tface.cpp.

117  {
121  BLOB_CHOICE_LIST_VECTOR *results = chop_word_main(word);
123  return results;
124 }
BlobMatchTable blob_match_table
Definition: wordrec.h:501
void InitChoiceAccum()
Definition: stopper.cpp:435
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:122
Dict & getDict()
Definition: classify.h:62
WERD * word
Definition: pageres.h:334
BLOB_CHOICE_LIST_VECTOR * chop_word_main(WERD_RES *word)
Definition: chopper.cpp:583
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:32
void DebugWordChoices()
Prints the current choices for this word to stdout.
Definition: stopper.cpp:330
Definition: werd.h:36
bool tesseract::Wordrec::ChoiceIsCorrect ( const UNICHARSET uni_set,
const WERD_CHOICE choice,
const GenericVector< STRING > &  truth_text 
)

Definition at line 159 of file wordrec.cpp.

161  {
162  if (choice == NULL) return false;
163  int i;
164  STRING truth_str;
165  for (i = 0; i < truth_text.length(); ++i) truth_str += truth_text[i];
166  STRING normed_choice_str;
167  for (i = 0; i < choice->length(); ++i) {
168  normed_choice_str += uni_set.get_normed_unichar(choice->unichar_id(i));
169  }
170  return (truth_str == normed_choice_str);
171 }
int length() const
Definition: ratngs.h:214
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:704
#define NULL
Definition: host.h:144
Definition: strngs.h:40
int length() const
Definition: genericvector.h:63
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
void tesseract::Wordrec::choose_best_seam ( SEAM_QUEUE  seam_queue,
SEAM_PILE seam_pile,
SPLIT split,
PRIORITY  priority,
SEAM **  seam_result,
TBLOB blob 
)

Definition at line 178 of file findseam.cpp.

183  {
184  SEAM *seam;
185  char str[80];
186  float my_priority;
187  /* Add seam of split */
188  my_priority = priority;
189  if (split != NULL) {
190  TPOINT split_point = split->point1->pos;
191  split_point += split->point2->pos;
192  split_point /= 2;
193  seam = new_seam(my_priority, split_point, split, NULL, NULL);
194  if (chop_debug > 1)
195  print_seam ("Partial priority ", seam);
196  add_seam_to_queue (seam_queue, seam, (float) my_priority);
197 
198  if (my_priority > chop_good_split)
199  return;
200  }
201 
202  TBOX bbox = blob->bounding_box();
203  /* Queue loop */
204  while (pop_next_seam (seam_queue, seam, my_priority)) {
205  /* Set full priority */
206  my_priority = seam_priority (seam, bbox.left(), bbox.right());
207  if (chop_debug) {
208  sprintf (str, "Full my_priority %0.0f, ", my_priority);
209  print_seam(str, seam);
210  }
211 
212  if ((*seam_result == NULL || /* Replace answer */
213  (*seam_result)->priority > my_priority) && my_priority < chop_ok_split) {
214  /* No crossing */
215  if (constrained_split (seam->split1, blob)) {
216  delete_seam(*seam_result);
217  clone_seam(*seam_result, seam);
218  (*seam_result)->priority = my_priority;
219  }
220  else {
221  delete_seam(seam);
222  seam = NULL;
223  my_priority = BAD_PRIORITY;
224  }
225  }
226 
227  if (my_priority < chop_good_split) {
228  if (seam)
229  delete_seam(seam);
230  return; /* Made good answer */
231  }
232 
233  if (seam) {
234  /* Combine with others */
235  if (array_count (*seam_pile) < MAX_NUM_SEAMS
236  /*|| tessedit_truncate_chopper==0 */ ) {
237  combine_seam(seam_queue, *seam_pile, seam);
238  *seam_pile = array_push (*seam_pile, seam);
239  }
240  else
241  delete_seam(seam);
242  }
243 
244  my_priority = best_seam_priority (seam_queue);
245  if ((my_priority > chop_ok_split) ||
246  (my_priority > chop_good_split && split))
247  return;
248  }
249 }
void combine_seam(SEAM_QUEUE seam_queue, SEAM_PILE seam_pile, SEAM *seam)
Definition: findseam.cpp:259
ARRAY array_push(ARRAY array, void *value)
Definition: tessarray.cpp:98
#define pop_next_seam(seams, seam, priority)
Definition: findseam.cpp:122
#define NULL
Definition: host.h:144
inT16 constrained_split(SPLIT *split, TBLOB *blob)
Definition: findseam.cpp:343
inT16 left() const
Definition: rect.h:67
#define BAD_PRIORITY
Definition: findseam.cpp:52
double chop_ok_split
Definition: wordrec.h:121
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
#define best_seam_priority(seam_queue)
Definition: findseam.cpp:79
SPLIT * split1
Definition: seam.h:46
double chop_good_split
Definition: wordrec.h:122
Definition: blobs.h:53
void delete_seam(void *arg)
Definition: seam.cpp:154
#define add_seam_to_queue(seams, seam, priority)
Definition: findseam.cpp:64
#define clone_seam(dest, source)
Definition: seam.h:64
TBOX bounding_box() const
Definition: blobs.cpp:384
#define MAX_NUM_SEAMS
Definition: findseam.cpp:47
PRIORITY seam_priority(SEAM *seam, inT16 xmin, inT16 xmax)
Definition: findseam.cpp:474
EDGEPT * point1
Definition: split.h:39
TPOINT pos
Definition: blobs.h:100
EDGEPT * point2
Definition: split.h:40
SEAM * new_seam(PRIORITY priority, const TPOINT &location, SPLIT *split1, SPLIT *split2, SPLIT *split3)
Definition: seam.cpp:421
#define array_count(a)
Definition: tessarray.h:74
void print_seam(const char *label, SEAM *seam)
Definition: seam.cpp:458
SEAM * tesseract::Wordrec::chop_numbered_blob ( TWERD word,
inT32  blob_number,
bool  italic_blob,
SEAMS  seam_list 
)

Definition at line 219 of file chopper.cpp.

220  {
221  TBLOB *blob;
222  inT16 x;
223 
224  blob = word->blobs;
225  for (x = 0; x < blob_number; x++)
226  blob = blob->next;
227 
228  return attempt_blob_chop(word, blob, blob_number,
229  italic_blob, seam_list);
230 }
TBLOB * blobs
Definition: blobs.h:274
Definition: blobs.h:174
SEAM * attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number, bool italic_blob, SEAMS seam_list)
Definition: chopper.cpp:146
short inT16
Definition: host.h:100
TBLOB * next
Definition: blobs.h:228
bool tesseract::Wordrec::chop_one_blob ( TWERD word,
BLOB_CHOICE_LIST_VECTOR char_choices,
inT32 blob_number,
SEAMS seam_list,
int *  right_chop_index 
)

Definition at line 441 of file chopper.cpp.

445  {
446  TBLOB *blob;
447  inT16 x = 0;
448  float rating_ceiling = MAX_FLOAT32;
449  BLOB_CHOICE_LIST *answer;
450  BLOB_CHOICE_IT answer_it;
451  SEAM *seam;
452  UNICHAR_ID unichar_id = 0;
453  int left_chop_index = 0;
454 
455  do {
456  *blob_number = select_blob_to_split(*char_choices, rating_ceiling, false);
457  if (chop_debug)
458  cprintf("blob_number = %d\n", *blob_number);
459  if (*blob_number == -1)
460  return false;
461  seam = chop_numbered_blob(word, *blob_number, true, *seam_list);
462  if (seam != NULL)
463  break;
464  /* Must split null blobs */
465  answer = char_choices->get(*blob_number);
466  if (answer == NULL)
467  return false;
468  answer_it.set_to_list(answer);
469  rating_ceiling = answer_it.data()->rating(); // try a different blob
470  } while (true);
471  /* Split OK */
472  for (blob = word->blobs; x < *blob_number; x++) {
473  blob = blob->next;
474  }
475  if (chop_debug) {
476  tprintf("Chop made blob1:");
477  blob->bounding_box().print();
478  tprintf("and blob2:");
479  blob->next->bounding_box().print();
480  }
481  *seam_list = insert_seam(*seam_list, *blob_number, seam, blob, word->blobs);
482 
483  answer = char_choices->get(*blob_number);
484  answer_it.set_to_list(answer);
485  unichar_id = answer_it.data()->unichar_id();
486  float rating = answer_it.data()->rating() / exp(1.0);
487  left_chop_index = atoi(unicharset.id_to_unichar(unichar_id));
488 
489  delete char_choices->get(*blob_number);
490  // combine confidence w/ serial #
491  answer = fake_classify_blob(0, rating, -rating);
492  modify_blob_choice(answer, left_chop_index);
493  char_choices->insert(answer, *blob_number);
494 
495  answer = fake_classify_blob(0, rating - 0.125f, -rating);
496  modify_blob_choice(answer, ++*right_chop_index);
497  char_choices->set(answer, *blob_number + 1);
498  return true;
499 }
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
int UNICHAR_ID
Definition: unichar.h:31
void set(T t, int index)
#define NULL
Definition: host.h:144
SEAMS insert_seam(SEAMS seam_list, int index, SEAM *seam, TBLOB *left_blob, TBLOB *first_blob)
Definition: seam.cpp:250
T & get(int index) const
inT16 select_blob_to_split(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_ceiling, bool split_next_to_fragment)
Definition: chopper.cpp:801
#define f(xc, yc)
Definition: imgscale.cpp:39
TBLOB * blobs
Definition: blobs.h:274
void insert(T t, int index)
BLOB_CHOICE_LIST * fake_classify_blob(UNICHAR_ID class_id, float rating, float certainty)
Definition: wordclass.cpp:136
SEAM * chop_numbered_blob(TWERD *word, inT32 blob_number, bool italic_blob, SEAMS seam_list)
Definition: chopper.cpp:219
Definition: blobs.h:174
#define MAX_FLOAT32
Definition: host.h:124
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
TBOX bounding_box() const
Definition: blobs.cpp:384
UNICHARSET unicharset
Definition: ccutil.h:72
short inT16
Definition: host.h:100
void modify_blob_choice(BLOB_CHOICE_LIST *answer, int chop_index)
Definition: chopper.cpp:403
void cprintf(const char *format,...)
Definition: callcpp.cpp:41
void print() const
Definition: rect.h:263
TBLOB * next
Definition: blobs.h:228
bool tesseract::Wordrec::chop_one_blob2 ( const GenericVector< TBOX > &  boxes,
WERD_RES word_res,
SEAMS seam_list 
)

Definition at line 502 of file chopper.cpp.

504  {
505  inT32 blob_number;
506  inT16 x = 0;
507  TBLOB *blob;
508  SEAM *seam;
509 
510  seam = chop_overlapping_blob(boxes, word_res, &blob_number,
511  true, *seam_list);
512  if (seam == NULL)
513  return false;
514 
515  /* Split OK */
516  for (blob = word_res->chopped_word->blobs; x < blob_number; x++) {
517  blob = blob->next;
518  }
519  if (chop_debug) {
520  tprintf("Chop made blob1:");
521  blob->bounding_box().print();
522  tprintf("and blob2:");
523  blob->next->bounding_box().print();
524  }
525  *seam_list = insert_seam(*seam_list, blob_number, seam, blob,
526  word_res->chopped_word->blobs);
527  return true;
528 }
#define NULL
Definition: host.h:144
SEAMS insert_seam(SEAMS seam_list, int index, SEAM *seam, TBLOB *left_blob, TBLOB *first_blob)
Definition: seam.cpp:250
int inT32
Definition: host.h:102
TBLOB * blobs
Definition: blobs.h:274
SEAM * chop_overlapping_blob(const GenericVector< TBOX > &boxes, WERD_RES *word_res, inT32 *blob_number, bool italic_blob, SEAMS seam_list)
Definition: chopper.cpp:233
Definition: blobs.h:174
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
TBOX bounding_box() const
Definition: blobs.cpp:384
short inT16
Definition: host.h:100
TWERD * chopped_word
Definition: pageres.h:357
void print() const
Definition: rect.h:263
TBLOB * next
Definition: blobs.h:228
SEAM * tesseract::Wordrec::chop_overlapping_blob ( const GenericVector< TBOX > &  boxes,
WERD_RES word_res,
inT32 blob_number,
bool  italic_blob,
SEAMS  seam_list 
)

Definition at line 233 of file chopper.cpp.

235  {
236  TWERD *word = word_res->chopped_word;
237  TBLOB *blob;
238 
239  *blob_number = 0;
240  blob = word->blobs;
241  while (blob != NULL) {
242  TPOINT topleft, botright;
243  topleft.x = blob->bounding_box().left();
244  topleft.y = blob->bounding_box().top();
245  botright.x = blob->bounding_box().right();
246  botright.y = blob->bounding_box().bottom();
247 
248  TPOINT original_topleft, original_botright;
249  word_res->denorm.DenormTransform(topleft, &original_topleft);
250  word_res->denorm.DenormTransform(botright, &original_botright);
251 
252  TBOX original_box = TBOX(original_topleft.x, original_botright.y,
253  original_botright.x, original_topleft.y);
254 
255  bool almost_equal_box = false;
256  int num_overlap = 0;
257  for (int i = 0; i < boxes.size(); i++) {
258  if (original_box.overlap_fraction(boxes[i]) > 0.125)
259  num_overlap++;
260  if (original_box.almost_equal(boxes[i], 3))
261  almost_equal_box = true;
262  }
263 
264  TPOINT location;
265  if (divisible_blob(blob, italic_blob, &location) ||
266  (!almost_equal_box && num_overlap > 1)) {
267  SEAM *seam = attempt_blob_chop(word, blob, *blob_number,
268  italic_blob, seam_list);
269  if (seam != NULL)
270  return seam;
271  }
272 
273  *blob_number = *blob_number + 1;
274  blob = blob->next;
275  }
276 
277  *blob_number = -1;
278  return NULL;
279 }
bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT *location)
Definition: blobs.cpp:596
#define NULL
Definition: host.h:144
Definition: blobs.h:233
inT16 left() const
Definition: rect.h:67
Definition: rect.h:29
TBLOB * blobs
Definition: blobs.h:274
void DenormTransform(const TPOINT &pt, TPOINT *original) const
Definition: normalis.cpp:233
inT16 right() const
Definition: rect.h:74
inT16 y
Definition: blobs.h:68
inT16 x
Definition: blobs.h:67
Definition: blobs.h:53
Definition: blobs.h:174
SEAM * attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number, bool italic_blob, SEAMS seam_list)
Definition: chopper.cpp:146
inT16 top() const
Definition: rect.h:53
DENORM denorm
Definition: pageres.h:346
TBOX bounding_box() const
Definition: blobs.cpp:384
int size() const
Definition: genericvector.h:59
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:249
TWERD * chopped_word
Definition: pageres.h:357
TBLOB * next
Definition: blobs.h:228
inT16 bottom() const
Definition: rect.h:60
BLOB_CHOICE_LIST_VECTOR * tesseract::Wordrec::chop_word_main ( WERD_RES word)

Definition at line 583 of file chopper.cpp.

583  {
584  TBLOB *blob;
585  int index;
586  int did_chopping;
587  STATE state;
588  BLOB_CHOICE_LIST *match_result;
589  MATRIX *ratings = NULL;
590  DANGERR fixpt; /*dangerous ambig */
591  inT32 bit_count; //no of bits
592 
593  BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
594  BLOB_CHOICE_LIST_VECTOR *best_char_choices = new BLOB_CHOICE_LIST_VECTOR();
595 
596  did_chopping = 0;
597  for (blob = word->chopped_word->blobs, index = 0;
598  blob != NULL; blob = blob->next, index++) {
599  match_result = classify_blob(blob, word->denorm, "chop_word:", Green,
600  word->blamer_bundle);
601  if (match_result == NULL)
602  cprintf("Null classifier output!\n");
603  *char_choices += match_result;
604  }
605  bit_count = index - 1;
606  set_n_ones(&state, char_choices->length() - 1);
607  bool acceptable = false;
608  bool replaced = false;
609  bool best_choice_updated =
610  getDict().permute_characters(*char_choices, word->best_choice,
611  word->raw_choice);
612  if (best_choice_updated &&
613  getDict().AcceptableChoice(char_choices, word->best_choice, &fixpt,
614  CHOPPER_CALLER, &replaced)) {
615  acceptable = true;
616  }
617  if (replaced)
618  update_blob_classifications(word->chopped_word, *char_choices);
619  CopyCharChoices(*char_choices, best_char_choices);
620  if (!acceptable) { // do more work to find a better choice
621  did_chopping = 1;
622 
623  bool best_choice_acceptable = false;
624  if (chop_enable)
625  improve_by_chopping(word,
626  char_choices,
627  &state,
628  best_char_choices,
629  &fixpt,
630  &best_choice_acceptable);
631  if (chop_debug)
632  print_seams ("Final seam list:", word->seam_array);
633 
634  if (word->blamer_bundle != NULL &&
635  !ChoiceIsCorrect(*word->uch_set, word->best_choice,
636  word->blamer_bundle->truth_text)) {
637  set_chopper_blame(word);
638  }
639 
640  // The force_word_assoc is almost redundant to enable_assoc. However,
641  // it is not conditioned on the dict behavior. For CJK, we need to force
642  // the associator to be invoked. When we figure out the exact behavior
643  // of dict on CJK, we can remove the flag if it turns out to be redundant.
644  if ((wordrec_enable_assoc && !best_choice_acceptable) || force_word_assoc) {
645  ratings = word_associator(false, word, &state, best_char_choices,
646  &fixpt, &state);
647  }
648  }
649  best_char_choices = rebuild_current_state(word, &state, best_char_choices,
650  ratings);
651 
652  // If after running only the chopper best_choice is incorrect and no blame
653  // has been yet set, blame the classifier if best_choice is classifier's
654  // top choice and is a dictionary word (i.e. language model could not have
655  // helped). Otherwise blame the tradeoff between the classifier and
656  // the old language model (permuters).
657  if (word->blamer_bundle != NULL &&
659  ratings == NULL && // only the chopper was run
660  !ChoiceIsCorrect(*word->uch_set, word->best_choice,
661  word->blamer_bundle->truth_text)) {
662  if (word->best_choice != NULL &&
664  // Find out whether best choice is a top choice.
666  for (int i = 0; i < word->best_choice->length(); ++i) {
667  BLOB_CHOICE_IT blob_choice_it(best_char_choices->get(i));
668  ASSERT_HOST(!blob_choice_it.empty());
669  BLOB_CHOICE *first_choice = NULL;
670  for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
671  blob_choice_it.forward()) { // find first non-fragment choice
672  if (!(getDict().getUnicharset().get_fragment(
673  blob_choice_it.data()->unichar_id()))) {
674  first_choice = blob_choice_it.data();
675  break;
676  }
677  }
678  ASSERT_HOST(first_choice != NULL);
679  if (first_choice->unichar_id() != word->best_choice->unichar_id(i)) {
681  break;
682  }
683  }
684  }
685  STRING debug;
687  debug = "Best choice is: incorrect, top choice, dictionary word";
688  debug += " with permuter ";
689  debug += word->best_choice->permuter_name();
690  } else {
691  debug = "Classifier/Old LM tradeoff is to blame";
692  }
693  word->blamer_bundle->SetBlame(
696  debug, word->best_choice, wordrec_debug_blamer);
697  }
698 
699  if (word->blamer_bundle != NULL && this->fill_lattice_ != NULL) {
700  if (ratings == NULL) {
701  ratings = word_associator(true, word, NULL, NULL, NULL, NULL);
702  }
703  CallFillLattice(*ratings, getDict().getBestChoices(),
704  *word->uch_set, word->blamer_bundle);
705  }
706  if (ratings != NULL) {
707  if (wordrec_debug_level > 0) {
708  tprintf("Final Ratings Matrix:\n");
709  ratings->print(getDict().getUnicharset());
710  }
711  ratings->delete_matrix_pointers();
712  delete ratings;
713  }
715  // TODO(antonova, eger): check that FilterWordChoices() does not filter
716  // out anything useful for word bigram or phrase search.
717  // TODO(antonova, eger): when implementing word bigram and phrase search
718  // we will need to think carefully about how to replace a word with its
719  // alternative choice.
720  // In particular it might be required to save the segmentation state
721  // associated with the word, so that best_char_choices could be updated
722  // by rebuild_current_state() correctly.
723  if (save_alt_choices) SaveAltChoices(getDict().getBestChoices(), word);
724  char_choices->delete_data_pointers();
725  delete char_choices;
726 
727  return best_char_choices;
728 }
int length() const
Definition: ratngs.h:214
void delete_data_pointers()
bool ChoiceIsCorrect(const UNICHARSET &uni_set, const WERD_CHOICE *choice, const GenericVector< STRING > &truth_text)
Definition: wordrec.cpp:159
IncorrectResultReason incorrect_result_reason
Definition: pageres.h:176
void set_chopper_blame(WERD_RES *word)
Definition: chopper.cpp:917
void SaveAltChoices(const LIST &best_choices, WERD_RES *word)
Definition: wordrec.cpp:173
bool best_choice_is_dict_and_top_choice
Definition: pageres.h:194
MATRIX * word_associator(bool only_create_ratings_matrtix, WERD_RES *word, STATE *state, BLOB_CHOICE_LIST_VECTOR *best_char_choices, DANGERR *fixpt, STATE *best_state)
Definition: chopper.cpp:984
#define NULL
Definition: host.h:144
void CopyCharChoices(const BLOB_CHOICE_LIST_VECTOR &from, BLOB_CHOICE_LIST_VECTOR *to)
Definition: wordrec.cpp:148
int wordrec_debug_level
Definition: wordrec.h:141
T & get(int index) const
int inT32
Definition: host.h:102
void print_seams(const char *label, SEAMS seams)
Definition: seam.cpp:485
TBLOB * blobs
Definition: blobs.h:274
GenericVector< BLOB_CHOICE_LIST * > BLOB_CHOICE_LIST_VECTOR
Definition: ratngs.h:449
void CallFillLattice(const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:183
SEAMS seam_array
Definition: pageres.h:358
void set_n_ones(STATE *state, int n)
Definition: states.cpp:263
Dict & getDict()
Definition: classify.h:62
bool force_word_assoc
Definition: wordrec.h:101
Definition: callcpp.h:37
const UNICHARSET * uch_set
Definition: pageres.h:348
uinT8 permuter() const
Definition: ratngs.h:237
GenericVector< STRING > truth_text
Definition: pageres.h:174
bool permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices, WERD_CHOICE *best_choice, WERD_CHOICE *raw_choice)
Definition: permute.cpp:765
Definition: blobs.h:174
void SetBlame(IncorrectResultReason irr, const STRING &msg, const WERD_CHOICE *choice, bool debug)
Definition: pageres.h:151
void improve_by_chopping(WERD_RES *word, BLOB_CHOICE_LIST_VECTOR *char_choices, STATE *best_state, BLOB_CHOICE_LIST_VECTOR *best_char_choices, DANGERR *fixpt, bool *updated_best_choice)
Definition: chopper.cpp:741
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void delete_matrix_pointers()
Definition: matrix.h:134
bool wordrec_debug_blamer
Definition: wordrec.h:142
DENORM denorm
Definition: pageres.h:346
bool save_alt_choices
Definition: wordrec.h:158
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:36
WERD_CHOICE * raw_choice
Definition: pageres.h:360
Definition: strngs.h:40
BLOB_CHOICE_LIST_VECTOR * rebuild_current_state(WERD_RES *word, STATE *state, BLOB_CHOICE_LIST_VECTOR *char_choices, MATRIX *ratings)
Definition: bestfirst.cpp:332
const char * permuter_name() const
Definition: ratngs.cpp:174
int length() const
Definition: genericvector.h:63
Definition: matrix.h:193
void FilterWordChoices()
Definition: stopper.cpp:375
TWERD * chopped_word
Definition: pageres.h:357
void update_blob_classifications(TWERD *word, const BLOB_CHOICE_LIST_VECTOR &choices)
Definition: wordclass.cpp:152
bool wordrec_enable_assoc
Definition: wordrec.h:98
void cprintf(const char *format,...)
Definition: callcpp.cpp:41
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: states.h:39
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:679
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
BlamerBundle * blamer_bundle
Definition: pageres.h:367
TBLOB * next
Definition: blobs.h:228
WERD_CHOICE * best_choice
Definition: pageres.h:359
BLOB_CHOICE_LIST * classify_blob(TBLOB *blob, const DENORM &denorm, const char *string, C_COL color, BlamerBundle *blamer_bundle)
Definition: wordclass.cpp:62
BLOB_CHOICE_LIST * tesseract::Wordrec::classify_blob ( TBLOB blob,
const DENORM denorm,
const char *  string,
C_COL  color,
BlamerBundle blamer_bundle 
)

Definition at line 62 of file wordclass.cpp.

64  {
65  fflush(stdout);
66  BLOB_CHOICE_LIST *choices = NULL;
67 #ifndef GRAPHICS_DISABLED
69  display_blob(blob, color);
70 #endif
71  choices = blob_match_table.get_match(blob);
72  if (choices == NULL) {
73  choices = call_matcher(&denorm, blob);
74  blob_match_table.put_match(blob, choices);
75  // If a blob with the same bounding box as one of the truth character
76  // bounding boxes is not classified as the corresponding truth character
77  // blame character classifier for incorrect answer.
78  if (blamer_bundle != NULL && blamer_bundle->truth_has_char_boxes &&
79  blamer_bundle->incorrect_result_reason == IRR_CORRECT) {
80  for (int b = 0; b < blamer_bundle->norm_truth_word.length(); ++b) {
81  const TBOX &truth_box = blamer_bundle->norm_truth_word.BlobBox(b);
82  const TBOX &blob_box = blob->bounding_box();
83  // Note that we are more strict on the bounding box boundaries here
84  // than in other places (chopper, segmentation search), since we do
85  // not have the ability to check the previous and next bounding box.
86  if (blob_box.x_almost_equal(truth_box,
87  blamer_bundle->norm_box_tolerance/2)) {
88  BLOB_CHOICE_IT choices_it(choices);
89  bool found = false;
90  bool incorrect_adapted = false;
91  UNICHAR_ID incorrect_adapted_id = INVALID_UNICHAR_ID;
92  const char *truth_str = blamer_bundle->truth_text[b].string();
93  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
94  choices_it.forward()) {
95  if (strcmp(truth_str, getDict().getUnicharset().get_normed_unichar(
96  choices_it.data()->unichar_id())) == 0) {
97  found = true;
98  break;
99  } else if (choices_it.data()->adapted()) {
100  incorrect_adapted = true;
101  incorrect_adapted_id = choices_it.data()->unichar_id();
102  }
103  } // end choices_it for loop
104  if (!found) {
105  STRING debug = "unichar ";
106  debug += truth_str;
107  debug += " not found in classification list";
108  blamer_bundle->SetBlame(IRR_CLASSIFIER, debug,
110  } else if (incorrect_adapted) {
111  STRING debug = "better rating for adapted ";
112  debug += getDict().getUnicharset().id_to_unichar(
113  incorrect_adapted_id);
114  debug += " than for correct ";
115  debug += truth_str;
116  blamer_bundle->SetBlame(IRR_ADAPTION, debug,
118  }
119  break;
120  }
121  } // end iterating over blamer_bundle->norm_truth_word
122  }
123  }
124 #ifndef GRAPHICS_DISABLED
125  if (classify_debug_level && string)
126  print_ratings_list(string, choices, getDict().getUnicharset());
127 
128  if (wordrec_blob_pause)
130 #endif
131 
132  return (choices);
133 }
bool wordrec_blob_pause
Definition: render.cpp:53
bool wordrec_display_all_blobs
Definition: render.cpp:49
const int length() const
Definition: boxword.h:99
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
int UNICHAR_ID
Definition: unichar.h:31
IncorrectResultReason incorrect_result_reason
Definition: pageres.h:176
bool x_almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:244
void put_match(TBLOB *blob, BLOB_CHOICE_LIST *ratings)
Definition: matchtab.cpp:87
ScrollView * blob_window
Definition: render.cpp:43
BlobMatchTable blob_match_table
Definition: wordrec.h:501
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
Definition: rect.h:29
char window_wait(ScrollView *win)
Definition: callcpp.cpp:112
Dict & getDict()
Definition: classify.h:62
tesseract::BoxWord norm_truth_word
Definition: pageres.h:170
BLOB_CHOICE_LIST * call_matcher(const DENORM *denorm, TBLOB *blob)
Definition: tface.cpp:143
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:511
GenericVector< STRING > truth_text
Definition: pageres.h:174
void SetBlame(IncorrectResultReason irr, const STRING &msg, const WERD_CHOICE *choice, bool debug)
Definition: pageres.h:151
bool wordrec_debug_blamer
Definition: wordrec.h:142
TBOX bounding_box() const
Definition: blobs.cpp:384
Definition: strngs.h:40
bool truth_has_char_boxes
Definition: pageres.h:164
const TBOX & BlobBox(int index) const
Definition: boxword.h:102
BLOB_CHOICE_LIST * get_match(TBLOB *blob)
Definition: matchtab.cpp:118
void display_blob(TBLOB *blob, C_COL color)
Definition: render.cpp:64
int norm_box_tolerance
Definition: pageres.h:172
BLOB_CHOICE_LIST * tesseract::Wordrec::classify_piece ( TBLOB pieces,
const DENORM denorm,
SEAMS  seams,
inT16  start,
inT16  end,
BlamerBundle blamer_bundle 
)
virtual

Definition at line 75 of file pieces.cpp.

80  {
81  BLOB_CHOICE_LIST *choices;
82  TBLOB *blob;
83  inT16 x;
84 
85  join_pieces(pieces, seams, start, end);
86  for (blob = pieces, x = 0; x < start; x++) {
87  blob = blob->next;
88  }
89  choices = classify_blob(blob, denorm, "pieces:", White, blamer_bundle);
90 
91  break_pieces(blob, seams, start, end);
92 #ifndef GRAPHICS_DISABLED
94  STATE current_state;
95  SEARCH_STATE chunk_groups;
96  set_n_ones (&current_state, array_count(seams));
97  chunk_groups = bin_to_chunks(&current_state, array_count(seams));
98  display_segmentation(pieces, chunk_groups);
100  memfree(chunk_groups);
101  }
102 #endif
103 
104  return (choices);
105 }
void memfree(void *element)
Definition: freelist.cpp:30
void break_pieces(TBLOB *blobs, SEAMS seams, inT16 start, inT16 end)
Definition: seam.cpp:535
Definition: callcpp.h:34
int wordrec_display_segmentations
Definition: plotseg.cpp:48
char window_wait(ScrollView *win)
Definition: callcpp.cpp:112
void join_pieces(TBLOB *piece_blobs, SEAMS seams, inT16 start, inT16 end)
Definition: seam.cpp:564
SEARCH_STATE bin_to_chunks(STATE *state, int num_joints)
Definition: states.cpp:49
void set_n_ones(STATE *state, int n)
Definition: states.cpp:263
void display_segmentation(TBLOB *chunks, SEARCH_STATE segmentation)
Definition: plotseg.cpp:58
ScrollView * segm_window
Definition: plotseg.cpp:46
Definition: blobs.h:174
short inT16
Definition: host.h:100
#define array_count(a)
Definition: tessarray.h:74
Definition: states.h:39
TBLOB * next
Definition: blobs.h:228
int * SEARCH_STATE
Definition: states.h:46
BLOB_CHOICE_LIST * classify_blob(TBLOB *blob, const DENORM &denorm, const char *string, C_COL color, BlamerBundle *blamer_bundle)
Definition: wordclass.cpp:62
void tesseract::Wordrec::combine_seam ( SEAM_QUEUE  seam_queue,
SEAM_PILE  seam_pile,
SEAM seam 
)

tessedit_fix_sideways_chops ||

Definition at line 259 of file findseam.cpp.

260  {
261  register inT16 x;
262  register inT16 dist;
263  inT16 bottom1, top1;
264  inT16 bottom2, top2;
265 
266  SEAM *new_one;
267  SEAM *this_one;
268 
269  bottom1 = seam->split1->point1->pos.y;
270  if (seam->split1->point2->pos.y >= bottom1)
271  top1 = seam->split1->point2->pos.y;
272  else {
273  top1 = bottom1;
274  bottom1 = seam->split1->point2->pos.y;
275  }
276  if (seam->split2 != NULL) {
277  bottom2 = seam->split2->point1->pos.y;
278  if (seam->split2->point2->pos.y >= bottom2)
279  top2 = seam->split2->point2->pos.y;
280  else {
281  top2 = bottom2;
282  bottom2 = seam->split2->point2->pos.y;
283  }
284  }
285  else {
286  bottom2 = bottom1;
287  top2 = top1;
288  }
289  array_loop(seam_pile, x) {
290  this_one = (SEAM *) array_value (seam_pile, x);
291  dist = seam->location.x - this_one->location.x;
292  if (-SPLIT_CLOSENESS < dist &&
293  dist < SPLIT_CLOSENESS &&
294  seam->priority + this_one->priority < chop_ok_split) {
295  inT16 split1_point1_y = this_one->split1->point1->pos.y;
296  inT16 split1_point2_y = this_one->split1->point2->pos.y;
297  inT16 split2_point1_y = 0;
298  inT16 split2_point2_y = 0;
299  if (this_one->split2) {
300  split2_point1_y = this_one->split2->point1->pos.y;
301  split2_point2_y = this_one->split2->point2->pos.y;
302  }
303  if (
305  (
306  /* this_one->split1 always exists */
307  (
308  ((split1_point1_y >= top1 && split1_point2_y >= top1) ||
309  (split1_point1_y <= bottom1 && split1_point2_y <= bottom1))
310  &&
311  ((split1_point1_y >= top2 && split1_point2_y >= top2) ||
312  (split1_point1_y <= bottom2 && split1_point2_y <= bottom2))
313  )
314  )
315  &&
316  (
317  this_one->split2 == NULL ||
318  (
319  ((split2_point1_y >= top1 && split2_point2_y >= top1) ||
320  (split2_point1_y <= bottom1 && split2_point2_y <= bottom1))
321  &&
322  ((split2_point1_y >= top2 && split2_point2_y >= top2) ||
323  (split2_point1_y <= bottom2 && split2_point2_y <= bottom2))
324  )
325  )
326  ) {
327  new_one = join_two_seams (seam, this_one);
328  if (chop_debug > 1)
329  print_seam ("Combo priority ", new_one);
330  add_seam_to_queue (seam_queue, new_one, new_one->priority);
331  }
332  }
333  }
334 }
PRIORITY priority
Definition: seam.h:42
SPLIT * split2
Definition: seam.h:47
#define NULL
Definition: host.h:144
SEAM * join_two_seams(SEAM *seam1, SEAM *seam2)
Definition: seam.cpp:396
double chop_ok_split
Definition: wordrec.h:121
inT16 y
Definition: blobs.h:68
inT16 x
Definition: blobs.h:67
SPLIT * split1
Definition: seam.h:46
#define add_seam_to_queue(seams, seam, priority)
Definition: findseam.cpp:64
short inT16
Definition: host.h:100
EDGEPT * point1
Definition: split.h:39
TPOINT pos
Definition: blobs.h:100
EDGEPT * point2
Definition: split.h:40
TPOINT location
Definition: seam.h:45
#define SPLIT_CLOSENESS
Definition: findseam.cpp:45
#define array_value(a, i)
Definition: tessarray.h:132
void print_seam(const char *label, SEAM *seam)
Definition: seam.cpp:458
#define array_loop(a, x)
Definition: tessarray.h:114
inT16 tesseract::Wordrec::constrained_split ( SPLIT split,
TBLOB blob 
)

Definition at line 343 of file findseam.cpp.

343  {
344  TESSLINE *outline;
345 
346  if (is_little_chunk (split->point1, split->point2))
347  return (FALSE);
348 
349  for (outline = blob->outlines; outline; outline = outline->next) {
350  if (split_bounds_overlap (split, outline) &&
351  crosses_outline (split->point1, split->point2, outline->loop)) {
352  return (FALSE);
353  }
354  }
355  return (TRUE);
356 }
#define split_bounds_overlap(split, outline)
Definition: gradechop.h:63
TESSLINE * next
Definition: blobs.h:171
int is_little_chunk(EDGEPT *point1, EDGEPT *point2)
Definition: chop.cpp:123
#define FALSE
Definition: capi.h:28
TESSLINE * outlines
Definition: blobs.h:227
int crosses_outline(EDGEPT *p0, EDGEPT *p1, EDGEPT *outline)
Definition: outlines.cpp:48
EDGEPT * point1
Definition: split.h:39
EDGEPT * point2
Definition: split.h:40
EDGEPT * loop
Definition: blobs.h:170
#define TRUE
Definition: capi.h:27
void tesseract::Wordrec::CopyCharChoices ( const BLOB_CHOICE_LIST_VECTOR from,
BLOB_CHOICE_LIST_VECTOR to 
)

Definition at line 148 of file wordrec.cpp.

149  {
150  to->delete_data_pointers();
151  to->clear();
152  for (int i = 0; i < from.size(); ++i) {
153  BLOB_CHOICE_LIST *cc_list = new BLOB_CHOICE_LIST();
154  cc_list->deep_copy(from[i], &BLOB_CHOICE::deep_copy);
155  to->push_back(cc_list);
156  }
157 }
void delete_data_pointers()
virtual void clear()
int push_back(T object)
int size() const
Definition: genericvector.h:59
static BLOB_CHOICE * deep_copy(const BLOB_CHOICE *src)
Definition: ratngs.h:126
int tesseract::Wordrec::crosses_outline ( EDGEPT p0,
EDGEPT p1,
EDGEPT outline 
)

Definition at line 48 of file outlines.cpp.

50  { /* Outline to check */
51  EDGEPT *pt = outline;
52  do {
53  if (is_crossed (p0->pos, p1->pos, pt->pos, pt->next->pos))
54  return (TRUE);
55  pt = pt->next;
56  }
57  while (pt != outline);
58  return (FALSE);
59 }
EDGEPT * next
Definition: blobs.h:106
#define FALSE
Definition: capi.h:28
Definition: blobs.h:72
int is_crossed(TPOINT a0, TPOINT a1, TPOINT b0, TPOINT b1)
Definition: outlines.cpp:70
TPOINT pos
Definition: blobs.h:100
#define TRUE
Definition: capi.h:27
void tesseract::Wordrec::delete_seam_pile ( SEAM_PILE  seam_pile)

Definition at line 365 of file findseam.cpp.

365  {
366  inT16 x;
367 
368  array_loop(seam_pile, x) {
369  delete_seam ((SEAM *) array_value (seam_pile, x));
370  }
371  array_free(seam_pile);
372 }
#define array_free
Definition: tessarray.h:83
void delete_seam(void *arg)
Definition: seam.cpp:154
short inT16
Definition: host.h:100
#define array_value(a, i)
Definition: tessarray.h:132
#define array_loop(a, x)
Definition: tessarray.h:114
void tesseract::Wordrec::delete_search ( SEARCH_RECORD the_search)

delete_search

Terminate the current search and free all the memory involved.

Definition at line 179 of file bestfirst.cpp.

179  {
180  float closeness;
181 
182  closeness = (the_search->num_joints ?
183  (hamming_distance(reinterpret_cast<uinT32*>(the_search->first_state),
184  reinterpret_cast<uinT32*>(the_search->best_state), 2) /
185  (float) the_search->num_joints) : 0.0f);
186 
187  free_state (the_search->first_state);
188  free_state (the_search->best_state);
189 
190  free_hash_table(the_search->closed_states);
191  FreeHeapData (the_search->open_states, (void_dest) free_state);
192 
193  memfree(the_search);
194 }
STATE * best_state
Definition: bestfirst.h:51
void(* void_dest)(void *)
Definition: cutil.h:72
void memfree(void *element)
Definition: freelist.cpp:30
void free_state(STATE *)
void FreeHeapData(HEAP *Heap, void_dest destructor)
Definition: oldheap.cpp:327
int num_joints
Definition: bestfirst.h:52
STATE * first_state
Definition: bestfirst.h:50
int hamming_distance(uinT32 *array1, uinT32 *array2, int length)
Definition: bitvec.cpp:81
HEAP * open_states
Definition: bestfirst.h:47
HASH_TABLE closed_states
Definition: bestfirst.h:48
#define free_hash_table(table)
Definition: closed.h:35
int tesseract::Wordrec::dict_word ( const WERD_CHOICE word)

Definition at line 133 of file tface.cpp.

133  {
134  return getDict().valid_word(word);
135 }
Dict & getDict()
Definition: classify.h:62
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:807
int tesseract::Wordrec::end_recog ( )

Definition at line 67 of file tface.cpp.

67  {
68  program_editdown (0);
69 
70  return (0);
71 }
void program_editdown(inT32 elasped_time)
Definition: tface.cpp:80
BLOB_CHOICE_LIST_VECTOR * tesseract::Wordrec::evaluate_chunks ( CHUNKS_RECORD chunks_record,
SEARCH_STATE  search_state,
BlamerBundle blamer_bundle 
)

evaluate_chunks

A particular word level segmentation has been chosen. Evaluation this to find the word list that corresponds to it.

Definition at line 203 of file bestfirst.cpp.

205  {
206  BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
207  BLOB_CHOICE_LIST *blob_choices;
208  BLOB_CHOICE_IT blob_choice_it;
209  int i;
210  int x = 0;
211  int y;
212 
213  // Iterate sub-paths.
214  for (i = 1; i <= search_state[0] + 1; i++) {
215  if (i > search_state[0])
216  y = count_blobs (chunks_record->chunks) - 1;
217  else
218  y = x + search_state[i];
219 
220  // Process one square.
221 
222  // Classify if needed.
223  blob_choices = get_piece_rating(chunks_record->ratings,
224  chunks_record->chunks,
225  chunks_record->word_res->denorm,
226  chunks_record->splits,
227  x, y, blamer_bundle);
228 
229  if (blob_choices == NULL) {
230  delete char_choices;
231  return (NULL);
232  }
233 
234  // Add permuted ratings.
235  blob_choice_it.set_to_list(blob_choices);
236  last_segmentation[i - 1].certainty = blob_choice_it.data()->certainty();
237  last_segmentation[i - 1].match = blob_choice_it.data()->rating();
238 
239  last_segmentation[i - 1].width =
240  AssociateUtils::GetChunksWidth(chunks_record->chunk_widths, x, y);
241  last_segmentation[i - 1].gap =
242  AssociateUtils::GetChunksGap(chunks_record->chunk_widths, y);
243 
244  *char_choices += blob_choices;
245  x = y + 1;
246  }
247  return (char_choices);
248 }
WERD_RES * word_res
Definition: associate.h:54
TBLOB * chunks
Definition: associate.h:53
SEAMS splits
Definition: associate.h:55
BLOB_CHOICE_LIST * get_piece_rating(MATRIX *ratings, TBLOB *blobs, const DENORM &denorm, SEAMS seams, inT16 start, inT16 end, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:362
#define NULL
Definition: host.h:144
GenericVector< BLOB_CHOICE_LIST * > BLOB_CHOICE_LIST_VECTOR
Definition: ratngs.h:449
static int GetChunksWidth(WIDTH_RECORD *width_record, int start_blob, int last_blob)
Definition: associate.cpp:144
static int GetChunksGap(WIDTH_RECORD *width_record, int last_chunk)
Definition: associate.h:132
int count_blobs(TBLOB *blobs)
Definition: blobs.cpp:581
DENORM denorm
Definition: pageres.h:346
WIDTH_RECORD * chunk_widths
Definition: associate.h:57
EVALUATION_ARRAY last_segmentation
Definition: wordrec.h:502
MATRIX * ratings
Definition: associate.h:52
inT16 tesseract::Wordrec::evaluate_state ( CHUNKS_RECORD chunks_record,
SEARCH_RECORD the_search,
DANGERR fixpt,
BlamerBundle blamer_bundle 
)

Definition at line 256 of file bestfirst.cpp.

259  {
260  BLOB_CHOICE_LIST_VECTOR *char_choices;
261  SEARCH_STATE chunk_groups;
262  float rating_limit = the_search->best_choice->rating();
263  bool keep_going = true;
264  PIECES_STATE widths;
265 
266  the_search->num_states++;
267  chunk_groups = bin_to_chunks(the_search->this_state,
268  the_search->num_joints);
269  bin_to_pieces (the_search->this_state, the_search->num_joints, widths);
270  if (wordrec_debug_level > 1) {
271  log_state("Evaluating state", the_search->num_joints,
272  the_search->this_state);
273  }
274  getDict().LogNewSegmentation(widths);
275 
276  char_choices = evaluate_chunks(chunks_record, chunk_groups, blamer_bundle);
278  bool updated_best_choice = false;
279  if (char_choices != NULL && char_choices->length() > 0) {
280  // Compute the segmentation cost and include the cost in word rating.
281  // TODO(dsl): We should change the SEARCH_RECORD to store this cost
282  // from state evaluation and avoid recomputing it here.
283  prioritize_state(chunks_record, the_search);
285  updated_best_choice =
286  getDict().permute_characters(*char_choices,
287  the_search->best_choice,
288  the_search->raw_choice);
289  bool replaced = false;
290  if (updated_best_choice) {
291  if (getDict().AcceptableChoice(char_choices, the_search->best_choice,
292  NULL, ASSOCIATOR_CALLER, &replaced)) {
293  keep_going = false;
294  }
295  CopyCharChoices(*char_choices, the_search->best_char_choices);
296  }
297  }
299 
300 #ifndef GRAPHICS_DISABLED
302  display_segmentation (chunks_record->chunks, chunk_groups);
305  }
306 #endif
307 
308  if (rating_limit != the_search->best_choice->rating()) {
309  ASSERT_HOST(updated_best_choice);
310  the_search->before_best = the_search->num_states;
311  the_search->best_state->part1 = the_search->this_state->part1;
312  the_search->best_state->part2 = the_search->this_state->part2;
313  replace_char_widths(chunks_record, chunk_groups);
314  } else {
315  ASSERT_HOST(!updated_best_choice);
316  if (char_choices != NULL) fixpt->clear();
317  }
318 
319  if (char_choices != NULL) delete char_choices;
320  memfree(chunk_groups);
321 
322  return (keep_going);
323 }
void SetWordsegRatingAdjustFactor(float f)
Set wordseg_rating_adjust_factor_ to the given value.
Definition: dict.h:726
TBLOB * chunks
Definition: associate.h:53
WERD_CHOICE * best_choice
Definition: bestfirst.h:56
STATE * best_state
Definition: bestfirst.h:51
void memfree(void *element)
Definition: freelist.cpp:30
virtual void clear()
long before_best
Definition: bestfirst.h:54
void replace_char_widths(CHUNKS_RECORD *chunks_record, SEARCH_STATE state)
Definition: bestfirst.cpp:651
WERD_CHOICE * raw_choice
Definition: bestfirst.h:57
#define NULL
Definition: host.h:144
void CopyCharChoices(const BLOB_CHOICE_LIST_VECTOR &from, BLOB_CHOICE_LIST_VECTOR *to)
Definition: wordrec.cpp:148
int wordrec_debug_level
Definition: wordrec.h:141
uinT32 part1
Definition: states.h:41
int wordrec_display_segmentations
Definition: plotseg.cpp:48
#define f(xc, yc)
Definition: imgscale.cpp:39
char window_wait(ScrollView *win)
Definition: callcpp.cpp:112
int num_joints
Definition: bestfirst.h:52
SEARCH_STATE bin_to_chunks(STATE *state, int num_joints)
Definition: states.cpp:49
Dict & getDict()
Definition: classify.h:62
void display_segmentation(TBLOB *chunks, SEARCH_STATE segmentation)
Definition: plotseg.cpp:58
ScrollView * segm_window
Definition: plotseg.cpp:46
BLOB_CHOICE_LIST_VECTOR * best_char_choices
Definition: bestfirst.h:58
bool permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices, WERD_CHOICE *best_choice, WERD_CHOICE *raw_choice)
Definition: permute.cpp:765
FLOAT32 prioritize_state(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search)
Definition: heuristic.cpp:289
long num_states
Definition: bestfirst.h:53
STATE * this_state
Definition: bestfirst.h:49
uinT32 part2
Definition: states.h:42
int length() const
Definition: genericvector.h:63
void LogNewSegmentation(PIECES_STATE BlobWidth)
Definition: stopper.cpp:463
uinT8 PIECES_STATE[MAX_NUM_CHUNKS+2]
Definition: states.h:49
BLOB_CHOICE_LIST_VECTOR * evaluate_chunks(CHUNKS_RECORD *chunks_record, SEARCH_STATE search_state, BlamerBundle *blamer_bundle)
Definition: bestfirst.cpp:203
#define ASSERT_HOST(x)
Definition: errcode.h:84
void bin_to_pieces(STATE *state, int num_joints, PIECES_STATE pieces)
Definition: states.cpp:99
float segcost_bias
Definition: bestfirst.h:55
int * SEARCH_STATE
Definition: states.h:46
float rating() const
Definition: ratngs.h:231
void tesseract::Wordrec::expand_node ( FLOAT32  worst_priority,
CHUNKS_RECORD chunks_record,
SEARCH_RECORD the_search 
)

Definition at line 499 of file bestfirst.cpp.

501  {
502  STATE old_state;
503  int x;
504  uinT32 mask = 1 << (the_search->num_joints - 1 - 32);
505 
506  old_state.part1 = the_search->this_state->part1;
507  old_state.part2 = the_search->this_state->part2;
508 
509  // We need to expand the search more intelligently, or we get stuck
510  // with a bad starting segmentation in a long word sequence as in CJK.
511  // Expand a child node only if it is within the global bound, and no
512  // worse than 2x of its parent.
513  // TODO(dsl): There is some redudency here in recomputing the priority,
514  // and in filtering of old_merit and worst_priority.
515  the_search->this_state->part2 = old_state.part2;
516  for (x = the_search->num_joints; x > 32; x--) {
517  the_search->this_state->part1 = mask ^ old_state.part1;
518  if (!hash_lookup (the_search->closed_states, the_search->this_state)) {
519  FLOAT32 new_merit = prioritize_state(chunks_record, the_search);
520  if (new_merit < worst_priority) {
521  if (wordrec_debug_level > 1)
522  log_state("Pushing segstate", the_search->num_joints,
523  the_search->this_state, new_merit);
524  push_queue(the_search->open_states, the_search->this_state,
525  worst_priority, new_merit, wordrec_debug_level > 1);
526  } else {
527  if (wordrec_debug_level > 1)
528  log_state("Ignore weak segstate", the_search->num_joints,
529  the_search->this_state, new_merit);
530  }
531  }
532  mask >>= 1;
533  }
534 
535  if (the_search->num_joints > 32) {
536  mask = 1 << 31;
537  }
538  else {
539  mask = 1 << (the_search->num_joints - 1);
540  }
541 
542  the_search->this_state->part1 = old_state.part1;
543  while (x--) {
544  the_search->this_state->part2 = mask ^ old_state.part2;
545  if (!hash_lookup (the_search->closed_states, the_search->this_state)) {
546  FLOAT32 new_merit = prioritize_state(chunks_record, the_search);
547  if (new_merit < worst_priority) {
548  if (wordrec_debug_level > 1)
549  log_state("Pushing segstate", the_search->num_joints,
550  the_search->this_state, new_merit);
551  push_queue(the_search->open_states, the_search->this_state,
552  worst_priority, new_merit, wordrec_debug_level > 1);
553  } else {
554  if (wordrec_debug_level > 1)
555  log_state("Ignoring weak segstate", the_search->num_joints,
556  the_search->this_state, new_merit);
557  }
558  }
559  mask >>= 1;
560  }
561 }
int wordrec_debug_level
Definition: wordrec.h:141
uinT32 part1
Definition: states.h:41
float FLOAT32
Definition: host.h:111
int num_joints
Definition: bestfirst.h:52
FLOAT32 prioritize_state(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search)
Definition: heuristic.cpp:289
void push_queue(HEAP *queue, STATE *state, FLOAT32 worst_priority, FLOAT32 priority, bool debug)
Definition: bestfirst.cpp:629
HEAP * open_states
Definition: bestfirst.h:47
STATE * this_state
Definition: bestfirst.h:49
uinT32 part2
Definition: states.h:42
int hash_lookup(HASH_TABLE state_table, STATE *state)
Definition: closed.cpp:86
Definition: states.h:39
unsigned int uinT32
Definition: host.h:103
HASH_TABLE closed_states
Definition: bestfirst.h:48
BLOB_CHOICE_LIST * tesseract::Wordrec::fake_classify_blob ( UNICHAR_ID  class_id,
float  rating,
float  certainty 
)

Definition at line 136 of file wordclass.cpp.

137  {
138  BLOB_CHOICE_LIST *ratings = new BLOB_CHOICE_LIST(); // matcher result
139  BLOB_CHOICE *choice =
140  new BLOB_CHOICE(class_id, rating, certainty, -1, -1, 0, 0, 0, false);
141  BLOB_CHOICE_IT temp_it(ratings);
142  temp_it.add_after_stay_put(choice);
143  return ratings;
144 }
void tesseract::Wordrec::fill_filtered_fragment_list ( BLOB_CHOICE_LIST *  choices,
int  fragment_pos,
int  num_frag_parts,
BLOB_CHOICE_LIST *  filtered_choices 
)

Definition at line 136 of file pieces.cpp.

139  {
140  BLOB_CHOICE_IT filtered_choices_it(filtered_choices);
141  BLOB_CHOICE_IT choices_it(choices);
142 
143  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
144  choices_it.forward()) {
145  UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
146  const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id);
147 
148  if (frag != NULL && frag->get_pos() == fragment_pos &&
149  frag->get_total() == num_frag_parts) {
150  // Recover the unichar_id of the unichar that this fragment is
151  // a part of
152  BLOB_CHOICE *b = new BLOB_CHOICE(*choices_it.data());
153  int original_unichar = unicharset.unichar_to_id(frag->get_unichar());
154  b->set_unichar_id(original_unichar);
155  filtered_choices_it.add_to_end(b);
156  }
157  }
158 
159  filtered_choices->sort(SortByUnicharID<BLOB_CHOICE>);
160 }
int UNICHAR_ID
Definition: unichar.h:31
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
int get_total() const
Definition: unicharset.h:54
#define NULL
Definition: host.h:144
int get_pos() const
Definition: unicharset.h:53
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:610
UNICHARSET unicharset
Definition: ccutil.h:72
const char * get_unichar() const
Definition: unicharset.h:52
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:96
void tesseract::Wordrec::FillLattice ( const MATRIX ratings,
const LIST best_choices,
const UNICHARSET unicharset,
BlamerBundle blamer_bundle 
)
void tesseract::Wordrec::FinishBlamerForSegSearch ( const WERD_CHOICE best_choice,
BlamerBundle blamer_bundle,
STRING blamer_debug 
)
protected

Definition at line 376 of file segsearch.cpp.

378  {
379  // If we are still looking for blame (i.e. best_choice is incorrect, but a
380  // path representing the correct segmentation could be constructed), we can
381  // blame segmentation search pain point prioritization if the rating of the
382  // path corresponding to the correct segmentation is better than that of
383  // best_choice (i.e. language model would have done the correct thing, but
384  // because of poor pain point prioritization the correct segmentation was
385  // never explored). Otherwise we blame the tradeoff between the language model
386  // and the classifier, since even after exploring the path corresponding to
387  // the correct segmentation incorrect best_choice would have been chosen.
388  // One special case when we blame the classifier instead is when best choice
389  // is incorrect, but it is a dictionary word and it classifier's top choice.
390  if (blamer_bundle != NULL && blamer_bundle->segsearch_is_looking_for_blame) {
391  blamer_bundle->segsearch_is_looking_for_blame = false;
392  if (blamer_bundle->best_choice_is_dict_and_top_choice) {
393  *blamer_debug = "Best choice is: incorrect, top choice, dictionary word";
394  *blamer_debug += " with permuter ";
395  *blamer_debug += best_choice->permuter_name();
396  blamer_bundle->SetBlame(IRR_CLASSIFIER, *blamer_debug, best_choice,
398  } else if (blamer_bundle->best_correctly_segmented_rating <
399  best_choice->rating()) {
400  *blamer_debug += "Correct segmentation state was not explored";
401  blamer_bundle->SetBlame(IRR_SEGSEARCH_PP, *blamer_debug, best_choice,
403  } else {
404  if (blamer_bundle->best_correctly_segmented_rating >=
406  *blamer_debug += "Correct segmentation paths were pruned by LM\n";
407  } else {
408  char debug_buffer[256];
409  *blamer_debug += "Best correct segmentation rating ";
410  sprintf(debug_buffer, "%g",
411  blamer_bundle->best_correctly_segmented_rating);
412  *blamer_debug += debug_buffer;
413  *blamer_debug += " vs. best choice rating ";
414  sprintf(debug_buffer, "%g", best_choice->rating());
415  *blamer_debug += debug_buffer;
416  }
417  blamer_bundle->SetBlame(IRR_CLASS_LM_TRADEOFF, *blamer_debug, best_choice,
419  }
420  }
421 }
bool best_choice_is_dict_and_top_choice
Definition: pageres.h:194
float best_correctly_segmented_rating
Definition: pageres.h:187
#define NULL
Definition: host.h:144
void SetBlame(IncorrectResultReason irr, const STRING &msg, const WERD_CHOICE *choice, bool debug)
Definition: pageres.h:151
static const float kBadRating
Definition: ratngs.h:188
bool wordrec_debug_blamer
Definition: wordrec.h:142
const char * permuter_name() const
Definition: ratngs.cpp:174
bool segsearch_is_looking_for_blame
Definition: pageres.h:184
float rating() const
Definition: ratngs.h:231
PRIORITY tesseract::Wordrec::full_split_priority ( SPLIT split,
inT16  xmin,
inT16  xmax 
)

Definition at line 74 of file gradechop.cpp.

74  {
75  BOUNDS_RECT rect;
76 
77  set_outline_bounds (split->point1, split->point2, rect);
78 
79  if (xmin < MIN (rect[0], rect[2]) && xmax > MAX (rect[1], rect[3]))
80  return (999.0);
81 
82  return (grade_overlap (rect) +
84 }
inT16 BOUNDS_RECT[4]
Definition: gradechop.h:38
PRIORITY grade_center_of_blob(register BOUNDS_RECT rect)
Definition: gradechop.cpp:95
PRIORITY grade_overlap(register BOUNDS_RECT rect)
Definition: gradechop.cpp:115
EDGEPT * point1
Definition: split.h:39
EDGEPT * point2
Definition: split.h:40
#define MIN(x, y)
Definition: ndminx.h:28
#define MAX(x, y)
Definition: ndminx.h:24
PRIORITY grade_width_change(register BOUNDS_RECT rect)
Definition: gradechop.cpp:191
void set_outline_bounds(register EDGEPT *point1, register EDGEPT *point2, BOUNDS_RECT rect)
Definition: gradechop.cpp:213
void tesseract::Wordrec::get_fragment_lists ( inT16  current_frag,
inT16  current_row,
inT16  start,
inT16  num_frag_parts,
inT16  num_blobs,
MATRIX ratings,
BLOB_CHOICE_LIST *  choice_lists 
)

Definition at line 292 of file pieces.cpp.

295  {
296  if (current_frag == num_frag_parts) {
297  merge_and_put_fragment_lists(start, current_row - 1, num_frag_parts,
298  choice_lists, ratings);
299  return;
300  }
301 
302  for (inT16 x = current_row; x < num_blobs; x++) {
303  BLOB_CHOICE_LIST *choices = ratings->get(current_row, x);
304  if (choices == NULL)
305  continue;
306 
307  fill_filtered_fragment_list(choices, current_frag, num_frag_parts,
308  &choice_lists[current_frag]);
309  if (!choice_lists[current_frag].empty()) {
310  get_fragment_lists(current_frag + 1, x + 1, start, num_frag_parts,
311  num_blobs, ratings, choice_lists);
312  choice_lists[current_frag].clear();
313  }
314  }
315 }
void merge_and_put_fragment_lists(inT16 row, inT16 column, inT16 num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
Definition: pieces.cpp:169
#define NULL
Definition: host.h:144
T get(int column, int row) const
Definition: matrix.h:117
void fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
Definition: pieces.cpp:136
short inT16
Definition: host.h:100
void get_fragment_lists(inT16 current_frag, inT16 current_row, inT16 start, inT16 num_frag_parts, inT16 num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
Definition: pieces.cpp:292
FLOAT32 tesseract::Wordrec::get_gap_variance ( WIDTH_RECORD wrec,
float  norm_height 
)

Definition at line 111 of file heuristic.cpp.

111  {
112  MEASUREMENT ws;
113  new_measurement(ws);
114  for (int x = 0; x < wrec->num_chars - 1; x++) {
115  FLOAT32 gap_ratio = (wrec->widths[2 * x] + wrec->widths[ 2*x + 1])
116  * 1.0 / norm_height;
117  ADD_SAMPLE(ws, gap_ratio);
118  }
119  if (segment_adjust_debug > 2)
120  tprintf("Gap Mean=%g Var=%g\n", MEAN(ws), VARIANCE(ws));
121  return VARIANCE(ws);
122 }
#define VARIANCE(m)
Definition: measure.h:115
#define ADD_SAMPLE(m, s)
Definition: measure.h:63
int segment_adjust_debug
Definition: wordrec.h:124
float FLOAT32
Definition: host.h:111
#define new_measurement(m)
Definition: measure.h:86
int num_chars
Definition: blobs.h:49
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int widths[1]
Definition: blobs.h:50
#define MEAN(m)
Definition: measure.h:74
BLOB_CHOICE_LIST * tesseract::Wordrec::get_piece_rating ( MATRIX ratings,
TBLOB blobs,
const DENORM denorm,
SEAMS  seams,
inT16  start,
inT16  end,
BlamerBundle blamer_bundle 
)

Definition at line 362 of file pieces.cpp.

368  {
369  BLOB_CHOICE_LIST *choices = ratings->get(start, end);
370  if (choices == NOT_CLASSIFIED) {
371  choices = classify_piece(blobs,
372  denorm,
373  seams,
374  start,
375  end,
376  blamer_bundle);
377  ratings->put(start, end, choices);
378  if (wordrec_debug_level > 1) {
379  tprintf("get_piece_rating(): updated ratings matrix\n");
380  ratings->print(getDict().getUnicharset());
381  }
382  }
383  return (choices);
384 }
int wordrec_debug_level
Definition: wordrec.h:141
T get(int column, int row) const
Definition: matrix.h:117
Dict & getDict()
Definition: classify.h:62
void put(int column, int row, const T &thing)
Definition: matrix.h:112
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:36
#define NOT_CLASSIFIED
Definition: matrix.h:31
virtual BLOB_CHOICE_LIST * classify_piece(TBLOB *pieces, const DENORM &denorm, SEAMS seams, inT16 start, inT16 end, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:75
FLOAT32 tesseract::Wordrec::get_width_variance ( WIDTH_RECORD wrec,
float  norm_height 
)

Definition at line 96 of file heuristic.cpp.

96  {
97  MEASUREMENT ws;
98  new_measurement(ws);
99  for (int x = 0; x < wrec->num_chars; x++) {
100  FLOAT32 wh_ratio = wrec->widths[2 * x] * 1.0f / norm_height;
101  if (x == wrec->num_chars - 1 && wh_ratio > 0.3)
102  continue; // exclude trailing punctuation from stats
103  ADD_SAMPLE(ws, wh_ratio);
104  }
105  if (segment_adjust_debug > 2)
106  tprintf("Width Mean=%g Var=%g\n", MEAN(ws), VARIANCE(ws));
107  return VARIANCE(ws);
108 }
#define VARIANCE(m)
Definition: measure.h:115
#define ADD_SAMPLE(m, s)
Definition: measure.h:63
int segment_adjust_debug
Definition: wordrec.h:124
float FLOAT32
Definition: host.h:111
#define new_measurement(m)
Definition: measure.h:86
int num_chars
Definition: blobs.h:49
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int widths[1]
Definition: blobs.h:50
#define MEAN(m)
Definition: measure.h:74
PRIORITY tesseract::Wordrec::grade_center_of_blob ( register BOUNDS_RECT  rect)

Definition at line 95 of file gradechop.cpp.

95  {
96  register PRIORITY grade;
97 
98  grade = (rect[1] - rect[0]) - (rect[3] - rect[2]);
99  if (grade < 0)
100  grade = -grade;
101 
102  grade *= chop_center_knob;
103  grade = MIN (CENTER_GRADE_CAP, grade);
104  return (MAX (0.0, grade));
105 }
float PRIORITY
Definition: seam.h:38
#define CENTER_GRADE_CAP
Definition: gradechop.cpp:38
#define MIN(x, y)
Definition: ndminx.h:28
double chop_center_knob
Definition: wordrec.h:118
#define MAX(x, y)
Definition: ndminx.h:24
PRIORITY tesseract::Wordrec::grade_overlap ( register BOUNDS_RECT  rect)

Definition at line 115 of file gradechop.cpp.

115  {
116  register PRIORITY grade;
117  register inT16 width1;
118  register inT16 width2;
119  register inT16 overlap;
120 
121  width1 = rect[3] - rect[2];
122  width2 = rect[1] - rect[0];
123 
124  overlap = MIN (rect[1], rect[3]) - MAX (rect[0], rect[2]);
125  width1 = MIN (width1, width2);
126  if (overlap == width1)
127  return (100.0); /* Total overlap */
128 
129  width1 = 2 * overlap - width1; /* Extra penalty for too */
130  overlap += MAX (0, width1); /* much overlap */
131 
132  grade = overlap * chop_overlap_knob;
133 
134  return (MAX (0.0, grade));
135 }
double chop_overlap_knob
Definition: wordrec.h:117
float PRIORITY
Definition: seam.h:38
short inT16
Definition: host.h:100
#define MIN(x, y)
Definition: ndminx.h:28
#define MAX(x, y)
Definition: ndminx.h:24
PRIORITY tesseract::Wordrec::grade_sharpness ( register SPLIT split)

Definition at line 168 of file gradechop.cpp.

168  {
169  register PRIORITY grade;
170 
171  grade = point_priority (split->point1) + point_priority (split->point2);
172 
173  if (grade < -360.0)
174  grade = 0;
175  else
176  grade += 360.0;
177 
178  grade *= chop_sharpness_knob; /* Values 0 to -360 */
179 
180  return (grade);
181 }
float PRIORITY
Definition: seam.h:38
double chop_sharpness_knob
Definition: wordrec.h:119
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:55
PRIORITY tesseract::Wordrec::grade_split_length ( register SPLIT split)

Definition at line 145 of file gradechop.cpp.

145  {
146  register PRIORITY grade;
147  register float split_length;
148 
149  split_length = weighted_edgept_dist (split->point1, split->point2,
151 
152  if (split_length <= 0)
153  grade = 0;
154  else
155  grade = sqrt (split_length) * chop_split_dist_knob;
156 
157  return (MAX (0.0, grade));
158 }
float PRIORITY
Definition: seam.h:38
double chop_split_dist_knob
Definition: wordrec.h:116
#define MAX(x, y)
Definition: ndminx.h:24
#define weighted_edgept_dist(p1, p2, chop_x_y_weight)
Definition: olutil.h:67
PRIORITY tesseract::Wordrec::grade_width_change ( register BOUNDS_RECT  rect)

Definition at line 191 of file gradechop.cpp.

191  {
192  register PRIORITY grade;
193  register inT32 width1;
194  register inT32 width2;
195 
196  width1 = rect[3] - rect[2];
197  width2 = rect[1] - rect[0];
198 
199  grade = 20 - (MAX (rect[1], rect[3])
200  - MIN (rect[0], rect[2]) - MAX (width1, width2));
201 
202  grade *= chop_width_change_knob;
203 
204  return (MAX (0.0, grade));
205 }
float PRIORITY
Definition: seam.h:38
int inT32
Definition: host.h:102
#define MIN(x, y)
Definition: ndminx.h:28
#define MAX(x, y)
Definition: ndminx.h:24
double chop_width_change_knob
Definition: wordrec.h:120
void tesseract::Wordrec::improve_by_chopping ( WERD_RES word,
BLOB_CHOICE_LIST_VECTOR char_choices,
STATE best_state,
BLOB_CHOICE_LIST_VECTOR best_char_choices,
DANGERR fixpt,
bool *  updated_best_choice 
)

Definition at line 741 of file chopper.cpp.

746  {
747  inT32 blob_number;
748  float old_best;
749  bool updated_best_choice = false;
750 
751  while (1) { // improvement loop
752  old_best = word->best_choice->rating();
753  if (improve_one_blob(word, char_choices,
754  &blob_number, &word->seam_array,
755  fixpt, (fragments_guide_chopper &&
756  word->best_choice->fragment_mark()),
757  word->blamer_bundle)) {
758  getDict().LogNewSplit(blob_number);
759  updated_best_choice =
760  getDict().permute_characters(*char_choices, word->best_choice,
761  word->raw_choice);
762 
763  if (old_best > word->best_choice->rating()) {
764  set_n_ones(best_state, char_choices->length() - 1);
765  } else {
766  insert_new_chunk(best_state, blob_number, char_choices->length() - 2);
767  fixpt->clear();
768  }
769 
770  if (chop_debug)
771  print_state("best state = ",
772  best_state, count_blobs(word->chopped_word->blobs) - 1);
773  } else {
774  break;
775  }
776 
777  // Check if we should break from the loop.
778  bool done = false;
779  bool replaced = false;
780  if ((updated_best_choice &&
781  (*best_choice_acceptable =
782  getDict().AcceptableChoice(char_choices, word->best_choice,
783  fixpt, CHOPPER_CALLER, &replaced))) ||
784  char_choices->length() >= MAX_NUM_CHUNKS) {
785  done = true;
786  }
787  if (replaced) update_blob_classifications(word->chopped_word,
788  *char_choices);
789  if (updated_best_choice) CopyCharChoices(*char_choices, best_char_choices);
790  if (done) break;
791  }
792 }
bool improve_one_blob(WERD_RES *word_res, BLOB_CHOICE_LIST_VECTOR *char_choices, inT32 *blob_number, SEAMS *seam_list, DANGERR *fixpt, bool split_next_to_fragment, BlamerBundle *blamer_bundle)
Definition: chopper.cpp:332
virtual void clear()
void CopyCharChoices(const BLOB_CHOICE_LIST_VECTOR &from, BLOB_CHOICE_LIST_VECTOR *to)
Definition: wordrec.cpp:148
int inT32
Definition: host.h:102
void print_state(const char *label, STATE *state, int num_joints)
Definition: states.cpp:214
TBLOB * blobs
Definition: blobs.h:274
bool fragments_guide_chopper
Definition: wordrec.h:105
SEAMS seam_array
Definition: pageres.h:358
void set_n_ones(STATE *state, int n)
Definition: states.cpp:263
Dict & getDict()
Definition: classify.h:62
bool permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices, WERD_CHOICE *best_choice, WERD_CHOICE *raw_choice)
Definition: permute.cpp:765
void insert_new_chunk(register STATE *state, register int index, register int num_joints)
Definition: states.cpp:133
int count_blobs(TBLOB *blobs)
Definition: blobs.cpp:581
bool fragment_mark() const
Definition: ratngs.h:241
void LogNewSplit(int Blob)
Definition: stopper.cpp:471
WERD_CHOICE * raw_choice
Definition: pageres.h:360
int length() const
Definition: genericvector.h:63
TWERD * chopped_word
Definition: pageres.h:357
void update_blob_classifications(TWERD *word, const BLOB_CHOICE_LIST_VECTOR &choices)
Definition: wordclass.cpp:152
BlamerBundle * blamer_bundle
Definition: pageres.h:367
#define MAX_NUM_CHUNKS
Definition: states.h:37
float rating() const
Definition: ratngs.h:231
WERD_CHOICE * best_choice
Definition: pageres.h:359
bool tesseract::Wordrec::improve_one_blob ( WERD_RES word_res,
BLOB_CHOICE_LIST_VECTOR char_choices,
inT32 blob_number,
SEAMS seam_list,
DANGERR fixpt,
bool  split_next_to_fragment,
BlamerBundle blamer_bundle 
)

Definition at line 332 of file chopper.cpp.

338  {
339  TWERD* word = word_res->chopped_word;
340  TBLOB *blob;
341  inT16 x = 0;
342  float rating_ceiling = MAX_FLOAT32;
343  BLOB_CHOICE_LIST *answer;
344  BLOB_CHOICE_IT answer_it;
345  SEAM *seam;
346 
347  do {
348  *blob_number = select_blob_to_split_from_fixpt(fixpt);
349  bool split_point_from_dict = (*blob_number != -1);
350  if (split_point_from_dict) {
351  fixpt->clear();
352  } else {
353  *blob_number = select_blob_to_split(*char_choices, rating_ceiling,
354  split_next_to_fragment);
355  }
356  if (chop_debug)
357  cprintf("blob_number = %d\n", *blob_number);
358  if (*blob_number == -1)
359  return false;
360 
361  // TODO(rays) it may eventually help to allow italic_blob to be true,
362  seam = chop_numbered_blob(word, *blob_number, false, *seam_list);
363  if (seam != NULL)
364  break;
365  /* Must split null blobs */
366  answer = char_choices->get(*blob_number);
367  if (answer == NULL)
368  return false;
369  answer_it.set_to_list(answer);
370  if (!split_point_from_dict) {
371  // We chopped the worst rated blob, try something else next time.
372  rating_ceiling = answer_it.data()->rating();
373  }
374  } while (true);
375  /* Split OK */
376  for (blob = word->blobs; x < *blob_number; x++) {
377  blob = blob->next;
378  }
379 
380  *seam_list =
381  insert_seam (*seam_list, *blob_number, seam, blob, word->blobs);
382 
383  delete char_choices->get(*blob_number);
384 
385  answer = classify_blob(blob, word_res->denorm, "improve 1:", Red,
386  blamer_bundle);
387  char_choices->insert(answer, *blob_number);
388 
389  answer = classify_blob(blob->next, word_res->denorm, "improve 2:", Yellow,
390  blamer_bundle);
391  char_choices->set(answer, *blob_number + 1);
392 
393  return true;
394 }
inT16 select_blob_to_split_from_fixpt(DANGERR *fixpt)
Definition: chopper.cpp:898
Definition: callcpp.h:36
virtual void clear()
void set(T t, int index)
#define NULL
Definition: host.h:144
Definition: blobs.h:233
SEAMS insert_seam(SEAMS seam_list, int index, SEAM *seam, TBLOB *left_blob, TBLOB *first_blob)
Definition: seam.cpp:250
T & get(int index) const
inT16 select_blob_to_split(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_ceiling, bool split_next_to_fragment)
Definition: chopper.cpp:801
TBLOB * blobs
Definition: blobs.h:274
void insert(T t, int index)
Definition: callcpp.h:35
SEAM * chop_numbered_blob(TWERD *word, inT32 blob_number, bool italic_blob, SEAMS seam_list)
Definition: chopper.cpp:219
Definition: blobs.h:174
#define MAX_FLOAT32
Definition: host.h:124
DENORM denorm
Definition: pageres.h:346
short inT16
Definition: host.h:100
TWERD * chopped_word
Definition: pageres.h:357
void cprintf(const char *format,...)
Definition: callcpp.cpp:41
TBLOB * next
Definition: blobs.h:228
BLOB_CHOICE_LIST * classify_blob(TBLOB *blob, const DENORM &denorm, const char *string, C_COL color, BlamerBundle *blamer_bundle)
Definition: wordclass.cpp:62
void tesseract::Wordrec::InitBlamerForSegSearch ( const WERD_CHOICE best_choice,
CHUNKS_RECORD chunks_record,
HEAP pain_points,
BlamerBundle blamer_bundle,
STRING blamer_debug 
)
protected

Definition at line 331 of file segsearch.cpp.

335  {
336  blamer_bundle->segsearch_is_looking_for_blame = true;
337  if (wordrec_debug_blamer) {
338  tprintf("segsearch starting to look for blame\n");
339  }
340  // Clear pain points heap.
341  int pop;
342  float pain_point_priority;
343  MATRIX_COORD *pain_point;
344  while ((pop = HeapPop(pain_points, &pain_point_priority,
345  &pain_point)) != EMPTY) {
346  delete pain_point;
347  }
348  // Fill pain points for any unclassifed blob corresponding to the
349  // correct segmentation state.
350  *blamer_debug += "Correct segmentation:\n";
351  for (int idx = 0;
352  idx < blamer_bundle->correct_segmentation_cols.length(); ++idx) {
353  blamer_debug->add_str_int(
354  "col=", blamer_bundle->correct_segmentation_cols[idx]);
355  blamer_debug->add_str_int(
356  " row=", blamer_bundle->correct_segmentation_rows[idx]);
357  *blamer_debug += "\n";
358  if (chunks_record->ratings->get(
359  blamer_bundle->correct_segmentation_cols[idx],
360  blamer_bundle->correct_segmentation_rows[idx]) == NOT_CLASSIFIED) {
362  blamer_bundle->correct_segmentation_cols[idx],
363  blamer_bundle->correct_segmentation_rows[idx],
364  false, -1.0, -1.0, false, -1.0, segsearch_max_char_wh_ratio,
365  NULL, NULL, chunks_record, pain_points)) {
366  blamer_bundle->segsearch_is_looking_for_blame = false;
367  *blamer_debug += "\nFailed to insert pain point\n";
368  blamer_bundle->SetBlame(IRR_SEGSEARCH_HEUR, *blamer_debug, best_choice,
370  break;
371  }
372  }
373  } // end for blamer_bundle->correct_segmentation_cols/rows
374 }
GenericVector< int > correct_segmentation_cols
Definition: pageres.h:190
LIST pop(LIST list)
Definition: oldlist.cpp:305
double segsearch_max_char_wh_ratio
Definition: wordrec.h:152
#define NULL
Definition: host.h:144
T get(int column, int row) const
Definition: matrix.h:117
void add_str_int(const char *str, int number)
Definition: strngs.cpp:334
int HeapPop(HEAP *Heap, FLOAT32 *Key, void *out_ptr)
Definition: oldheap.cpp:76
void SetBlame(IncorrectResultReason irr, const STRING &msg, const WERD_CHOICE *choice, bool debug)
Definition: pageres.h:151
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool wordrec_debug_blamer
Definition: wordrec.h:142
#define NOT_CLASSIFIED
Definition: matrix.h:31
int length() const
Definition: genericvector.h:63
LanguageModel * language_model_
Definition: wordrec.h:495
bool GeneratePainPoint(int col, int row, bool ok_to_extend, float priority_adjustment, float worst_piece_cert, bool fragmented, float best_choice_cert, float max_char_wh_ratio, BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, HEAP *pain_points)
bool segsearch_is_looking_for_blame
Definition: pageres.h:184
MATRIX * ratings
Definition: associate.h:52
GenericVector< int > correct_segmentation_rows
Definition: pageres.h:191
#define EMPTY
Definition: oldheap.h:29
int tesseract::Wordrec::is_crossed ( TPOINT  a0,
TPOINT  a1,
TPOINT  b0,
TPOINT  b1 
)

Definition at line 70 of file outlines.cpp.

70  {
71  int b0a1xb0b1, b0b1xb0a0;
72  int a1b1xa1a0, a1a0xa1b0;
73 
74  TPOINT b0a1, b0a0, a1b1, b0b1, a1a0;
75 
76  b0a1.x = a1.x - b0.x;
77  b0a0.x = a0.x - b0.x;
78  a1b1.x = b1.x - a1.x;
79  b0b1.x = b1.x - b0.x;
80  a1a0.x = a0.x - a1.x;
81  b0a1.y = a1.y - b0.y;
82  b0a0.y = a0.y - b0.y;
83  a1b1.y = b1.y - a1.y;
84  b0b1.y = b1.y - b0.y;
85  a1a0.y = a0.y - a1.y;
86 
87  b0a1xb0b1 = CROSS (b0a1, b0b1);
88  b0b1xb0a0 = CROSS (b0b1, b0a0);
89  a1b1xa1a0 = CROSS (a1b1, a1a0);
90  /*a1a0xa1b0=CROSS(a1a0,a1b0); */
91  a1a0xa1b0 = -CROSS (a1a0, b0a1);
92 
93  return ((b0a1xb0b1 > 0 && b0b1xb0a0 > 0)
94  || (b0a1xb0b1 < 0 && b0b1xb0a0 < 0))
95  && ((a1b1xa1a0 > 0 && a1a0xa1b0 > 0) || (a1b1xa1a0 < 0 && a1a0xa1b0 < 0));
96 }
inT16 y
Definition: blobs.h:68
inT16 x
Definition: blobs.h:67
Definition: blobs.h:53
#define CROSS(a, b)
Definition: vecfuncs.h:54
int tesseract::Wordrec::is_little_chunk ( EDGEPT point1,
EDGEPT point2 
)

Definition at line 123 of file chop.cpp.

123  {
124  EDGEPT *p = point1; /* Iterator */
125  int counter = 0;
126 
127  do {
128  /* Go from P1 to P2 */
129  if (is_same_edgept (point2, p)) {
130  if (is_small_area (point1, point2))
131  return (TRUE);
132  else
133  break;
134  }
135  p = p->next;
136  }
137  while ((p != point1) && (counter++ < chop_min_outline_points));
138  /* Go from P2 to P1 */
139  p = point2;
140  counter = 0;
141  do {
142  if (is_same_edgept (point1, p)) {
143  return (is_small_area (point2, point1));
144  }
145  p = p->next;
146  }
147  while ((p != point2) && (counter++ < chop_min_outline_points));
148 
149  return (FALSE);
150 }
EDGEPT * next
Definition: blobs.h:106
#define FALSE
Definition: capi.h:28
int chop_min_outline_points
Definition: wordrec.h:113
int is_small_area(EDGEPT *point1, EDGEPT *point2)
Definition: chop.cpp:158
Definition: blobs.h:72
int is_same_edgept(EDGEPT *p1, EDGEPT *p2)
Definition: outlines.cpp:104
#define TRUE
Definition: capi.h:27
int tesseract::Wordrec::is_same_edgept ( EDGEPT p1,
EDGEPT p2 
)

Definition at line 104 of file outlines.cpp.

104  {
105  return (p1 == p2);
106 }
int tesseract::Wordrec::is_small_area ( EDGEPT point1,
EDGEPT point2 
)

Definition at line 158 of file chop.cpp.

158  {
159  EDGEPT *p = point1->next; /* Iterator */
160  int area = 0;
161  TPOINT origin;
162 
163  do {
164  /* Go from P1 to P2 */
165  origin.x = p->pos.x - point1->pos.x;
166  origin.y = p->pos.y - point1->pos.y;
167  area += CROSS (origin, p->vec);
168  p = p->next;
169  }
170  while (!is_same_edgept (point2, p));
171 
172  return (area < chop_min_outline_area);
173 }
EDGEPT * next
Definition: blobs.h:106
VECTOR vec
Definition: blobs.h:101
inT16 y
Definition: blobs.h:68
inT16 x
Definition: blobs.h:67
Definition: blobs.h:53
int chop_min_outline_area
Definition: wordrec.h:115
Definition: blobs.h:72
#define CROSS(a, b)
Definition: vecfuncs.h:54
TPOINT pos
Definition: blobs.h:100
int is_same_edgept(EDGEPT *p1, EDGEPT *p2)
Definition: outlines.cpp:104
BLOB_CHOICE_LIST * tesseract::Wordrec::join_blobs_and_classify ( WERD_RES word,
int  x,
int  y,
int  choice_index,
MATRIX ratings,
BLOB_CHOICE_LIST_VECTOR old_choices 
)

Definition at line 730 of file bestfirst.cpp.

732  {
733  // Join parts to make the blob if needed.
734  if (x != y)
735  join_pieces(word->chopped_word->blobs, word->seam_array, x, y);
736  TBLOB *blob = word->chopped_word->blobs;
737  for (int i = 0; i < x; i++) {
738  blob = blob->next;
739  }
740  // Deep copy this blob into the output word.
741  TBLOB* copy_blob = new TBLOB(*blob);
742  copy_blob->next = word->rebuild_word->blobs;
743  word->rebuild_word->blobs = copy_blob;
744 
745  BLOB_CHOICE_LIST *choices = NULL;
746  // First check to see if we can look up the classificaiton
747  // in old_choices (if there is no need to merge blobs).
748  if (choice_index >= 0 && old_choices != NULL) {
749  choices = old_choices->get(choice_index);
750  old_choices->set(NULL, choice_index);
751  }
752  // The ratings matrix filled in by the associator will contain the next most
753  // up-to-date classification info. Thus we look up the classification there
754  // next, and only call classify_blob() if the classification is not found.
755  if (choices == NULL && ratings != NULL) {
756  choices = ratings->get(x, y);
757  if (choices != NOT_CLASSIFIED) {
758  ratings->put(x, y, NULL);
759  }
760  }
761  // Get the choices for the blob by classification if necessary.
762  if (choices == NULL) {
763  choices = classify_blob(blob, word->denorm, "rebuild", Orange,
764  word->blamer_bundle);
765  }
766  // Undo join_pieces to restore the chopped word to its fully chopped state.
767  if (x != y)
768  break_pieces(blob, word->seam_array, x, y);
769  return choices;
770 }
TWERD * rebuild_word
Definition: pageres.h:381
void break_pieces(TBLOB *blobs, SEAMS seams, inT16 start, inT16 end)
Definition: seam.cpp:535
void set(T t, int index)
#define NULL
Definition: host.h:144
T & get(int index) const
T get(int column, int row) const
Definition: matrix.h:117
TBLOB * blobs
Definition: blobs.h:274
void join_pieces(TBLOB *piece_blobs, SEAMS seams, inT16 start, inT16 end)
Definition: seam.cpp:564
SEAMS seam_array
Definition: pageres.h:358
void put(int column, int row, const T &thing)
Definition: matrix.h:112
Definition: blobs.h:174
DENORM denorm
Definition: pageres.h:346
#define NOT_CLASSIFIED
Definition: matrix.h:31
Definition: callcpp.h:67
TWERD * chopped_word
Definition: pageres.h:357
BlamerBundle * blamer_bundle
Definition: pageres.h:367
TBLOB * next
Definition: blobs.h:228
BLOB_CHOICE_LIST * classify_blob(TBLOB *blob, const DENORM &denorm, const char *string, C_COL color, BlamerBundle *blamer_bundle)
Definition: wordclass.cpp:62
void tesseract::Wordrec::junk_worst_seam ( SEAM_QUEUE  seams,
SEAM new_seam,
float  new_priority 
)

Definition at line 148 of file findseam.cpp.

149  {
150  SEAM *seam;
151  float priority;
152 
153  HeapPopWorst(seams, &priority, &seam);
154  if (priority > new_priority) {
155  delete_seam(seam); /*get rid of it */
156  HeapPush (seams, new_priority, (char *) new_seam);
157  }
158  else {
159  delete_seam(new_seam);
160  HeapPush (seams, priority, (char *) seam);
161  }
162 }
void HeapPush(HEAP *Heap, FLOAT32 Key, void *Data)
Definition: oldheap.cpp:195
void delete_seam(void *arg)
Definition: seam.cpp:154
int HeapPopWorst(HEAP *Heap, FLOAT32 *Key, void *out_ptr)
Definition: oldheap.cpp:124
void tesseract::Wordrec::merge_and_put_fragment_lists ( inT16  row,
inT16  column,
inT16  num_frag_parts,
BLOB_CHOICE_LIST *  choice_lists,
MATRIX ratings 
)

Definition at line 169 of file pieces.cpp.

172  {
173  BLOB_CHOICE_IT *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts];
174 
175  for (int i = 0; i < num_frag_parts; i++) {
176  choice_lists_it[i].set_to_list(&choice_lists[i]);
177  choice_lists_it[i].mark_cycle_pt();
178  }
179 
180  BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column);
181  if (merged_choice == NULL)
182  merged_choice = new BLOB_CHOICE_LIST;
183 
184  bool end_of_list = false;
185  BLOB_CHOICE_IT merged_choice_it(merged_choice);
186  while (!end_of_list) {
187  // Find the maximum unichar_id of the current entry the iterators
188  // are pointing at
189  UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id();
190  int max_list = 0;
191  for (int i = 0; i < num_frag_parts; i++) {
192  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
193  if (max_unichar_id < unichar_id) {
194  max_unichar_id = unichar_id;
195  max_list = i;
196  }
197  }
198 
199  // Move the each iterators until it gets to an entry that has a
200  // value greater than or equal to max_unichar_id
201  for (int i = 0; i < num_frag_parts; i++) {
202  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
203  while (!choice_lists_it[i].cycled_list() &&
204  unichar_id < max_unichar_id) {
205  choice_lists_it[i].forward();
206  unichar_id = choice_lists_it[i].data()->unichar_id();
207  }
208  if (choice_lists_it[i].cycled_list()) {
209  end_of_list = true;
210  break;
211  }
212  }
213 
214  if (end_of_list)
215  break;
216 
217  // Checks if the fragments are parts of the same character
218  UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id();
219  bool same_unichar = true;
220  for (int i = 1; i < num_frag_parts; i++) {
221  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
222  if (unichar_id != first_unichar_id) {
223  same_unichar = false;
224  break;
225  }
226  }
227 
228  if (same_unichar) {
229  // Add the merged character to the result
230  UNICHAR_ID merged_unichar_id = first_unichar_id;
231  inT16 merged_fontinfo_id = choice_lists_it[0].data()->fontinfo_id();
232  inT16 merged_fontinfo_id2 = choice_lists_it[0].data()->fontinfo_id2();
233  inT16 merged_min_xheight = choice_lists_it[0].data()->min_xheight();
234  inT16 merged_max_xheight = choice_lists_it[0].data()->max_xheight();
235  int merged_script_id = choice_lists_it[0].data()->script_id();
236  bool merged_adapted = choice_lists_it[0].data()->adapted();
237 
238  float merged_rating = 0, merged_certainty = 0;
239  for (int i = 0; i < num_frag_parts; i++) {
240  float rating = choice_lists_it[i].data()->rating();
241  float certainty = choice_lists_it[i].data()->certainty();
242 
243  if (i == 0 || certainty < merged_certainty)
244  merged_certainty = certainty;
245  merged_rating += rating;
246 
247  choice_lists_it[i].forward();
248  if (choice_lists_it[i].cycled_list())
249  end_of_list = true;
250  IntersectRange(choice_lists_it[i].data()->min_xheight(),
251  choice_lists_it[i].data()->max_xheight(),
252  &merged_min_xheight, &merged_max_xheight);
253  }
254 
255  merged_choice_it.add_to_end(new BLOB_CHOICE(merged_unichar_id,
256  merged_rating,
257  merged_certainty,
258  merged_fontinfo_id,
259  merged_fontinfo_id2,
260  merged_script_id,
261  merged_min_xheight,
262  merged_max_xheight,
263  merged_adapted));
264  }
265  }
266 
268  print_ratings_list("Merged Fragments", merged_choice,
269  unicharset);
270 
271  if (merged_choice->empty())
272  delete merged_choice;
273  else
274  ratings->put(row, column, merged_choice);
275 
276  delete [] choice_lists_it;
277 }
int UNICHAR_ID
Definition: unichar.h:31
void IntersectRange(const T &lower1, const T &upper1, T *lower2, T *upper2)
Definition: helpers.h:95
#define NULL
Definition: host.h:144
T get(int column, int row) const
Definition: matrix.h:117
void put(int column, int row, const T &thing)
Definition: matrix.h:112
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:511
UNICHARSET unicharset
Definition: ccutil.h:72
short inT16
Definition: host.h:100
void tesseract::Wordrec::merge_fragments ( MATRIX ratings,
inT16  num_blobs 
)

Definition at line 324 of file pieces.cpp.

324  {
325  BLOB_CHOICE_LIST choice_lists[CHAR_FRAGMENT::kMaxChunks];
326  for (inT16 start = 0; start < num_blobs; start++) {
327  for (int frag_parts = 2; frag_parts <= CHAR_FRAGMENT::kMaxChunks;
328  frag_parts++) {
329  get_fragment_lists(0, start, start, frag_parts, num_blobs,
330  ratings, choice_lists);
331  }
332  }
333 
334  // Delete fragments from the rating matrix
335  for (inT16 x = 0; x < num_blobs; x++) {
336  for (inT16 y = x; y < num_blobs; y++) {
337  BLOB_CHOICE_LIST *choices = ratings->get(x, y);
338  if (choices != NULL) {
339  BLOB_CHOICE_IT choices_it(choices);
340  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
341  choices_it.forward()) {
342  UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
343  const CHAR_FRAGMENT *frag =
344  unicharset.get_fragment(choice_unichar_id);
345  if (frag != NULL)
346  delete choices_it.extract();
347  }
348  }
349  }
350  }
351 }
int UNICHAR_ID
Definition: unichar.h:31
#define NULL
Definition: host.h:144
T get(int column, int row) const
Definition: matrix.h:117
static const int kMaxChunks
Definition: unicharset.h:37
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:610
UNICHARSET unicharset
Definition: ccutil.h:72
short inT16
Definition: host.h:100
void get_fragment_lists(inT16 current_frag, inT16 current_row, inT16 start, inT16 num_frag_parts, inT16 num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
Definition: pieces.cpp:292
void tesseract::Wordrec::modify_blob_choice ( BLOB_CHOICE_LIST *  answer,
int  chop_index 
)

Definition at line 403 of file chopper.cpp.

404  {
405  char chop_index_string[2];
406  if (chop_index <= 9) {
407  snprintf(chop_index_string, sizeof(chop_index_string), "%d", chop_index);
408  } else {
409  chop_index_string[0] = static_cast<char>('A' - 10 + chop_index);
410  chop_index_string[1] = '\0';
411  }
412  UNICHAR_ID unichar_id = unicharset.unichar_to_id(chop_index_string);
413  if (unichar_id == INVALID_UNICHAR_ID) {
414  // If the word is very long, we might exhaust the possibilities.
415  unichar_id = 1;
416  }
417  BLOB_CHOICE_IT answer_it(answer);
418  BLOB_CHOICE *modified_blob =
419  new BLOB_CHOICE(unichar_id,
420  answer_it.data()->rating(),
421  answer_it.data()->certainty(),
422  answer_it.data()->fontinfo_id(),
423  answer_it.data()->fontinfo_id2(),
424  answer_it.data()->script_id(),
425  answer_it.data()->min_xheight(),
426  answer_it.data()->max_xheight(),
427  answer_it.data()->adapted());
428  answer->clear();
429  answer_it.set_to_list(answer);
430  answer_it.add_after_then_move(modified_blob);
431 }
int UNICHAR_ID
Definition: unichar.h:31
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
UNICHARSET unicharset
Definition: ccutil.h:72
bool tesseract::Wordrec::near_point ( EDGEPT point,
EDGEPT line_pt_0,
EDGEPT line_pt_1,
EDGEPT **  near_pt 
)

Definition at line 116 of file outlines.cpp.

118  {
119  TPOINT p;
120 
121  float slope;
122  float intercept;
123 
124  float x0 = line_pt_0->pos.x;
125  float x1 = line_pt_1->pos.x;
126  float y0 = line_pt_0->pos.y;
127  float y1 = line_pt_1->pos.y;
128 
129  if (x0 == x1) {
130  /* Handle vertical line */
131  p.x = (inT16) x0;
132  p.y = point->pos.y;
133  }
134  else {
135  /* Slope and intercept */
136  slope = (y0 - y1) / (x0 - x1);
137  intercept = y1 - x1 * slope;
138 
139  /* Find perpendicular */
140  p.x = (inT16) ((point->pos.x + (point->pos.y - intercept) * slope) /
141  (slope * slope + 1));
142  p.y = (inT16) (slope * p.x + intercept);
143  }
144 
145  if (is_on_line (p, line_pt_0->pos, line_pt_1->pos) &&
146  (!same_point (p, line_pt_0->pos)) && (!same_point (p, line_pt_1->pos))) {
147  /* Intersection on line */
148  *near_pt = make_edgept(p.x, p.y, line_pt_1, line_pt_0);
149  return true;
150  } else { /* Intersection not on line */
151  *near_pt = closest(point, line_pt_0, line_pt_1);
152  return false;
153  }
154 }
inT16 y
Definition: blobs.h:68
EDGEPT * make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev)
Definition: split.cpp:65
inT16 x
Definition: blobs.h:67
Definition: blobs.h:53
#define same_point(p1, p2)
Definition: outlines.h:49
short inT16
Definition: host.h:100
TPOINT pos
Definition: blobs.h:100
#define is_on_line(p, p0, p1)
Definition: outlines.h:120
#define closest(test_p, p1, p2)
Definition: outlines.h:71
void tesseract::Wordrec::new_max_point ( EDGEPT local_max,
POINT_GROUP  points 
)

Definition at line 303 of file chop.cpp.

303  {
304  inT16 dir;
305 
306  dir = direction (local_max);
307 
308  if (dir > 0) {
309  add_point_to_list(points, local_max);
310  return;
311  }
312 
313  if (dir == 0 && point_priority (local_max) < 0) {
314  add_point_to_list(points, local_max);
315  return;
316  }
317 }
void add_point_to_list(POINT_GROUP point_list, EDGEPT *point)
Definition: chop.cpp:65
short inT16
Definition: host.h:100
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:55
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
void tesseract::Wordrec::new_min_point ( EDGEPT local_min,
POINT_GROUP  points 
)

Definition at line 279 of file chop.cpp.

279  {
280  inT16 dir;
281 
282  dir = direction (local_min);
283 
284  if (dir < 0) {
285  add_point_to_list(points, local_min);
286  return;
287  }
288 
289  if (dir == 0 && point_priority (local_min) < 0) {
290  add_point_to_list(points, local_min);
291  return;
292  }
293 }
void add_point_to_list(POINT_GROUP point_list, EDGEPT *point)
Definition: chop.cpp:65
short inT16
Definition: host.h:100
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:55
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
SEARCH_RECORD * tesseract::Wordrec::new_search ( CHUNKS_RECORD chunks_record,
int  num_joints,
BLOB_CHOICE_LIST_VECTOR best_char_choices,
WERD_CHOICE best_choice,
WERD_CHOICE raw_choice,
STATE state 
)

Definition at line 568 of file bestfirst.cpp.

573  {
574  SEARCH_RECORD *this_search;
575 
576  this_search = (SEARCH_RECORD *) memalloc (sizeof (SEARCH_RECORD));
577 
578  this_search->open_states = MakeHeap (wordrec_num_seg_states * 20);
579  this_search->closed_states = new_hash_table();
580 
581  if (state)
582  this_search->this_state = new_state (state);
583  else
584  cprintf ("error: bad initial state in new_search\n");
585 
586  this_search->first_state = new_state (this_search->this_state);
587  this_search->best_state = new_state (this_search->this_state);
588 
589  this_search->best_choice = best_choice;
590  this_search->raw_choice = raw_choice;
591  this_search->best_char_choices = best_char_choices;
592 
593  this_search->num_joints = num_joints;
594  this_search->num_states = 0;
595  this_search->before_best = 0;
596  this_search->segcost_bias = 0;
597 
598  return (this_search);
599 }
WERD_CHOICE * best_choice
Definition: bestfirst.h:56
int * memalloc(int size)
Definition: freelist.cpp:22
STATE * best_state
Definition: bestfirst.h:51
long before_best
Definition: bestfirst.h:54
WERD_CHOICE * raw_choice
Definition: bestfirst.h:57
STATE * new_state(STATE *oldstate)
Definition: states.cpp:166
HEAP * MakeHeap(int Size)
Definition: oldheap.cpp:49
int num_joints
Definition: bestfirst.h:52
STATE * first_state
Definition: bestfirst.h:50
int wordrec_num_seg_states
Definition: wordrec.h:102
BLOB_CHOICE_LIST_VECTOR * best_char_choices
Definition: bestfirst.h:58
long num_states
Definition: bestfirst.h:53
HEAP * open_states
Definition: bestfirst.h:47
STATE * this_state
Definition: bestfirst.h:49
void cprintf(const char *format,...)
Definition: callcpp.cpp:41
HASH_TABLE closed_states
Definition: bestfirst.h:48
HASH_TABLE new_hash_table()
Definition: closed.cpp:120
float segcost_bias
Definition: bestfirst.h:55
EDGEPT * tesseract::Wordrec::pick_close_point ( EDGEPT critical_point,
EDGEPT vertical_point,
int *  best_dist 
)

Definition at line 182 of file chop.cpp.

184  {
185  EDGEPT *best_point = NULL;
186  int this_distance;
187  int found_better;
188 
189  do {
190  found_better = FALSE;
191 
192  this_distance = edgept_dist (critical_point, vertical_point);
193  if (this_distance <= *best_dist) {
194 
195  if (!(same_point (critical_point->pos, vertical_point->pos) ||
196  same_point (critical_point->pos, vertical_point->next->pos) ||
197  (best_point && same_point (best_point->pos, vertical_point->pos)) ||
198  is_exterior_point (critical_point, vertical_point))) {
199  *best_dist = this_distance;
200  best_point = vertical_point;
202  found_better = TRUE;
203  }
204  }
205  vertical_point = vertical_point->next;
206  }
207  while (found_better == TRUE);
208 
209  return (best_point);
210 }
EDGEPT * next
Definition: blobs.h:106
#define NULL
Definition: host.h:144
#define FALSE
Definition: capi.h:28
#define edgept_dist(p1, p2)
Definition: outlines.h:87
Definition: blobs.h:72
#define is_exterior_point(edge, point)
Definition: outlines.h:97
#define same_point(p1, p2)
Definition: outlines.h:49
TPOINT pos
Definition: blobs.h:100
bool chop_vertical_creep
Definition: wordrec.h:110
#define TRUE
Definition: capi.h:27
SEAM * tesseract::Wordrec::pick_good_seam ( TBLOB blob)

Definition at line 380 of file findseam.cpp.

380  {
381  SEAM_QUEUE seam_queue;
382  SEAM_PILE seam_pile;
383  POINT_GROUP point_heap;
384  PRIORITY priority;
385  EDGEPT *edge;
386  EDGEPT *points[MAX_NUM_POINTS];
387  EDGEPT_CLIST new_points;
388  SEAM *seam = NULL;
389  TESSLINE *outline;
390  inT16 num_points = 0;
391 
392 #ifndef GRAPHICS_DISABLED
393  if (chop_debug > 2)
394  wordrec_display_splits.set_value(true);
395 
396  draw_blob_edges(blob);
397 #endif
398 
399  point_heap = MakeHeap (MAX_NUM_POINTS);
400  for (outline = blob->outlines; outline; outline = outline->next)
401  prioritize_points(outline, point_heap);
402 
403  while (HeapPop (point_heap, &priority, &edge) == TESS_HEAP_OK) {
404  if (num_points < MAX_NUM_POINTS)
405  points[num_points++] = (EDGEPT *) edge;
406  }
407  FreeHeap(point_heap);
408 
409  /* Initialize queue & pile */
410  create_seam_pile(seam_pile);
411  create_seam_queue(seam_queue);
412 
413  try_point_pairs(points, num_points, seam_queue, &seam_pile, &seam, blob);
414  try_vertical_splits(points, num_points, &new_points,
415  seam_queue, &seam_pile, &seam, blob);
416 
417  if (seam == NULL) {
418  choose_best_seam(seam_queue, &seam_pile, NULL, BAD_PRIORITY, &seam, blob);
419  }
420  else if (seam->priority > chop_good_split) {
421  choose_best_seam (seam_queue, &seam_pile, NULL, seam->priority,
422  &seam, blob);
423  }
424 
425  EDGEPT_C_IT it(&new_points);
426  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
427  EDGEPT *inserted_point = it.data();
428  if (!point_used_by_seam(seam, inserted_point)) {
429  for (outline = blob->outlines; outline; outline = outline->next) {
430  if (outline->loop == inserted_point) {
431  outline->loop = outline->loop->next;
432  }
433  }
434  remove_edgept(inserted_point);
435  }
436  }
437 
438  delete_seam_queue(seam_queue);
439  delete_seam_pile(seam_pile);
440 
441  if (seam) {
442  if (seam->priority > chop_ok_split) {
443  delete_seam(seam);
444  seam = NULL;
445  }
446 #ifndef GRAPHICS_DISABLED
447  else if (wordrec_display_splits) {
448  if (seam->split1)
449  mark_split (seam->split1);
450  if (seam->split2)
451  mark_split (seam->split2);
452  if (seam->split3)
453  mark_split (seam->split3);
454  if (chop_debug > 2) {
457  }
458  }
459 #endif
460  }
461 
462  if (chop_debug)
463  wordrec_display_splits.set_value(false);
464 
465  return (seam);
466 }
PRIORITY priority
Definition: seam.h:42
SPLIT * split2
Definition: seam.h:47
EDGEPT * next
Definition: blobs.h:106
float PRIORITY
Definition: seam.h:38
void delete_seam_pile(SEAM_PILE seam_pile)
Definition: findseam.cpp:365
bool point_used_by_seam(SEAM *seam, EDGEPT *point)
Definition: seam.cpp:92
#define MAX_NUM_POINTS
Definition: chop.h:38
TESSLINE * next
Definition: blobs.h:171
void remove_edgept(EDGEPT *point)
Definition: split.cpp:90
#define NULL
Definition: host.h:144
HEAP * MakeHeap(int Size)
Definition: oldheap.cpp:49
#define BAD_PRIORITY
Definition: findseam.cpp:52
double chop_ok_split
Definition: wordrec.h:121
TESSLINE * outlines
Definition: blobs.h:227
#define update_edge_window()
Definition: plotedges.h:46
int HeapPop(HEAP *Heap, FLOAT32 *Key, void *out_ptr)
Definition: oldheap.cpp:76
#define create_seam_queue(seam_queue)
Definition: findseam.cpp:90
void prioritize_points(TESSLINE *outline, POINT_GROUP points)
Definition: chop.cpp:220
SPLIT * split1
Definition: seam.h:46
double chop_good_split
Definition: wordrec.h:122
#define FreeHeap(H)
Definition: oldheap.h:46
#define delete_seam_queue(seam_queue)
Definition: findseam.cpp:109
void delete_seam(void *arg)
Definition: seam.cpp:154
Definition: blobs.h:72
#define TESS_HEAP_OK
Definition: oldheap.h:30
Definition: oldheap.h:37
short inT16
Definition: host.h:100
#define create_seam_pile(seam_pile)
Definition: findseam.cpp:99
void try_vertical_splits(EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, EDGEPT_CLIST *new_points, SEAM_QUEUE seam_queue, SEAM_PILE *seam_pile, SEAM **seam, TBLOB *blob)
Definition: findseam.cpp:554
bool wordrec_display_splits
Definition: split.cpp:39
#define edge_window_wait()
Definition: plotedges.h:58
SPLIT * split3
Definition: seam.h:48
void try_point_pairs(EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, SEAM_QUEUE seam_queue, SEAM_PILE *seam_pile, SEAM **seam, TBLOB *blob)
Definition: findseam.cpp:512
void mark_split(SPLIT *split)
Definition: plotedges.cpp:130
void draw_blob_edges(TBLOB *blob)
Definition: plotedges.cpp:77
void choose_best_seam(SEAM_QUEUE seam_queue, SEAM_PILE *seam_pile, SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob)
Definition: findseam.cpp:178
EDGEPT * loop
Definition: blobs.h:170
PRIORITY tesseract::Wordrec::point_priority ( EDGEPT point)

Definition at line 55 of file chop.cpp.

55  {
56  return (PRIORITY)angle_change(point->prev, point, point->next);
57 }
EDGEPT * next
Definition: blobs.h:106
float PRIORITY
Definition: seam.h:38
int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
Definition: chop.cpp:87
EDGEPT * prev
Definition: blobs.h:107
STATE * tesseract::Wordrec::pop_queue ( HEAP queue)

Definition at line 607 of file bestfirst.cpp.

607  {
608  HEAPENTRY entry;
609 
610  if (GetTopOfHeap (queue, &entry) == TESS_HEAP_OK) {
611 #ifndef GRAPHICS_DISABLED
613  cprintf ("eval state: %8.3f ", entry.Key);
614  print_state ("", (STATE *) entry.Data, num_joints);
615  }
616 #endif
617  return ((STATE *) entry.Data);
618  }
619  else {
620  return (NULL);
621  }
622 }
#define NULL
Definition: host.h:144
int wordrec_display_segmentations
Definition: plotseg.cpp:48
void print_state(const char *label, STATE *state, int num_joints)
Definition: states.cpp:214
int GetTopOfHeap(HEAP *Heap, HEAPENTRY *Entry)
Definition: oldheap.cpp:273
#define TESS_HEAP_OK
Definition: oldheap.h:30
void * Data
Definition: oldheap.h:34
FLOAT32 Key
Definition: oldheap.h:33
void cprintf(const char *format,...)
Definition: callcpp.cpp:41
Definition: states.h:39
void tesseract::Wordrec::prioritize_points ( TESSLINE outline,
POINT_GROUP  points 
)

Definition at line 220 of file chop.cpp.

220  {
221  EDGEPT *this_point;
222  EDGEPT *local_min = NULL;
223  EDGEPT *local_max = NULL;
224 
225  this_point = outline->loop;
226  local_min = this_point;
227  local_max = this_point;
228  do {
229  if (this_point->vec.y < 0) {
230  /* Look for minima */
231  if (local_max != NULL)
232  new_max_point(local_max, points);
233  else if (is_inside_angle (this_point))
234  add_point_to_list(points, this_point);
235  local_max = NULL;
236  local_min = this_point->next;
237  }
238  else if (this_point->vec.y > 0) {
239  /* Look for maxima */
240  if (local_min != NULL)
241  new_min_point(local_min, points);
242  else if (is_inside_angle (this_point))
243  add_point_to_list(points, this_point);
244  local_min = NULL;
245  local_max = this_point->next;
246  }
247  else {
248  /* Flat area */
249  if (local_max != NULL) {
250  if (local_max->prev->vec.y != 0) {
251  new_max_point(local_max, points);
252  }
253  local_max = this_point->next;
254  local_min = NULL;
255  }
256  else {
257  if (local_min->prev->vec.y != 0) {
258  new_min_point(local_min, points);
259  }
260  local_min = this_point->next;
261  local_max = NULL;
262  }
263  }
264 
265  /* Next point */
266  this_point = this_point->next;
267  }
268  while (this_point != outline->loop);
269 }
EDGEPT * next
Definition: blobs.h:106
void add_point_to_list(POINT_GROUP point_list, EDGEPT *point)
Definition: chop.cpp:65
#define is_inside_angle(pt)
Definition: olutil.h:45
EDGEPT * prev
Definition: blobs.h:107
#define NULL
Definition: host.h:144
VECTOR vec
Definition: blobs.h:101
inT16 y
Definition: blobs.h:68
void new_min_point(EDGEPT *local_min, POINT_GROUP points)
Definition: chop.cpp:279
Definition: blobs.h:72
void new_max_point(EDGEPT *local_max, POINT_GROUP points)
Definition: chop.cpp:303
EDGEPT * loop
Definition: blobs.h:170
FLOAT32 tesseract::Wordrec::prioritize_state ( CHUNKS_RECORD chunks_record,
SEARCH_RECORD the_search 
)

Definition at line 289 of file heuristic.cpp.

290  {
291  FLOAT32 shape_cost;
292  FLOAT32 width_cost;
293  FLOAT32 seam_cost;
294 
295  shape_cost = rating_priority(chunks_record,
296  the_search->this_state,
297  the_search->num_joints);
298 
299  width_cost = width_priority(chunks_record,
300  the_search->this_state,
301  the_search->num_joints);
302 
303  // The rating_priority is the same as the original, and the width_priority
304  // is the same as before if assume_fixed_pitch_char_segment == FALSE.
305  // So this would return the original state priority.
306  if (!use_new_state_cost)
307  return width_cost * 1000 + shape_cost;
308 
309  seam_cost = seamcut_priority(chunks_record->splits,
310  the_search->this_state,
311  the_search->num_joints);
312 
313  // TODO(dsl): how do we normalize the scores for these separate evidence?
314  // FLOAT32 total_cost = shape_cost + width_cost * 0.01 + seam_cost * 0.001;
315  FLOAT32 total_cost = shape_cost * heuristic_weight_rating +
316  width_cost * heuristic_weight_width +
317  seam_cost * heuristic_weight_seamcut;
318 
319  // We don't have an adjustment model for variable pitch segmentation cost
320  // into word rating
322  float seg_bias = 1.0;
323  if (width_cost < 1) seg_bias *= 0.85;
324  if (width_cost > 3)
325  seg_bias *= pow(heuristic_segcost_rating_base, width_cost/3.0);
326  if (seam_cost > 10)
327  seg_bias *= pow(heuristic_segcost_rating_base, log(seam_cost)/log(10.0));
328  if (shape_cost > 5)
329  seg_bias *= pow(heuristic_segcost_rating_base, shape_cost/5.0);
330  if (segment_adjust_debug) {
331  tprintf("SegCost: %g Weight: %g rating: %g width: %g seam: %g\n",
332  total_cost, seg_bias, shape_cost, width_cost, seam_cost);
333  }
334  the_search->segcost_bias = seg_bias;
335  } else {
336  the_search->segcost_bias = 0;
337  }
338 
339  return total_cost;
340 }
int segment_adjust_debug
Definition: wordrec.h:124
SEAMS splits
Definition: associate.h:55
double heuristic_segcost_rating_base
Definition: wordrec.h:132
FLOAT32 rating_priority(CHUNKS_RECORD *chunks_record, STATE *state, int num_joints)
Definition: heuristic.cpp:175
double heuristic_weight_rating
Definition: wordrec.h:134
float FLOAT32
Definition: host.h:111
int num_joints
Definition: bestfirst.h:52
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:126
FLOAT32 seamcut_priority(SEAMS seams, STATE *state, int num_joints)
Definition: heuristic.cpp:142
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool use_new_state_cost
Definition: wordrec.h:128
STATE * this_state
Definition: bestfirst.h:49
FLOAT32 width_priority(CHUNKS_RECORD *chunks_record, STATE *state, int num_joints)
Definition: heuristic.cpp:222
double heuristic_weight_seamcut
Definition: wordrec.h:138
double heuristic_weight_width
Definition: wordrec.h:136
float segcost_bias
Definition: bestfirst.h:55
void tesseract::Wordrec::ProcessSegSearchPainPoint ( float  pain_point_priority,
const MATRIX_COORD pain_point,
const WERD_CHOICE best_choice,
SEG_SEARCH_PENDING_LIST *  pending[],
CHUNKS_RECORD chunks_record,
HEAP pain_points,
BlamerBundle blamer_bundle 
)
protected

Definition at line 257 of file segsearch.cpp.

263  {
264  if (segsearch_debug_level > 0) {
265  tprintf("Classifying pain point priority=%.4f, col=%d, row=%d\n",
266  pain_point_priority, pain_point.col, pain_point.row);
267  }
268  MATRIX *ratings = chunks_record->ratings;
269  BLOB_CHOICE_LIST *classified = classify_piece(
270  chunks_record->chunks, chunks_record->word_res->denorm,
271  chunks_record->splits,
272  pain_point.col, pain_point.row, blamer_bundle);
273  ratings->put(pain_point.col, pain_point.row, classified);
274 
275  if (segsearch_debug_level > 0) {
276  print_ratings_list("Updated ratings matrix with a new entry:",
277  ratings->get(pain_point.col, pain_point.row),
278  getDict().getUnicharset());
279  ratings->print(getDict().getUnicharset());
280  }
281 
282  // Insert initial "pain points" to join the newly classified blob
283  // with its left and right neighbors.
284  if (!classified->empty()) {
285  float worst_piece_cert;
286  bool fragmented;
287  if (pain_point.col > 0) {
289  pain_point.col-1, pain_point.row, chunks_record->ratings,
290  &worst_piece_cert, &fragmented);
292  pain_point.col-1, pain_point.row, false,
294  worst_piece_cert, fragmented, best_choice->certainty(),
296  chunks_record, pain_points);
297  }
298  if (pain_point.row+1 < ratings->dimension()) {
300  pain_point.col, pain_point.row+1, chunks_record->ratings,
301  &worst_piece_cert, &fragmented);
303  pain_point.col, pain_point.row+1, true,
305  worst_piece_cert, fragmented, best_choice->certainty(),
307  chunks_record, pain_points);
308  }
309  }
310 
311  // Record a pending entry with the pain_point and each of its parents.
312  int parent_row = pain_point.col - 1;
313  if (parent_row < 0) { // this node has no parents
314  (*pending)[pain_point.col].add_sorted(
316  new SEG_SEARCH_PENDING(pain_point.row, NULL,
318  } else {
319  for (int parent_col = 0; parent_col < pain_point.col; ++parent_col) {
320  if (ratings->get(parent_col, parent_row) != NOT_CLASSIFIED) {
321  (*pending)[pain_point.col].add_sorted(
323  new SEG_SEARCH_PENDING(pain_point.row,
324  ratings->get(parent_col, parent_row),
326  }
327  }
328  }
329 }
WERD_RES * word_res
Definition: associate.h:54
TBLOB * chunks
Definition: associate.h:53
SEAMS splits
Definition: associate.h:55
float certainty() const
Definition: ratngs.h:234
double segsearch_max_char_wh_ratio
Definition: wordrec.h:152
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
T get(int column, int row) const
Definition: matrix.h:117
static const float kInitialPainPointPriorityAdjustment
static const LanguageModelFlagsType kAllChangedFlag
Dict & getDict()
Definition: classify.h:62
static int compare(const void *p1, const void *p2)
Definition: wordrec.h:49
void put(int column, int row, const T &thing)
Definition: matrix.h:112
int dimension() const
Definition: matrix.h:190
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:511
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
DENORM denorm
Definition: pageres.h:346
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:36
#define NOT_CLASSIFIED
Definition: matrix.h:31
virtual BLOB_CHOICE_LIST * classify_piece(TBLOB *pieces, const DENORM &denorm, SEAMS seams, inT16 start, inT16 end, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:75
Definition: matrix.h:193
void GetWorstPieceCertainty(int col, int row, MATRIX *ratings, float *cert, bool *fragmented)
LanguageModel * language_model_
Definition: wordrec.h:495
bool GeneratePainPoint(int col, int row, bool ok_to_extend, float priority_adjustment, float worst_piece_cert, bool fragmented, float best_choice_cert, float max_char_wh_ratio, BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, HEAP *pain_points)
int segsearch_debug_level
Definition: wordrec.h:146
MATRIX * ratings
Definition: associate.h:52
void tesseract::Wordrec::program_editdown ( inT32  elasped_time)

Definition at line 80 of file tface.cpp.

80  {
84  getDict().End();
85 }
BlobMatchTable blob_match_table
Definition: wordrec.h:501
void InitChoiceAccum()
Definition: stopper.cpp:435
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:476
Dict & getDict()
Definition: classify.h:62
void End()
Definition: dict.cpp:335
void tesseract::Wordrec::program_editup ( const char *  textbase,
bool  init_classifier,
bool  init_permute 
)

Definition at line 50 of file tface.cpp.

52  {
53  if (textbase != NULL) imagefile = textbase;
56  InitAdaptiveClassifier(init_classifier);
57  if (init_dict) getDict().Load();
60 }
void Load()
Definition: dict.cpp:219
#define NULL
Definition: host.h:144
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:121
void SetupExtractors(FEATURE_DEFS_STRUCT *FeatureDefs)
Definition: fxdefs.cpp:42
double chop_ok_split
Definition: wordrec.h:121
void InitAdaptiveClassifier(bool load_pre_trained_templates)
Definition: adaptmatch.cpp:545
Dict & getDict()
Definition: classify.h:62
int wordrec_num_seg_states
Definition: wordrec.h:102
STRING imagefile
Definition: ccutil.h:74
PRIORITY pass2_ok_split
Definition: wordrec.h:496
FEATURE_DEFS_STRUCT feature_defs_
Definition: classify.h:459
void tesseract::Wordrec::push_queue ( HEAP queue,
STATE state,
FLOAT32  worst_priority,
FLOAT32  priority,
bool  debug 
)

Definition at line 629 of file bestfirst.cpp.

630  {
631  HEAPENTRY entry;
632 
633  if (priority < worst_priority) {
634  if (SizeOfHeap (queue) >= MaxSizeOfHeap(queue)) {
635  if (debug) tprintf("Heap is Full\n");
636  return;
637  }
638  entry.Data = (char *) new_state (state);
639  num_pushed++;
640  entry.Key = priority;
641  HeapStore(queue, &entry);
642  }
643 }
STATE * new_state(STATE *oldstate)
Definition: states.cpp:166
#define SizeOfHeap(H)
Definition: oldheap.h:48
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void HeapStore(HEAP *Heap, HEAPENTRY *Entry)
Definition: oldheap.cpp:234
void * Data
Definition: oldheap.h:34
#define MaxSizeOfHeap(H)
Definition: oldheap.h:47
FLOAT32 Key
Definition: oldheap.h:33
FLOAT32 tesseract::Wordrec::rating_priority ( CHUNKS_RECORD chunks_record,
STATE state,
int  num_joints 
)

Definition at line 175 of file heuristic.cpp.

177  {
178  BLOB_CHOICE_LIST *blob_choices;
179  BLOB_CHOICE_IT blob_choice_it;
180  inT16 first_chunk = 0;
181  inT16 last_chunk;
182  inT16 ratings = 0;
183  inT16 weights = 0;
184 
185  PIECES_STATE blob_chunks;
186  bin_to_pieces(state, num_joints, blob_chunks);
187 
188  for (int x = 0; blob_chunks[x]; x++) {
189  last_chunk = first_chunk + blob_chunks[x];
190 
191  blob_choices = chunks_record->ratings->get(first_chunk, last_chunk - 1);
192  if (blob_choices != NOT_CLASSIFIED && blob_choices->length() > 0) {
193  blob_choice_it.set_to_list(blob_choices);
194  ratings += (inT16) blob_choice_it.data()->rating();
195  for (int y = first_chunk; y < last_chunk; y++) {
196  weights += (inT16) (chunks_record->weights[y]);
197  }
198  }
199  first_chunk = last_chunk;
200  }
201  if (weights <= 0)
202  weights = 1;
203  FLOAT32 rating_cost = static_cast<FLOAT32>(ratings) /
204  static_cast<FLOAT32>(weights);
205  if (segment_adjust_debug > 2)
206  tprintf("rating_cost: r%f / w%f = %f\n", ratings, weights, rating_cost);
207  return rating_cost;
208 }
inT16 * weights
Definition: associate.h:59
int segment_adjust_debug
Definition: wordrec.h:124
T get(int column, int row) const
Definition: matrix.h:117
float FLOAT32
Definition: host.h:111
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
#define NOT_CLASSIFIED
Definition: matrix.h:31
short inT16
Definition: host.h:100
uinT8 PIECES_STATE[MAX_NUM_CHUNKS+2]
Definition: states.h:49
MATRIX * ratings
Definition: associate.h:52
void bin_to_pieces(STATE *state, int num_joints, PIECES_STATE pieces)
Definition: states.cpp:99
BLOB_CHOICE_LIST_VECTOR * tesseract::Wordrec::rebuild_current_state ( WERD_RES word,
STATE state,
BLOB_CHOICE_LIST_VECTOR old_choices,
MATRIX ratings 
)

rebuild_current_state

Transfers the given state to the word's output fields: rebuild_word, best_state, box_word, and returns the corresponding blob choices.

Definition at line 332 of file bestfirst.cpp.

336  {
337  // Initialize search_state, num_joints, x, y.
338  int num_joints = array_count(word->seam_array);
339 #ifndef GRAPHICS_DISABLED
341  print_state("Rebuilding state", state, num_joints);
342  }
343 #endif
344  // Setup the rebuild_word ready for the output blobs.
345  if (word->rebuild_word != NULL)
346  delete word->rebuild_word;
347  word->rebuild_word = new TWERD;
348  // Setup the best_state.
349  word->best_state.clear();
350  SEARCH_STATE search_state = bin_to_chunks(state, num_joints);
351  // See which index is which below for information on x and y.
352  int x = 0;
353  int y;
354  for (int i = 1; i <= search_state[0]; i++) {
355  y = x + search_state[i];
356  x = y + 1;
357  }
358  y = count_blobs(word->chopped_word->blobs) - 1;
359 
360  // Initialize char_choices, expanded_fragment_lengths:
361  // e.g. if fragment_lengths = {1 1 2 3 1},
362  // expanded_fragment_lengths_str = {1 1 2 2 3 3 3 1}.
363  BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
364  STRING expanded_fragment_lengths_str = "";
365  bool state_has_fragments = false;
366  const char *fragment_lengths = NULL;
367 
368  if (word->best_choice->length() > 0) {
369  fragment_lengths = word->best_choice->fragment_lengths();
370  }
371  if (fragment_lengths) {
372  for (int i = 0; i < word->best_choice->length(); ++i) {
373  *char_choices += NULL;
374  word->best_state.push_back(0);
375  if (fragment_lengths[i] > 1) {
376  state_has_fragments = true;
377  }
378  for (int j = 0; j < fragment_lengths[i]; ++j) {
379  expanded_fragment_lengths_str += fragment_lengths[i];
380  }
381  }
382  } else {
383  for (int i = 0; i <= search_state[0]; ++i) {
384  expanded_fragment_lengths_str += (char)1;
385  *char_choices += NULL;
386  word->best_state.push_back(0);
387  }
388  }
389 
390  // Set up variables for concatenating fragments.
391  const char *word_lengths_ptr = NULL;
392  const char *word_ptr = NULL;
393  if (state_has_fragments) {
394  // Make word_lengths_ptr point to the last element in
395  // best_choice->unichar_lengths().
396  word_lengths_ptr = word->best_choice->unichar_lengths().string();
397  word_lengths_ptr += (strlen(word_lengths_ptr)-1);
398  // Make word_str point to the beginning of the last
399  // unichar in best_choice->unichar_string().
400  word_ptr = word->best_choice->unichar_string().string();
401  word_ptr += (strlen(word_ptr)-*word_lengths_ptr);
402  }
403  const char *expanded_fragment_lengths =
404  expanded_fragment_lengths_str.string();
405  char unichar[UNICHAR_LEN + 1];
406 
407  // Populate char_choices list such that it corresponds to search_state.
408  //
409  // If we are rebuilding a state that contains character fragments:
410  // -- combine blobs that belong to character fragments
411  // -- re-classify the blobs to obtain choices list for the merged blob
412  // -- ensure that correct classification appears in the new choices list
413  // NOTE: a choice composed form original fragment choices will be always
414  // added to the new choices list for each character composed from
415  // fragments (even if the choice for the corresponding character appears
416  // in the re-classified choices list of for the newly merged blob).
417  int ss_index = search_state[0];
418  // Which index is which?
419  // char_choices_index refers to the finished product: there is one for each
420  // blob/unicharset entry in the final word.
421  // ss_index refers to the search_state, and indexes a group (chunk) of blobs
422  // that were classified together for the best state.
423  // old_choice_index is a copy of ss_index, and accesses the old_choices,
424  // which correspond to chunks in the best state. old_choice_index gets
425  // set to -1 on a fragment set, as there is no corresponding chunk in
426  // the best state.
427  // x and y refer to the underlying blobs and are the first and last blob
428  // indices in a chunk.
429  for (int char_choices_index = char_choices->length() - 1;
430  char_choices_index >= 0;
431  --char_choices_index) {
432  // The start and end of the blob to rebuild.
433  int true_x = x;
434  int true_y = y;
435  // The fake merged fragment choice.
436  BLOB_CHOICE* merged_choice = NULL;
437  // Test for and combine fragments first.
438  int fragment_pieces = expanded_fragment_lengths[ss_index];
439  int old_choice_index = ss_index;
440 
441  if (fragment_pieces > 1) {
442  strncpy(unichar, word_ptr, *word_lengths_ptr);
443  unichar[*word_lengths_ptr] = '\0';
444  merged_choice = rebuild_fragments(unichar, expanded_fragment_lengths,
445  old_choice_index, old_choices);
446  old_choice_index = -1;
447  }
448  while (fragment_pieces > 0) {
449  true_x = x;
450  // Move left to the previous blob.
451  y = x - 1;
452  x = y - search_state[ss_index--];
453  --fragment_pieces;
454  }
455  word->best_state[char_choices_index] = true_y + 1 - true_x;
456  BLOB_CHOICE_LIST *current_choices = join_blobs_and_classify(
457  word, true_x, true_y, old_choice_index, ratings, old_choices);
458  if (merged_choice != NULL) {
459  // Insert merged_blob into current_choices, such that current_choices
460  // are still sorted in non-descending order by rating.
461  ASSERT_HOST(!current_choices->empty());
462  BLOB_CHOICE_IT choice_it(current_choices);
463  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list() &&
464  merged_choice->rating() > choice_it.data()->rating();
465  choice_it.forward());
466  choice_it.add_before_stay_put(merged_choice);
467  }
468  // Get rid of fragments in current_choices.
469  BLOB_CHOICE_IT choice_it(current_choices);
470  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
471  choice_it.forward()) {
473  choice_it.data()->unichar_id())) {
474  delete choice_it.extract();
475  }
476  }
477  char_choices->set(current_choices, char_choices_index);
478 
479  // Update word_ptr and word_lengths_ptr.
480  if (word_lengths_ptr != NULL && word_ptr != NULL) {
481  word_lengths_ptr--;
482  word_ptr -= (*word_lengths_ptr);
483  }
484  }
485  old_choices->delete_data_pointers();
486  delete old_choices;
487  memfree(search_state);
488 
489  return char_choices;
490 }
int length() const
Definition: ratngs.h:214
void delete_data_pointers()
TWERD * rebuild_word
Definition: pageres.h:381
const STRING & unichar_string() const
Definition: ratngs.h:395
BLOB_CHOICE_LIST * join_blobs_and_classify(WERD_RES *word, int x, int y, int choice_index, MATRIX *ratings, BLOB_CHOICE_LIST_VECTOR *old_choices)
Definition: bestfirst.cpp:730
void memfree(void *element)
Definition: freelist.cpp:30
virtual void clear()
void set(T t, int index)
#define NULL
Definition: host.h:144
Definition: blobs.h:233
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
GenericVector< int > best_state
Definition: pageres.h:392
int wordrec_display_segmentations
Definition: plotseg.cpp:48
void print_state(const char *label, STATE *state, int num_joints)
Definition: states.cpp:214
BLOB_CHOICE * rebuild_fragments(const char *unichar, const char *expanded_fragment_lengths, int choice_index, BLOB_CHOICE_LIST_VECTOR *old_choices)
Definition: bestfirst.cpp:680
TBLOB * blobs
Definition: blobs.h:274
int push_back(T object)
GenericVector< BLOB_CHOICE_LIST * > BLOB_CHOICE_LIST_VECTOR
Definition: ratngs.h:449
SEARCH_STATE bin_to_chunks(STATE *state, int num_joints)
Definition: states.cpp:49
const char * fragment_lengths() const
Definition: ratngs.h:224
SEAMS seam_array
Definition: pageres.h:358
Dict & getDict()
Definition: classify.h:62
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:610
const char * string() const
Definition: strngs.cpp:156
int count_blobs(TBLOB *blobs)
Definition: blobs.cpp:581
Definition: strngs.h:40
const STRING & unichar_lengths() const
Definition: ratngs.h:402
int length() const
Definition: genericvector.h:63
TWERD * chopped_word
Definition: pageres.h:357
#define UNICHAR_LEN
Definition: unichar.h:28
#define array_count(a)
Definition: tessarray.h:74
#define ASSERT_HOST(x)
Definition: errcode.h:84
int * SEARCH_STATE
Definition: states.h:46
WERD_CHOICE * best_choice
Definition: pageres.h:359
float rating() const
Definition: ratngs.h:62
BLOB_CHOICE * tesseract::Wordrec::rebuild_fragments ( const char *  unichar,
const char *  expanded_fragment_lengths,
int  choice_index,
BLOB_CHOICE_LIST_VECTOR old_choices 
)

Definition at line 680 of file bestfirst.cpp.

684  {
685  float rating = 0.0f;
686  float certainty = 0.0f;
687  inT16 min_xheight = -MAX_INT16;
688  inT16 max_xheight = MAX_INT16;
689  for (int fragment_pieces = expanded_fragment_lengths[choice_index] - 1;
690  fragment_pieces >= 0; --fragment_pieces, --choice_index) {
691  // Get a pointer to the classifier results from the old_choices.
692  BLOB_CHOICE_LIST *current_choices = old_choices->get(choice_index);
693  // Populate fragment with updated values and look for the
694  // fragment with the same values in current_choices.
695  // Update rating and certainty of the character being composed.
696  CHAR_FRAGMENT fragment;
697  fragment.set_all(unichar, fragment_pieces,
698  expanded_fragment_lengths[choice_index], false);
699  BLOB_CHOICE_IT choice_it(current_choices);
700  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
701  choice_it.forward()) {
702  BLOB_CHOICE* choice = choice_it.data();
703  const CHAR_FRAGMENT *current_fragment =
705  if (current_fragment && fragment.equals(current_fragment)) {
706  rating += choice->rating();
707  if (choice->certainty() < certainty) {
708  certainty = choice->certainty();
709  }
710  IntersectRange(choice->min_xheight(), choice->max_xheight(),
711  &min_xheight, &max_xheight);
712  break;
713  }
714  }
715  if (choice_it.cycled_list()) {
716  print_ratings_list("Failure", current_choices, unicharset);
717  tprintf("Failed to find fragment %s at index=%d\n",
718  fragment.to_string().string(), choice_index);
719  }
720  ASSERT_HOST(!choice_it.cycled_list()); // Be sure we found the fragment.
721  }
722  return new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar),
723  rating, certainty, -1, -1, 0,
724  min_xheight, max_xheight, false);
725 }
void IntersectRange(const T &lower1, const T &upper1, T *lower2, T *upper2)
Definition: helpers.h:95
bool equals(const char *other_unichar, int other_pos, int other_total) const
Definition: unicharset.h:67
void set_all(const char *unichar, int pos, int total, bool natural)
Definition: unicharset.h:40
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
T & get(int index) const
UNICHAR_ID unichar_id() const
Definition: ratngs.h:59
float certainty() const
Definition: ratngs.h:65
Dict & getDict()
Definition: classify.h:62
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:511
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:610
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
UNICHARSET unicharset
Definition: ccutil.h:72
short inT16
Definition: host.h:100
inT16 max_xheight() const
Definition: ratngs.h:89
#define MAX_INT16
Definition: host.h:119
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 min_xheight() const
Definition: ratngs.h:86
float rating() const
Definition: ratngs.h:62
static STRING to_string(const char *unichar, int pos, int total, bool natural)
Definition: unicharset.cpp:889
TBOX * tesseract::Wordrec::record_blob_bounds ( TBLOB blobs)

Definition at line 393 of file pieces.cpp.

393  {
394  int nblobs = count_blobs(blobs);
395  TBOX *bboxes = new TBOX[nblobs];
396 
397  inT16 x = 0;
398  for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) {
399  bboxes[x] = blob->bounding_box();
400  x++;
401  }
402  return bboxes;
403 }
#define NULL
Definition: host.h:144
Definition: rect.h:29
Definition: blobs.h:174
int count_blobs(TBLOB *blobs)
Definition: blobs.cpp:581
short inT16
Definition: host.h:100
MATRIX * tesseract::Wordrec::record_piece_ratings ( TBLOB blobs)

Definition at line 414 of file pieces.cpp.

414  {
415  inT16 num_blobs = count_blobs(blobs);
416  TBOX *bounds = record_blob_bounds(blobs);
417  MATRIX *ratings = new MATRIX(num_blobs);
418 
419  for (int x = 0; x < num_blobs; x++) {
420  for (int y = x; y < num_blobs; y++) {
421  TBOX piecebox = bounds_of_piece(bounds, x, y);
422  BLOB_CHOICE_LIST *choices = blob_match_table.get_match_by_box(piecebox);
423  if (choices != NULL) {
424  ratings->put(x, y, choices);
425  }
426  }
427  }
428 
430  merge_fragments(ratings, num_blobs);
431 
432  delete []bounds;
433  return ratings;
434 }
BlobMatchTable blob_match_table
Definition: wordrec.h:501
#define NULL
Definition: host.h:144
void merge_fragments(MATRIX *ratings, inT16 num_blobs)
Definition: pieces.cpp:324
Definition: rect.h:29
void put(int column, int row, const T &thing)
Definition: matrix.h:112
TBOX * record_blob_bounds(TBLOB *blobs)
Definition: pieces.cpp:393
int count_blobs(TBLOB *blobs)
Definition: blobs.cpp:581
short inT16
Definition: host.h:100
TBOX bounds_of_piece(TBOX *bounds, inT16 start, inT16 end)
Definition: pieces.cpp:58
Definition: matrix.h:193
BLOB_CHOICE_LIST * get_match_by_box(const TBOX &box)
Definition: matchtab.cpp:150
bool merge_fragments_in_matrix
Definition: wordrec.h:96
void tesseract::Wordrec::replace_char_widths ( CHUNKS_RECORD chunks_record,
SEARCH_STATE  state 
)

Definition at line 651 of file bestfirst.cpp.

652  {
653  WIDTH_RECORD *width_record;
654  int num_blobs;
655  int i;
656 
657  free_widths (chunks_record->char_widths);
658 
659  num_blobs = state[0] + 1;
660  width_record = (WIDTH_RECORD *) memalloc (sizeof (int) * num_blobs * 2);
661  width_record->num_chars = num_blobs;
662 
663  for (i = 0; i < num_blobs; i++) {
664 
665  width_record->widths[2 * i] = last_segmentation[i].width;
666 
667  if (i + 1 < num_blobs)
668  width_record->widths[2 * i + 1] = last_segmentation[i].gap;
669  }
670  chunks_record->char_widths = width_record;
671 }
WIDTH_RECORD * char_widths
Definition: associate.h:58
int * memalloc(int size)
Definition: freelist.cpp:22
#define free_widths(w)
Definition: blobs.h:287
int num_chars
Definition: blobs.h:49
int widths[1]
Definition: blobs.h:50
EVALUATION_ARRAY last_segmentation
Definition: wordrec.h:502
void tesseract::Wordrec::reverse_outline ( EDGEPT outline)

Definition at line 164 of file outlines.cpp.

164  {
165  EDGEPT *edgept = outline;
166  EDGEPT *temp;
167 
168  do {
169  /* Swap next and prev */
170  temp = edgept->prev;
171  edgept->prev = edgept->next;
172  edgept->next = temp;
173  /* Set up vec field */
174  edgept->vec.x = edgept->next->pos.x - edgept->pos.x;
175  edgept->vec.y = edgept->next->pos.y - edgept->pos.y;
176 
177  edgept = edgept->prev; /* Go to next point */
178  }
179  while (edgept != outline);
180 }
EDGEPT * next
Definition: blobs.h:106
EDGEPT * prev
Definition: blobs.h:107
VECTOR vec
Definition: blobs.h:101
inT16 y
Definition: blobs.h:68
inT16 x
Definition: blobs.h:67
Definition: blobs.h:72
TPOINT pos
Definition: blobs.h:100
void tesseract::Wordrec::SaveAltChoices ( const LIST best_choices,
WERD_RES word 
)

Definition at line 173 of file wordrec.cpp.

173  {
174  ASSERT_HOST(word->alt_choices.empty());
175  ASSERT_HOST(word->alt_states.empty());
176  LIST list_it;
177  iterate_list(list_it, best_choices) {
178  VIABLE_CHOICE choice =
179  reinterpret_cast<VIABLE_CHOICE>(first_node(list_it));
180  CHAR_CHOICE *char_choice = &(choice->Blob[0]);
181  WERD_CHOICE *alt_choice = new WERD_CHOICE(word->uch_set, choice->Length);
183  GenericVector<int> &alt_state = word->alt_states.back();
184  for (int i = 0; i < choice->Length; char_choice++, i++) {
186  char_choice->Class, 1, 0, 0);
187  alt_state.push_back(char_choice->NumChunks);
188  }
189  alt_choice->set_rating(choice->Rating);
190  alt_choice->set_certainty(choice->Certainty);
191 
192  ASSERT_HOST(choice->blob_choices != NULL);
193  alt_choice->set_blob_choices(choice->blob_choices);
194  choice->blob_choices = NULL;
195 
196  word->alt_choices.push_back(alt_choice);
197  if (wordrec_debug_level > 0) {
198  tprintf("SaveAltChoices: %s %g\n",
199  alt_choice->unichar_string().string(), alt_choice->rating());
200  }
201  }
202 }
const STRING & unichar_string() const
Definition: ratngs.h:395
void set_rating(float new_val)
Definition: ratngs.h:255
CHAR_CHOICE * Blob
Definition: stopper.h:74
GenericVector< WERD_CHOICE * > alt_choices
Definition: pageres.h:363
UNICHAR_ID Class
Definition: stopper.h:51
#define NULL
Definition: host.h:144
int wordrec_debug_level
Definition: wordrec.h:141
int push_back(T object)
uinT16 NumChunks
Definition: stopper.h:52
GenericVector< GenericVector< int > > alt_states
Definition: pageres.h:364
const UNICHARSET * uch_set
Definition: pageres.h:348
BLOB_CHOICE_LIST_CLIST * blob_choices
Definition: stopper.h:75
const char * string() const
Definition: strngs.cpp:156
bool empty() const
Definition: genericvector.h:68
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void set_certainty(float new_val)
Definition: ratngs.h:258
T & back() const
#define iterate_list(x, l)
Definition: oldlist.h:170
void set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: ratngs.cpp:184
#define ASSERT_HOST(x)
Definition: errcode.h:84
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, char fragment_length, float rating, float certainty)
Definition: ratngs.h:331
#define first_node(l)
Definition: oldlist.h:139
float rating() const
Definition: ratngs.h:231
PRIORITY tesseract::Wordrec::seam_priority ( SEAM seam,
inT16  xmin,
inT16  xmax 
)

Definition at line 474 of file findseam.cpp.

474  {
475  PRIORITY priority;
476 
477  if (seam->split1 == NULL)
478  priority = 0;
479 
480  else if (seam->split2 == NULL) {
481  priority = (seam->priority +
482  full_split_priority (seam->split1, xmin, xmax));
483  }
484 
485  else if (seam->split3 == NULL) {
486  split_outline (seam->split2->point1, seam->split2->point2);
487  priority = (seam->priority +
488  full_split_priority (seam->split1, xmin, xmax));
489  unsplit_outlines (seam->split2->point1, seam->split2->point2);
490  }
491 
492  else {
493  split_outline (seam->split2->point1, seam->split2->point2);
494  split_outline (seam->split3->point1, seam->split3->point2);
495  priority = (seam->priority +
496  full_split_priority (seam->split1, xmin, xmax));
497  unsplit_outlines (seam->split3->point1, seam->split3->point2);
498  unsplit_outlines (seam->split2->point1, seam->split2->point2);
499  }
500 
501  return (priority);
502 }
PRIORITY priority
Definition: seam.h:42
void split_outline(EDGEPT *join_point1, EDGEPT *join_point2)
Definition: split.cpp:136
PRIORITY full_split_priority(SPLIT *split, inT16 xmin, inT16 xmax)
Definition: gradechop.cpp:74
SPLIT * split2
Definition: seam.h:47
float PRIORITY
Definition: seam.h:38
#define NULL
Definition: host.h:144
void unsplit_outlines(EDGEPT *p1, EDGEPT *p2)
Definition: split.cpp:158
SPLIT * split1
Definition: seam.h:46
EDGEPT * point1
Definition: split.h:39
EDGEPT * point2
Definition: split.h:40
SPLIT * split3
Definition: seam.h:48
FLOAT32 tesseract::Wordrec::seamcut_priority ( SEAMS  seams,
STATE state,
int  num_joints 
)

Definition at line 142 of file heuristic.cpp.

144  {
145  int x;
146  unsigned int mask = (num_joints > 32) ? (1 << (num_joints - 1 - 32))
147  : (1 << (num_joints - 1));
148  float seam_cost = 0.0f;
149  for (x = num_joints - 1; x >= 0; x--) {
150  int i = num_joints - 1 - x;
151  uinT32 value = (x < 32) ? state->part2 : state->part1;
152  bool state_on = value & mask;
153  if (state_on) {
154  SEAM* seam = (SEAM *) array_value(seams, i);
155  seam_cost += seam->priority;
156  }
157  if (mask == 1)
158  mask = 1 << 31;
159  else
160  mask >>= 1;
161  }
162  if (segment_adjust_debug > 2)
163  tprintf("seam_cost: %f\n", seam_cost);
164  return seam_cost;
165 }
PRIORITY priority
Definition: seam.h:42
int segment_adjust_debug
Definition: wordrec.h:124
uinT32 part1
Definition: states.h:41
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
uinT32 part2
Definition: states.h:42
#define array_value(a, i)
Definition: tessarray.h:132
unsigned int uinT32
Definition: host.h:103
void tesseract::Wordrec::SegSearch ( CHUNKS_RECORD chunks_record,
WERD_CHOICE best_choice,
BLOB_CHOICE_LIST_VECTOR best_char_choices,
WERD_CHOICE raw_choice,
STATE output_best_state,
BlamerBundle blamer_bundle 
)

Definition at line 35 of file segsearch.cpp.

40  {
41  int row, col = 0;
42  if (segsearch_debug_level > 0) {
43  tprintf("Starting SegSearch on ratings matrix:\n");
44  chunks_record->ratings->print(getDict().getUnicharset());
45  }
46  // Start with a fresh best_choice since rating adjustments
47  // used by the chopper and the new segmentation search are not compatible.
48  best_choice->set_rating(WERD_CHOICE::kBadRating);
49  // TODO(antonova): Due to the fact that we currently do not re-start the
50  // segmentation search from the best choice the chopper found, sometimes
51  // the the segmentation search does not find the best path (that chopper
52  // did discover) and does not have a chance to adapt to it. As soon as we
53  // transition to using new-style language model penalties in the chopper
54  // this issue will be resolved. But for how we are forced clear the
55  // accumulator choices.
56  //
57  // Clear best choice accumulator (that is used for adaption), so that
58  // choices adjusted by chopper do not interfere with the results from the
59  // segmentation search.
61 
62  MATRIX *ratings = chunks_record->ratings;
63  // Priority queue containing pain points generated by the language model
64  // The priority is set by the language model components, adjustments like
65  // seam cost and width priority are factored into the priority.
66  HEAP *pain_points = MakeHeap(segsearch_max_pain_points);
67 
68  // best_path_by_column records the lowest cost path found so far for each
69  // column of the chunks_record->ratings matrix over all the rows.
70  BestPathByColumn *best_path_by_column =
71  new BestPathByColumn[ratings->dimension()];
72  for (col = 0; col < ratings->dimension(); ++col) {
73  best_path_by_column[col].avg_cost = WERD_CHOICE::kBadRating;
74  best_path_by_column[col].best_vse = NULL;
75  }
76 
77  // Compute scaling factor that will help us recover blob outline length
78  // from classifier rating and certainty for the blob.
79  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
80 
83  best_choice->certainty(),
84  segsearch_max_char_wh_ratio, rating_cert_scale,
85  pain_points, chunks_record, blamer_bundle,
87 
88  MATRIX_COORD *pain_point;
89  float pain_point_priority;
90  BestChoiceBundle best_choice_bundle(
91  output_best_state, best_choice, raw_choice, best_char_choices);
92 
93  // pending[i] stores a list of the parent/child pair of BLOB_CHOICE_LISTs,
94  // where i is the column of the child. Initially all the classified entries
95  // in the ratings matrix from column 0 (with parent NULL) are inserted into
96  // pending[0]. As the language model state is updated, new child/parent
97  // pairs are inserted into the lists. Next, the entries in pending[1] are
98  // considered, and so on. It is important that during the update the
99  // children are considered in the non-decreasing order of their column, since
100  // this guarantees that all the parents would be up to date before an update
101  // of a child is done.
102  SEG_SEARCH_PENDING_LIST *pending =
103  new SEG_SEARCH_PENDING_LIST[ratings->dimension()];
104 
105  // Search for the ratings matrix for the initial best path.
106  for (row = 0; row < ratings->dimension(); ++row) {
107  if (ratings->get(0, row) != NOT_CLASSIFIED) {
108  pending[0].add_sorted(
111  }
112  }
113  UpdateSegSearchNodes(0, &pending, &best_path_by_column, chunks_record,
114  pain_points, &best_choice_bundle, blamer_bundle);
115 
116  // Keep trying to find a better path by fixing the "pain points".
117  int num_futile_classifications = 0;
118  STRING blamer_debug;
119  while (!SegSearchDone(num_futile_classifications) ||
120  (blamer_bundle != NULL &&
121  blamer_bundle->segsearch_is_looking_for_blame)) {
122  // Get the next valid "pain point".
123  int pop;
124  while (true) {
125  pop = HeapPop(pain_points, &pain_point_priority, &pain_point);
126  if (pop == EMPTY) break;
127  if (pain_point->Valid(*ratings) &&
128  ratings->get(pain_point->col, pain_point->row) == NOT_CLASSIFIED) {
129  break;
130  } else {
131  delete pain_point;
132  }
133  }
134  if (pop == EMPTY) {
135  if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
136  break;
137  }
138  ProcessSegSearchPainPoint(pain_point_priority, *pain_point,
139  best_choice_bundle.best_choice, &pending,
140  chunks_record, pain_points, blamer_bundle);
141 
142  UpdateSegSearchNodes(pain_point->col, &pending, &best_path_by_column,
143  chunks_record, pain_points, &best_choice_bundle,
144  blamer_bundle);
145  if (!best_choice_bundle.updated) ++num_futile_classifications;
146 
147  if (segsearch_debug_level > 0) {
148  tprintf("num_futile_classifications %d\n", num_futile_classifications);
149  }
150 
151  best_choice_bundle.updated = false; // reset updated
152  delete pain_point; // done using this pain point
153 
154  // See if it's time to terminate SegSearch or time for starting a guided
155  // search for the true path to find the blame for the incorrect best_choice.
156  if (SegSearchDone(num_futile_classifications) && blamer_bundle != NULL &&
157  blamer_bundle->incorrect_result_reason == IRR_CORRECT &&
158  !blamer_bundle->segsearch_is_looking_for_blame &&
159  blamer_bundle->truth_has_char_boxes &&
160  !ChoiceIsCorrect(getDict().getUnicharset(),
161  best_choice, blamer_bundle->truth_text)) {
162  InitBlamerForSegSearch(best_choice_bundle.best_choice, chunks_record,
163  pain_points, blamer_bundle, &blamer_debug);
164  }
165  } // end while loop exploring alternative paths
166  FinishBlamerForSegSearch(best_choice_bundle.best_choice,
167  blamer_bundle, &blamer_debug);
168 
169  if (segsearch_debug_level > 0) {
170  tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n",
172  }
173 
174  // Clean up.
175  FreeHeapData(pain_points, MATRIX_COORD::Delete);
176  delete[] best_path_by_column;
177  delete[] pending;
178  for (row = 0; row < ratings->dimension(); ++row) {
179  for (col = 0; col <= row; ++col) {
180  BLOB_CHOICE_LIST *rating = ratings->get(col, row);
181  if (rating != NOT_CLASSIFIED) language_model_->DeleteState(rating);
182  }
183  }
184 }
bool ChoiceIsCorrect(const UNICHARSET &uni_set, const WERD_CHOICE *choice, const GenericVector< STRING > &truth_text)
Definition: wordrec.cpp:159
void FinishBlamerForSegSearch(const WERD_CHOICE *best_choice, BlamerBundle *blamer_bundle, STRING *blamer_debug)
Definition: segsearch.cpp:376
void set_rating(float new_val)
Definition: ratngs.h:255
void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, const WERD_CHOICE *best_choice, SEG_SEARCH_PENDING_LIST *pending[], CHUNKS_RECORD *chunks_record, HEAP *pain_points, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:257
LIST pop(LIST list)
Definition: oldlist.cpp:305
float certainty() const
Definition: ratngs.h:234
double segsearch_max_char_wh_ratio
Definition: wordrec.h:152
#define NULL
Definition: host.h:144
HEAP * MakeHeap(int Size)
Definition: oldheap.cpp:49
double certainty_scale
Definition: dict.h:845
T get(int column, int row) const
Definition: matrix.h:117
void ClearBestChoiceAccum()
Clears best_choices_ list accumulated by the stopper.
Definition: stopper.cpp:458
bool SegSearchDone(int num_futile_classifications)
Definition: wordrec.h:516
void FreeHeapData(HEAP *Heap, void_dest destructor)
Definition: oldheap.cpp:327
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float best_choice_cert, float max_char_wh_ratio, float rating_cert_scale, HEAP *pain_points, CHUNKS_RECORD *chunks_record, BlamerBundle *blamer_bundle, bool debug_blamer)
static const LanguageModelFlagsType kAllChangedFlag
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:126
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:506
int segsearch_max_pain_points
Definition: wordrec.h:148
int HeapPop(HEAP *Heap, FLOAT32 *Key, void *out_ptr)
Definition: oldheap.cpp:76
Dict & getDict()
Definition: classify.h:62
static int compare(const void *p1, const void *p2)
Definition: wordrec.h:49
int dimension() const
Definition: matrix.h:190
void InitBlamerForSegSearch(const WERD_CHOICE *best_choice, CHUNKS_RECORD *chunks_record, HEAP *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
Definition: segsearch.cpp:331
static const float kBadRating
Definition: ratngs.h:188
void DeleteState(BLOB_CHOICE_LIST *choices)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool wordrec_debug_blamer
Definition: wordrec.h:142
bool Valid(const MATRIX &m) const
Definition: matrix.h:208
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:36
#define NOT_CLASSIFIED
Definition: matrix.h:31
Definition: strngs.h:40
Definition: oldheap.h:37
Definition: matrix.h:193
LanguageModel * language_model_
Definition: wordrec.h:495
void UpdateSegSearchNodes(int starting_col, SEG_SEARCH_PENDING_LIST *pending[], BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, HEAP *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:186
int segsearch_debug_level
Definition: wordrec.h:146
MATRIX * ratings
Definition: associate.h:52
static void Delete(void *arg)
Definition: matrix.h:202
#define EMPTY
Definition: oldheap.h:29
bool tesseract::Wordrec::SegSearchDone ( int  num_futile_classifications)
inlineprotected

Definition at line 516 of file wordrec.h.

516  {
518  num_futile_classifications >=
520  }
int segsearch_max_futile_classifications
Definition: wordrec.h:150
LanguageModel * language_model_
Definition: wordrec.h:495
inT16 tesseract::Wordrec::select_blob_to_split ( const BLOB_CHOICE_LIST_VECTOR char_choices,
float  rating_ceiling,
bool  split_next_to_fragment 
)

Definition at line 801 of file chopper.cpp.

803  {
804  BLOB_CHOICE_IT blob_choice_it;
805  BLOB_CHOICE *blob_choice;
806  BLOB_CHOICE_IT temp_it;
807  int x;
808  float worst = -MAX_FLOAT32;
809  int worst_index = -1;
810  float worst_near_fragment = -MAX_FLOAT32;
811  int worst_index_near_fragment = -1;
812  const CHAR_FRAGMENT **fragments = NULL;
813 
814  if (chop_debug) {
815  if (rating_ceiling < MAX_FLOAT32)
816  cprintf("rating_ceiling = %8.4f\n", rating_ceiling);
817  else
818  cprintf("rating_ceiling = No Limit\n");
819  }
820 
821  if (split_next_to_fragment && char_choices.length() > 0) {
822  fragments = new const CHAR_FRAGMENT *[char_choices.length()];
823  if (char_choices.get(0) != NULL) {
824  temp_it.set_to_list(char_choices.get(0));
825  fragments[0] = getDict().getUnicharset().get_fragment(
826  temp_it.data()->unichar_id());
827  } else {
828  fragments[0] = NULL;
829  }
830  }
831 
832  for (x = 0; x < char_choices.length(); ++x) {
833  if (char_choices.get(x) == NULL) {
834  if (fragments != NULL) {
835  delete[] fragments;
836  }
837  return x;
838  } else {
839  blob_choice_it.set_to_list(char_choices.get(x));
840  blob_choice = blob_choice_it.data();
841  // Populate fragments for the following position.
842  if (split_next_to_fragment && x+1 < char_choices.length()) {
843  if (char_choices.get(x+1) != NULL) {
844  temp_it.set_to_list(char_choices.get(x+1));
845  fragments[x+1] = getDict().getUnicharset().get_fragment(
846  temp_it.data()->unichar_id());
847  } else {
848  fragments[x+1] = NULL;
849  }
850  }
851  if (blob_choice->rating() < rating_ceiling &&
852  blob_choice->certainty() < tessedit_certainty_threshold) {
853  // Update worst and worst_index.
854  if (blob_choice->rating() > worst) {
855  worst_index = x;
856  worst = blob_choice->rating();
857  }
858  if (split_next_to_fragment) {
859  // Update worst_near_fragment and worst_index_near_fragment.
860  bool expand_following_fragment =
861  (x + 1 < char_choices.length() &&
862  fragments[x+1] != NULL && !fragments[x+1]->is_beginning());
863  bool expand_preceding_fragment =
864  (x > 0 && fragments[x-1] != NULL && !fragments[x-1]->is_ending());
865  if ((expand_following_fragment || expand_preceding_fragment) &&
866  blob_choice->rating() > worst_near_fragment) {
867  worst_index_near_fragment = x;
868  worst_near_fragment = blob_choice->rating();
869  if (chop_debug) {
870  cprintf("worst_index_near_fragment=%d"
871  " expand_following_fragment=%d"
872  " expand_preceding_fragment=%d\n",
873  worst_index_near_fragment,
874  expand_following_fragment,
875  expand_preceding_fragment);
876  }
877  }
878  }
879  }
880  }
881  }
882  if (fragments != NULL) {
883  delete[] fragments;
884  }
885  // TODO(daria): maybe a threshold of badness for
886  // worst_near_fragment would be useful.
887  return worst_index_near_fragment != -1 ?
888  worst_index_near_fragment : worst_index;
889 }
double tessedit_certainty_threshold
Definition: wordrec.h:107
#define NULL
Definition: host.h:144
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
T & get(int index) const
float certainty() const
Definition: ratngs.h:65
Dict & getDict()
Definition: classify.h:62
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:610
bool is_ending() const
Definition: unicharset.h:90
#define MAX_FLOAT32
Definition: host.h:124
int length() const
Definition: genericvector.h:63
bool is_beginning() const
Definition: unicharset.h:87
void cprintf(const char *format,...)
Definition: callcpp.cpp:41
float rating() const
Definition: ratngs.h:62
inT16 tesseract::Wordrec::select_blob_to_split_from_fixpt ( DANGERR fixpt)

Definition at line 898 of file chopper.cpp.

898  {
899  if (!fixpt)
900  return -1;
901  for (int i = 0; i < fixpt->size(); i++) {
902  if ((*fixpt)[i].begin == (*fixpt)[i].end &&
903  (*fixpt)[i].dangerous &&
904  (*fixpt)[i].correct_is_ngram) {
905  return (*fixpt)[i].begin;
906  }
907  }
908  return -1;
909 }
int size() const
Definition: genericvector.h:59
void tesseract::Wordrec::set_chopper_blame ( WERD_RES word)

Definition at line 917 of file chopper.cpp.

917  {
918  BlamerBundle *blamer_bundle = word->blamer_bundle;
919  assert(blamer_bundle != NULL);
920  if (blamer_bundle->NoTruth() || !(blamer_bundle->truth_has_char_boxes) ||
921  word->chopped_word->blobs == NULL) {
922  return;
923  }
924  STRING debug;
925  bool missing_chop = false;
926  TBLOB * curr_blob = word->chopped_word->blobs;
927  int b = 0;
928  inT16 truth_x;
929  while (b < blamer_bundle->truth_word.length() && curr_blob != NULL) {
930  truth_x = blamer_bundle->norm_truth_word.BlobBox(b).right();
931  if (curr_blob->bounding_box().right() <
932  (truth_x - blamer_bundle->norm_box_tolerance)) {
933  curr_blob = curr_blob->next;
934  continue; // encountered an extra chop, keep looking
935  } else if (curr_blob->bounding_box().right() >
936  (truth_x + blamer_bundle->norm_box_tolerance)) {
937  missing_chop = true;
938  break;
939  } else {
940  curr_blob = curr_blob->next;
941  ++b;
942  }
943  }
944  if (missing_chop || b < blamer_bundle->norm_truth_word.length()) {
945  STRING debug;
946  char debug_buffer[256];
947  if (missing_chop) {
948  sprintf(debug_buffer, "Detected missing chop (tolerance=%d) at ",
949  blamer_bundle->norm_box_tolerance);
950  debug += debug_buffer;
951  curr_blob->bounding_box().append_debug(&debug);
952  debug.add_str_int("\nNo chop for truth at x=", truth_x);
953  } else {
954  debug.add_str_int("Missing chops for last ",
955  blamer_bundle->norm_truth_word.length()-b);
956  debug += " truth box(es)";
957  }
958  debug += "\nMaximally chopped word boxes:\n";
959  for (curr_blob = word->chopped_word->blobs; curr_blob != NULL;
960  curr_blob = curr_blob->next) {
961  const TBOX &tbox = curr_blob->bounding_box();
962  sprintf(debug_buffer, "(%d,%d)->(%d,%d)\n",
963  tbox.left(), tbox.bottom(), tbox.right(), tbox.top());
964  debug += debug_buffer;
965  }
966  debug += "Truth bounding boxes:\n";
967  for (b = 0; b < blamer_bundle->norm_truth_word.length(); ++b) {
968  const TBOX &tbox = blamer_bundle->norm_truth_word.BlobBox(b);
969  sprintf(debug_buffer, "(%d,%d)->(%d,%d)\n",
970  tbox.left(), tbox.bottom(), tbox.right(), tbox.top());
971  debug += debug_buffer;
972  }
973  blamer_bundle->SetBlame(IRR_CHOPPER, debug, word->best_choice,
975  }
976 }
const int length() const
Definition: boxword.h:99
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
void append_debug(STRING *str) const
Definition: rect.h:270
Definition: rect.h:29
TBLOB * blobs
Definition: blobs.h:274
inT16 right() const
Definition: rect.h:74
bool NoTruth() const
Definition: pageres.h:147
void add_str_int(const char *str, int number)
Definition: strngs.cpp:334
tesseract::BoxWord norm_truth_word
Definition: pageres.h:170
Definition: blobs.h:174
void SetBlame(IncorrectResultReason irr, const STRING &msg, const WERD_CHOICE *choice, bool debug)
Definition: pageres.h:151
inT16 top() const
Definition: rect.h:53
bool wordrec_debug_blamer
Definition: wordrec.h:142
TBOX bounding_box() const
Definition: blobs.cpp:384
Definition: strngs.h:40
short inT16
Definition: host.h:100
bool truth_has_char_boxes
Definition: pageres.h:164
const TBOX & BlobBox(int index) const
Definition: boxword.h:102
TWERD * chopped_word
Definition: pageres.h:357
int norm_box_tolerance
Definition: pageres.h:172
BlamerBundle * blamer_bundle
Definition: pageres.h:367
TBLOB * next
Definition: blobs.h:228
WERD_CHOICE * best_choice
Definition: pageres.h:359
inT16 bottom() const
Definition: rect.h:60
void tesseract::Wordrec::set_outline_bounds ( register EDGEPT point1,
register EDGEPT point2,
BOUNDS_RECT  rect 
)

Definition at line 213 of file gradechop.cpp.

215  {
216  register EDGEPT *this_point;
217  register inT16 x_min;
218  register inT16 x_max;
219 
220  find_bounds_loop(point1, point2, x_min, x_max);
221 
222  rect[0] = x_min;
223  rect[1] = x_max;
224 
225  find_bounds_loop(point2, point1, x_min, x_max);
226 
227  rect[2] = x_min;
228  rect[3] = x_max;
229 }
#define find_bounds_loop(point1, point2, x_min, x_max)
Definition: gradechop.cpp:49
Definition: blobs.h:72
short inT16
Definition: host.h:100
void tesseract::Wordrec::set_pass1 ( )

Definition at line 93 of file tface.cpp.

93  {
94  chop_ok_split.set_value(70.0);
95  wordrec_num_seg_states.set_value(15);
96  SettupPass1();
97 }
double chop_ok_split
Definition: wordrec.h:121
int wordrec_num_seg_states
Definition: wordrec.h:102
void tesseract::Wordrec::set_pass2 ( )

Definition at line 105 of file tface.cpp.

105  {
106  chop_ok_split.set_value(pass2_ok_split);
108  SettupPass2();
109 }
double chop_ok_split
Definition: wordrec.h:121
int wordrec_num_seg_states
Definition: wordrec.h:102
PRIORITY pass2_ok_split
Definition: wordrec.h:496
WIDTH_RECORD * tesseract::Wordrec::state_char_widths ( WIDTH_RECORD chunk_widths,
STATE state,
int  num_joints 
)

Definition at line 58 of file heuristic.cpp.

60  {
61  SEARCH_STATE chunks = bin_to_chunks(state, num_joints);
62  int num_chars = chunks[0] + 1;
63 
64  // allocate and store (n+1,w0,g0,w1,g1...,wn) in int[2*(n+1)] as a
65  // struct { num_chars, widths[2*n+1]; }
66  WIDTH_RECORD *char_widths = (WIDTH_RECORD*) memalloc(sizeof(int)*num_chars*2);
67  char_widths->num_chars = num_chars;
68 
69  int first_blob = 0;
70  int last_blob;
71  for (int i = 1; i <= num_chars; i++) {
72  last_blob = (i > chunks[0]) ? num_joints : first_blob + chunks[i];
73 
74  char_widths->widths[2*i-2] =
75  AssociateUtils::GetChunksWidth(chunk_widths, first_blob, last_blob);
76  if (i <= chunks[0]) {
77  char_widths->widths[2*i-1] =
78  AssociateUtils::GetChunksGap(chunk_widths, last_blob);
79  }
80 
81  if (segment_adjust_debug > 3)
82  tprintf("width_record[%d]s%d--s%d(%d) %d %d:%d\n",
83  i-1, first_blob, last_blob, chunks[i],
84  char_widths->widths[2*i-2], char_widths->widths[2*i-1],
85  chunk_widths->widths[2*last_blob+1]);
86  first_blob = last_blob + 1;
87  }
88 
89  memfree(chunks);
90  return char_widths;
91 }
int * memalloc(int size)
Definition: freelist.cpp:22
int segment_adjust_debug
Definition: wordrec.h:124
void memfree(void *element)
Definition: freelist.cpp:30
SEARCH_STATE bin_to_chunks(STATE *state, int num_joints)
Definition: states.cpp:49
int num_chars
Definition: blobs.h:49
static int GetChunksWidth(WIDTH_RECORD *width_record, int start_blob, int last_blob)
Definition: associate.cpp:144
static int GetChunksGap(WIDTH_RECORD *width_record, int last_chunk)
Definition: associate.h:132
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int widths[1]
Definition: blobs.h:50
int * SEARCH_STATE
Definition: states.h:46
void tesseract::Wordrec::try_point_pairs ( EDGEPT points[MAX_NUM_POINTS],
inT16  num_points,
SEAM_QUEUE  seam_queue,
SEAM_PILE seam_pile,
SEAM **  seam,
TBLOB blob 
)

Definition at line 512 of file findseam.cpp.

517  {
518  inT16 x;
519  inT16 y;
520  SPLIT *split;
521  PRIORITY priority;
522 
523  for (x = 0; x < num_points; x++) {
524  for (y = x + 1; y < num_points; y++) {
525 
526  if (points[y] &&
527  weighted_edgept_dist(points[x], points[y],
529  points[x] != points[y]->next &&
530  points[y] != points[x]->next &&
531  !is_exterior_point(points[x], points[y]) &&
532  !is_exterior_point(points[y], points[x])) {
533  split = new_split (points[x], points[y]);
534  priority = partial_split_priority (split);
535 
536  choose_best_seam(seam_queue, seam_pile, split, priority, seam, blob);
537  }
538  }
539  }
540 
541 }
float PRIORITY
Definition: seam.h:38
#define is_exterior_point(edge, point)
Definition: outlines.h:97
#define partial_split_priority(split)
Definition: gradechop.h:51
short inT16
Definition: host.h:100
SPLIT * new_split(EDGEPT *point1, EDGEPT *point2)
Definition: split.cpp:106
void choose_best_seam(SEAM_QUEUE seam_queue, SEAM_PILE *seam_pile, SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob)
Definition: findseam.cpp:178
#define weighted_edgept_dist(p1, p2, chop_x_y_weight)
Definition: olutil.h:67
void tesseract::Wordrec::try_vertical_splits ( EDGEPT points[MAX_NUM_POINTS],
inT16  num_points,
EDGEPT_CLIST *  new_points,
SEAM_QUEUE  seam_queue,
SEAM_PILE seam_pile,
SEAM **  seam,
TBLOB blob 
)

Definition at line 554 of file findseam.cpp.

560  {
561  EDGEPT *vertical_point = NULL;
562  SPLIT *split;
563  inT16 x;
564  PRIORITY priority;
565  TESSLINE *outline;
566 
567  for (x = 0; x < num_points; x++) {
568  vertical_point = NULL;
569  for (outline = blob->outlines; outline; outline = outline->next) {
570  vertical_projection_point(points[x], outline->loop,
571  &vertical_point, new_points);
572  }
573 
574  if (vertical_point &&
575  points[x] != vertical_point->next &&
576  vertical_point != points[x]->next &&
577  weighted_edgept_dist(points[x], vertical_point,
579 
580  split = new_split (points[x], vertical_point);
581  priority = partial_split_priority (split);
582 
583  choose_best_seam(seam_queue, seam_pile, split, priority, seam, blob);
584  }
585  }
586 }
EDGEPT * next
Definition: blobs.h:106
float PRIORITY
Definition: seam.h:38
TESSLINE * next
Definition: blobs.h:171
#define NULL
Definition: host.h:144
TESSLINE * outlines
Definition: blobs.h:227
Definition: blobs.h:72
void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
Definition: chop.cpp:332
#define partial_split_priority(split)
Definition: gradechop.h:51
short inT16
Definition: host.h:100
SPLIT * new_split(EDGEPT *point1, EDGEPT *point2)
Definition: split.cpp:106
void choose_best_seam(SEAM_QUEUE seam_queue, SEAM_PILE *seam_pile, SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob)
Definition: findseam.cpp:178
#define weighted_edgept_dist(p1, p2, chop_x_y_weight)
Definition: olutil.h:67
EDGEPT * loop
Definition: blobs.h:170
void tesseract::Wordrec::update_blob_classifications ( TWERD word,
const BLOB_CHOICE_LIST_VECTOR choices 
)

Definition at line 152 of file wordclass.cpp.

153  {
154  TBLOB *tblob = word->blobs;
155  int index = 0;
156  for (; tblob != NULL && index < choices.length();
157  tblob = tblob->next, index++) {
158  blob_match_table.add_to_match(tblob, choices.get(index));
159  }
160 }
BlobMatchTable blob_match_table
Definition: wordrec.h:501
#define NULL
Definition: host.h:144
T & get(int index) const
void add_to_match(TBLOB *blob, BLOB_CHOICE_LIST *ratings)
Definition: matchtab.cpp:183
TBLOB * blobs
Definition: blobs.h:274
Definition: blobs.h:174
int length() const
Definition: genericvector.h:63
TBLOB * next
Definition: blobs.h:228
void tesseract::Wordrec::update_ratings ( const BLOB_CHOICE_LIST_VECTOR new_choices,
const CHUNKS_RECORD chunks_record,
const SEARCH_STATE  search_state 
)
void tesseract::Wordrec::UpdateSegSearchNodes ( int  starting_col,
SEG_SEARCH_PENDING_LIST *  pending[],
BestPathByColumn best_path_by_column[],
CHUNKS_RECORD chunks_record,
HEAP pain_points,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 186 of file segsearch.cpp.

193  {
194  MATRIX *ratings = chunks_record->ratings;
195  for (int col = starting_col; col < ratings->dimension(); ++col) {
196  if (segsearch_debug_level > 0) {
197  tprintf("\n\nUpdateSegSearchNodes: evaluate children in col=%d\n", col);
198  }
199  // Iterate over the pending list for this column.
200  SEG_SEARCH_PENDING_LIST *pending_list = &((*pending)[col]);
201  SEG_SEARCH_PENDING_IT pending_it(pending_list);
202  GenericVector<int> non_empty_rows;
203  while (!pending_it.empty()) {
204  // Update language model state of this child+parent pair.
205  SEG_SEARCH_PENDING *p = pending_it.extract();
206  if (non_empty_rows.length() == 0 ||
207  non_empty_rows[non_empty_rows.length()-1] != p->child_row) {
208  non_empty_rows.push_back(p->child_row);
209  }
210  BLOB_CHOICE_LIST *current_node = ratings->get(col, p->child_row);
211  LanguageModelFlagsType new_changed =
213  current_node, p->parent, pain_points,
214  best_path_by_column, chunks_record,
215  best_choice_bundle, blamer_bundle);
216  if (new_changed) {
217  // Since the language model state of this entry changed, add all the
218  // pairs with it as a parent and each of its children to pending, so
219  // that the children are updated as well.
220  int child_col = p->child_row + 1;
221  for (int child_row = child_col;
222  child_row < ratings->dimension(); ++child_row) {
223  if (ratings->get(child_col, child_row) != NOT_CLASSIFIED) {
224  SEG_SEARCH_PENDING *new_pending =
225  new SEG_SEARCH_PENDING(child_row, current_node, 0);
226  SEG_SEARCH_PENDING *actual_new_pending =
227  reinterpret_cast<SEG_SEARCH_PENDING *>(
228  (*pending)[child_col].add_sorted_and_find(
229  SEG_SEARCH_PENDING::compare, true, new_pending));
230  if (new_pending != actual_new_pending) delete new_pending;
231  actual_new_pending->changed |= new_changed;
232  if (segsearch_debug_level > 0) {
233  tprintf("Added child(col=%d row=%d) parent(col=%d row=%d)"
234  " changed=0x%x to pending\n", child_col,
235  actual_new_pending->child_row,
236  col, p->child_row, actual_new_pending->changed);
237  }
238  }
239  }
240  } // end if new_changed
241  delete p; // clean up
242  pending_it.forward();
243  } // end while !pending_it.empty()
245  col, non_empty_rows, best_choice_bundle->best_choice->certainty(),
246  pain_points, best_path_by_column, chunks_record);
247  } // end for col
248 
249  if (best_choice_bundle->updated) {
251  pain_points, chunks_record, best_choice_bundle);
252  }
253 
255 }
tesseract::LanguageModelFlagsType changed
Definition: wordrec.h:66
void GeneratePainPointsFromBestChoice(HEAP *pain_points, CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle)
T get(int column, int row) const
Definition: matrix.h:117
int push_back(T object)
void GeneratePainPointsFromColumn(int col, const GenericVector< int > &non_empty_rows, float best_choice_cert, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record)
static int compare(const void *p1, const void *p2)
Definition: wordrec.h:49
int dimension() const
Definition: matrix.h:190
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
#define NOT_CLASSIFIED
Definition: matrix.h:31
int length() const
Definition: genericvector.h:63
LanguageModelFlagsType UpdateState(LanguageModelFlagsType changed, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE_LIST *parent_list, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: matrix.h:193
LanguageModel * language_model_
Definition: wordrec.h:495
int segsearch_debug_level
Definition: wordrec.h:146
MATRIX * ratings
Definition: associate.h:52
unsigned char LanguageModelFlagsType
BLOB_CHOICE_LIST * parent
Definition: wordrec.h:60
void tesseract::Wordrec::vertical_projection_point ( EDGEPT split_point,
EDGEPT target_point,
EDGEPT **  best_point,
EDGEPT_CLIST *  new_points 
)

Definition at line 332 of file chop.cpp.

334  {
335  EDGEPT *p; /* Iterator */
336  EDGEPT *this_edgept; /* Iterator */
337  EDGEPT_C_IT new_point_it(new_points);
338  int x = split_point->pos.x; /* X value of vertical */
339  int best_dist = LARGE_DISTANCE;/* Best point found */
340 
341  if (*best_point != NULL)
342  best_dist = edgept_dist(split_point, *best_point);
343 
344  p = target_point;
345  /* Look at each edge point */
346  do {
347  if ((((p->pos.x <= x) && (x <= p->next->pos.x)) ||
348  ((p->next->pos.x <= x) && (x <= p->pos.x))) &&
349  !same_point (split_point->pos, p->pos) &&
350  !same_point (split_point->pos, p->next->pos)
351  && (*best_point == NULL || !same_point ((*best_point)->pos, p->pos))) {
352 
353  if (near_point(split_point, p, p->next, &this_edgept)) {
354  new_point_it.add_before_then_move(this_edgept);
355  }
356 
357  if (*best_point == NULL)
358  best_dist = edgept_dist (split_point, this_edgept);
359 
360  this_edgept =
361  pick_close_point(split_point, this_edgept, &best_dist);
362  if (this_edgept)
363  *best_point = this_edgept;
364  }
365 
366  p = p->next;
367  }
368  while (p != target_point);
369 }
EDGEPT * next
Definition: blobs.h:106
bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
Definition: outlines.cpp:116
#define NULL
Definition: host.h:144
EDGEPT * pick_close_point(EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
Definition: chop.cpp:182
#define edgept_dist(p1, p2)
Definition: outlines.h:87
inT16 x
Definition: blobs.h:67
Definition: blobs.h:72
#define LARGE_DISTANCE
Definition: outlines.h:36
#define same_point(p1, p2)
Definition: outlines.h:49
TPOINT pos
Definition: blobs.h:100
FLOAT32 tesseract::Wordrec::width_priority ( CHUNKS_RECORD chunks_record,
STATE state,
int  num_joints 
)

Definition at line 222 of file heuristic.cpp.

224  {
225  FLOAT32 penalty = 0.0;
226  WIDTH_RECORD *width_rec = state_char_widths(chunks_record->chunk_widths,
227  state, num_joints);
228  // When baseline_enable==True, which is the current default for Tesseract,
229  // a fixed value of 128 (BASELINE_SCALE) is always used.
230  FLOAT32 normalizing_height = BASELINE_SCALE;
232  // For fixed pitch language like CJK, we use the full text height as the
233  // normalizing factor so we are not dependent on xheight calculation.
234  // In the normalized coord. xheight * scale == BASELINE_SCALE(128),
235  // so add proportionally scaled ascender zone to get full text height.
236  const DENORM& denorm = chunks_record->word_res->denorm;
237  normalizing_height = denorm.y_scale() *
238  (denorm.row()->x_height() + denorm.row()->ascenders());
239  if (segment_adjust_debug > 1)
240  tprintf("WidthPriority: %f %f normalizing height = %f\n",
241  denorm.row()->x_height(), denorm.row()->ascenders(),
242  normalizing_height);
243  // Impose additional segmentation penalties if blob widths or gaps
244  // distribution don't fit a fixed-pitch model.
245  FLOAT32 width_var = get_width_variance(width_rec, normalizing_height);
246  FLOAT32 gap_var = get_gap_variance(width_rec, normalizing_height);
247  penalty += width_var;
248  penalty += gap_var;
249  }
250 
251  for (int x = 0; x < width_rec->num_chars; x++) {
252  FLOAT32 squat = width_rec->widths[2*x];
253  FLOAT32 gap = (x < width_rec->num_chars-1) ? width_rec->widths[2*x+1] : 0;
254  squat /= normalizing_height;
255  gap /= normalizing_height;
258  squat, 0.0f, x == 0 || x == width_rec->num_chars -1,
261  gap, x == width_rec->num_chars - 1);
262  if (width_rec->num_chars == 1 &&
264  penalty += 10;
265  }
266  } else {
267  // Original equation when
268  // heuristic_max_char_ratio == AssociateUtils::kMaxSquat
269  if (squat > heuristic_max_char_wh_ratio)
270  penalty += squat - heuristic_max_char_wh_ratio;
271  }
272  }
273 
274  free_widths(width_rec);
275  return (penalty);
276 }
WERD_RES * word_res
Definition: associate.h:54
FLOAT32 get_width_variance(WIDTH_RECORD *wrec, float norm_height)
Definition: heuristic.cpp:96
const ROW * row() const
Definition: normalis.h:270
#define free_widths(w)
Definition: blobs.h:287
int segment_adjust_debug
Definition: wordrec.h:124
WIDTH_RECORD * state_char_widths(WIDTH_RECORD *chunk_widths, STATE *state, int num_joints)
Definition: heuristic.cpp:58
#define f(xc, yc)
Definition: imgscale.cpp:39
float FLOAT32
Definition: host.h:111
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:126
static float FixedPitchGapCost(float norm_gap, bool end_pos)
Definition: associate.h:143
int num_chars
Definition: blobs.h:49
static float FixedPitchWidthCost(float norm_width, float right_gap, bool end_pos, float max_char_wh_ratio)
Definition: associate.cpp:152
float ascenders() const
Definition: ocrrow.h:79
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
DENORM denorm
Definition: pageres.h:346
static const float kMaxFixedPitchCharAspectRatio
Definition: associate.h:98
float x_height() const
Definition: ocrrow.h:61
int widths[1]
Definition: blobs.h:50
WIDTH_RECORD * chunk_widths
Definition: associate.h:57
float y_scale() const
Definition: normalis.h:267
#define BASELINE_SCALE
Definition: baseline.h:39
double heuristic_max_char_wh_ratio
Definition: wordrec.h:140
FLOAT32 get_gap_variance(WIDTH_RECORD *wrec, float norm_height)
Definition: heuristic.cpp:111
MATRIX * tesseract::Wordrec::word_associator ( bool  only_create_ratings_matrtix,
WERD_RES word,
STATE state,
BLOB_CHOICE_LIST_VECTOR best_char_choices,
DANGERR fixpt,
STATE best_state 
)

Definition at line 984 of file chopper.cpp.

989  {
990  CHUNKS_RECORD chunks_record;
991  BLOB_WEIGHTS blob_weights;
992  int x;
993  int num_chunks;
994  BLOB_CHOICE_IT blob_choice_it;
995 
996  num_chunks = array_count(word->seam_array) + 1;
997 
998  TBLOB* blobs = word->chopped_word->blobs;
999  chunks_record.ratings = record_piece_ratings(blobs);
1000  chunks_record.chunks = blobs;
1001  chunks_record.word_res = word;
1002  chunks_record.splits = word->seam_array;
1003  chunks_record.chunk_widths = blobs_widths(blobs);
1004  chunks_record.char_widths = blobs_widths(blobs);
1005  /* Save chunk weights */
1006  for (x = 0; x < num_chunks; x++) {
1007  BLOB_CHOICE_LIST* choices = get_piece_rating(chunks_record.ratings, blobs,
1008  chunks_record.word_res->denorm,
1009  word->seam_array, x, x,
1010  word->blamer_bundle);
1011  blob_choice_it.set_to_list(choices);
1012  //This is done by Jetsoft. Divide by zero is possible.
1013  if (blob_choice_it.data()->certainty() == 0) {
1014  blob_weights[x]=0;
1015  } else {
1016  blob_weights[x] =
1017  -(inT16) (10 * blob_choice_it.data()->rating() /
1018  blob_choice_it.data()->certainty());
1019  }
1020  }
1021  chunks_record.weights = blob_weights;
1022 
1023  if (chop_debug)
1024  chunks_record.ratings->print(getDict().getUnicharset());
1025 
1026  if (!only_create_ratings_matrix) {
1027  if (enable_new_segsearch) {
1028  SegSearch(&chunks_record, word->best_choice,
1029  best_char_choices, word->raw_choice,
1030  state, word->blamer_bundle);
1031  } else {
1032  best_first_search(&chunks_record, best_char_choices, word,
1033  state, fixpt, best_state);
1034  }
1035  }
1036 
1037  free_widths(chunks_record.chunk_widths);
1038  free_widths(chunks_record.char_widths);
1039  return chunks_record.ratings;
1040 }
WERD_RES * word_res
Definition: associate.h:54
WIDTH_RECORD * char_widths
Definition: associate.h:58
TBLOB * chunks
Definition: associate.h:53
inT16 * weights
Definition: associate.h:59
#define free_widths(w)
Definition: blobs.h:287
void SegSearch(CHUNKS_RECORD *chunks_record, WERD_CHOICE *best_choice, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_CHOICE *raw_choice, STATE *output_best_state, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:35
void best_first_search(CHUNKS_RECORD *chunks_record, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_RES *word, STATE *state, DANGERR *fixpt, STATE *best_state)
Definition: bestfirst.cpp:88
SEAMS splits
Definition: associate.h:55
BLOB_CHOICE_LIST * get_piece_rating(MATRIX *ratings, TBLOB *blobs, const DENORM &denorm, SEAMS seams, inT16 start, inT16 end, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:362
bool enable_new_segsearch
Definition: wordrec.h:145
TBLOB * blobs
Definition: blobs.h:274
WIDTH_RECORD * blobs_widths(TBLOB *blobs)
Definition: blobs.cpp:549
MATRIX * record_piece_ratings(TBLOB *blobs)
Definition: pieces.cpp:414
inT16 BLOB_WEIGHTS[MAX_NUM_CHUNKS]
Definition: associate.h:32
SEAMS seam_array
Definition: pageres.h:358
Dict & getDict()
Definition: classify.h:62
Definition: blobs.h:174
DENORM denorm
Definition: pageres.h:346
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:36
WERD_CHOICE * raw_choice
Definition: pageres.h:360
short inT16
Definition: host.h:100
WIDTH_RECORD * chunk_widths
Definition: associate.h:57
TWERD * chopped_word
Definition: pageres.h:357
#define array_count(a)
Definition: tessarray.h:74
MATRIX * ratings
Definition: associate.h:52
BlamerBundle * blamer_bundle
Definition: pageres.h:367
WERD_CHOICE * best_choice
Definition: pageres.h:359

Member Data Documentation

bool tesseract::Wordrec::assume_fixed_pitch_char_segment = FALSE

"include fixed-pitch heuristics in char segmentation"

Definition at line 126 of file wordrec.h.

GenericVector<int> tesseract::Wordrec::blame_reasons_

Definition at line 508 of file wordrec.h.

BlobMatchTable tesseract::Wordrec::blob_match_table

Definition at line 501 of file wordrec.h.

double tesseract::Wordrec::chop_center_knob = 0.15

"Split center adjustment"

Definition at line 118 of file wordrec.h.

int tesseract::Wordrec::chop_debug = 0

"Chop debug"

Definition at line 108 of file wordrec.h.

bool tesseract::Wordrec::chop_enable = 1

"Chop enable"

Definition at line 109 of file wordrec.h.

double tesseract::Wordrec::chop_good_split = 50.0

"Good split limit"

Definition at line 122 of file wordrec.h.

int tesseract::Wordrec::chop_inside_angle = -50

"Min Inside Angle Bend"

Definition at line 114 of file wordrec.h.

int tesseract::Wordrec::chop_min_outline_area = 2000

"Min Outline Area"

Definition at line 115 of file wordrec.h.

int tesseract::Wordrec::chop_min_outline_points = 6

"Min Number of Points on Outline"

Definition at line 113 of file wordrec.h.

double tesseract::Wordrec::chop_ok_split = 100.0

"OK split limit"

Definition at line 121 of file wordrec.h.

double tesseract::Wordrec::chop_overlap_knob = 0.9

"Split overlap adjustment"

Definition at line 117 of file wordrec.h.

int tesseract::Wordrec::chop_same_distance = 2

"Same distance"

Definition at line 112 of file wordrec.h.

double tesseract::Wordrec::chop_sharpness_knob = 0.06

"Split sharpness adjustment"

Definition at line 119 of file wordrec.h.

double tesseract::Wordrec::chop_split_dist_knob = 0.5

"Split length adjustment"

Definition at line 116 of file wordrec.h.

int tesseract::Wordrec::chop_split_length = 10000

"Split Length"

Definition at line 111 of file wordrec.h.

bool tesseract::Wordrec::chop_vertical_creep = 0

"Vertical creep"

Definition at line 110 of file wordrec.h.

double tesseract::Wordrec::chop_width_change_knob = 5.0

"Width change adjustment"

Definition at line 120 of file wordrec.h.

int tesseract::Wordrec::chop_x_y_weight = 3

"X / Y length weight"

Definition at line 123 of file wordrec.h.

bool tesseract::Wordrec::enable_new_segsearch = false

"Enable new segmentation search path."

Definition at line 145 of file wordrec.h.

void(Wordrec::* tesseract::Wordrec::fill_lattice_)(const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)

Definition at line 510 of file wordrec.h.

bool tesseract::Wordrec::force_word_assoc = FALSE

"force associator to run regardless of what enable_assoc is." "This is used for CJK where component grouping is necessary."

Definition at line 101 of file wordrec.h.

bool tesseract::Wordrec::fragments_guide_chopper = FALSE

"Use information from fragments to guide chopping process"

Definition at line 105 of file wordrec.h.

double tesseract::Wordrec::heuristic_max_char_wh_ratio = 2.0

"max char width-to-height ratio allowed in segmentation"

Definition at line 140 of file wordrec.h.

double tesseract::Wordrec::heuristic_segcost_rating_base = 1.25

"base factor for adding segmentation cost into word rating." "It's a multiplying factor, the larger the value above 1, " "the bigger the effect of segmentation cost."

Definition at line 132 of file wordrec.h.

double tesseract::Wordrec::heuristic_weight_rating = 1

"weight associated with char rating in combined cost of state"

Definition at line 134 of file wordrec.h.

double tesseract::Wordrec::heuristic_weight_seamcut = 0

"weight associated with seam cut in combined cost of state"

Definition at line 138 of file wordrec.h.

double tesseract::Wordrec::heuristic_weight_width = 0

"weight associated with width evidence in combined cost of state"

Definition at line 136 of file wordrec.h.

LanguageModel* tesseract::Wordrec::language_model_

Definition at line 495 of file wordrec.h.

EVALUATION_ARRAY tesseract::Wordrec::last_segmentation

Definition at line 502 of file wordrec.h.

bool tesseract::Wordrec::merge_fragments_in_matrix = TRUE

"Merge the fragments in the ratings matrix and delete them " "after merging"

Definition at line 96 of file wordrec.h.

int tesseract::Wordrec::num_joints

Definition at line 498 of file wordrec.h.

int tesseract::Wordrec::num_popped

Definition at line 500 of file wordrec.h.

int tesseract::Wordrec::num_pushed

Definition at line 499 of file wordrec.h.

PRIORITY tesseract::Wordrec::pass2_ok_split

Definition at line 496 of file wordrec.h.

int tesseract::Wordrec::pass2_seg_states

Definition at line 497 of file wordrec.h.

WERD_CHOICE* tesseract::Wordrec::prev_word_best_choice_

Definition at line 506 of file wordrec.h.

int tesseract::Wordrec::repair_unchopped_blobs = 1

"Fix blobs that aren't chopped"

Definition at line 106 of file wordrec.h.

bool tesseract::Wordrec::save_alt_choices = false

"Save alternative paths found during chopping " "and segmentation search"

Definition at line 158 of file wordrec.h.

int tesseract::Wordrec::segment_adjust_debug = 0

"Segmentation adjustment debug"

Definition at line 124 of file wordrec.h.

int tesseract::Wordrec::segsearch_debug_level = 0

"SegSearch debug level"

Definition at line 146 of file wordrec.h.

double tesseract::Wordrec::segsearch_max_char_wh_ratio = 2.0

"Maximum character width-to-height ratio"

Definition at line 152 of file wordrec.h.

double tesseract::Wordrec::segsearch_max_fixed_pitch_char_wh_ratio = 2.0

"Maximum character width-to-height ratio for" "fixed pitch fonts"

Definition at line 155 of file wordrec.h.

int tesseract::Wordrec::segsearch_max_futile_classifications = 10

"Maximum number of pain point classifications per word."

Definition at line 150 of file wordrec.h.

int tesseract::Wordrec::segsearch_max_pain_points = 2000

"Maximum number of pain points stored in the queue"

Definition at line 148 of file wordrec.h.

double tesseract::Wordrec::tessedit_certainty_threshold = -2.25

"Good blob limit"

Definition at line 107 of file wordrec.h.

bool tesseract::Wordrec::use_new_state_cost = FALSE

"use new state cost heuristics for segmentation state evaluation"

Definition at line 128 of file wordrec.h.

bool tesseract::Wordrec::wordrec_debug_blamer = false

"Print blamer debug messages"

Definition at line 142 of file wordrec.h.

int tesseract::Wordrec::wordrec_debug_level = 0

"Debug level for wordrec"

Definition at line 141 of file wordrec.h.

bool tesseract::Wordrec::wordrec_enable_assoc = TRUE

"Associator Enable"

Definition at line 98 of file wordrec.h.

bool tesseract::Wordrec::wordrec_no_block = FALSE

"Don't output block information"

Definition at line 97 of file wordrec.h.

int tesseract::Wordrec::wordrec_num_seg_states = 30

"Segmentation states"

Definition at line 102 of file wordrec.h.

bool tesseract::Wordrec::wordrec_run_blamer = false

"Try to set the blame for errors"

Definition at line 143 of file wordrec.h.

double tesseract::Wordrec::wordrec_worst_state = 1

"Worst segmentation state"

Definition at line 103 of file wordrec.h.


The documentation for this class was generated from the following files: