Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::Tesseract Class Reference

#include <tesseractclass.h>

Inheritance diagram for tesseract::Tesseract:
tesseract::Wordrec tesseract::Classify tesseract::CCStruct tesseract::CUtil tesseract::CCUtil

Public Member Functions

 Tesseract ()
 
 ~Tesseract ()
 
void Clear ()
 
void ResetAdaptiveClassifier ()
 
void ResetDocumentDictionary ()
 
void SetEquationDetect (EquationDetect *detector)
 
const FCOORDreskew () const
 
Pix ** mutable_pix_binary ()
 
Pix * pix_binary () const
 
Pix * pix_grey () const
 
void set_pix_grey (Pix *grey_pix)
 
Pix * BestPix () const
 
int source_resolution () const
 
void set_source_resolution (int ppi)
 
int ImageWidth () const
 
int ImageHeight () const
 
Pix * scaled_color () const
 
int scaled_factor () const
 
void SetScaledColor (int factor, Pix *color)
 
const Textordtextord () const
 
Textordmutable_textord ()
 
bool right_to_left () const
 
int num_sub_langs () const
 
Tesseractget_sub_lang (int index) const
 
void SetBlackAndWhitelist ()
 
void PrepareForPageseg ()
 
void PrepareForTessOCR (BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
 
int SegmentPage (const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
 
void SetupWordScripts (BLOCK_LIST *blocks)
 
int AutoPageSeg (bool single_column, bool osd, bool only_osd, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, Tesseract *osd_tess, OSResults *osr)
 
ColumnFinderSetupPageSegAndDetectOrientation (bool single_column, bool osd, bool only_osd, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)
 
bool ProcessTargetWord (const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
 
bool recog_all_words (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
 
void rejection_passes (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
 
void bigram_correction_pass (PAGE_RES *page_res)
 
void blamer_pass (PAGE_RES *page_res)
 
bool RetryWithLanguage (WERD_RES *word, BLOCK *block, ROW *row, WordRecognizer recognizer)
 
void classify_word_and_language (WordRecognizer recognizer, BLOCK *block, ROW *row, WERD_RES *word)
 
void classify_word_pass1 (BLOCK *block, ROW *row, WERD_RES *word)
 
void recog_pseudo_word (PAGE_RES *page_res, TBOX &selection_box)
 
void fix_rep_char (PAGE_RES_IT *page_res_it)
 
void ExplodeRepeatedWord (BLOB_CHOICE *best_choice, PAGE_RES_IT *page_res_it)
 
ACCEPTABLE_WERD_TYPE acceptable_word_string (const UNICHARSET &char_set, const char *s, const char *lengths)
 
void match_word_pass2 (WERD_RES *word, ROW *row, BLOCK *block)
 
void classify_word_pass2 (BLOCK *block, ROW *row, WERD_RES *word)
 
void ReportXhtFixResult (bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
 
bool RunOldFixXht (WERD_RES *word, BLOCK *block, ROW *row)
 
bool TrainedXheightFix (WERD_RES *word, BLOCK *block, ROW *row)
 
BOOL8 recog_interactive (BLOCK *block, ROW *row, WERD_RES *word_res)
 
void set_word_fonts (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
 
void font_recognition_pass (PAGE_RES *page_res)
 
BOOL8 check_debug_pt (WERD_RES *word, int location)
 
bool init_cube_objects (bool load_combiner, TessdataManager *tessdata_manager)
 
void run_cube_combiner (PAGE_RES *page_res)
 
void cube_word_pass1 (BLOCK *block, ROW *row, WERD_RES *word)
 
CubeObjectcube_recognize_word (BLOCK *block, WERD_RES *word)
 
void cube_combine_word (CubeObject *cube_obj, WERD_RES *cube_word, WERD_RES *tess_word)
 
bool cube_recognize (CubeObject *cube_obj, BLOCK *block, WERD_RES *word)
 
void fill_werd_res (const BoxWord &cube_box_word, WERD_CHOICE *cube_werd_choice, const char *cube_best_str, WERD_RES *tess_werd_res)
 
bool extract_cube_state (CubeObject *cube_obj, int *num_chars, Boxa **char_boxes, CharSamp ***char_samples)
 
bool create_cube_box_word (Boxa *char_boxes, int num_chars, TBOX word_box, BoxWord *box_word)
 
void output_pass (PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
 
void write_results (PAGE_RES_IT &page_res_it, char newline_type, BOOL8 force_eol)
 
void set_unlv_suspects (WERD_RES *word)
 
UNICHAR_ID get_rep_char (WERD_RES *word)
 
BOOL8 acceptable_number_string (const char *s, const char *lengths)
 
inT16 count_alphanums (const WERD_CHOICE &word)
 
inT16 count_alphas (const WERD_CHOICE &word)
 
void read_config_file (const char *filename, SetParamConstraint constraint)
 
int init_tesseract (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
 
int init_tesseract (const char *datapath, const char *language, OcrEngineMode oem)
 
int init_tesseract_internal (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
 
void SetupUniversalFontIds ()
 
int init_tesseract_lm (const char *arg0, const char *textbase, const char *language)
 
void recognize_page (STRING &image_name)
 
void end_tesseract ()
 
bool init_tesseract_lang_data (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
 
void ParseLanguageString (const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
 
SVMenuNodebuild_menu_new ()
 
void pgeditor_main (int width, int height, PAGE_RES *page_res)
 
void process_image_event (const SVEvent &event)
 
BOOL8 process_cmd_win_event (inT32 cmd_event, char *new_value)
 
void debug_word (PAGE_RES *page_res, const TBOX &selection_box)
 
void do_re_display (BOOL8(tesseract::Tesseract::*word_painter)(BLOCK *block, ROW *row, WERD_RES *word_res))
 
BOOL8 word_display (BLOCK *block, ROW *row, WERD_RES *word_res)
 
BOOL8 word_bln_display (BLOCK *block, ROW *row, WERD_RES *word_res)
 
BOOL8 word_blank_and_set_display (BLOCK *block, ROW *row, WERD_RES *word_res)
 
BOOL8 word_set_display (BLOCK *block, ROW *row, WERD_RES *word_res)
 
BOOL8 word_dumper (BLOCK *block, ROW *row, WERD_RES *word_res)
 
void make_reject_map (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices, ROW *row, inT16 pass)
 
BOOL8 one_ell_conflict (WERD_RES *word_res, BOOL8 update_map)
 
inT16 first_alphanum_index (const char *word, const char *word_lengths)
 
inT16 first_alphanum_offset (const char *word, const char *word_lengths)
 
inT16 alpha_count (const char *word, const char *word_lengths)
 
BOOL8 word_contains_non_1_digit (const char *word, const char *word_lengths)
 
void dont_allow_1Il (WERD_RES *word)
 
inT16 count_alphanums (WERD_RES *word)
 
void flip_0O (WERD_RES *word)
 
BOOL8 non_0_digit (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
 
BOOL8 non_O_upper (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
 
BOOL8 repeated_nonalphanum_wd (WERD_RES *word, ROW *row)
 
void nn_match_word (WERD_RES *word, ROW *row)
 
void nn_recover_rejects (WERD_RES *word, ROW *row)
 
BOOL8 test_ambig_word (WERD_RES *word)
 
void set_done (WERD_RES *word, inT16 pass)
 
inT16 safe_dict_word (const WERD_RES *werd_res)
 
void flip_hyphens (WERD_RES *word)
 
void reject_I_1_L (WERD_RES *word)
 
void reject_edge_blobs (WERD_RES *word)
 
void reject_mostly_rejects (WERD_RES *word)
 
BOOL8 word_adaptable (WERD_RES *word, uinT16 mode)
 
void recog_word_recursive (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
 
void recog_word (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
 
void split_and_recog_word (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
 
void match_current_words (WERD_RES_LIST &words, ROW *row, BLOCK *block)
 
inT16 fp_eval_word_spacing (WERD_RES_LIST &word_res_list)
 
void dump_words (WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved)
 
GARBAGE_LEVEL garbage_word (WERD_RES *word, BOOL8 ok_dict_word)
 
BOOL8 potential_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word)
 
void tilde_crunch (PAGE_RES_IT &page_res_it)
 
void unrej_good_quality_words (PAGE_RES_IT &page_res_it)
 
void doc_and_block_rejection (PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
 
void quality_based_rejection (PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
 
void convert_bad_unlv_chs (WERD_RES *word_res)
 
void tilde_delete (PAGE_RES_IT &page_res_it)
 
inT16 word_blob_quality (WERD_RES *word, ROW *row)
 
void word_char_quality (WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
 
void unrej_good_chs (WERD_RES *word, ROW *row)
 
inT16 count_outline_errs (char c, inT16 outline_count)
 
inT16 word_outline_errs (WERD_RES *word)
 
BOOL8 terrible_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level)
 
CRUNCH_MODE word_deletable (WERD_RES *word, inT16 &delete_mode)
 
inT16 failure_count (WERD_RES *word)
 
BOOL8 noise_outlines (TWERD *word)
 
void process_selected_words (PAGE_RES *page_res, TBOX &selection_box, BOOL8(tesseract::Tesseract::*word_processor)(BLOCK *block, ROW *row, WERD_RES *word_res))
 
void tess_segment_pass1 (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
 
PAGE_RESApplyBoxes (const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
 
PAGE_RESSetupApplyBoxes (const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
 
void MaximallyChopWord (const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
 
bool ResegmentCharBox (PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX &next_box, const char *correct_text)
 
bool ResegmentWordBox (BLOCK_LIST *block_list, const TBOX &box, const TBOX &next_box, const char *correct_text)
 
void ReSegmentByClassification (PAGE_RES *page_res)
 
bool ConvertStringToUnichars (const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
 
bool FindSegmentation (const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
 
void SearchForText (const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
 
void TidyUp (PAGE_RES *page_res)
 
void ReportFailedBox (int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
 
void CorrectClassifyWords (PAGE_RES *page_res)
 
void ApplyBoxTraining (const STRING &filename, PAGE_RES *page_res)
 
int CountMisfitTops (WERD_RES *word_res)
 
float ComputeCompatibleXheight (WERD_RES *word_res)
 
FILE * init_recog_training (const STRING &fname)
 
void recog_training_segmented (const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
 
void ambigs_classify_and_output (WERD_RES *werd_res, ROW_RES *row_res, BLOCK_RES *block_res, const char *label, FILE *output_file)
 
CubeRecoContextGetCubeRecoContext ()
 
eval_word_spacing()

The basic measure is the number of characters in contextually confirmed words. (I.e the word is done) If all words are contextually confirmed the evaluation is deemed perfect.

Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is prefered.

The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space.

Conversly, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word + 2 sides of a "1" joined.

The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.

BOOL8 digit_or_numeric_punct (WERD_RES *word, int char_position)
 
inT16 eval_word_spacing (WERD_RES_LIST &word_res_list)
 
fix_sp_fp_word()

Test the current word to see if it can be split by deleting noise blobs. If so, do the business. Return with the iterator pointing to the same place if the word is unchanged, or the last of the replacement words.

void fix_noisy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
 
void fix_sp_fp_word (WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
 
inT16 worst_noise_blob (WERD_RES *word_res, float *worst_noise_score)
 
float blob_noise_score (TBLOB *blob)
 
void break_noisiest_blob_word (WERD_RES_LIST &words)
 
fix_fuzzy_spaces()

Walk over the page finding sequences of words joined by fuzzy spaces. Extract them as a sublist, process the sublist to find the optimal arrangement of spaces then replace the sublist in the ROW_RES.

Parameters
monitorprogress monitor
word_countcount of words in doc
[out]page_res
void fix_fuzzy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
 
void fix_fuzzy_spaces (ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)
 
uniformly_spaced()

Return true if one of the following are true:

  • All inter-char gaps are the same width
  • The largest gap is no larger than twice the mean/median of the others
  • The largest gap is < normalised_max_nonspace **** REMEMBER - WE'RE NOW WORKING WITH A BLN WERD !!!
BOOL8 uniformly_spaced (WERD_RES *word)
 
BOOL8 fixspace_thinks_word_done (WERD_RES *word)
 
tess_add_doc_word

Add the given word to the document dictionary

void tess_add_doc_word (WERD_CHOICE *word_choice)
 
tess_segment_pass2

Segment a word using the pass2 conditions of the tess segmenter.

Parameters
wordword to do
blob_choiceslist of blob lists
void tess_segment_pass2 (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
 
tess_acceptable_word
Returns
true if the word is regarded as "good enough".
Parameters
word_choiceafter context
raw_choicebefore context
BOOL8 tess_acceptable_word (WERD_CHOICE *word_choice, WERD_CHOICE *raw_choice)
 
- Public Member Functions inherited from tesseract::Wordrec
 Wordrec ()
 
virtual ~Wordrec ()
 
void CopyCharChoices (const BLOB_CHOICE_LIST_VECTOR &from, BLOB_CHOICE_LIST_VECTOR *to)
 
bool ChoiceIsCorrect (const UNICHARSET &uni_set, const WERD_CHOICE *choice, const GenericVector< STRING > &truth_text)
 
void SaveAltChoices (const LIST &best_choices, WERD_RES *word)
 
void FillLattice (const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void CallFillLattice (const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void update_ratings (const BLOB_CHOICE_LIST_VECTOR &new_choices, const CHUNKS_RECORD *chunks_record, const SEARCH_STATE search_state)
 
void SegSearch (CHUNKS_RECORD *chunks_record, WERD_CHOICE *best_choice, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_CHOICE *raw_choice, STATE *output_best_state, BlamerBundle *blamer_bundle)
 
SEAMattempt_blob_chop (TWERD *word, TBLOB *blob, inT32 blob_number, bool italic_blob, SEAMS seam_list)
 
SEAMchop_numbered_blob (TWERD *word, inT32 blob_number, bool italic_blob, SEAMS seam_list)
 
SEAMchop_overlapping_blob (const GenericVector< TBOX > &boxes, WERD_RES *word_res, inT32 *blob_number, bool italic_blob, SEAMS seam_list)
 
void junk_worst_seam (SEAM_QUEUE seams, SEAM *new_seam, float new_priority)
 
void choose_best_seam (SEAM_QUEUE seam_queue, SEAM_PILE *seam_pile, SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob)
 
void combine_seam (SEAM_QUEUE seam_queue, SEAM_PILE seam_pile, SEAM *seam)
 
inT16 constrained_split (SPLIT *split, TBLOB *blob)
 
void delete_seam_pile (SEAM_PILE seam_pile)
 
SEAMpick_good_seam (TBLOB *blob)
 
PRIORITY seam_priority (SEAM *seam, inT16 xmin, inT16 xmax)
 
void try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, SEAM_QUEUE seam_queue, SEAM_PILE *seam_pile, SEAM **seam, TBLOB *blob)
 
void try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, EDGEPT_CLIST *new_points, SEAM_QUEUE seam_queue, SEAM_PILE *seam_pile, SEAM **seam, TBLOB *blob)
 
PRIORITY full_split_priority (SPLIT *split, inT16 xmin, inT16 xmax)
 
PRIORITY grade_center_of_blob (register BOUNDS_RECT rect)
 
PRIORITY grade_overlap (register BOUNDS_RECT rect)
 
PRIORITY grade_split_length (register SPLIT *split)
 
PRIORITY grade_sharpness (register SPLIT *split)
 
PRIORITY grade_width_change (register BOUNDS_RECT rect)
 
void set_outline_bounds (register EDGEPT *point1, register EDGEPT *point2, BOUNDS_RECT rect)
 
int crosses_outline (EDGEPT *p0, EDGEPT *p1, EDGEPT *outline)
 
int is_crossed (TPOINT a0, TPOINT a1, TPOINT b0, TPOINT b1)
 
int is_same_edgept (EDGEPT *p1, EDGEPT *p2)
 
bool near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
 
void reverse_outline (EDGEPT *outline)
 
virtual BLOB_CHOICE_LIST * classify_piece (TBLOB *pieces, const DENORM &denorm, SEAMS seams, inT16 start, inT16 end, BlamerBundle *blamer_bundle)
 
void merge_fragments (MATRIX *ratings, inT16 num_blobs)
 
void get_fragment_lists (inT16 current_frag, inT16 current_row, inT16 start, inT16 num_frag_parts, inT16 num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
 
void merge_and_put_fragment_lists (inT16 row, inT16 column, inT16 num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
 
void fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
 
BLOB_CHOICE_LIST * get_piece_rating (MATRIX *ratings, TBLOB *blobs, const DENORM &denorm, SEAMS seams, inT16 start, inT16 end, BlamerBundle *blamer_bundle)
 
TBOXrecord_blob_bounds (TBLOB *blobs)
 
MATRIXrecord_piece_ratings (TBLOB *blobs)
 
WIDTH_RECORDstate_char_widths (WIDTH_RECORD *chunk_widths, STATE *state, int num_joints)
 
FLOAT32 get_width_variance (WIDTH_RECORD *wrec, float norm_height)
 
FLOAT32 get_gap_variance (WIDTH_RECORD *wrec, float norm_height)
 
FLOAT32 prioritize_state (CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search)
 
FLOAT32 width_priority (CHUNKS_RECORD *chunks_record, STATE *state, int num_joints)
 
FLOAT32 seamcut_priority (SEAMS seams, STATE *state, int num_joints)
 
FLOAT32 rating_priority (CHUNKS_RECORD *chunks_record, STATE *state, int num_joints)
 
void program_editup (const char *textbase, bool init_classifier, bool init_permute)
 
BLOB_CHOICE_LIST_VECTORcc_recog (WERD_RES *word)
 
void program_editdown (inT32 elasped_time)
 
void set_pass1 ()
 
void set_pass2 ()
 
int end_recog ()
 
BLOB_CHOICE_LIST * call_matcher (const DENORM *denorm, TBLOB *blob)
 
int dict_word (const WERD_CHOICE &word)
 
BLOB_CHOICE_LIST * classify_blob (TBLOB *blob, const DENORM &denorm, const char *string, C_COL color, BlamerBundle *blamer_bundle)
 
BLOB_CHOICE_LIST * fake_classify_blob (UNICHAR_ID class_id, float rating, float certainty)
 
void update_blob_classifications (TWERD *word, const BLOB_CHOICE_LIST_VECTOR &choices)
 
BLOB_CHOICE_LIST_VECTORevaluate_chunks (CHUNKS_RECORD *chunks_record, SEARCH_STATE search_state, BlamerBundle *blamer_bundle)
 
void best_first_search (CHUNKS_RECORD *chunks_record, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_RES *word, STATE *state, DANGERR *fixpt, STATE *best_state)
 
void delete_search (SEARCH_RECORD *the_search)
 
inT16 evaluate_state (CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search, DANGERR *fixpt, BlamerBundle *blamer_bundle)
 
BLOB_CHOICE_LIST_VECTORrebuild_current_state (WERD_RES *word, STATE *state, BLOB_CHOICE_LIST_VECTOR *char_choices, MATRIX *ratings)
 
SEARCH_RECORDnew_search (CHUNKS_RECORD *chunks_record, int num_joints, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_CHOICE *best_choice, WERD_CHOICE *raw_choice, STATE *state)
 
void expand_node (FLOAT32 worst_priority, CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search)
 
void replace_char_widths (CHUNKS_RECORD *chunks_record, SEARCH_STATE state)
 
BLOB_CHOICErebuild_fragments (const char *unichar, const char *expanded_fragment_lengths, int choice_index, BLOB_CHOICE_LIST_VECTOR *old_choices)
 
BLOB_CHOICE_LIST * join_blobs_and_classify (WERD_RES *word, int x, int y, int choice_index, MATRIX *ratings, BLOB_CHOICE_LIST_VECTOR *old_choices)
 
STATEpop_queue (HEAP *queue)
 
void push_queue (HEAP *queue, STATE *state, FLOAT32 worst_priority, FLOAT32 priority, bool debug)
 
PRIORITY point_priority (EDGEPT *point)
 
void add_point_to_list (POINT_GROUP point_list, EDGEPT *point)
 
int angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
 
int is_little_chunk (EDGEPT *point1, EDGEPT *point2)
 
int is_small_area (EDGEPT *point1, EDGEPT *point2)
 
EDGEPTpick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
 
void prioritize_points (TESSLINE *outline, POINT_GROUP points)
 
void new_min_point (EDGEPT *local_min, POINT_GROUP points)
 
void new_max_point (EDGEPT *local_max, POINT_GROUP points)
 
void vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
 
bool improve_one_blob (WERD_RES *word_res, BLOB_CHOICE_LIST_VECTOR *char_choices, inT32 *blob_number, SEAMS *seam_list, DANGERR *fixpt, bool split_next_to_fragment, BlamerBundle *blamer_bundle)
 
void modify_blob_choice (BLOB_CHOICE_LIST *answer, int chop_index)
 
bool chop_one_blob (TWERD *word, BLOB_CHOICE_LIST_VECTOR *char_choices, inT32 *blob_number, SEAMS *seam_list, int *right_chop_index)
 
bool chop_one_blob2 (const GenericVector< TBOX > &boxes, WERD_RES *word_res, SEAMS *seam_list)
 
BLOB_CHOICE_LIST_VECTORchop_word_main (WERD_RES *word)
 
void improve_by_chopping (WERD_RES *word, BLOB_CHOICE_LIST_VECTOR *char_choices, STATE *best_state, BLOB_CHOICE_LIST_VECTOR *best_char_choices, DANGERR *fixpt, bool *updated_best_choice)
 
MATRIXword_associator (bool only_create_ratings_matrtix, WERD_RES *word, STATE *state, BLOB_CHOICE_LIST_VECTOR *best_char_choices, DANGERR *fixpt, STATE *best_state)
 
inT16 select_blob_to_split (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_ceiling, bool split_next_to_fragment)
 
inT16 select_blob_to_split_from_fixpt (DANGERR *fixpt)
 
void set_chopper_blame (WERD_RES *word)
 
- Public Member Functions inherited from tesseract::Classify
 Classify ()
 
virtual ~Classify ()
 
DictgetDict ()
 
const ShapeTableshape_table () const
 
ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)
 
int GetFontinfoId (ADAPT_CLASS Class, uinT8 ConfigId)
 
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, const INT_FEATURE_STRUCT *features, const uinT8 *normalization_factors, const uinT16 *expected_num_features, CP_RESULT_STRUCT *results)
 
void ReadNewCutoffs (FILE *CutoffFile, bool swap, inT64 end_offset, CLASS_CUTOFF_ARRAY Cutoffs)
 
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
ADAPT_TEMPLATES ReadAdaptedTemplates (FILE *File)
 
FLOAT32 ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, BOOL8 DebugMatch)
 
void FreeNormProtos ()
 
NORM_PROTOSReadNormProtos (FILE *File, inT64 end_offset)
 
void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
 
INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
 
void LearnWord (const char *filename, const char *rejmap, WERD_RES *word)
 
void LearnPieces (const char *filename, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
 
void InitAdaptiveClassifier (bool load_pre_trained_templates)
 
void InitAdaptedClass (TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
 
void AdaptToPunc (TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold)
 
void AmbigClassifier (TBLOB *Blob, const DENORM &denorm, INT_TEMPLATES Templates, ADAPT_CLASS *Classes, UNICHAR_ID *Ambiguities, ADAPT_RESULTS *Results)
 
void MasterMatcher (INT_TEMPLATES templates, inT16 num_features, const INT_FEATURE_STRUCT *features, const uinT8 *norm_factors, ADAPT_CLASS *classes, int debug, int num_classes, const TBOX &blob_box, CLASS_PRUNER_RESULTS results, ADAPT_RESULTS *final_results)
 
void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, const uinT8 *cn_factors, INT_RESULT_STRUCT &int_result, ADAPT_RESULTS *final_results)
 
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, const uinT8 *cn_factors)
 
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
 
void AddNewResult (ADAPT_RESULTS *results, CLASS_ID class_id, int shape_id, FLOAT32 rating, bool adapted, int config, int fontinfo_id, int fontinfo_id2)
 
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
 
void DebugAdaptiveClassifier (TBLOB *Blob, const DENORM &denorm, ADAPT_RESULTS *Results)
 
void GetAdaptThresholds (TWERD *Word, const DENORM &denorm, const WERD_CHOICE &BestChoice, const WERD_CHOICE &BestRawChoice, FLOAT32 Thresholds[])
 
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
 
int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
 
void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, const DENORM &denorm, TBLOB *Blob)
 
void PrintAdaptiveMatchResults (FILE *File, ADAPT_RESULTS *Results)
 
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
 
void RemoveBadMatches (ADAPT_RESULTS *Results)
 
void SetAdaptiveThreshold (FLOAT32 Threshold)
 
void ShowBestMatchFor (TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int shape_id, BOOL8 AdaptiveOn, BOOL8 PreTrainedOn, ADAPT_RESULTS *Results)
 
STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
 
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
 
int ShapeIDToClassID (int shape_id) const
 
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const DENORM &denorm, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
 
int CharNormClassifier (TBLOB *Blob, const DENORM &denorm, INT_TEMPLATES Templates, ADAPT_RESULTS *Results)
 
int CharNormTrainingSample (bool pruner_only, const TrainingSample &sample, GenericVector< ShapeRating > *results)
 
UNICHAR_IDGetAmbiguities (TBLOB *Blob, const DENORM &denorm, CLASS_ID CorrectClass)
 
void DoAdaptiveMatch (TBLOB *Blob, const DENORM &denorm, ADAPT_RESULTS *Results)
 
void AdaptToChar (TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold)
 
void DisplayAdaptedChar (TBLOB *blob, const DENORM &denorm, INT_CLASS_STRUCT *int_class)
 
int AdaptableWord (TWERD *Word, const WERD_CHOICE &BestChoiceWord, const WERD_CHOICE &RawChoiceWord)
 
void EndAdaptiveClassifier ()
 
void PrintAdaptiveStatistics (FILE *File)
 
void SettupPass1 ()
 
void SettupPass2 ()
 
void AdaptiveClassifier (TBLOB *Blob, const DENORM &denorm, BLOB_CHOICE_LIST *Choices, CLASS_PRUNER_RESULTS cp_results)
 
void ClassifyAsNoise (ADAPT_RESULTS *Results)
 
void ResetAdaptiveClassifierInternal ()
 
int GetBaselineFeatures (TBLOB *Blob, const DENORM &denorm, INT_TEMPLATES Templates, INT_FEATURE_ARRAY IntFeatures, uinT8 *CharNormArray, inT32 *BlobLength)
 
int GetCharNormFeatures (TBLOB *Blob, const DENORM &denorm, INT_TEMPLATES Templates, INT_FEATURE_ARRAY IntFeatures, uinT8 *PrunerNormArray, uinT8 *CharNormArray, inT32 *BlobLength, inT32 *FeatureOutlineIndex)
 
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uinT8 *char_norm_array, uinT8 *pruner_array)
 
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
 
void UpdateAmbigsGroup (CLASS_ID class_id, const DENORM &denorm, TBLOB *Blob)
 
void ResetFeaturesHaveBeenExtracted ()
 
bool AdaptiveClassifierIsFull ()
 
bool LooksLikeGarbage (const DENORM &denorm, TBLOB *blob)
 
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
 
void ClearCharNormArray (uinT8 *char_norm_array)
 
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uinT8 *char_norm_array)
 
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
 
INT_TEMPLATES ReadIntTemplates (FILE *File)
 
void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
 
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
 
void ShowMatchDisplay ()
 
UnicityTable< FontInfo > & get_fontinfo_table ()
 
UnicityTable< FontSet > & get_fontset_table ()
 
void NormalizeOutlines (LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale)
 
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
 
void ReadClassFile ()
 
- Public Member Functions inherited from tesseract::CCStruct
 CCStruct ()
 
 ~CCStruct ()
 
- Public Member Functions inherited from tesseract::CUtil
 CUtil ()
 
 ~CUtil ()
 
void read_variables (const char *filename, bool global_only)
 
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
 
virtual ~CCUtil ()
 
void main_setup (const char *argv0, const char *basename)
 
ParamsVectorsparams ()
 

Public Attributes

bool tessedit_resegment_from_boxes = false
 
bool tessedit_resegment_from_line_boxes = false
 
bool tessedit_train_from_boxes = false
 
bool tessedit_make_boxes_from_boxes = false
 
bool tessedit_dump_pageseg_images = false
 
int tessedit_pageseg_mode = PSM_SINGLE_BLOCK
 
int tessedit_ocr_engine_mode = tesseract::OEM_TESSERACT_ONLY
 
char * tessedit_char_blacklist = ""
 
char * tessedit_char_whitelist = ""
 
bool tessedit_ambigs_training = false
 
int pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT
 
int ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT
 
char * tessedit_write_params_to_file = ""
 
bool tessedit_adapt_to_char_fragments = true
 
bool tessedit_adaption_debug = false
 
int bidi_debug = 0
 
int applybox_debug = 1
 
int applybox_page = 0
 
char * applybox_exposure_pattern = ".exp"
 
bool applybox_learn_chars_and_char_frags_mode = false
 
bool applybox_learn_ngrams_mode = false
 
bool tessedit_display_outwords = false
 
bool tessedit_training_tess = false
 
bool tessedit_dump_choices = false
 
bool tessedit_fix_fuzzy_spaces = true
 
bool tessedit_unrej_any_wd = false
 
bool tessedit_fix_hyphens = true
 
bool tessedit_redo_xheight = true
 
bool tessedit_enable_doc_dict = true
 
bool tessedit_debug_fonts = false
 
bool tessedit_debug_block_rejection = false
 
bool tessedit_enable_bigram_correction = false
 
int tessedit_bigram_debug = 0
 
int debug_x_ht_level = 0
 
bool debug_acceptable_wds = false
 
char * chs_leading_punct = "('`\""
 
char * chs_trailing_punct1 = ").,;:?!"
 
char * chs_trailing_punct2 = ")'`\""
 
double quality_rej_pc = 0.08
 
double quality_blob_pc = 0.0
 
double quality_outline_pc = 1.0
 
double quality_char_pc = 0.95
 
int quality_min_initial_alphas_reqd = 2
 
bool tessedit_tess_adapt_to_rejmap = false
 
int tessedit_tess_adaption_mode = 0x27
 
bool tessedit_minimal_rej_pass1 = false
 
bool tessedit_test_adaption = false
 
bool tessedit_matcher_log = false
 
int tessedit_test_adaption_mode = 3
 
bool save_blob_choices = false
 
bool test_pt = false
 
double test_pt_x = 99999.99
 
double test_pt_y = 99999.99
 
int paragraph_debug_level = 0
 
int cube_debug_level = 1
 
char * outlines_odd = "%| "
 
char * outlines_2 = "ij!?%\":;"
 
bool docqual_excuse_outline_errs = false
 
bool tessedit_good_quality_unrej = true
 
bool tessedit_use_reject_spaces = true
 
double tessedit_reject_doc_percent = 65.00
 
double tessedit_reject_block_percent = 45.00
 
double tessedit_reject_row_percent = 40.00
 
double tessedit_whole_wd_rej_row_percent = 70.00
 
bool tessedit_preserve_blk_rej_perfect_wds = true
 
bool tessedit_preserve_row_rej_perfect_wds = true
 
bool tessedit_dont_blkrej_good_wds = false
 
bool tessedit_dont_rowrej_good_wds = false
 
int tessedit_preserve_min_wd_len = 2
 
bool tessedit_row_rej_good_docs = true
 
double tessedit_good_doc_still_rowrej_wd = 1.1
 
bool tessedit_reject_bad_qual_wds = true
 
bool tessedit_debug_doc_rejection = false
 
bool tessedit_debug_quality_metrics = false
 
bool bland_unrej = false
 
double quality_rowrej_pc = 1.1
 
bool unlv_tilde_crunching = true
 
bool crunch_early_merge_tess_fails = true
 
bool crunch_early_convert_bad_unlv_chs = false
 
double crunch_terrible_rating = 80.0
 
bool crunch_terrible_garbage = true
 
double crunch_poor_garbage_cert = -9.0
 
double crunch_poor_garbage_rate = 60
 
double crunch_pot_poor_rate = 40
 
double crunch_pot_poor_cert = -8.0
 
bool crunch_pot_garbage = true
 
double crunch_del_rating = 60
 
double crunch_del_cert = -10.0
 
double crunch_del_min_ht = 0.7
 
double crunch_del_max_ht = 3.0
 
double crunch_del_min_width = 3.0
 
double crunch_del_high_word = 1.5
 
double crunch_del_low_word = 0.5
 
double crunch_small_outlines_size = 0.6
 
int crunch_rating_max = 10
 
int crunch_pot_indicators = 1
 
bool crunch_leave_ok_strings = true
 
bool crunch_accept_ok = true
 
bool crunch_leave_accept_strings = false
 
bool crunch_include_numerals = false
 
int crunch_leave_lc_strings = 4
 
int crunch_leave_uc_strings = 4
 
int crunch_long_repetitions = 3
 
int crunch_debug = 0
 
int fixsp_non_noise_limit = 1
 
double fixsp_small_outlines_size = 0.28
 
bool tessedit_prefer_joined_punct = false
 
int fixsp_done_mode = 1
 
int debug_fix_space_level = 0
 
char * numeric_punctuation = ".,"
 
int x_ht_acceptance_tolerance = 8
 
int x_ht_min_change = 8
 
bool tessedit_write_block_separators = false
 
bool tessedit_write_rep_codes = false
 
bool tessedit_write_unlv = false
 
bool tessedit_create_hocr = false
 
char * unrecognised_char = "|"
 
int suspect_level = 99
 
int suspect_space_level = 100
 
int suspect_short_words = 2
 
bool suspect_constrain_1Il = false
 
double suspect_rating_per_ch = 999.9
 
double suspect_accept_rating = -999.9
 
bool tessedit_minimal_rejection = false
 
bool tessedit_zero_rejection = false
 
bool tessedit_word_for_word = false
 
bool tessedit_zero_kelvin_rejection = false
 
bool tessedit_consistent_reps = true
 
int tessedit_reject_mode = 0
 
int tessedit_ok_mode = 5
 
bool tessedit_rejection_debug = false
 
bool tessedit_flip_0O = true
 
double tessedit_lower_flip_hyphen = 1.5
 
double tessedit_upper_flip_hyphen = 1.8
 
bool rej_trust_doc_dawg = false
 
bool rej_1Il_use_dict_word = false
 
bool rej_1Il_trust_permuter_type = true
 
bool rej_use_tess_accepted = true
 
bool rej_use_tess_blanks = true
 
bool rej_use_good_perm = true
 
bool rej_use_sensible_wd = false
 
bool rej_alphas_in_number_perm = false
 
double rej_whole_of_mostly_reject_word_fract = 0.85
 
int tessedit_image_border = 2
 
char * ok_repeated_ch_non_alphanum_wds = "-?*\075"
 
char * conflict_set_I_l_1 = "Il1[]"
 
int min_sane_x_ht_pixels = 8
 
bool tessedit_create_boxfile = false
 
int tessedit_page_number = -1
 
bool tessedit_write_images = false
 
bool interactive_display_mode = false
 
char * file_type = ".tif"
 
bool tessedit_override_permuter = true
 
int tessdata_manager_debug_level = 0
 
char * tessedit_load_sublangs = ""
 
double min_orientation_margin = 7.0
 
bool textord_tabfind_show_vlines = false
 
bool textord_use_cjk_fp_model = FALSE
 
bool tessedit_init_config_only = false
 
bool textord_equation_detect = false
 
- Public Attributes inherited from tesseract::Wordrec
bool merge_fragments_in_matrix = TRUE
 
bool wordrec_no_block = FALSE
 
bool wordrec_enable_assoc = TRUE
 
bool force_word_assoc = FALSE
 
int wordrec_num_seg_states = 30
 
double wordrec_worst_state = 1
 
bool fragments_guide_chopper = FALSE
 
int repair_unchopped_blobs = 1
 
double tessedit_certainty_threshold = -2.25
 
int chop_debug = 0
 
bool chop_enable = 1
 
bool chop_vertical_creep = 0
 
int chop_split_length = 10000
 
int chop_same_distance = 2
 
int chop_min_outline_points = 6
 
int chop_inside_angle = -50
 
int chop_min_outline_area = 2000
 
double chop_split_dist_knob = 0.5
 
double chop_overlap_knob = 0.9
 
double chop_center_knob = 0.15
 
double chop_sharpness_knob = 0.06
 
double chop_width_change_knob = 5.0
 
double chop_ok_split = 100.0
 
double chop_good_split = 50.0
 
int chop_x_y_weight = 3
 
int segment_adjust_debug = 0
 
bool assume_fixed_pitch_char_segment = FALSE
 
bool use_new_state_cost = FALSE
 
double heuristic_segcost_rating_base = 1.25
 
double heuristic_weight_rating = 1
 
double heuristic_weight_width = 0
 
double heuristic_weight_seamcut = 0
 
double heuristic_max_char_wh_ratio = 2.0
 
int wordrec_debug_level = 0
 
bool wordrec_debug_blamer = false
 
bool wordrec_run_blamer = false
 
bool enable_new_segsearch = false
 
int segsearch_debug_level = 0
 
int segsearch_max_pain_points = 2000
 
int segsearch_max_futile_classifications = 10
 
double segsearch_max_char_wh_ratio = 2.0
 
double segsearch_max_fixed_pitch_char_wh_ratio = 2.0
 
bool save_alt_choices = false
 
LanguageModellanguage_model_
 
PRIORITY pass2_ok_split
 
int pass2_seg_states
 
int num_joints
 
int num_pushed
 
int num_popped
 
BlobMatchTable blob_match_table
 
EVALUATION_ARRAY last_segmentation
 
WERD_CHOICEprev_word_best_choice_
 
GenericVector< int > blame_reasons_
 
void(Wordrec::* fill_lattice_ )(const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
- Public Attributes inherited from tesseract::Classify
bool prioritize_division = FALSE
 
int tessedit_single_match = FALSE
 
bool classify_enable_learning = true
 
int classify_debug_level = 0
 
int classify_norm_method = character
 
double classify_char_norm_range = 0.2
 
double classify_min_norm_scale_x = 0.0
 
double classify_max_norm_scale_x = 0.325
 
double classify_min_norm_scale_y = 0.0
 
double classify_max_norm_scale_y = 0.325
 
bool tess_cn_matching = 0
 
bool tess_bn_matching = 0
 
bool classify_enable_adaptive_matcher = 1
 
bool classify_use_pre_adapted_templates = 0
 
bool classify_save_adapted_templates = 0
 
bool classify_enable_adaptive_debugger = 0
 
int matcher_debug_level = 0
 
int matcher_debug_flags = 0
 
int classify_learning_debug_level = 0
 
double matcher_good_threshold = 0.125
 
double matcher_great_threshold = 0.0
 
double matcher_perfect_threshold = 0.02
 
double matcher_bad_match_pad = 0.15
 
double matcher_rating_margin = 0.1
 
double matcher_avg_noise_size = 12.0
 
int matcher_permanent_classes_min = 1
 
int matcher_min_examples_for_prototyping = 3
 
int matcher_sufficient_examples_for_prototyping = 5
 
double matcher_clustering_max_angle_delta = 0.015
 
double classify_misfit_junk_penalty = 0.0
 
double rating_scale = 1.5
 
double certainty_scale = 20.0
 
double tessedit_class_miss_scale = 0.00390625
 
int classify_adapt_proto_threshold = 230
 
int classify_adapt_feature_threshold = 230
 
bool disable_character_fragments = TRUE
 
double classify_character_fragments_garbage_certainty_threshold = -3.0
 
bool classify_debug_character_fragments = FALSE
 
bool matcher_debug_separate_windows = FALSE
 
char * classify_learn_debug_str = ""
 
int classify_class_pruner_threshold = 229
 
int classify_class_pruner_multiplier = 30
 
int classify_cp_cutoff_strength = 7
 
int classify_integer_matcher_multiplier = 14
 
INT_TEMPLATES PreTrainedTemplates
 
ADAPT_TEMPLATES AdaptedTemplates
 
BIT_VECTOR AllProtosOn
 
BIT_VECTOR PrunedProtos
 
BIT_VECTOR AllConfigsOn
 
BIT_VECTOR AllProtosOff
 
BIT_VECTOR AllConfigsOff
 
BIT_VECTOR TempProtoMask
 
bool EnableLearning
 
NORM_PROTOSNormProtos
 
UnicityTable< FontInfofontinfo_table_
 
UnicityTable< FontSetfontset_table_
 
int il1_adaption_test = 0
 
bool classify_bln_numeric_mode = 0
 
- Public Attributes inherited from tesseract::CCUtil
STRING datadir
 
STRING imagebasename
 
STRING lang
 
STRING language_data_path_prefix
 
TessdataManager tessdata_manager
 
UNICHARSET unicharset
 
UnicharAmbigs unichar_ambigs
 
STRING imagefile
 
STRING directory
 
char * m_data_sub_dir = "tessdata/"
 
int ambigs_debug_level = 0
 
bool use_definite_ambigs_for_classifier = 0
 
bool use_ambigs_for_adaption = 0
 

Additional Inherited Members

- Static Public Attributes inherited from tesseract::CCStruct
static const double kDescenderFraction = 0.25
 
static const double kXHeightFraction = 0.5
 
static const double kAscenderFraction = 0.25
 
static const double kXHeightCapRatio
 
- Protected Member Functions inherited from tesseract::Wordrec
bool SegSearchDone (int num_futile_classifications)
 
void UpdateSegSearchNodes (int starting_col, SEG_SEARCH_PENDING_LIST *pending[], BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, HEAP *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const WERD_CHOICE *best_choice, SEG_SEARCH_PENDING_LIST *pending[], CHUNKS_RECORD *chunks_record, HEAP *pain_points, BlamerBundle *blamer_bundle)
 
void InitBlamerForSegSearch (const WERD_CHOICE *best_choice, CHUNKS_RECORD *chunks_record, HEAP *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
 
void FinishBlamerForSegSearch (const WERD_CHOICE *best_choice, BlamerBundle *blamer_bundle, STRING *blamer_debug)
 
- Protected Attributes inherited from tesseract::Classify
IntegerMatcher im_
 
FEATURE_DEFS_STRUCT feature_defs_
 
ShapeTableshape_table_
 
- Protected Attributes inherited from tesseract::CCStruct
Image image_
 

Detailed Description

Definition at line 139 of file tesseractclass.h.

Constructor & Destructor Documentation

tesseract::Tesseract::Tesseract ( )

Definition at line 37 of file tesseractclass.cpp.

39  "Take segmentation and labeling from box file",
40  this->params()),
42  "Conversion of word/line box file to char box file",
43  this->params()),
45  "Generate training data from boxed chars", this->params()),
47  "Generate more boxes from boxed chars", this->params()),
49  "Dump intermediate images made during page segmentation",
50  this->params()),
51  // The default for pageseg_mode is the old behaviour, so as not to
52  // upset anything that relies on that.
54  "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
55  " 5=line, 6=word, 7=char"
56  " (Values from PageSegMode enum in publictypes.h)",
57  this->params()),
59  "Which OCR engine(s) to run (Tesseract, Cube, both)."
60  " Defaults to loading and running only Tesseract"
61  " (no Cube,no combiner)."
62  " Values from OcrEngineMode enum in tesseractclass.h)",
63  this->params()),
65  "Blacklist of chars not to recognize", this->params()),
67  "Whitelist of chars to recognize", this->params()),
69  "Perform training for ambiguities", this->params()),
72  "Whether to use the top-line splitting process for Devanagari "
73  "documents while performing page-segmentation.", this->params()),
76  "Whether to use the top-line splitting process for Devanagari "
77  "documents while performing ocr.", this->params()),
79  "Write all parameters to the given file.", this->params()),
81  "Adapt to words that contain "
82  " a character composed form fragments", this->params()),
83  BOOL_MEMBER(tessedit_adaption_debug, false, "Generate and print debug"
84  " information for adaption", this->params()),
85  INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()),
86  INT_MEMBER(applybox_debug, 1, "Debug level", this->params()),
88  "Page number to apply boxes from", this->params()),
89  STRING_MEMBER(applybox_exposure_pattern, ".exp", "Exposure value follows"
90  " this pattern in the image filename. The name of the image"
91  " files are expected to be in the form"
92  " [lang].[fontname].exp[num].tif", this->params()),
94  "Learn both character fragments (as is done in the"
95  " special low exposure mode) as well as unfragmented"
96  " characters.", this->params()),
97  BOOL_MEMBER(applybox_learn_ngrams_mode, false, "Each bounding box"
98  " is assumed to contain ngrams. Only learn the ngrams"
99  " whose outlines overlap horizontally.", this->params()),
101  "Draw output words", this->params()),
103  "Call Tess to learn blobs", this->params()),
105  "Dump char choices", this->params()),
107  "Try to improve fuzzy spaces", this->params()),
109  "Dont bother with word plausibility", this->params()),
111  "Crunch double hyphens?", this->params()),
113  "Check/Correct x-height", this->params()),
115  "Add words to the document dictionary", this->params()),
117  "Output font info per char", this->params()),
119  "Block and Row stats", this->params()),
121  "Enable correction based on the word bigram dictionary.",
122  this->params()),
124  "Amount of debug output for bigram correction.",
125  this->params()),
126  INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()),
128  "Dump word pass/fail chk", this->params()),
130  "Leading punctuation", this->params()),
132  "1st Trailing punctuation", this->params()),
134  "2nd Trailing punctuation", this->params()),
136  "good_quality_doc lte rejection limit", this->params()),
138  "good_quality_doc gte good blobs limit", this->params()),
140  "good_quality_doc lte outline error limit", this->params()),
142  "good_quality_doc gte good char limit", this->params()),
144  "alphas in a good word", this->params()),
146  "Use reject map to control Tesseract adaption", this->params()),
148  "Adaptation decision algorithm for tess", this->params()),
150  "Do minimal rejection on pass 1 output", this->params()),
152  "Test adaption criteria", this->params()),
154  "Log matcher activity", this->params()),
156  "Adaptation decision algorithm for tess", this->params()),
158  "Save the results of the recognition step (blob_choices)"
159  " within the corresponding WERD_CHOICE", this->params()),
160  BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
161  double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
162  double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
163  INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
164  this->params()),
165  INT_MEMBER(cube_debug_level, 0, "Print cube debug info.", this->params()),
166  STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines",
167  this->params()),
168  STRING_MEMBER(outlines_2, "ij!?%\":;",
169  "Non standard number of outlines", this->params()),
171  "Allow outline errs in unrejection?", this->params()),
173  "Reduce rejection on good docs", this->params()),
175  "Reject spaces?", this->params()),
177  "%rej allowed before rej whole doc", this->params()),
179  "%rej allowed before rej whole block", this->params()),
181  "%rej allowed before rej whole row", this->params()),
183  "Number of row rejects in whole word rejects"
184  "which prevents whole row rejection", this->params()),
186  "Only rej partially rejected words in block rejection",
187  this->params()),
189  "Only rej partially rejected words in row rejection",
190  this->params()),
192  "Use word segmentation quality metric", this->params()),
194  "Use word segmentation quality metric", this->params()),
196  "Only preserve wds longer than this", this->params()),
198  "Apply row rejection to good docs", this->params()),
200  "rej good doc wd if more than this fraction rejected",
201  this->params()),
203  "Reject all bad quality wds", this->params()),
205  "Page stats", this->params()),
207  "Output data to debug file", this->params()),
208  BOOL_MEMBER(bland_unrej, false,
209  "unrej potential with no chekcs", this->params()),
211  "good_quality_doc gte good char limit", this->params()),
213  "Mark v.bad words for tilde crunch", this->params()),
215  "Before word crunch?", this->params()),
217  "Take out ~^ early?", this->params()),
219  "crunch rating lt this", this->params()),
220  BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()),
222  "crunch garbage cert lt this", this->params()),
224  "crunch garbage rating lt this", this->params()),
226  "POTENTIAL crunch rating lt this", this->params()),
228  "POTENTIAL crunch cert lt this", this->params()),
230  "POTENTIAL crunch garbage", this->params()),
232  "POTENTIAL crunch rating lt this", this->params()),
234  "POTENTIAL crunch cert lt this", this->params()),
236  "Del if word ht lt xht x this", this->params()),
238  "Del if word ht gt xht x this", this->params()),
240  "Del if word width lt xht x this", this->params()),
242  "Del if word gt xht x this above bl", this->params()),
244  "Del if word gt xht x this below bl", this->params()),
246  "Small if lt xht x this", this->params()),
248  "For adj length in rating per ch", this->params()),
250  "How many potential indicators needed", this->params()),
252  "Dont touch sensible strings", this->params()),
254  "Use acceptability in okstring", this->params()),
256  "Dont pot crunch sensible strings", this->params()),
258  "Fiddle alpha figures", this->params()),
260  "Dont crunch words with long lower case strings",
261  this->params()),
263  "Dont crunch words with long lower case strings",
264  this->params()),
266  "Crunch words with long repetitions", this->params()),
267  INT_MEMBER(crunch_debug, 0, "As it says", this->params()),
269  "How many non-noise blbs either side?", this->params()),
271  "Small if lt xht x this", this->params()),
273  "Reward punctation joins", this->params()),
275  "What constitues done for spacing", this->params()),
277  "Contextual fixspace debug", this->params()),
279  "Punct. chs expected WITHIN numbers", this->params()),
281  "Max allowed deviation of blob top outside of font data",
282  this->params()),
284  "Min change in xht before actually trying it", this->params()),
286  "Write block separators in output", this->params()),
288  "Write repetition char code", this->params()),
290  "Write .unlv output file", this->params()),
292  "Write .html hOCR output file", this->params()),
294  "Output char for unidentified blobs", this->params()),
295  INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
297  "Min suspect level for rejecting spaces", this->params()),
299  "Dont Suspect dict wds longer than this", this->params()),
301  "UNLV keep 1Il chars rejected", this->params()),
303  "Dont touch bad rating limit", this->params()),
305  "Accept good rating limit", this->params()),
307  "Only reject tess failures", this->params()),
309  "Dont reject ANYTHING", this->params()),
311  "Make output have exactly one word per WERD", this->params()),
313  "Dont reject ANYTHING AT ALL", this->params()),
315  "Force all rep chars the same", this->params()),
316  INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", this->params()),
318  "Acceptance decision algorithm", this->params()),
320  "Adaption debug", this->params()),
322  "Contextual 0O O0 flips", this->params()),
324  "Aspect ratio dot/hyphen test", this->params()),
326  "Aspect ratio dot/hyphen test", this->params()),
328  "Use DOC dawg in 11l conf. detector", this->params()),
330  "Use dictword test", this->params()),
332  "Dont double check", this->params()),
334  "Individual rejection control", this->params()),
336  "Individual rejection control", this->params()),
338  "Individual rejection control", this->params()),
340  "Extend permuter check", this->params()),
342  "Extend permuter check", this->params()),
344  "if >this fract", this->params()),
346  "Rej blbs near image edge limit", this->params()),
348  "Allow NN to unrej", this->params()),
350  "Il1 conflict set", this->params()),
352  "Reject any x-ht lt or eq than this", this->params()),
354  "Output text with boxes", this->params()),
355  INT_MEMBER(tessedit_page_number, -1, "-1 -> All pages"
356  " , else specifc page to process", this->params()),
358  "Capture the image from the IPE", this->params()),
359  BOOL_MEMBER(interactive_display_mode, false, "Run interactively?",
360  this->params()),
361  STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
363  "According to dict_word", this->params()),
364  INT_MEMBER(tessdata_manager_debug_level, 0, "Debug level for"
365  " TessdataManager functions.", this->params()),
367  "List of languages to load with this one", this->params()),
369  "Min acceptable orientation margin", this->params()),
370  BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding",
371  this->params()),
372  BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model",
373  this->params()),
375  "Only initialize with the config file. Useful if the "
376  "instance is not going to be used for OCR but say only "
377  "for layout analysis.", this->params()),
378  BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
379  this->params()),
380  backup_config_file_(NULL),
381  pix_binary_(NULL),
382  cube_binary_(NULL),
383  pix_grey_(NULL),
384  source_resolution_(0),
385  textord_(this),
386  right_to_left_(false),
387  scaled_color_(NULL),
388  scaled_factor_(-1),
389  deskew_(1.0f, 0.0f),
390  reskew_(1.0f, 0.0f),
391  most_recently_used_(this),
392  font_table_size_(0),
393  cube_cntxt_(NULL),
394  tess_cube_combiner_(NULL),
395  equ_detect_(NULL) {
396 }
#define INT_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:284
bool tessedit_preserve_row_rej_perfect_wds
double tessedit_reject_block_percent
#define NULL
Definition: host.h:144
double tessedit_whole_wd_rej_row_percent
#define f(xc, yc)
Definition: imgscale.cpp:39
#define FALSE
Definition: capi.h:28
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:272
bool tessedit_resegment_from_line_boxes
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:287
bool crunch_early_convert_bad_unlv_chs
double tessedit_reject_doc_percent
bool tessedit_enable_bigram_correction
bool applybox_learn_chars_and_char_frags_mode
bool tessedit_preserve_blk_rej_perfect_wds
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:278
double rej_whole_of_mostly_reject_word_fract
char * tessedit_write_params_to_file
double tessedit_reject_row_percent
char * ok_repeated_ch_non_alphanum_wds
double tessedit_good_doc_still_rowrej_wd
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:156
ParamsVectors * params()
Definition: ccutil.h:65
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:275
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:281
tesseract::Tesseract::~Tesseract ( )

Definition at line 398 of file tesseractclass.cpp.

398  {
399  Clear();
400  end_tesseract();
401  sub_langs_.delete_data_pointers();
402  // Delete cube objects.
403  if (cube_cntxt_ != NULL) {
404  delete cube_cntxt_;
405  cube_cntxt_ = NULL;
406  }
407  if (tess_cube_combiner_ != NULL) {
408  delete tess_cube_combiner_;
409  tess_cube_combiner_ = NULL;
410  }
411 }
#define NULL
Definition: host.h:144

Member Function Documentation

BOOL8 tesseract::Tesseract::acceptable_number_string ( const char *  s,
const char *  lengths 
)

Definition at line 485 of file output.cpp.

486  {
487  BOOL8 prev_digit = FALSE;
488 
489  if (*lengths == 1 && *s == '(')
490  s++;
491 
492  if (*lengths == 1 &&
493  ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
494  s++;
495 
496  for (; *s != '\0'; s += *(lengths++)) {
497  if (unicharset.get_isdigit(s, *lengths))
498  prev_digit = TRUE;
499  else if (prev_digit &&
500  (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
501  prev_digit = FALSE;
502  else if (prev_digit && *lengths == 1 &&
503  (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
504  return TRUE;
505  else if (prev_digit &&
506  *lengths == 1 && (*s == '%') &&
507  (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
508  (*(s + *lengths + *(lengths + 1)) == '\0'))
509  return TRUE;
510  else
511  return FALSE;
512  }
513  return TRUE;
514 }
unsigned char BOOL8
Definition: host.h:113
#define FALSE
Definition: capi.h:28
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
UNICHARSET unicharset
Definition: ccutil.h:72
#define TRUE
Definition: capi.h:27
ACCEPTABLE_WERD_TYPE tesseract::Tesseract::acceptable_word_string ( const UNICHARSET char_set,
const char *  s,
const char *  lengths 
)

Definition at line 1284 of file control.cpp.

1285  {
1286  int i = 0;
1287  int offset = 0;
1288  int leading_punct_count;
1289  int upper_count = 0;
1290  int hyphen_pos = -1;
1292 
1293  if (strlen (lengths) > 20)
1294  return word_type;
1295 
1296  /* Single Leading punctuation char*/
1297 
1298  if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
1299  offset += lengths[i++];
1300  leading_punct_count = i;
1301 
1302  /* Initial cap */
1303  while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1304  offset += lengths[i++];
1305  upper_count++;
1306  }
1307  if (upper_count > 1) {
1308  word_type = AC_UPPER_CASE;
1309  } else {
1310  /* Lower case word, possibly with an initial cap */
1311  while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1312  offset += lengths[i++];
1313  }
1314  if (i - leading_punct_count < quality_min_initial_alphas_reqd)
1315  goto not_a_word;
1316  /*
1317  Allow a single hyphen in a lower case word
1318  - dont trust upper case - I've seen several cases of "H" -> "I-I"
1319  */
1320  if (lengths[i] == 1 && s[offset] == '-') {
1321  hyphen_pos = i;
1322  offset += lengths[i++];
1323  if (s[offset] != '\0') {
1324  while ((s[offset] != '\0') &&
1325  char_set.get_islower(s + offset, lengths[i])) {
1326  offset += lengths[i++];
1327  }
1328  if (i < hyphen_pos + 3)
1329  goto not_a_word;
1330  }
1331  } else {
1332  /* Allow "'s" in NON hyphenated lower case words */
1333  if (lengths[i] == 1 && (s[offset] == '\'') &&
1334  lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
1335  offset += lengths[i++];
1336  offset += lengths[i++];
1337  }
1338  }
1339  if (upper_count > 0)
1340  word_type = AC_INITIAL_CAP;
1341  else
1342  word_type = AC_LOWER_CASE;
1343  }
1344 
1345  /* Up to two different, constrained trailing punctuation chars */
1346  if (lengths[i] == 1 && s[offset] != '\0' &&
1347  STRING(chs_trailing_punct1).contains(s[offset]))
1348  offset += lengths[i++];
1349  if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
1350  s[offset - lengths[i - 1]] != s[offset] &&
1351  STRING(chs_trailing_punct2).contains (s[offset]))
1352  offset += lengths[i++];
1353 
1354  if (s[offset] != '\0')
1355  word_type = AC_UNACCEPTABLE;
1356 
1357  not_a_word:
1358 
1359  if (word_type == AC_UNACCEPTABLE) {
1360  /* Look for abbreviation string */
1361  i = 0;
1362  offset = 0;
1363  if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1364  word_type = AC_UC_ABBREV;
1365  while (s[offset] != '\0' &&
1366  char_set.get_isupper(s + offset, lengths[i]) &&
1367  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1368  offset += lengths[i++];
1369  offset += lengths[i++];
1370  }
1371  }
1372  else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1373  word_type = AC_LC_ABBREV;
1374  while (s[offset] != '\0' &&
1375  char_set.get_islower(s + offset, lengths[i]) &&
1376  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1377  offset += lengths[i++];
1378  offset += lengths[i++];
1379  }
1380  }
1381  if (s[offset] != '\0')
1382  word_type = AC_UNACCEPTABLE;
1383  }
1384 
1385  return word_type;
1386 }
Unacceptable word.
Definition: control.h:37
ALL upper case.
Definition: control.h:39
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:399
BOOL8 contains(const char c) const
Definition: strngs.cpp:147
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:406
ALL but initial lc.
Definition: control.h:40
Definition: strngs.h:40
a.b.c.
Definition: control.h:41
ALL lower case.
Definition: control.h:38
ACCEPTABLE_WERD_TYPE
Definition: control.h:35
A.B.C.
Definition: control.h:42
inT16 tesseract::Tesseract::alpha_count ( const char *  word,
const char *  word_lengths 
)

Definition at line 659 of file reject.cpp.

660  {
661  inT16 i;
662  inT16 offset;
663  inT16 count = 0;
664 
665  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
666  if (unicharset.get_isalpha (word + offset, word_lengths[i]))
667  count++;
668  }
669  return count;
670 }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
UNICHARSET unicharset
Definition: ccutil.h:72
short inT16
Definition: host.h:100
int count(LIST var_list)
Definition: oldlist.cpp:108
void tesseract::Tesseract::ambigs_classify_and_output ( WERD_RES werd_res,
ROW_RES row_res,
BLOCK_RES block_res,
const char *  label,
FILE *  output_file 
)

Definition at line 163 of file recogtraining.cpp.

167  {
168  int offset;
169  // Classify word.
170  fflush(stdout);
171  classify_word_pass1(block_res->block, row_res->row, werd_res);
172  WERD_CHOICE *best_choice = werd_res->best_choice;
173  ASSERT_HOST(best_choice != NULL);
174  ASSERT_HOST(best_choice->blob_choices() != NULL);
175 
176  // Compute the number of unichars in the label.
177  int label_num_unichars = 0;
178  int step = 1; // should be non-zero on the first iteration
179  for (offset = 0; label[offset] != '\0' && step > 0;
180  step = werd_res->uch_set->step(label + offset),
181  offset += step, ++label_num_unichars);
182  if (step == 0) {
183  tprintf("Not outputting illegal unichar %s\n", label);
184  return;
185  }
186 
187  // Output all classifier choices for the unigrams (1->1 classifications).
188  if (label_num_unichars == 1 && best_choice->blob_choices()->length() == 1) {
189  BLOB_CHOICE_LIST_C_IT outer_blob_choice_it;
190  outer_blob_choice_it.set_to_list(best_choice->blob_choices());
191  BLOB_CHOICE_IT blob_choice_it;
192  blob_choice_it.set_to_list(outer_blob_choice_it.data());
193  for (blob_choice_it.mark_cycle_pt();
194  !blob_choice_it.cycled_list();
195  blob_choice_it.forward()) {
196  BLOB_CHOICE *blob_choice = blob_choice_it.data();
197  if (blob_choice->unichar_id() != INVALID_UNICHAR_ID) {
198  fprintf(output_file, "%s\t%s\t%.4f\t%.4f\n",
199  unicharset.id_to_unichar(blob_choice->unichar_id()),
200  label, blob_choice->rating(), blob_choice->certainty());
201  }
202  }
203  }
204  // Output raw choices for many->many and 1->many classifications.
205  getDict().PrintAmbigAlternatives(output_file, label, label_num_unichars);
206 }
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
BLOB_CHOICE_LIST_CLIST * blob_choices()
Definition: ratngs.h:244
BLOCK * block
Definition: pageres.h:258
#define NULL
Definition: host.h:144
UNICHAR_ID unichar_id() const
Definition: ratngs.h:59
float certainty() const
Definition: ratngs.h:65
Dict & getDict()
Definition: classify.h:62
const UNICHARSET * uch_set
Definition: pageres.h:348
void classify_word_pass1(BLOCK *block, ROW *row, WERD_RES *word)
Definition: control.cpp:860
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
UNICHARSET unicharset
Definition: ccutil.h:72
void PrintAmbigAlternatives(FILE *file, const char *label, int label_num_unichars)
Print all the choices in raw_choices_ list for non 1-1 ambiguities.
Definition: stopper.cpp:358
int step(const char *str) const
Definition: unicharset.cpp:192
#define ASSERT_HOST(x)
Definition: errcode.h:84
ROW * row
Definition: pageres.h:286
WERD_CHOICE * best_choice
Definition: pageres.h:359
float rating() const
Definition: ratngs.h:62
PAGE_RES * tesseract::Tesseract::ApplyBoxes ( const STRING fname,
bool  find_segmentation,
BLOCK_LIST *  block_list 
)

Definition at line 111 of file applybox.cpp.

113  {
114  int box_count = 0;
115  int box_failures = 0;
116 
117  FILE* box_file = OpenBoxFile(fname);
118  TBOX box;
119  GenericVector<TBOX> boxes;
120  GenericVector<STRING> texts, full_texts;
121 
122  bool found_box = true;
123  while (found_box) {
124  int line_number = 0; // Line number of the box file.
125  STRING text, full_text;
126  found_box = ReadNextBox(applybox_page, &line_number, box_file, &text, &box);
127  if (found_box) {
128  ++box_count;
129  MakeBoxFileStr(text.string(), box, applybox_page, &full_text);
130  } else {
131  full_text = "";
132  }
133  boxes.push_back(box);
134  texts.push_back(text);
135  full_texts.push_back(full_text);
136  }
137 
138  // In word mode, we use the boxes to make a word for each box, but
139  // in blob mode we use the existing words and maximally chop them first.
140  PAGE_RES* page_res = find_segmentation ?
141  NULL : SetupApplyBoxes(boxes, block_list);
142  clear_any_old_text(block_list);
143 
144  for (int i = 0; i < boxes.size() - 1; i++) {
145  bool foundit = false;
146  if (page_res != NULL) {
147  if (i == 0) {
148  foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1],
149  full_texts[i].string());
150  } else {
151  foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i],
152  boxes[i + 1], full_texts[i].string());
153  }
154  } else {
155  foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1],
156  texts[i].string());
157  }
158  if (!foundit) {
159  box_failures++;
160  ReportFailedBox(i, boxes[i], texts[i].string(),
161  "FAILURE! Couldn't find a matching blob");
162  }
163  }
164 
165  if (page_res == NULL) {
166  // In word/line mode, we now maximally chop all the words and resegment
167  // them with the classifier.
168  page_res = SetupApplyBoxes(boxes, block_list);
169  ReSegmentByClassification(page_res);
170  }
171  if (applybox_debug > 0) {
172  tprintf("APPLY_BOXES:\n");
173  tprintf(" Boxes read from boxfile: %6d\n", box_count);
174  if (box_failures > 0)
175  tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
176  }
177  TidyUp(page_res);
178  return page_res;
179 }
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX &next_box, const char *correct_text)
Definition: applybox.cpp:439
#define NULL
Definition: host.h:144
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:197
Definition: rect.h:29
int push_back(T object)
const char * string() const
Definition: strngs.cpp:156
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:59
void ReSegmentByClassification(PAGE_RES *page_res)
Definition: applybox.cpp:510
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
Definition: applybox.cpp:756
void TidyUp(PAGE_RES *page_res)
Definition: applybox.cpp:702
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX &next_box, const char *correct_text)
Definition: applybox.cpp:341
FILE * OpenBoxFile(const STRING &fname)
Definition: boxread.cpp:33
Definition: strngs.h:40
int size() const
Definition: genericvector.h:59
void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, STRING *box_str)
Definition: boxread.cpp:155
void tesseract::Tesseract::ApplyBoxTraining ( const STRING filename,
PAGE_RES page_res 
)

Definition at line 786 of file applybox.cpp.

786  {
787  PAGE_RES_IT pr_it(page_res);
788  int word_count = 0;
789  for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
790  word_res = pr_it.forward()) {
791  LearnWord(filename.string(), NULL, word_res);
792  ++word_count;
793  }
794  tprintf("Generated training data for %d words\n", word_count);
795 }
void LearnWord(const char *filename, const char *rejmap, WERD_RES *word)
Definition: adaptmatch.cpp:254
#define NULL
Definition: host.h:144
WERD * word
Definition: pageres.h:334
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int tesseract::Tesseract::AutoPageSeg ( bool  single_column,
bool  osd,
bool  only_osd,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  to_blocks,
Tesseract osd_tess,
OSResults osr 
)

Auto page segmentation. Divide the page image into blocks of uniform text linespacing and images.

Resolution (in ppi) is derived from the input image.

The output goes in the blocks list with corresponding TO_BLOCKs in the to_blocks list.

If single_column is true, then no attempt is made to divide the image into columns, but multiple blocks are still made if the text is of non-uniform linespacing.

If osd (orientation and script detection) is true then that is performed as well. If only_osd is true, then only orientation and script detection is performed. If osd is desired, (osd or only_osd) then osr_tess must be another Tesseract that was initialized especially for osd, and the results will be output into osr (orientation and script result).

Definition at line 218 of file pagesegmain.cpp.

220  {
221  if (textord_debug_images) {
222  WriteDebugBackgroundImage(textord_debug_printable, pix_binary_);
223  }
224  Pix* photomask_pix = NULL;
225  Pix* musicmask_pix = NULL;
226  // The blocks made by the ColumnFinder. Moved to blocks before return.
227  BLOCK_LIST found_blocks;
228  TO_BLOCK_LIST temp_blocks;
229 
230  ColumnFinder* finder = SetupPageSegAndDetectOrientation(
231  single_column, osd, only_osd, blocks, osd_tess, osr,
232  &temp_blocks, &photomask_pix, &musicmask_pix);
233  if (finder != NULL) {
234  TO_BLOCK_IT to_block_it(&temp_blocks);
235  TO_BLOCK* to_block = to_block_it.data();
236  if (musicmask_pix != NULL) {
237  // TODO(rays) pass the musicmask_pix into FindBlocks and mark music
238  // blocks separately. For now combine with photomask_pix.
239  pixOr(photomask_pix, photomask_pix, musicmask_pix);
240  }
241  if (equ_detect_) {
242  finder->SetEquationDetect(equ_detect_);
243  }
244  if (finder->FindBlocks(single_column, scaled_color_, scaled_factor_,
245  to_block, photomask_pix,
246  &found_blocks, to_blocks) < 0) {
247  pixDestroy(&photomask_pix);
248  pixDestroy(&musicmask_pix);
249  return -1;
250  }
251  finder->GetDeskewVectors(&deskew_, &reskew_);
252  delete finder;
253  }
254  pixDestroy(&photomask_pix);
255  pixDestroy(&musicmask_pix);
256  blocks->clear();
257  BLOCK_IT block_it(blocks);
258  // Move the found blocks to the input/output blocks.
259  block_it.add_list_after(&found_blocks);
260 
261  if (textord_debug_images) {
262  // The debug image is no longer needed so delete it.
263  unlink(AlignedBlob::textord_debug_pix().string());
264  }
265  return 0;
266 }
bool textord_debug_images
Definition: alignedblob.cpp:34
#define NULL
Definition: host.h:144
bool textord_debug_printable
Definition: alignedblob.cpp:35
static const STRING & textord_debug_pix()
Definition: alignedblob.h:112
ColumnFinder * SetupPageSegAndDetectOrientation(bool single_column, bool osd, bool only_osd, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)
Pix* tesseract::Tesseract::BestPix ( ) const
inline

Definition at line 181 of file tesseractclass.h.

181  {
182  return pix_grey_ != NULL ? pix_grey_ : pix_binary_;
183  }
#define NULL
Definition: host.h:144
void tesseract::Tesseract::bigram_correction_pass ( PAGE_RES page_res)

Definition at line 419 of file control.cpp.

419  {
420  PAGE_RES_IT word_it(page_res);
421 
422  WERD_RES *w_prev = NULL;
423  WERD_RES *w = word_it.word();
424  while (1) {
425  w_prev = w;
426  while (word_it.forward() != NULL &&
427  (!word_it.word() || word_it.word()->part_of_combo)) {
428  // advance word_it, skipping over parts of combos
429  }
430  if (!word_it.word()) break;
431  w = word_it.word();
432  if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
433  continue;
434  }
435  if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
436  if (tessedit_bigram_debug) {
437  tprintf("Skipping because one of the words is W_REP_CHAR\n");
438  }
439  continue;
440  }
441  // Two words sharing the same language model, excellent!
442  if (w->alt_choices.empty()) {
443  if (tessedit_bigram_debug) {
444  tprintf("Alt choices not set up for word choice: %s\n",
446  }
447  continue;
448  }
449  if (w_prev->alt_choices.empty()) {
450  if (tessedit_bigram_debug) {
451  tprintf("Alt choices not set up for word choice: %s\n",
452  w_prev->best_choice->unichar_string().string());
453  }
454  continue;
455  }
456 
457  // We saved alternate choices, excellent!
458  GenericVector<WERD_CHOICE *> overrides_word1;
459  GenericVector<GenericVector<int> *> overrides_word1_state;
460  GenericVector<WERD_CHOICE *> overrides_word2;
461  GenericVector<GenericVector<int> *> overrides_word2_state;
462 
463  STRING orig_w1_str = w_prev->best_choice->unichar_string();
464  STRING orig_w2_str = w->best_choice->unichar_string();
465  WERD_CHOICE prev_best(w->uch_set);
466  {
467  int w1start, w1end;
468  w_prev->WithoutFootnoteSpan(&w1start, &w1end);
469  prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
470  }
471  WERD_CHOICE this_best(w->uch_set);
472  {
473  int w2start, w2end;
474  w->WithoutFootnoteSpan(&w2start, &w2end);
475  this_best = w->best_choice->shallow_copy(w2start, w2end);
476  }
477 
478  if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
479  if (tessedit_bigram_debug) {
480  tprintf("Top choice \"%s %s\" verified by bigram model.\n",
481  orig_w1_str.string(), orig_w2_str.string());
482  }
483  continue;
484  }
485  if (tessedit_bigram_debug > 2) {
486  tprintf("Examining alt choices for \"%s %s\".\n",
487  orig_w1_str.string(), orig_w2_str.string());
488  }
489  if (tessedit_bigram_debug > 1) {
490  if (w_prev->alt_choices.size() > 1) {
492  }
493  if (w->alt_choices.size() > 1) {
495  }
496  }
497  float best_rating = 0.0;
498  int best_idx = 0;
499  for (int i = 0; i < w_prev->alt_choices.size(); i++) {
500  WERD_CHOICE *p1 = w_prev->alt_choices.get(i);
501  WERD_CHOICE strip1(w->uch_set);
502  {
503  int p1start, p1end;
504  w_prev->WithoutFootnoteSpan(*p1, w_prev->alt_states.get(i),
505  &p1start, &p1end);
506  strip1 = p1->shallow_copy(p1start, p1end);
507  }
508  for (int j = 0; j < w->alt_choices.size(); j++) {
509  WERD_CHOICE *p2 = w->alt_choices.get(j);
510  WERD_CHOICE strip2(w->uch_set);
511  {
512  int p2start, p2end;
513  w->WithoutFootnoteSpan(*p2, w->alt_states.get(j), &p2start, &p2end);
514  strip2 = p2->shallow_copy(p2start, p2end);
515  }
516  if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
517  overrides_word1.push_back(p1);
518  overrides_word1_state.push_back(&w_prev->alt_states.get(i));
519  overrides_word2.push_back(p2);
520  overrides_word2_state.push_back(&w->alt_states.get(j));
521  if (overrides_word1.size() == 1 ||
522  p1->rating() + p2->rating() < best_rating) {
523  best_rating = p1->rating() + p2->rating();
524  best_idx = overrides_word1.size() - 1;
525  }
526  }
527  }
528  }
529  if (overrides_word1.size() >= 1) {
530  // Excellent, we have some bigram matches.
532  *overrides_word1[best_idx]) &&
534  *overrides_word2[best_idx])) {
535  if (tessedit_bigram_debug > 1) {
536  tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
537  "model.\n", orig_w1_str.string(), orig_w2_str.string());
538  }
539  continue;
540  }
541  STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
542  STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
543  if (new_w1_str != orig_w1_str) {
544  w_prev->ReplaceBestChoice(*overrides_word1[best_idx],
545  *overrides_word1_state[best_idx]);
546  }
547  if (new_w2_str != orig_w2_str) {
548  w->ReplaceBestChoice(*overrides_word2[best_idx],
549  *overrides_word2_state[best_idx]);
550  }
551  if (tessedit_bigram_debug > 0) {
552  STRING choices_description;
553  int num_bigram_choices
554  = overrides_word1.size() * overrides_word2.size();
555  if (num_bigram_choices == 1) {
556  choices_description = "This was the unique bigram choice.";
557  } else {
558  if (tessedit_bigram_debug > 1) {
559  STRING bigrams_list;
560  const int kMaxChoicesToPrint = 20;
561  for (int i = 0; i < overrides_word1.size() &&
562  i < kMaxChoicesToPrint; i++) {
563  if (i > 0) { bigrams_list += ", "; }
564  WERD_CHOICE *p1 = overrides_word1[i];
565  WERD_CHOICE *p2 = overrides_word2[i];
566  bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
567  if (i == kMaxChoicesToPrint) {
568  bigrams_list += " ...";
569  }
570  }
571  choices_description = "There were many choices: {";
572  choices_description += bigrams_list;
573  choices_description += "}";
574  } else {
575  choices_description.add_str_int("There were ", num_bigram_choices);
576  choices_description += " compatible bigrams.";
577  }
578  }
579  tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
580  orig_w1_str.string(), orig_w2_str.string(),
581  new_w1_str.string(), new_w2_str.string(),
582  choices_description.string());
583  }
584  }
585  }
586 }
const STRING & unichar_string() const
Definition: ratngs.h:395
GenericVector< WERD_CHOICE * > alt_choices
Definition: pageres.h:363
tesseract::Tesseract * tesseract
Definition: pageres.h:403
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:849
#define NULL
Definition: host.h:144
T & get(int index) const
int push_back(T object)
GenericVector< GenericVector< int > > alt_states
Definition: pageres.h:364
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:122
void add_str_int(const char *str, int number)
Definition: strngs.cpp:334
Dict & getDict()
Definition: classify.h:62
WERD * word
Definition: pageres.h:334
const UNICHARSET * uch_set
Definition: pageres.h:348
void print_word_alternates_list(WERD_CHOICE *word, GenericVector< WERD_CHOICE * > *alternates)
Definition: ratngs.cpp:628
const char * string() const
Definition: strngs.cpp:156
bool empty() const
Definition: genericvector.h:68
void WithoutFootnoteSpan(int *start, int *end) const
Definition: pageres.cpp:510
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:483
Definition: strngs.h:40
int size() const
Definition: genericvector.h:59
void ReplaceBestChoice(const WERD_CHOICE &choice, const GenericVector< int > &segmentation_state)
Definition: pageres.cpp:436
WERD_CHOICE shallow_copy(int start, int end) const
Definition: ratngs.cpp:259
float rating() const
Definition: ratngs.h:231
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::blamer_pass ( PAGE_RES page_res)

Definition at line 684 of file control.cpp.

684  {
685  if (!wordrec_run_blamer) return;
686  PAGE_RES_IT page_res_it(page_res);
687  for (page_res_it.restart_page(); page_res_it.word() != NULL;
688  page_res_it.forward()) {
689  WERD_RES *word = page_res_it.word();
690  if (word->blamer_bundle == NULL) {
691  word->blamer_bundle = new BlamerBundle();
694  word->blamer_bundle->debug += " to blame";
695  } else if (word->blamer_bundle->incorrect_result_reason ==
696  IRR_NO_TRUTH) {
697  word->blamer_bundle->SetBlame(IRR_NO_TRUTH, "Rejected truth",
699  } else {
700  bool correct = ChoiceIsCorrect(*word->uch_set, word->best_choice,
701  word->blamer_bundle->truth_text);
704  if (irr == IRR_CORRECT && !correct) {
705  STRING debug = "Choice is incorrect after recognition";
706  word->blamer_bundle->SetBlame(IRR_UNKNOWN, debug,
707  word->best_choice,
709  } else if (irr != IRR_CORRECT && correct) {
710  if (wordrec_debug_blamer) {
711  tprintf("Corrected %s\n", word->blamer_bundle->debug.string());
712  }
714  word->blamer_bundle->debug = "";
715  }
716  }
718  }
719  tprintf("Blame reasons:\n");
720  for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
722  static_cast<IncorrectResultReason>(bl)),
723  page_res->blame_reasons[bl]);
724  }
725  if (page_res->misadaption_log.length() > 0) {
726  tprintf("Misadaption log:\n");
727  for (int i = 0; i < page_res->misadaption_log.length(); ++i) {
728  tprintf("%s\n", page_res->misadaption_log[i].string());
729  }
730  }
731 }
bool ChoiceIsCorrect(const UNICHARSET &uni_set, const WERD_CHOICE *choice, const GenericVector< STRING > &truth_text)
Definition: wordrec.cpp:159
const char * IncorrectReason() const
Definition: pageres.cpp:59
IncorrectResultReason incorrect_result_reason
Definition: pageres.h:176
#define NULL
Definition: host.h:144
WERD * word
Definition: pageres.h:334
const UNICHARSET * uch_set
Definition: pageres.h:348
bool wordrec_run_blamer
Definition: wordrec.h:143
IncorrectResultReason
Definition: pageres.h:45
GenericVector< STRING > truth_text
Definition: pageres.h:174
const char * string() const
Definition: strngs.cpp:156
void SetBlame(IncorrectResultReason irr, const STRING &msg, const WERD_CHOICE *choice, bool debug)
Definition: pageres.h:151
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool wordrec_debug_blamer
Definition: wordrec.h:142
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: pageres.cpp:55
Definition: strngs.h:40
GenericVector< int > blame_reasons
Definition: pageres.h:228
STRING debug
Definition: pageres.h:178
int length() const
Definition: genericvector.h:63
GenericVector< STRING > misadaption_log
Definition: pageres.h:233
BlamerBundle * blamer_bundle
Definition: pageres.h:367
WERD_CHOICE * best_choice
Definition: pageres.h:359
float tesseract::Tesseract::blob_noise_score ( TBLOB blob)

Definition at line 844 of file fixspace.cpp.

844  {
845  TBOX box; // BB of outline
846  inT16 outline_count = 0;
847  inT16 max_dimension;
848  inT16 largest_outline_dimension = 0;
849 
850  for (TESSLINE* ol = blob->outlines; ol != NULL; ol= ol->next) {
851  outline_count++;
852  box = ol->bounding_box();
853  if (box.height() > box.width()) {
854  max_dimension = box.height();
855  } else {
856  max_dimension = box.width();
857  }
858 
859  if (largest_outline_dimension < max_dimension)
860  largest_outline_dimension = max_dimension;
861  }
862 
863  if (outline_count > 5) {
864  // penalise LOTS of blobs
865  largest_outline_dimension *= 2;
866  }
867 
868  box = blob->bounding_box();
869  if (box.bottom() > kBlnBaselineOffset * 4 ||
870  box.top() < kBlnBaselineOffset / 2) {
871  // Lax blob is if high or low
872  largest_outline_dimension /= 2;
873  }
874 
875  return largest_outline_dimension;
876 }
const int kBlnBaselineOffset
Definition: normalis.h:28
#define NULL
Definition: host.h:144
inT16 width() const
Definition: rect.h:104
Definition: rect.h:29
TESSLINE * outlines
Definition: blobs.h:227
inT16 top() const
Definition: rect.h:53
TBOX bounding_box() const
Definition: blobs.cpp:384
short inT16
Definition: host.h:100
inT16 height() const
Definition: rect.h:97
inT16 bottom() const
Definition: rect.h:60
void tesseract::Tesseract::break_noisiest_blob_word ( WERD_RES_LIST &  words)

break_noisiest_blob_word() Find the word with the blob which looks like the worst noise. Break the word into two, deleting the noise blob.

Definition at line 699 of file fixspace.cpp.

699  {
700  WERD_RES_IT word_it(&words);
701  WERD_RES_IT worst_word_it;
702  float worst_noise_score = 9999;
703  int worst_blob_index = -1; // Noisiest blob of noisiest wd
704  int blob_index; // of wds noisiest blob
705  float noise_score; // of wds noisiest blob
706  WERD_RES *word_res;
707  C_BLOB_IT blob_it;
708  C_BLOB_IT rej_cblob_it;
709  C_BLOB_LIST new_blob_list;
710  C_BLOB_IT new_blob_it;
711  C_BLOB_IT new_rej_cblob_it;
712  WERD *new_word;
713  inT16 start_of_noise_blob;
714  inT16 i;
715 
716  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
717  blob_index = worst_noise_blob(word_it.data(), &noise_score);
718  if (blob_index > -1 && worst_noise_score > noise_score) {
719  worst_noise_score = noise_score;
720  worst_blob_index = blob_index;
721  worst_word_it = word_it;
722  }
723  }
724  if (worst_blob_index < 0) {
725  words.clear(); // signal termination
726  return;
727  }
728 
729  /* Now split the worst_word_it */
730 
731  word_res = worst_word_it.data();
732 
733  /* Move blobs before noise blob to a new bloblist */
734 
735  new_blob_it.set_to_list(&new_blob_list);
736  blob_it.set_to_list(word_res->word->cblob_list());
737  for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
738  new_blob_it.add_after_then_move(blob_it.extract());
739  }
740  start_of_noise_blob = blob_it.data()->bounding_box().left();
741  delete blob_it.extract(); // throw out noise blob
742 
743  new_word = new WERD(&new_blob_list, word_res->word);
744  new_word->set_flag(W_EOL, FALSE);
745  word_res->word->set_flag(W_BOL, FALSE);
746  word_res->word->set_blanks(1); // After break
747 
748  new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
749  rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
750  for (;
751  (!rej_cblob_it.empty() &&
752  (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
753  rej_cblob_it.forward()) {
754  new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
755  }
756 
757  WERD_RES* new_word_res = new WERD_RES(new_word);
758  new_word_res->combination = TRUE;
759  worst_word_it.add_before_then_move(new_word_res);
760 
761  word_res->ClearResults();
762 }
void ClearResults()
Definition: pageres.cpp:799
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
void set_blanks(uinT8 new_blanks)
Definition: werd.h:107
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:95
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:764
BOOL8 combination
Definition: pageres.h:450
#define FALSE
Definition: capi.h:28
WERD * word
Definition: pageres.h:334
Definition: werd.h:35
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:123
Definition: werd.h:60
short inT16
Definition: host.h:100
Definition: werd.h:36
#define TRUE
Definition: capi.h:27
SVMenuNode * tesseract::Tesseract::build_menu_new ( )

Definition at line 256 of file pgedit.cpp.

256  {
257  SVMenuNode* parent_menu;
258  SVMenuNode* root_menu_item = new SVMenuNode();
259 
260  SVMenuNode* modes_menu_item = root_menu_item->AddChild("MODES");
261 
262  modes_menu_item->AddChild("Change Display", CHANGE_DISP_CMD_EVENT);
263  modes_menu_item->AddChild("Dump Word", DUMP_WERD_CMD_EVENT);
264  modes_menu_item->AddChild("Show Point", SHOW_POINT_CMD_EVENT);
265  modes_menu_item->AddChild("Show BL Norm Word", SHOW_BLN_WERD_CMD_EVENT);
266  modes_menu_item->AddChild("Config Words", DEBUG_WERD_CMD_EVENT);
267  modes_menu_item->AddChild("Recog Words", RECOG_WERDS);
268  modes_menu_item->AddChild("Recog Blobs", RECOG_PSEUDO);
269 
270  parent_menu = root_menu_item->AddChild("DISPLAY");
271 
272  parent_menu->AddChild("Blamer", BLAMER_CMD_EVENT, FALSE);
273  parent_menu->AddChild("Bounding Boxes", BOUNDING_BOX_CMD_EVENT, FALSE);
274  parent_menu->AddChild("Correct Text", CORRECT_TEXT_CMD_EVENT, FALSE);
275  parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, FALSE);
276  parent_menu->AddChild("Baseline Normalized", BL_NORM_CMD_EVENT, FALSE);
277  parent_menu->AddChild("Edge Steps", BITMAP_CMD_EVENT, TRUE);
278  parent_menu->AddChild("Subscripts", SHOW_SUBSCRIPT_CMD_EVENT);
279  parent_menu->AddChild("Superscripts", SHOW_SUPERSCRIPT_CMD_EVENT);
280  parent_menu->AddChild("Italics", SHOW_ITALIC_CMD_EVENT);
281  parent_menu->AddChild("Bold", SHOW_BOLD_CMD_EVENT);
282  parent_menu->AddChild("Underline", SHOW_UNDERLINE_CMD_EVENT);
283  parent_menu->AddChild("FixedPitch", SHOW_FIXEDPITCH_CMD_EVENT);
284  parent_menu->AddChild("Serifs", SHOW_SERIF_CMD_EVENT);
285  parent_menu->AddChild("SmallCaps", SHOW_SMALLCAPS_CMD_EVENT);
286  parent_menu->AddChild("DropCaps", SHOW_DROPCAPS_CMD_EVENT);
287 
288 
289  parent_menu = root_menu_item->AddChild("OTHER");
290 
291  parent_menu->AddChild("Quit", QUIT_CMD_EVENT);
292  parent_menu->AddChild("Show Image", IMAGE_CMD_EVENT, FALSE);
293  parent_menu->AddChild("ShowBlock Outlines", BLOCKS_CMD_EVENT, FALSE);
294  parent_menu->AddChild("Show Baselines", BASELINES_CMD_EVENT, FALSE);
295  parent_menu->AddChild("Uniform Display", UNIFORM_DISP_CMD_EVENT);
296  parent_menu->AddChild("Refresh Display", REFRESH_CMD_EVENT);
297 
298  return root_menu_item;
299 }
SVMenuNode * AddChild(const char *txt)
Definition: svmnode.cpp:64
#define FALSE
Definition: capi.h:28
#define TRUE
Definition: capi.h:27
BOOL8 tesseract::Tesseract::check_debug_pt ( WERD_RES word,
int  location 
)

Definition at line 1388 of file control.cpp.

1388  {
1389  BOOL8 show_map_detail = FALSE;
1390  inT16 i;
1391 
1392  #ifndef SECURE_NAMES
1393  if (!test_pt)
1394  return FALSE;
1395 
1396  tessedit_rejection_debug.set_value (FALSE);
1397  debug_x_ht_level.set_value (0);
1398 
1399  if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
1400  if (location < 0)
1401  return TRUE; // For breakpoint use
1402  tessedit_rejection_debug.set_value (TRUE);
1403  debug_x_ht_level.set_value (20);
1404  tprintf ("\n\nTESTWD::");
1405  switch (location) {
1406  case 0:
1407  tprintf ("classify_word_pass1 start\n");
1408  word->word->print();
1409  break;
1410  case 10:
1411  tprintf ("make_reject_map: initial map");
1412  break;
1413  case 20:
1414  tprintf ("make_reject_map: after NN");
1415  break;
1416  case 30:
1417  tprintf ("classify_word_pass2 - START");
1418  break;
1419  case 40:
1420  tprintf ("classify_word_pass2 - Pre Xht");
1421  break;
1422  case 50:
1423  tprintf ("classify_word_pass2 - END");
1424  show_map_detail = TRUE;
1425  break;
1426  case 60:
1427  tprintf ("fixspace");
1428  break;
1429  case 70:
1430  tprintf ("MM pass START");
1431  break;
1432  case 80:
1433  tprintf ("MM pass END");
1434  break;
1435  case 90:
1436  tprintf ("After Poor quality rejection");
1437  break;
1438  case 100:
1439  tprintf ("unrej_good_quality_words - START");
1440  break;
1441  case 110:
1442  tprintf ("unrej_good_quality_words - END");
1443  break;
1444  case 120:
1445  tprintf ("Write results pass");
1446  show_map_detail = TRUE;
1447  break;
1448  }
1449  tprintf(" \"%s\" ",
1450  word->best_choice->unichar_string().string());
1451  word->reject_map.print (debug_fp);
1452  tprintf ("\n");
1453  if (show_map_detail) {
1454  tprintf ("\"%s\"\n", word->best_choice->unichar_string().string());
1455  for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1456  tprintf ("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1457  word->reject_map[i].full_print(debug_fp);
1458  }
1459  }
1460  tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1461  tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1462  return TRUE;
1463  }
1464  else
1465  #endif
1466  return FALSE;
1467 }
void full_print(FILE *fp)
Definition: rejctmap.cpp:412
const STRING & unichar_string() const
Definition: ratngs.h:395
TBOX bounding_box()
Definition: werd.cpp:164
BOOL8 done
Definition: pageres.h:419
bool contains(const FCOORD pt) const
Definition: rect.h:323
void print(FILE *fp)
Definition: rejctmap.cpp:400
unsigned char BOOL8
Definition: host.h:113
REJMAP reject_map
Definition: pageres.h:408
#define FALSE
Definition: capi.h:28
WERD * word
Definition: pageres.h:334
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
Definition: points.h:189
short inT16
Definition: host.h:100
BOOL8 tess_accepted
Definition: pageres.h:417
void print()
Definition: werd.cpp:256
FILE * debug_fp
Definition: tessvars.cpp:25
#define TRUE
Definition: capi.h:27
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::classify_word_and_language ( WordRecognizer  recognizer,
BLOCK block,
ROW row,
WERD_RES word 
)

Definition at line 795 of file control.cpp.

798  {
800  tprintf("Processing word with lang %s at:",
801  most_recently_used_->lang.string());
802  word->word->bounding_box().print();
803  }
804  const char* result_type = "Initial";
805  bool initially_done = !word->tess_failed && word->done;
806  if (initially_done) {
807  // If done on pass1, we reuse the tesseract that did it, and don't try
808  // any more. The only need to call the classifier at all is for the
809  // cube combiner and xheight fixing (which may be bogus on a done word.)
810  most_recently_used_ = word->tesseract;
811  result_type = "Already done";
812  }
813  (most_recently_used_->*recognizer)(block, row, word);
814  if (!word->tess_failed && word->tess_accepted)
815  result_type = "Accepted";
817  tprintf("%s result: %s r=%g, c=%g, accepted=%d, adaptable=%d\n",
818  result_type,
819  word->best_choice->unichar_string().string(),
820  word->best_choice->rating(),
821  word->best_choice->certainty(),
822  word->tess_accepted, word->tess_would_adapt);
823  }
824  if (word->tess_failed || !word->tess_accepted) {
825  // Try all the other languages to see if they are any better.
826  Tesseract* previous_used = most_recently_used_;
827  if (most_recently_used_ != this) {
828  if (classify_debug_level) {
829  tprintf("Retrying with main-Tesseract, lang: %s\n", lang.string());
830  }
831  if (RetryWithLanguage(word, block, row, recognizer)) {
832  most_recently_used_ = this;
833  if (!word->tess_failed && word->tess_accepted)
834  return; // No need to look at the others.
835  }
836  }
837 
838  for (int i = 0; i < sub_langs_.size(); ++i) {
839  if (sub_langs_[i] != previous_used) {
840  if (classify_debug_level) {
841  tprintf("Retrying with sub-Tesseract[%d] lang: %s\n",
842  i, sub_langs_[i]->lang.string());
843  }
844  if (sub_langs_[i]->RetryWithLanguage(word, block, row, recognizer)) {
845  most_recently_used_ = sub_langs_[i];
846  if (!word->tess_failed && word->tess_accepted)
847  return; // No need to look at the others.
848  }
849  }
850  }
851  }
852 }
const STRING & unichar_string() const
Definition: ratngs.h:395
TBOX bounding_box()
Definition: werd.cpp:164
BOOL8 done
Definition: pageres.h:419
float certainty() const
Definition: ratngs.h:234
tesseract::Tesseract * tesseract
Definition: pageres.h:403
BOOL8 tess_would_adapt
Definition: pageres.h:418
WERD * word
Definition: pageres.h:334
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool RetryWithLanguage(WERD_RES *word, BLOCK *block, ROW *row, WordRecognizer recognizer)
Definition: control.cpp:756
STRING lang
Definition: ccutil.h:69
BOOL8 tess_accepted
Definition: pageres.h:417
BOOL8 tess_failed
Definition: pageres.h:409
void print() const
Definition: rect.h:263
float rating() const
Definition: ratngs.h:231
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::classify_word_pass1 ( BLOCK block,
ROW row,
WERD_RES word 
)

classify_word_pass1

Baseline normalize the word and pass it to Tess.

Definition at line 860 of file control.cpp.

860  {
861  // If we only intend to run cube - run it and return.
863  cube_word_pass1(block, row, word);
864  return;
865  }
866 
867  BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST();
868  BOOL8 adapt_ok;
869  const char *rejmap;
870  inT16 index;
871  STRING mapstr = "";
872 
873  check_debug_pt(word, 0);
874  if (word->SetupForTessRecognition(unicharset, this, BestPix(),
877  row, block))
878  tess_segment_pass1(word, blob_choices);
879  if (!word->tess_failed) {
880  /*
881  The adaption step used to be here. It has been moved to after
882  make_reject_map so that we know whether the word will be accepted in the
883  first pass or not. This move will PREVENT adaption to words containing
884  double quotes because the word will not be identical to what tess thinks
885  its best choice is. (See CurrentBestChoiceIs in
886  stopper.cpp which is used by AdaptableWord in
887  adaptmatch.cpp)
888  */
889 
890  if (!word->word->flag(W_REP_CHAR)) {
891  // TODO(daria) delete these hacks when replaced by more generic code.
892  // Convert '' (double single) to " (single double).
893  word->fix_quotes(blob_choices);
894  if (tessedit_fix_hyphens) // turn -- to -
895  word->fix_hyphens(blob_choices);
896 
898  word->raw_choice);
899 
900  word->tess_would_adapt = word->best_choice && word->raw_choice &&
902  *word->best_choice,
903  *word->raw_choice);
904  // Also sets word->done flag
905  make_reject_map(word, blob_choices, row, 1);
906 
908 
909  if (adapt_ok || tessedit_tess_adapt_to_rejmap) {
911  rejmap = NULL;
912  } else {
913  ASSERT_HOST(word->reject_map.length() ==
914  word->best_choice->length());
915 
916  for (index = 0; index < word->reject_map.length(); index++) {
917  if (adapt_ok || word->reject_map[index].accepted())
918  mapstr += '1';
919  else
920  mapstr += '0';
921  }
922  rejmap = mapstr.string();
923  }
924  // Send word to adaptive classifier for training.
925  word->BestChoiceToCorrectText();
926  set_word_fonts(word, blob_choices);
927  LearnWord(NULL, rejmap, word);
928  // Mark misadaptions if running blamer.
929  if (word->blamer_bundle != NULL &&
931  !ChoiceIsCorrect(*word->uch_set, word->best_choice,
932  word->blamer_bundle->truth_text)) {
933  word->blamer_bundle->misadaption_debug ="misadapt to word (";
935  word->best_choice->permuter_name();
936  word->blamer_bundle->misadaption_debug += "): ";
938  "", word->best_choice, &(word->blamer_bundle->misadaption_debug));
939  if (wordrec_debug_blamer) {
940  tprintf("%s\n", word->blamer_bundle->misadaption_debug.string());
941  }
942  }
943  }
944 
947  }
948  }
949 
950  // Save best choices in the WERD_CHOICE if needed
951  word->best_choice->set_blob_choices(blob_choices);
952 }
int length() const
Definition: ratngs.h:214
TWERD * rebuild_word
Definition: pageres.h:381
bool ChoiceIsCorrect(const UNICHARSET &uni_set, const WERD_CHOICE *choice, const GenericVector< STRING > &truth_text)
Definition: wordrec.cpp:159
bool SetupForTessRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, bool numeric_mode, bool use_body_size, ROW *row, BLOCK *block)
Definition: pageres.cpp:272
IncorrectResultReason incorrect_result_reason
Definition: pageres.h:176
void cube_word_pass1(BLOCK *block, ROW *row, WERD_RES *word)
void LearnWord(const char *filename, const char *rejmap, WERD_RES *word)
Definition: adaptmatch.cpp:254
unsigned char BOOL8
Definition: host.h:113
REJMAP reject_map
Definition: pageres.h:408
void FillDebugString(const STRING &msg, const WERD_CHOICE *choice, STRING *debug)
Definition: pageres.cpp:63
#define NULL
Definition: host.h:144
void make_reject_map(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices, ROW *row, inT16 pass)
BOOL8 tess_would_adapt
Definition: pageres.h:418
Pix * BestPix() const
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1388
void fix_hyphens(BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: pageres.cpp:700
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:122
int AdaptableWord(TWERD *Word, const WERD_CHOICE &BestChoiceWord, const WERD_CHOICE &RawChoiceWord)
Definition: adaptmatch.cpp:894
BOOL8 tess_acceptable_word(WERD_CHOICE *word_choice, WERD_CHOICE *raw_choice)
Definition: tessbox.cpp:102
WERD * word
Definition: pageres.h:334
const UNICHARSET * uch_set
Definition: pageres.h:348
bool classify_bln_numeric_mode
Definition: classify.h:455
GenericVector< STRING > truth_text
Definition: pageres.h:174
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool wordrec_debug_blamer
Definition: wordrec.h:142
WERD_CHOICE * raw_choice
Definition: pageres.h:360
UNICHARSET unicharset
Definition: ccutil.h:72
void BestChoiceToCorrectText()
Definition: pageres.cpp:572
Definition: strngs.h:40
short inT16
Definition: host.h:100
const char * permuter_name() const
Definition: ratngs.cpp:174
BOOL8 tess_accepted
Definition: pageres.h:417
void set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: ratngs.cpp:184
inT32 length() const
Definition: rejctmap.h:238
BOOL8 word_adaptable(WERD_RES *word, uinT16 mode)
Definition: adaptions.cpp:50
void set_word_fonts(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: control.cpp:1500
STRING misadaption_debug
Definition: pageres.h:180
BOOL8 tess_failed
Definition: pageres.h:409
void tess_add_doc_word(WERD_CHOICE *word_choice)
Definition: tessbox.cpp:114
void fix_quotes(BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: pageres.cpp:670
#define ASSERT_HOST(x)
Definition: errcode.h:84
void tess_segment_pass1(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: tessbox.cpp:42
BlamerBundle * blamer_bundle
Definition: pageres.h:367
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::classify_word_pass2 ( BLOCK block,
ROW row,
WERD_RES word 
)

classify_word_pass2

Control what to do with the word in pass 2

Definition at line 1026 of file control.cpp.

1026  {
1027  // Return if we do not want to run Tesseract.
1030  return;
1031 
1032  bool done_this_pass = false;
1034  check_debug_pt(word, 30);
1035  if (!word->done || tessedit_training_tess) {
1036  word->caps_height = 0.0;
1037  if (word->x_height == 0.0f)
1038  word->x_height = row->x_height();
1039  match_word_pass2(word, row, block);
1040  done_this_pass = TRUE;
1041  check_debug_pt(word, 40);
1042  }
1043 
1044  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1045  bool accept_new_xht = false;
1047  // Use the tops and bottoms since they are available.
1048  accept_new_xht = TrainedXheightFix(word, block, row);
1049  }
1050  if (accept_new_xht)
1051  done_this_pass = true;
1052  // Test for small caps. Word capheight must be close to block xheight,
1053  // and word must contain no lower case letters, and at least one upper case.
1054  double small_cap_xheight = block->x_height() * kXHeightCapRatio;
1055  double small_cap_delta = (block->x_height() - small_cap_xheight) / 2.0;
1057  small_cap_xheight - small_cap_delta <= word->x_height &&
1058  word->x_height <= small_cap_xheight + small_cap_delta) {
1059  // Scan for upper/lower.
1060  int num_upper = 0;
1061  int num_lower = 0;
1062  for (int i = 0; i < word->best_choice->length(); ++i) {
1064  ++num_upper;
1065  else if (unicharset.get_islower(word->best_choice->unichar_id(i)))
1066  ++num_lower;
1067  }
1068  if (num_upper > 0 && num_lower == 0)
1069  word->small_caps = true;
1070  }
1071  word->SetScriptPositions();
1072 
1074  }
1075 #ifndef GRAPHICS_DISABLED
1077  if (fx_win == NULL)
1078  create_fx_win();
1079  clear_fx_win();
1080  word->rebuild_word->plot(fx_win);
1081  TBOX wbox = word->rebuild_word->bounding_box();
1082  fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
1083  wbox.right(), wbox.bottom());
1085  }
1086 #endif
1088  check_debug_pt(word, 50);
1089 }
int length() const
Definition: ratngs.h:214
TWERD * rebuild_word
Definition: pageres.h:381
bool script_has_xheight() const
Definition: unicharset.h:770
bool top_bottom_useful() const
Definition: unicharset.h:438
BOOL8 done
Definition: pageres.h:419
#define SUBLOC_NORM
Definition: errcode.h:59
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:399
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:760
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:976
EXTERN ScrollView * fx_win
Definition: drawfx.cpp:53
Definition: rect.h:29
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:406
inT32 x_height() const
return xheight
Definition: ocrblock.h:111
inT16 right() const
Definition: rect.h:74
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1388
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:122
WERD * word
Definition: pageres.h:334
static void Update()
Definition: scrollview.cpp:710
void set_global_subloc_code(int loc_code)
Definition: globaloc.cpp:106
float x_height
Definition: pageres.h:431
inT16 top() const
Definition: rect.h:53
static const double kXHeightCapRatio
Definition: ccstruct.h:38
float x_height() const
Definition: ocrrow.h:61
TBOX bounding_box() const
Definition: blobs.cpp:483
UNICHARSET unicharset
Definition: ccutil.h:72
void plot(ScrollView *window)
Definition: blobs.cpp:522
void match_word_pass2(WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1098
void create_fx_win()
Definition: drawfx.cpp:62
void SetScriptPositions()
Definition: pageres.cpp:505
BOOL8 tess_failed
Definition: pageres.h:409
void clear_fx_win()
Definition: drawfx.cpp:75
float caps_height
Definition: pageres.h:432
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
bool small_caps
Definition: pageres.h:420
#define TRUE
Definition: capi.h:27
inT16 bottom() const
Definition: rect.h:60
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::Clear ( )

Definition at line 413 of file tesseractclass.cpp.

413  {
414  pixDestroy(&pix_binary_);
415  pixDestroy(&cube_binary_);
416  pixDestroy(&pix_grey_);
417  pixDestroy(&scaled_color_);
418  deskew_ = FCOORD(1.0f, 0.0f);
419  reskew_ = FCOORD(1.0f, 0.0f);
420  splitter_.Clear();
421  scaled_factor_ = -1;
423  for (int i = 0; i < sub_langs_.size(); ++i)
424  sub_langs_[i]->Clear();
425 }
#define f(xc, yc)
Definition: imgscale.cpp:39
void ResetFeaturesHaveBeenExtracted()
Definition: points.h:189
float tesseract::Tesseract::ComputeCompatibleXheight ( WERD_RES word_res)

Definition at line 96 of file fixxht.cpp.

96  {
97  STATS top_stats(0, MAX_UINT8);
98  TBLOB* blob = word_res->rebuild_word->blobs;
99  int blob_id = 0;
100  for (; blob != NULL; blob = blob->next, ++blob_id) {
101  UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
102  if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
103  int top = blob->bounding_box().top();
104  // Clip the top to the limit of normalized feature space.
105  if (top >= INT_FEAT_RANGE)
106  top = INT_FEAT_RANGE - 1;
107  int bottom = blob->bounding_box().bottom();
108  int min_bottom, max_bottom, min_top, max_top;
109  unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
110  &min_top, &max_top);
111  // Chars with a wild top range would mess up the result so ignore them.
112  if (max_top - min_top > kMaxCharTopRange)
113  continue;
114  int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top,
115  top - (max_top + x_ht_acceptance_tolerance));
116  int height = top - kBlnBaselineOffset;
117  if (debug_x_ht_level >= 20) {
118  tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d : ",
119  unicharset.id_to_unichar(class_id),
120  height, min_bottom, max_bottom, min_top, max_top,
121  bottom, top);
122  }
123  // Use only chars that fit in the expected bottom range, and where
124  // the range of tops is sensibly near the xheight.
125  if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
126  bottom - x_ht_acceptance_tolerance <= max_bottom &&
127  min_top > kBlnBaselineOffset &&
128  max_top - kBlnBaselineOffset >= kBlnXHeight &&
129  misfit_dist > 0) {
130  // Compute the x-height position using proportionality between the
131  // actual height and expected height.
132  int min_xht = DivRounded(height * kBlnXHeight,
133  max_top - kBlnBaselineOffset);
134  int max_xht = DivRounded(height * kBlnXHeight,
135  min_top - kBlnBaselineOffset);
136  if (debug_x_ht_level >= 20) {
137  tprintf(" xht range min=%d, max=%d\n",
138  min_xht, max_xht);
139  }
140  // The range of expected heights gets a vote equal to the distance
141  // of the actual top from the expected top.
142  for (int y = min_xht; y <= max_xht; ++y)
143  top_stats.add(y, misfit_dist);
144  } else if (debug_x_ht_level >= 20) {
145  tprintf(" already OK\n");
146  }
147  }
148  }
149  if (top_stats.get_total() == 0)
150  return 0.0f;
151  // The new xheight is just the median vote, which is then scaled out
152  // of BLN space back to pixel space to get the x-height in pixel space.
153  float new_xht = top_stats.median();
154  if (debug_x_ht_level >= 20) {
155  tprintf("Median xht=%f\n", new_xht);
156  tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
157  new_xht, new_xht / word_res->denorm.y_scale());
158  }
159  // The xheight must change by at least x_ht_min_change to be used.
160  if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
161  return new_xht / word_res->denorm.y_scale();
162  else
163  return 0.0f;
164 }
TWERD * rebuild_word
Definition: pageres.h:381
const int kBlnXHeight
Definition: normalis.h:27
#define MAX_UINT8
Definition: host.h:121
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
int UNICHAR_ID
Definition: unichar.h:31
const int kBlnBaselineOffset
Definition: normalis.h:28
#define NULL
Definition: host.h:144
int DivRounded(int a, int b)
Definition: helpers.h:115
#define f(xc, yc)
Definition: imgscale.cpp:39
TBLOB * blobs
Definition: blobs.h:274
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:459
Definition: blobs.h:174
const int kMaxCharTopRange
Definition: fixxht.cpp:61
inT16 top() const
Definition: rect.h:53
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
DENORM denorm
Definition: pageres.h:346
TBOX bounding_box() const
Definition: blobs.cpp:384
UNICHARSET unicharset
Definition: ccutil.h:72
Definition: statistc.h:29
float y_scale() const
Definition: normalis.h:267
#define INT_FEAT_RANGE
Definition: float2int.h:27
#define MAX(x, y)
Definition: ndminx.h:24
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
TBLOB * next
Definition: blobs.h:228
inT16 bottom() const
Definition: rect.h:60
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::convert_bad_unlv_chs ( WERD_RES word_res)

Definition at line 666 of file docqual.cpp.

666  {
667  int i;
668  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
669  UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
670  UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
671  UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
672  bool modified = false;
673  for (i = 0; i < word_res->reject_map.length(); ++i) {
674  if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
675  word_res->best_choice->set_unichar_id(unichar_dash, i);
676  modified = true;
677  if (word_res->reject_map[i].accepted ())
678  word_res->reject_map[i].setrej_unlv_rej ();
679  }
680  if (word_res->best_choice->unichar_id(i) == unichar_pow) {
681  word_res->best_choice->set_unichar_id(unichar_space, i);
682  modified = true;
683  if (word_res->reject_map[i].accepted ())
684  word_res->reject_map[i].setrej_unlv_rej ();
685  }
686  }
687 }
int UNICHAR_ID
Definition: unichar.h:31
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
REJMAP reject_map
Definition: pageres.h:408
const UNICHARSET * uch_set
Definition: pageres.h:348
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:247
inT32 length() const
Definition: rejctmap.h:238
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
WERD_CHOICE * best_choice
Definition: pageres.h:359
bool tesseract::Tesseract::ConvertStringToUnichars ( const char *  utf8,
GenericVector< UNICHAR_ID > *  class_ids 
)

Definition at line 536 of file applybox.cpp.

537  {
538  for (int step = 0; *utf8 != '\0'; utf8 += step) {
539  const char* next_space = strchr(utf8, ' ');
540  if (next_space == NULL)
541  next_space = utf8 + strlen(utf8);
542  step = next_space - utf8;
543  UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
544  if (class_id == INVALID_UNICHAR_ID) {
545  return false;
546  }
547  while (utf8[step] == ' ')
548  ++step;
549  class_ids->push_back(class_id);
550  }
551  return true;
552 }
int UNICHAR_ID
Definition: unichar.h:31
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
#define NULL
Definition: host.h:144
int push_back(T object)
UNICHARSET unicharset
Definition: ccutil.h:72
void tesseract::Tesseract::CorrectClassifyWords ( PAGE_RES page_res)

Definition at line 764 of file applybox.cpp.

764  {
765  PAGE_RES_IT pr_it(page_res);
766  for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
767  word_res = pr_it.forward()) {
768  WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set,
769  word_res->correct_text.size());
770  for (int i = 0; i < word_res->correct_text.size(); ++i) {
771  // The part before the first space is the real ground truth, and the
772  // rest is the bounding box location and page number.
773  GenericVector<STRING> tokens;
774  word_res->correct_text[i].split(' ', &tokens);
775  UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
776  choice->append_unichar_id_space_allocated(char_id, 1, 0.0f, 0.0f);
777  }
778  if (word_res->best_choice != NULL)
779  delete word_res->best_choice;
780  word_res->best_choice = choice;
781  }
782 }
int UNICHAR_ID
Definition: unichar.h:31
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
#define NULL
Definition: host.h:144
#define f(xc, yc)
Definition: imgscale.cpp:39
WERD * word
Definition: pageres.h:334
UNICHARSET unicharset
Definition: ccutil.h:72
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, char fragment_length, float rating, float certainty)
Definition: ratngs.h:331
inT16 tesseract::Tesseract::count_alphanums ( const WERD_CHOICE word)

Definition at line 474 of file output.cpp.

474  {
475  int count = 0;
476  for (int i = 0; i < word.length(); ++i) {
477  if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
478  word.unicharset()->get_isdigit(word.unichar_id(i)))
479  count++;
480  }
481  return count;
482 }
int length() const
Definition: ratngs.h:214
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
int count(LIST var_list)
Definition: oldlist.cpp:108
const UNICHARSET * unicharset() const
Definition: ratngs.h:211
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
inT16 tesseract::Tesseract::count_alphanums ( WERD_RES word)

Definition at line 737 of file reject.cpp.

737  {
738  int count = 0;
739  const WERD_CHOICE *best_choice = word_res->best_choice;
740  for (int i = 0; i < word_res->reject_map.length(); ++i) {
741  if ((word_res->reject_map[i].accepted()) &&
742  (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
743  word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
744  count++;
745  }
746  }
747  return count;
748 }
int count(LIST var_list)
Definition: oldlist.cpp:108
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
inT16 tesseract::Tesseract::count_alphas ( const WERD_CHOICE word)

Definition at line 464 of file output.cpp.

464  {
465  int count = 0;
466  for (int i = 0; i < word.length(); ++i) {
467  if (word.unicharset()->get_isalpha(word.unichar_id(i)))
468  count++;
469  }
470  return count;
471 }
int length() const
Definition: ratngs.h:214
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
int count(LIST var_list)
Definition: oldlist.cpp:108
const UNICHARSET * unicharset() const
Definition: ratngs.h:211
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
inT16 tesseract::Tesseract::count_outline_errs ( char  c,
inT16  outline_count 
)

Definition at line 131 of file docqual.cpp.

131  {
132  int expected_outline_count;
133 
134  if (STRING (outlines_odd).contains (c))
135  return 0; //Dont use this char
136  else if (STRING (outlines_2).contains (c))
137  expected_outline_count = 2;
138  else
139  expected_outline_count = 1;
140  return abs (outline_count - expected_outline_count);
141 }
Definition: strngs.h:40
int tesseract::Tesseract::CountMisfitTops ( WERD_RES word_res)

Definition at line 64 of file fixxht.cpp.

64  {
65  int bad_blobs = 0;
66  TBLOB* blob = word_res->rebuild_word->blobs;
67  int blob_id = 0;
68  for (; blob != NULL; blob = blob->next, ++blob_id) {
69  UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
70  if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
71  int top = blob->bounding_box().top();
72  if (top >= INT_FEAT_RANGE)
73  top = INT_FEAT_RANGE - 1;
74  int min_bottom, max_bottom, min_top, max_top;
75  unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
76  &min_top, &max_top);
77  if (max_top - min_top > kMaxCharTopRange)
78  continue;
79  bool bad = top < min_top - x_ht_acceptance_tolerance ||
80  top > max_top + x_ht_acceptance_tolerance;
81  if (bad)
82  ++bad_blobs;
83  if (debug_x_ht_level >= 1) {
84  tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
85  unicharset.id_to_unichar(class_id),
86  bad ? "Misfit" : "OK", top, min_top, max_top,
87  static_cast<int>(x_ht_acceptance_tolerance));
88  }
89  }
90  }
91  return bad_blobs;
92 }
TWERD * rebuild_word
Definition: pageres.h:381
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
int UNICHAR_ID
Definition: unichar.h:31
#define NULL
Definition: host.h:144
TBLOB * blobs
Definition: blobs.h:274
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:459
Definition: blobs.h:174
const int kMaxCharTopRange
Definition: fixxht.cpp:61
inT16 top() const
Definition: rect.h:53
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
TBOX bounding_box() const
Definition: blobs.cpp:384
UNICHARSET unicharset
Definition: ccutil.h:72
#define INT_FEAT_RANGE
Definition: float2int.h:27
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
TBLOB * next
Definition: blobs.h:228
WERD_CHOICE * best_choice
Definition: pageres.h:359
bool tesseract::Tesseract::create_cube_box_word ( Boxa *  char_boxes,
int  num_chars,
TBOX  word_box,
BoxWord box_word 
)

Definition at line 116 of file cube_control.cpp.

119  {
120  if (!box_word) {
121  if (cube_debug_level > 0) {
122  tprintf("Cube WARNING (create_cube_box_word): Invalid box_word.\n");
123  }
124  return false;
125  }
126 
127  // Find the x-coordinate of left-most char_box, which could be
128  // nonzero if the word image was padded before recognition took place.
129  int x_offset = -1;
130  for (int i = 0; i < num_chars; ++i) {
131  Box* char_box = boxaGetBox(char_boxes, i, L_CLONE);
132  if (x_offset < 0 || char_box->x < x_offset) {
133  x_offset = char_box->x;
134  }
135  boxDestroy(&char_box);
136  }
137 
138  for (int i = 0; i < num_chars; ++i) {
139  Box* char_box = boxaGetBox(char_boxes, i, L_CLONE);
140  TBOX tbox = char_box_to_tbox(char_box, word_box, x_offset);
141  boxDestroy(&char_box);
142  box_word->InsertBox(i, tbox);
143  }
144  return true;
145 }
Definition: rect.h:29
TBOX char_box_to_tbox(Box *char_box, TBOX word_box, int x_offset)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void tesseract::Tesseract::cube_combine_word ( CubeObject cube_obj,
WERD_RES cube_word,
WERD_RES tess_word 
)

Definition at line 323 of file cube_control.cpp.

324  {
325  float combiner_prob = tess_cube_combiner_->CombineResults(tess_word,
326  cube_obj);
327  // If combiner probability is greater than tess/cube combiner
328  // classifier threshold, i.e. tesseract wins, then just return the
329  // tesseract result unchanged, as the combiner knows nothing about how
330  // correct the answer is. If cube and tesseract agree, then improve the
331  // scores before returning.
332  WERD_CHOICE* tess_best = tess_word->best_choice;
333  WERD_CHOICE* cube_best = cube_word->best_choice;
335  tprintf("Combiner prob = %g vs threshold %g\n",
336  combiner_prob, cube_cntxt_->Params()->CombinerClassifierThresh());
337  }
338  if (combiner_prob >=
339  cube_cntxt_->Params()->CombinerClassifierThresh()) {
340  if (tess_best->unichar_string() == cube_best->unichar_string()) {
341  // Cube and tess agree, so improve the scores.
342  tess_best->set_rating(tess_best->rating() / 2);
343  tess_best->set_certainty(tess_best->certainty() / 2);
344  }
345  return;
346  }
347  // Cube wins.
348  // It is better for the language combiner to have all tesseract scores,
349  // so put them in the cube result.
350  cube_best->set_rating(tess_best->rating());
351  cube_best->set_certainty(tess_best->certainty());
353  tprintf("Cube INFO: tesseract result replaced by cube: %s -> %s\n",
354  tess_best->unichar_string().string(),
355  cube_best->unichar_string().string());
356  }
357  tess_word->ConsumeWordResults(cube_word);
358 }
const STRING & unichar_string() const
Definition: ratngs.h:395
float CombineResults(WERD_RES *tess_res, CubeObject *cube_obj)
void set_rating(float new_val)
Definition: ratngs.h:255
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:411
float certainty() const
Definition: ratngs.h:234
TuningParams * Params() const
double CombinerClassifierThresh() const
Definition: tuning_params.h:63
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void set_certainty(float new_val)
Definition: ratngs.h:258
float rating() const
Definition: ratngs.h:231
WERD_CHOICE * best_choice
Definition: pageres.h:359
bool tesseract::Tesseract::cube_recognize ( CubeObject cube_obj,
BLOCK block,
WERD_RES word 
)

Definition at line 366 of file cube_control.cpp.

367  {
368  if (!word->SetupForCubeRecognition(unicharset, this, block)) {
369  return false; // Graphics block.
370  }
371 
372  // Run cube
373  WordAltList *cube_alt_list = cube_obj->RecognizeWord();
374  if (!cube_alt_list || cube_alt_list->AltCount() <= 0) {
375  if (cube_debug_level > 0) {
376  tprintf("Cube returned nothing for word at:");
377  word->word->bounding_box().print();
378  }
379  word->SetupFake(unicharset);
380  return false;
381  }
382 
383  // Get cube's best result and its probability, mapped to tesseract's
384  // certainty range
385  char_32 *cube_best_32 = cube_alt_list->Alt(0);
386  double cube_prob = CubeUtils::Cost2Prob(cube_alt_list->AltCost(0));
387  float cube_certainty = convert_prob_to_tess_certainty(cube_prob);
388  string cube_best_str;
389  CubeUtils::UTF32ToUTF8(cube_best_32, &cube_best_str);
390 
391  // Retrieve Cube's character bounding boxes and CharSamples,
392  // corresponding to the most recent call to RecognizeWord().
393  Boxa *char_boxes = NULL;
394  CharSamp **char_samples = NULL;;
395  int num_chars;
396  if (!extract_cube_state(cube_obj, &num_chars, &char_boxes, &char_samples)
397  && cube_debug_level > 0) {
398  tprintf("Cube WARNING (Tesseract::cube_recognize): Cannot extract "
399  "cube state.\n");
400  word->SetupFake(unicharset);
401  return false;
402  }
403 
404  // Convert cube's character bounding boxes to a BoxWord.
405  BoxWord cube_box_word;
406  TBOX tess_word_box = word->word->bounding_box();
407  if (word->denorm.block() != NULL)
408  tess_word_box.rotate(word->denorm.block()->re_rotation());
409  bool box_word_success = create_cube_box_word(char_boxes, num_chars,
410  tess_word_box,
411  &cube_box_word);
412  boxaDestroy(&char_boxes);
413  if (!box_word_success) {
414  if (cube_debug_level > 0) {
415  tprintf("Cube WARNING (Tesseract::cube_recognize): Could not "
416  "create cube BoxWord\n");
417  }
418  word->SetupFake(unicharset);
419  return false;
420  }
421 
422  // Create cube's best choice.
423  WERD_CHOICE* cube_werd_choice = create_werd_choice(
424  char_samples, num_chars, cube_best_str.c_str(), cube_certainty,
425  unicharset, cube_cntxt_->CharacterSet());
426  delete []char_samples;
427 
428  if (!cube_werd_choice) {
429  if (cube_debug_level > 0) {
430  tprintf("Cube WARNING (Tesseract::cube_recognize): Could not "
431  "create cube WERD_CHOICE\n");
432  }
433  word->SetupFake(unicharset);
434  return false;
435  }
437  tprintf("Cube result: %s r=%g, c=%g\n",
438  cube_werd_choice->unichar_string().string(),
439  cube_werd_choice->rating(),
440  cube_werd_choice->certainty());
441  }
442 
443  // Fill tesseract result's fields with cube results
444  fill_werd_res(cube_box_word, cube_werd_choice, cube_best_str.c_str(), word);
445  return true;
446 }
void fill_werd_res(const BoxWord &cube_box_word, WERD_CHOICE *cube_werd_choice, const char *cube_best_str, WERD_RES *tess_werd_res)
const STRING & unichar_string() const
Definition: ratngs.h:395
TBOX bounding_box()
Definition: werd.cpp:164
bool SetupForCubeRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, const BLOCK *block)
Definition: pageres.cpp:317
float certainty() const
Definition: ratngs.h:234
#define NULL
Definition: host.h:144
Definition: rect.h:29
bool extract_cube_state(CubeObject *cube_obj, int *num_chars, Boxa **char_boxes, CharSamp ***char_samples)
static double Cost2Prob(int cost)
Definition: cube_utils.cpp:43
WERD * word
Definition: pageres.h:334
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
signed int char_32
Definition: string_32.h:40
DENORM denorm
Definition: pageres.h:346
bool create_cube_box_word(Boxa *char_boxes, int num_chars, TBOX word_box, BoxWord *box_word)
UNICHARSET unicharset
Definition: ccutil.h:72
CharSet * CharacterSet() const
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:340
const BLOCK * block() const
Definition: normalis.h:276
void rotate(const FCOORD &vec)
Definition: rect.h:182
static void UTF32ToUTF8(const char_32 *utf32_str, string *str)
Definition: cube_utils.cpp:349
FCOORD re_rotation() const
Definition: ocrblock.h:139
void print() const
Definition: rect.h:263
float rating() const
Definition: ratngs.h:231
CubeObject * tesseract::Tesseract::cube_recognize_word ( BLOCK block,
WERD_RES word 
)

Definition at line 286 of file cube_control.cpp.

286  {
287  if (!cube_binary_ || !cube_cntxt_) {
288  if (cube_debug_level > 0 && !cube_binary_)
289  tprintf("Tesseract::run_cube(): NULL binary image.\n");
290  word->SetupFake(unicharset);
291  return NULL;
292  }
293  TBOX word_box = word->word->bounding_box();
294  if (block != NULL && (block->re_rotation().x() != 1.0f ||
295  block->re_rotation().y() != 0.0f)) {
296  // TODO(rays) We have to rotate the bounding box to get the true coords.
297  // This will be achieved in the future via DENORM.
298  // In the mean time, cube can't process this word.
299  if (cube_debug_level > 0) {
300  tprintf("Cube can't process rotated word at:");
301  word_box.print();
302  }
303  word->SetupFake(unicharset);
304  return NULL;
305  }
306  CubeObject* cube_obj = new tesseract::CubeObject(
307  cube_cntxt_, cube_binary_, word_box.left(),
308  pixGetHeight(cube_binary_) - word_box.top(),
309  word_box.width(), word_box.height());
310  if (!cube_recognize(cube_obj, block, word)) {
311  delete cube_obj;
312  return NULL;
313  }
314  return cube_obj;
315 }
TBOX bounding_box()
Definition: werd.cpp:164
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
inT16 width() const
Definition: rect.h:104
Definition: rect.h:29
bool cube_recognize(CubeObject *cube_obj, BLOCK *block, WERD_RES *word)
WERD * word
Definition: pageres.h:334
inT16 top() const
Definition: rect.h:53
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
UNICHARSET unicharset
Definition: ccutil.h:72
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:340
float y() const
Definition: points.h:212
FCOORD re_rotation() const
Definition: ocrblock.h:139
void print() const
Definition: rect.h:263
inT16 height() const
Definition: rect.h:97
float x() const
Definition: points.h:209
void tesseract::Tesseract::cube_word_pass1 ( BLOCK block,
ROW row,
WERD_RES word 
)

Definition at line 275 of file cube_control.cpp.

275  {
276  CubeObject *cube_obj = cube_recognize_word(block, word);
277  delete cube_obj;
278 }
CubeObject * cube_recognize_word(BLOCK *block, WERD_RES *word)
void tesseract::Tesseract::debug_word ( PAGE_RES page_res,
const TBOX selection_box 
)

debug_word

Process the whole image, but load word_config_ for the selected word(s).

Definition at line 636 of file pgedit.cpp.

636  {
638  recog_all_words(page_res, NULL, &selection_box, word_config_.string(), 0);
639 }
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:178
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:156
BOOL8 tesseract::Tesseract::digit_or_numeric_punct ( WERD_RES word,
int  char_position 
)

Definition at line 344 of file fixspace.cpp.

344  {
345  int i;
346  int offset;
347 
348  for (i = 0, offset = 0; i < char_position;
349  offset += word->best_choice->unichar_lengths()[i++]);
350  return (
351  word->uch_set->get_isdigit(
352  word->best_choice->unichar_string().string() + offset,
353  word->best_choice->unichar_lengths()[i]) ||
354  (word->best_choice->permuter() == NUMBER_PERM &&
356  word->best_choice->unichar_string().string()[offset])));
357 }
const STRING & unichar_string() const
Definition: ratngs.h:395
BOOL8 contains(const char c) const
Definition: strngs.cpp:147
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
const UNICHARSET * uch_set
Definition: pageres.h:348
uinT8 permuter() const
Definition: ratngs.h:237
const char * string() const
Definition: strngs.cpp:156
Definition: strngs.h:40
const STRING & unichar_lengths() const
Definition: ratngs.h:402
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::do_re_display ( BOOL8(tesseract::Tesseract::*)(BLOCK *block, ROW *row, WERD_RES *word_res)  word_painter)

do_re_display()

Redisplay page

Definition at line 306 of file pgedit.cpp.

309  {
311  int block_count = 1;
312 
313  image_win->Clear();
314  if (display_image != 0) {
315  image_win->Image(pix_binary_, 0, 0);
316  }
317 
318  for (WERD_RES* word = pr_it.word(); word != NULL; word = pr_it.forward()) {
319  (this->*word_painter)(pr_it.block()->block, pr_it.row()->row, word);
320  if (display_baselines && pr_it.row() != pr_it.prev_row())
321  pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN);
322  if (display_blocks && pr_it.block() != pr_it.prev_block())
323  pr_it.block()->block->plot(image_win, block_count++, ScrollView::RED);
324  }
325  image_win->Update();
326 }
void Clear()
Definition: scrollview.cpp:590
PAGE_RES * current_page_res
Definition: pgedit.cpp:127
#define NULL
Definition: host.h:144
BOOL8 display_image
Definition: pgedit.cpp:123
BOOL8 display_baselines
Definition: pgedit.cpp:125
WERD * word
Definition: pageres.h:334
static void Update()
Definition: scrollview.cpp:710
void Image(struct Pix *image, int x_pos, int y_pos)
Definition: scrollview.cpp:768
ScrollView * image_win
Definition: pgedit.cpp:106
BOOL8 display_blocks
Definition: pgedit.cpp:124
void tesseract::Tesseract::doc_and_block_rejection ( PAGE_RES_IT page_res_it,
BOOL8  good_quality_doc 
)

Definition at line 238 of file docqual.cpp.

240  {
241  inT16 block_no = 0;
242  inT16 row_no = 0;
243  BLOCK_RES *current_block;
244  ROW_RES *current_row;
245 
246  BOOL8 rej_word;
247  BOOL8 prev_word_rejected;
248  inT16 char_quality = 0;
249  inT16 accepted_char_quality;
250 
251  if (page_res_it.page_res->rej_count * 100.0 /
253  reject_whole_page(page_res_it);
255  tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
256  page_res_it.page_res->char_count,
257  page_res_it.page_res->rej_count);
258  }
259  } else {
261  tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
262  page_res_it.page_res->char_count,
263  page_res_it.page_res->rej_count);
264  }
265 
266  /* Walk blocks testing for block rejection */
267 
268  page_res_it.restart_page();
269  WERD_RES* word;
270  while ((word = page_res_it.word()) != NULL) {
271  current_block = page_res_it.block();
272  block_no = current_block->block->index();
273  if (current_block->char_count > 0 &&
274  (current_block->rej_count * 100.0 / current_block->char_count) >
277  tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
278  block_no, current_block->char_count,
279  current_block->rej_count);
280  }
281  prev_word_rejected = FALSE;
282  while ((word = page_res_it.word()) != NULL &&
283  (page_res_it.block() == current_block)) {
285  rej_word = word->reject_map.reject_count() > 0 ||
287  if (rej_word && tessedit_dont_blkrej_good_wds &&
290  *word->uch_set,
291  word->best_choice->unichar_string().string(),
292  word->best_choice->unichar_lengths().string()) !=
293  AC_UNACCEPTABLE) {
294  word_char_quality(word, page_res_it.row()->row,
295  &char_quality,
296  &accepted_char_quality);
297  rej_word = char_quality != word->reject_map.length();
298  }
299  } else {
300  rej_word = TRUE;
301  }
302  if (rej_word) {
303  /*
304  Reject spacing if both current and prev words are rejected.
305  NOTE - this is NOT restricted to FUZZY spaces. - When tried this
306  generated more space errors.
307  */
309  prev_word_rejected &&
310  page_res_it.prev_row() == page_res_it.row() &&
311  word->word->space() == 1)
312  word->reject_spaces = TRUE;
314  }
315  prev_word_rejected = rej_word;
316  page_res_it.forward();
317  }
318  } else {
320  tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
321  block_no, page_res_it.block()->char_count,
322  page_res_it.block()->rej_count);
323  }
324 
325  /* Walk rows in block testing for row rejection */
326  row_no = 0;
327  while ((word = page_res_it.word()) != NULL &&
328  page_res_it.block() == current_block) {
329  current_row = page_res_it.row();
330  row_no++;
331  /* Reject whole row if:
332  fraction of chars on row which are rejected exceed a limit AND
333  fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
334  limit
335  */
336  if (current_row->char_count > 0 &&
337  (current_row->rej_count * 100.0 / current_row->char_count) >
339  (current_row->whole_word_rej_count * 100.0 /
340  current_row->rej_count) <
343  tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
344  row_no, current_row->char_count,
345  current_row->rej_count);
346  }
347  prev_word_rejected = FALSE;
348  while ((word = page_res_it.word()) != NULL &&
349  page_res_it.row () == current_row) {
350  /* Preserve words on good docs unless they are mostly rejected*/
351  if (!tessedit_row_rej_good_docs && good_quality_doc) {
352  rej_word = word->reject_map.reject_count() /
353  static_cast<float>(word->reject_map.length()) >
356  /* Preserve perfect words anyway */
357  rej_word = word->reject_map.reject_count() > 0 ||
359  if (rej_word && tessedit_dont_rowrej_good_wds &&
362  word->best_choice->unichar_string().string(),
363  word->best_choice->unichar_lengths().string()) !=
364  AC_UNACCEPTABLE) {
365  word_char_quality(word, page_res_it.row()->row,
366  &char_quality,
367  &accepted_char_quality);
368  rej_word = char_quality != word->reject_map.length();
369  }
370  } else {
371  rej_word = TRUE;
372  }
373  if (rej_word) {
374  /*
375  Reject spacing if both current and prev words are rejected.
376  NOTE - this is NOT restricted to FUZZY spaces. - When tried
377  this generated more space errors.
378  */
380  prev_word_rejected &&
381  page_res_it.prev_row() == page_res_it.row() &&
382  word->word->space () == 1)
383  word->reject_spaces = TRUE;
385  }
386  prev_word_rejected = rej_word;
387  page_res_it.forward();
388  }
389  } else {
391  tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
392  row_no, current_row->char_count, current_row->rej_count);
393  }
394  while (page_res_it.word() != NULL &&
395  page_res_it.row() == current_row)
396  page_res_it.forward();
397  }
398  }
399  }
400  }
401  }
402 }
const STRING & unichar_string() const
Definition: ratngs.h:395
inT32 rej_count
Definition: pageres.h:221
Unacceptable word.
Definition: control.h:37
BLOCK * block
Definition: pageres.h:258
ROW_RES * row() const
Definition: pageres.h:760
bool tessedit_preserve_row_rej_perfect_wds
double tessedit_reject_block_percent
WERD_RES * restart_page()
Definition: pageres.h:713
unsigned char BOOL8
Definition: host.h:113
REJMAP reject_map
Definition: pageres.h:408
#define NULL
Definition: host.h:144
inT32 char_count
Definition: pageres.h:259
uinT8 space()
Definition: werd.h:104
double tessedit_whole_wd_rej_row_percent
#define FALSE
Definition: capi.h:28
BOOL8 reject_spaces
Definition: pageres.h:452
void rej_word_block_rej()
Definition: rejctmap.cpp:512
double tessedit_reject_doc_percent
WERD_RES * word() const
Definition: pageres.h:757
BLOCK_RES * block() const
Definition: pageres.h:763
WERD * word
Definition: pageres.h:334
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:413
const UNICHARSET * uch_set
Definition: pageres.h:348
inT32 rej_count
Definition: pageres.h:260
const char * string() const
Definition: strngs.cpp:156
WERD_RES * forward()
Definition: pageres.h:737
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void rej_word_row_rej()
Definition: rejctmap.cpp:521
inT32 whole_word_rej_count
Definition: pageres.h:289
bool tessedit_preserve_blk_rej_perfect_wds
ROW_RES * prev_row() const
Definition: pageres.h:751
short inT16
Definition: host.h:100
const STRING & unichar_lengths() const
Definition: ratngs.h:402
inT32 char_count
Definition: pageres.h:287
int index() const
Definition: pdblock.h:80
inT32 length() const
Definition: rejctmap.h:238
double tessedit_reject_row_percent
inT32 char_count
Definition: pageres.h:220
double tessedit_good_doc_still_rowrej_wd
inT32 rej_count
Definition: pageres.h:288
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1284
PAGE_RES * page_res
Definition: pageres.h:691
inT16 reject_count()
Definition: rejctmap.h:244
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:100
ROW * row
Definition: pageres.h:286
#define TRUE
Definition: capi.h:27
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::dont_allow_1Il ( WERD_RES word)

Definition at line 705 of file reject.cpp.

705  {
706  int i = 0;
707  int offset;
708  int word_len = word->reject_map.length();
709  const char *s = word->best_choice->unichar_string().string();
710  const char *lengths = word->best_choice->unichar_lengths().string();
711  BOOL8 accepted_1Il = FALSE;
712 
713  for (i = 0, offset = 0; i < word_len;
714  offset += word->best_choice->unichar_lengths()[i++]) {
715  if (word->reject_map[i].accepted()) {
716  if (STRING(conflict_set_I_l_1).contains(s[offset])) {
717  accepted_1Il = TRUE;
718  } else {
719  if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
720  word->uch_set->get_isdigit(s + offset, lengths[i]))
721  return; // >=1 non 1Il ch accepted
722  }
723  }
724  }
725  if (!accepted_1Il)
726  return; //Nothing to worry about
727 
728  for (i = 0, offset = 0; i < word_len;
729  offset += word->best_choice->unichar_lengths()[i++]) {
730  if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
731  word->reject_map[i].accepted())
732  word->reject_map[i].setrej_postNN_1Il();
733  }
734 }
const STRING & unichar_string() const
Definition: ratngs.h:395
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
unsigned char BOOL8
Definition: host.h:113
REJMAP reject_map
Definition: pageres.h:408
BOOL8 contains(const char c) const
Definition: strngs.cpp:147
#define FALSE
Definition: capi.h:28
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
const UNICHARSET * uch_set
Definition: pageres.h:348
const char * string() const
Definition: strngs.cpp:156
Definition: strngs.h:40
const STRING & unichar_lengths() const
Definition: ratngs.h:402
inT32 length() const
Definition: rejctmap.h:238
#define TRUE
Definition: capi.h:27
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::dump_words ( WERD_RES_LIST &  perm,
inT16  score,
inT16  mode,
BOOL8  improved 
)

Definition at line 450 of file fixspace.cpp.

451  {
452  WERD_RES_IT word_res_it(&perm);
453 
454  if (debug_fix_space_level > 0) {
455  if (mode == 1) {
456  stats_.dump_words_str = "";
457  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
458  word_res_it.forward()) {
459  if (!word_res_it.data()->part_of_combo) {
460  stats_.dump_words_str +=
461  word_res_it.data()->best_choice->unichar_string();
462  stats_.dump_words_str += ' ';
463  }
464  }
465  }
466 
467  #ifndef SECURE_NAMES
468  if (debug_fix_space_level > 1) {
469  switch (mode) {
470  case 1:
471  tprintf("EXTRACTED (%d): \"", score);
472  break;
473  case 2:
474  tprintf("TESTED (%d): \"", score);
475  break;
476  case 3:
477  tprintf("RETURNED (%d): \"", score);
478  break;
479  }
480 
481  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
482  word_res_it.forward()) {
483  if (!word_res_it.data()->part_of_combo) {
484  tprintf("%s/%1d ",
485  word_res_it.data()->best_choice->unichar_string().string(),
486  (int)word_res_it.data()->best_choice->permuter());
487  }
488  }
489  tprintf("\"\n");
490  } else if (improved) {
491  tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string());
492  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
493  word_res_it.forward()) {
494  if (!word_res_it.data()->part_of_combo) {
495  tprintf("%s/%1d ",
496  word_res_it.data()->best_choice->unichar_string().string(),
497  (int)word_res_it.data()->best_choice->permuter());
498  }
499  }
500  tprintf("\"\n");
501  }
502  #endif
503  }
504 }
const char * string() const
Definition: strngs.cpp:156
CMD_EVENTS mode
Definition: pgedit.cpp:115
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void tesseract::Tesseract::end_tesseract ( )

Definition at line 431 of file tessedit.cpp.

431  {
432  end_recog();
433 }
inT16 tesseract::Tesseract::eval_word_spacing ( WERD_RES_LIST &  word_res_list)

Definition at line 240 of file fixspace.cpp.

240  {
241  WERD_RES_IT word_res_it(&word_res_list);
242  inT16 total_score = 0;
243  inT16 word_count = 0;
244  inT16 done_word_count = 0;
245  inT16 word_len;
246  inT16 i;
247  inT16 offset;
248  WERD_RES *word; // current word
249  inT16 prev_word_score = 0;
250  BOOL8 prev_word_done = FALSE;
251  BOOL8 prev_char_1 = FALSE; // prev ch a "1/I/l"?
252  BOOL8 prev_char_digit = FALSE; // prev ch 2..9 or 0
253  BOOL8 current_char_1 = FALSE;
254  BOOL8 current_word_ok_so_far;
255  STRING punct_chars = "!\"`',.:;";
256  BOOL8 prev_char_punct = FALSE;
257  BOOL8 current_char_punct = FALSE;
258  BOOL8 word_done = FALSE;
259 
260  do {
261  word = word_res_it.data();
262  word_done = fixspace_thinks_word_done(word);
263  word_count++;
264  if (word->tess_failed) {
265  total_score += prev_word_score;
266  if (prev_word_done)
267  done_word_count++;
268  prev_word_score = 0;
269  prev_char_1 = FALSE;
270  prev_char_digit = FALSE;
271  prev_word_done = FALSE;
272  } else {
273  /*
274  Can we add the prev word score and potentially count this word?
275  Yes IF it didnt end in a 1 when the first char of this word is a digit
276  AND it didnt end in a digit when the first char of this word is a 1
277  */
278  word_len = word->reject_map.length();
279  current_word_ok_so_far = FALSE;
280  if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
281  (prev_char_digit && (
282  (word_done &&
283  word->best_choice->unichar_lengths().string()[0] == 1 &&
284  word->best_choice->unichar_string()[0] == '1') ||
285  (!word_done && STRING(conflict_set_I_l_1).contains(
286  word->best_choice->unichar_string()[0])))))) {
287  total_score += prev_word_score;
288  if (prev_word_done)
289  done_word_count++;
290  current_word_ok_so_far = word_done;
291  }
292 
293  if (current_word_ok_so_far) {
294  prev_word_done = TRUE;
295  prev_word_score = word_len;
296  } else {
297  prev_word_done = FALSE;
298  prev_word_score = 0;
299  }
300 
301  /* Add 1 to total score for every joined 1 regardless of context and
302  rejtn */
303  for (i = 0, prev_char_1 = FALSE; i < word_len; i++) {
304  current_char_1 = word->best_choice->unichar_string()[i] == '1';
305  if (prev_char_1 || (current_char_1 && (i > 0)))
306  total_score++;
307  prev_char_1 = current_char_1;
308  }
309 
310  /* Add 1 to total score for every joined punctuation regardless of context
311  and rejtn */
313  for (i = 0, offset = 0, prev_char_punct = FALSE; i < word_len;
314  offset += word->best_choice->unichar_lengths()[i++]) {
315  current_char_punct =
316  punct_chars.contains(word->best_choice->unichar_string()[offset]);
317  if (prev_char_punct || (current_char_punct && i > 0))
318  total_score++;
319  prev_char_punct = current_char_punct;
320  }
321  }
322  prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
323  for (i = 0, offset = 0; i < word_len - 1;
324  offset += word->best_choice->unichar_lengths()[i++]);
325  prev_char_1 =
326  ((word_done && (word->best_choice->unichar_string()[offset] == '1'))
327  || (!word_done && STRING(conflict_set_I_l_1).contains(
328  word->best_choice->unichar_string()[offset])));
329  }
330  /* Find next word */
331  do {
332  word_res_it.forward();
333  } while (word_res_it.data()->part_of_combo);
334  } while (!word_res_it.at_first());
335  total_score += prev_word_score;
336  if (prev_word_done)
337  done_word_count++;
338  if (done_word_count == word_count)
339  return PERFECT_WERDS;
340  else
341  return total_score;
342 }
const STRING & unichar_string() const
Definition: ratngs.h:395
unsigned char BOOL8
Definition: host.h:113
REJMAP reject_map
Definition: pageres.h:408
#define PERFECT_WERDS
Definition: fixspace.cpp:35
BOOL8 contains(const char c) const
Definition: strngs.cpp:147
#define FALSE
Definition: capi.h:28
BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position)
Definition: fixspace.cpp:344
BOOL8 fixspace_thinks_word_done(WERD_RES *word)
Definition: fixspace.cpp:586
const char * string() const
Definition: strngs.cpp:156
Definition: strngs.h:40
short inT16
Definition: host.h:100
const STRING & unichar_lengths() const
Definition: ratngs.h:402
inT32 length() const
Definition: rejctmap.h:238
BOOL8 tess_failed
Definition: pageres.h:409
#define TRUE
Definition: capi.h:27
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::ExplodeRepeatedWord ( BLOB_CHOICE best_choice,
PAGE_RES_IT page_res_it 
)

Definition at line 1252 of file control.cpp.

1253  {
1254  WERD_RES *word_res = page_res_it->word();
1255  ASSERT_HOST(best_choice != NULL);
1256 
1257  // Make a new word for each blob in the original.
1258  WERD* werd = word_res->word;
1259  C_BLOB_IT blob_it(werd->cblob_list());
1260  for (; !blob_it.empty(); blob_it.forward()) {
1261  bool first_blob = blob_it.at_first();
1262  bool last_blob = blob_it.at_last();
1263  WERD* blob_word = werd->ConstructFromSingleBlob(first_blob, last_blob,
1264  blob_it.extract());
1265  // Note that blamer_bundle (truth information) is not copied, which is
1266  // desirable, since the newly inserted words would not have the original
1267  // bounding box corresponding to the one recorded in truth fields.
1268  WERD_RES* rep_word =
1269  page_res_it->InsertSimpleCloneWord(*word_res, blob_word);
1270  // Setup the single char WERD_RES
1271  if (rep_word->SetupForTessRecognition(*word_res->uch_set, this, BestPix(),
1272  false,
1274  page_res_it->row()->row,
1275  page_res_it->block()->block)) {
1276  rep_word->CloneChoppedToRebuild();
1277  BLOB_CHOICE* blob_choice = new BLOB_CHOICE(*best_choice);
1278  rep_word->FakeClassifyWord(1, &blob_choice);
1279  }
1280  }
1281  page_res_it->DeleteCurrentWord();
1282 }
bool SetupForTessRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, bool numeric_mode, bool use_body_size, ROW *row, BLOCK *block)
Definition: pageres.cpp:272
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
BLOCK * block
Definition: pageres.h:258
ROW_RES * row() const
Definition: pageres.h:760
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:918
#define NULL
Definition: host.h:144
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
Definition: werd.cpp:141
Pix * BestPix() const
WERD_RES * word() const
Definition: pageres.h:757
void CloneChoppedToRebuild()
Definition: pageres.cpp:480
BLOCK_RES * block() const
Definition: pageres.h:763
WERD * word
Definition: pageres.h:334
const UNICHARSET * uch_set
Definition: pageres.h:348
Definition: werd.h:60
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:549
#define ASSERT_HOST(x)
Definition: errcode.h:84
void DeleteCurrentWord()
Definition: pageres.cpp:950
ROW * row
Definition: pageres.h:286
bool tesseract::Tesseract::extract_cube_state ( CubeObject cube_obj,
int *  num_chars,
Boxa **  char_boxes,
CharSamp ***  char_samples 
)

Definition at line 65 of file cube_control.cpp.

68  {
69  if (!cube_obj) {
70  if (cube_debug_level > 0) {
71  tprintf("Cube WARNING (extract_cube_state): Invalid cube object "
72  "passed to extract_cube_state\n");
73  }
74  return false;
75  }
76 
77  // Note that the CubeObject accessors return either the deslanted or
78  // regular objects search object or beam search object, whichever
79  // was used in the last call to Recognize()
80  CubeSearchObject* cube_search_obj = cube_obj->SrchObj();
81  if (!cube_search_obj) {
82  if (cube_debug_level > 0) {
83  tprintf("Cube WARNING (Extract_cube_state): Could not retrieve "
84  "cube's search object in extract_cube_state.\n");
85  }
86  return false;
87  }
88  BeamSearch *beam_search_obj = cube_obj->BeamObj();
89  if (!beam_search_obj) {
90  if (cube_debug_level > 0) {
91  tprintf("Cube WARNING (Extract_cube_state): Could not retrieve "
92  "cube's beam search object in extract_cube_state.\n");
93  }
94  return false;
95  }
96 
97  // Get the character samples and bounding boxes by backtracking
98  // through the beam search path
99  int best_node_index = beam_search_obj->BestPresortedNodeIndex();
100  *char_samples = beam_search_obj->BackTrack(
101  cube_search_obj, best_node_index, num_chars, NULL, char_boxes);
102  if (!*char_samples)
103  return false;
104  return true;
105 }
#define NULL
Definition: host.h:144
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
inT16 tesseract::Tesseract::failure_count ( WERD_RES word)

Definition at line 975 of file docqual.cpp.

975  {
976  const char *str = word->best_choice->unichar_string().string();
977  int tess_rejs = 0;
978 
979  for (; *str != '\0'; str++) {
980  if (*str == ' ')
981  tess_rejs++;
982  }
983  return tess_rejs;
984 }
const STRING & unichar_string() const
Definition: ratngs.h:395
const char * string() const
Definition: strngs.cpp:156
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::fill_werd_res ( const BoxWord cube_box_word,
WERD_CHOICE cube_werd_choice,
const char *  cube_best_str,
WERD_RES tess_werd_res 
)

Definition at line 454 of file cube_control.cpp.

457  {
458  // Replace tesseract results's best choice with cube's
459  tess_werd_res->best_choice = cube_werd_choice;
460  tess_werd_res->raw_choice = new WERD_CHOICE(*cube_werd_choice);
461 
462  delete tess_werd_res->box_word;
463  tess_werd_res->box_word = new BoxWord(cube_box_word);
464  tess_werd_res->box_word->ClipToOriginalWord(tess_werd_res->denorm.block(),
465  tess_werd_res->word);
466  // Fill text and remaining fields
467  tess_werd_res->word->set_text(cube_best_str);
468  tess_werd_res->tess_failed = FALSE;
469  tess_werd_res->tess_accepted =
470  tess_acceptable_word(tess_werd_res->best_choice,
471  tess_werd_res->raw_choice);
472  // There is no output word, so we can' call AdaptableWord, but then I don't
473  // think we need to. Fudge the result with accepted.
474  tess_werd_res->tess_would_adapt = tess_werd_res->tess_accepted;
475 
476  // Initialize the reject_map and set it to done, i.e., ignore all of
477  // tesseract's tests for rejection
478  tess_werd_res->reject_map.initialise(cube_werd_choice->length());
479  tess_werd_res->done = tess_werd_res->tess_accepted;
480 
481  // Some sanity checks
482  ASSERT_HOST(tess_werd_res->best_choice->length() ==
483  tess_werd_res->best_choice->blob_choices()->length());
484  ASSERT_HOST(tess_werd_res->best_choice->length() ==
485  tess_werd_res->reject_map.length());
486 }
int length() const
Definition: ratngs.h:214
void set_text(const char *new_text)
Definition: werd.h:120
BLOB_CHOICE_LIST_CLIST * blob_choices()
Definition: ratngs.h:244
BOOL8 done
Definition: pageres.h:419
REJMAP reject_map
Definition: pageres.h:408
void ClipToOriginalWord(const BLOCK *block, WERD *original_word)
Definition: boxword.cpp:138
#define FALSE
Definition: capi.h:28
BOOL8 tess_would_adapt
Definition: pageres.h:418
BOOL8 tess_acceptable_word(WERD_CHOICE *word_choice, WERD_CHOICE *raw_choice)
Definition: tessbox.cpp:102
WERD * word
Definition: pageres.h:334
DENORM denorm
Definition: pageres.h:346
WERD_CHOICE * raw_choice
Definition: pageres.h:360
tesseract::BoxWord * box_word
Definition: pageres.h:387
BOOL8 tess_accepted
Definition: pageres.h:417
const BLOCK * block() const
Definition: normalis.h:276
inT32 length() const
Definition: rejctmap.h:238
BOOL8 tess_failed
Definition: pageres.h:409
#define ASSERT_HOST(x)
Definition: errcode.h:84
void initialise(inT16 length)
Definition: rejctmap.cpp:324
WERD_CHOICE * best_choice
Definition: pageres.h:359
bool tesseract::Tesseract::FindSegmentation ( const GenericVector< UNICHAR_ID > &  target_text,
WERD_RES word_res 
)

Definition at line 560 of file applybox.cpp.

561  {
563  // Classify all required combinations of blobs and save results in choices.
564  int word_length = word_res->box_word->length();
566  new GenericVector<BLOB_CHOICE_LIST*>[word_length];
567  for (int i = 0; i < word_length; ++i) {
568  for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
569  BLOB_CHOICE_LIST* match_result = classify_piece(
570  word_res->chopped_word->blobs, word_res->denorm, word_res->seam_array,
571  i, i + j - 1, word_res->blamer_bundle);
572  if (applybox_debug > 2) {
573  tprintf("%d+%d:", i, j);
574  print_ratings_list("Segment:", match_result, unicharset);
575  }
576  choices[i].push_back(match_result);
577  }
578  }
579  // Search the segmentation graph for the target text. Must be an exact
580  // match. Using wildcards makes it difficult to find the correct
581  // segmentation even when it is there.
582  word_res->best_state.clear();
583  GenericVector<int> search_segmentation;
584  float best_rating = 0.0f;
585  SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
586  &search_segmentation, &best_rating, &word_res->best_state);
588  for (int i = 0; i < word_length; ++i)
589  choices[i].delete_data_pointers();
590  delete [] choices;
591  if (word_res->best_state.empty()) {
592  // Build the original segmentation and if it is the same length as the
593  // truth, assume it will do.
594  int blob_count = 1;
595  for (int s = 0; s < array_count(word_res->seam_array); ++s) {
596  SEAM* seam =
597  reinterpret_cast<SEAM*>(array_value(word_res->seam_array, s));
598  if (seam->split1 == NULL) {
599  word_res->best_state.push_back(blob_count);
600  blob_count = 1;
601  } else {
602  ++blob_count;
603  }
604  }
605  word_res->best_state.push_back(blob_count);
606  if (word_res->best_state.size() != target_text.size()) {
607  word_res->best_state.clear(); // No good. Original segmentation bad size.
608  return false;
609  }
610  }
611  word_res->correct_text.clear();
612  for (int i = 0; i < target_text.size(); ++i) {
613  word_res->correct_text.push_back(
614  STRING(unicharset.id_to_unichar(target_text[i])));
615  }
616  return true;
617 }
const int kMaxGroupSize
Definition: applybox.cpp:41
const int length() const
Definition: boxword.h:99
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
virtual void clear()
BlobMatchTable blob_match_table
Definition: wordrec.h:501
#define NULL
Definition: host.h:144
GenericVector< int > best_state
Definition: pageres.h:392
#define f(xc, yc)
Definition: imgscale.cpp:39
void SearchForText(const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
Definition: applybox.cpp:625
TBLOB * blobs
Definition: blobs.h:274
int push_back(T object)
SEAMS seam_array
Definition: pageres.h:358
SPLIT * split1
Definition: seam.h:46
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:511
bool empty() const
Definition: genericvector.h:68
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
DENORM denorm
Definition: pageres.h:346
UNICHARSET unicharset
Definition: ccutil.h:72
Definition: strngs.h:40
int size() const
Definition: genericvector.h:59
virtual BLOB_CHOICE_LIST * classify_piece(TBLOB *pieces, const DENORM &denorm, SEAMS seams, inT16 start, inT16 end, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:75
GenericVector< STRING > correct_text
Definition: pageres.h:396
tesseract::BoxWord * box_word
Definition: pageres.h:387
TWERD * chopped_word
Definition: pageres.h:357
#define array_count(a)
Definition: tessarray.h:74
#define array_value(a, i)
Definition: tessarray.h:132
BlamerBundle * blamer_bundle
Definition: pageres.h:367
inT16 tesseract::Tesseract::first_alphanum_index ( const char *  word,
const char *  word_lengths 
)

Definition at line 633 of file reject.cpp.

634  {
635  inT16 i;
636  inT16 offset;
637 
638  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
639  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
640  unicharset.get_isdigit(word + offset, word_lengths[i]))
641  return i;
642  }
643  return -1;
644 }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
UNICHARSET unicharset
Definition: ccutil.h:72
short inT16
Definition: host.h:100
inT16 tesseract::Tesseract::first_alphanum_offset ( const char *  word,
const char *  word_lengths 
)

Definition at line 646 of file reject.cpp.

647  {
648  inT16 i;
649  inT16 offset;
650 
651  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
652  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
653  unicharset.get_isdigit(word + offset, word_lengths[i]))
654  return offset;
655  }
656  return -1;
657 }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
UNICHARSET unicharset
Definition: ccutil.h:72
short inT16
Definition: host.h:100
void tesseract::Tesseract::fix_fuzzy_space_list ( WERD_RES_LIST &  best_perm,
ROW row,
BLOCK block 
)

Definition at line 146 of file fixspace.cpp.

148  {
149  inT16 best_score;
150  WERD_RES_LIST current_perm;
151  inT16 current_score;
152  BOOL8 improved = FALSE;
153 
154  best_score = eval_word_spacing(best_perm); // default score
155  dump_words(best_perm, best_score, 1, improved);
156 
157  if (best_score != PERFECT_WERDS)
158  initialise_search(best_perm, current_perm);
159 
160  while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
161  match_current_words(current_perm, row, block);
162  current_score = eval_word_spacing(current_perm);
163  dump_words(current_perm, current_score, 2, improved);
164  if (current_score > best_score) {
165  best_perm.clear();
166  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
167  best_score = current_score;
168  improved = TRUE;
169  }
170  if (current_score < PERFECT_WERDS)
171  transform_to_next_perm(current_perm);
172  }
173  dump_words(best_perm, best_score, 3, improved);
174 }
void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved)
Definition: fixspace.cpp:450
unsigned char BOOL8
Definition: host.h:113
void transform_to_next_perm(WERD_RES_LIST &words)
Definition: fixspace.cpp:373
#define PERFECT_WERDS
Definition: fixspace.cpp:35
#define FALSE
Definition: capi.h:28
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:668
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
Definition: fixspace.cpp:178
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:197
short inT16
Definition: host.h:100
inT16 eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:240
#define TRUE
Definition: capi.h:27
void tesseract::Tesseract::fix_fuzzy_spaces ( ETEXT_DESC monitor,
inT32  word_count,
PAGE_RES page_res 
)

Definition at line 49 of file fixspace.cpp.

51  {
52  BLOCK_RES_IT block_res_it;
53  ROW_RES_IT row_res_it;
54  WERD_RES_IT word_res_it_from;
55  WERD_RES_IT word_res_it_to;
56  WERD_RES *word_res;
57  WERD_RES_LIST fuzzy_space_words;
58  inT16 new_length;
59  BOOL8 prevent_null_wd_fixsp; // DONT process blobless wds
60  inT32 word_index; // current word
61 
62  block_res_it.set_to_list(&page_res->block_res_list);
63  word_index = 0;
64  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
65  block_res_it.forward()) {
66  row_res_it.set_to_list(&block_res_it.data()->row_res_list);
67  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
68  row_res_it.forward()) {
69  word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
70  while (!word_res_it_from.at_last()) {
71  word_res = word_res_it_from.data();
72  while (!word_res_it_from.at_last() &&
73  !(word_res->combination ||
74  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
75  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
76  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
77  block_res_it.data()->block);
78  word_res = word_res_it_from.forward();
79  word_index++;
80  if (monitor != NULL) {
81  monitor->ocr_alive = TRUE;
82  monitor->progress = 90 + 5 * word_index / word_count;
83  if (monitor->deadline_exceeded() ||
84  (monitor->cancel != NULL &&
85  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
86  return;
87  }
88  }
89 
90  if (!word_res_it_from.at_last()) {
91  word_res_it_to = word_res_it_from;
92  prevent_null_wd_fixsp =
93  word_res->word->cblob_list()->empty();
94  if (check_debug_pt(word_res, 60))
95  debug_fix_space_level.set_value(10);
96  word_res_it_to.forward();
97  word_index++;
98  if (monitor != NULL) {
99  monitor->ocr_alive = TRUE;
100  monitor->progress = 90 + 5 * word_index / word_count;
101  if (monitor->deadline_exceeded() ||
102  (monitor->cancel != NULL &&
103  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
104  return;
105  }
106  while (!word_res_it_to.at_last () &&
107  (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
108  word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
109  if (check_debug_pt(word_res, 60))
110  debug_fix_space_level.set_value(10);
111  if (word_res->word->cblob_list()->empty())
112  prevent_null_wd_fixsp = TRUE;
113  word_res = word_res_it_to.forward();
114  }
115  if (check_debug_pt(word_res, 60))
116  debug_fix_space_level.set_value(10);
117  if (word_res->word->cblob_list()->empty())
118  prevent_null_wd_fixsp = TRUE;
119  if (prevent_null_wd_fixsp) {
120  word_res_it_from = word_res_it_to;
121  } else {
122  fuzzy_space_words.assign_to_sublist(&word_res_it_from,
123  &word_res_it_to);
124  fix_fuzzy_space_list(fuzzy_space_words,
125  row_res_it.data()->row,
126  block_res_it.data()->block);
127  new_length = fuzzy_space_words.length();
128  word_res_it_from.add_list_before(&fuzzy_space_words);
129  for (;
130  !word_res_it_from.at_last() && new_length > 0;
131  new_length--) {
132  word_res_it_from.forward();
133  }
134  }
135  if (test_pt)
136  debug_fix_space_level.set_value(0);
137  }
138  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
139  block_res_it.data()->block);
140  // Last word in row
141  }
142  }
143  }
144 }
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
Definition: fixspace.cpp:618
inT16 progress
Definition: ocrclass.h:115
bool deadline_exceeded() const
Definition: ocrclass.h:144
BLOCK_RES_LIST block_res_list
Definition: pageres.h:222
CANCEL_FUNC cancel
Definition: ocrclass.h:119
unsigned char BOOL8
Definition: host.h:113
#define NULL
Definition: host.h:144
BOOL8 combination
Definition: pageres.h:450
void * cancel_this
Definition: ocrclass.h:120
int inT32
Definition: host.h:102
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1388
volatile inT8 ocr_alive
Definition: ocrclass.h:117
short inT16
Definition: host.h:100
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:146
#define TRUE
Definition: capi.h:27
void tesseract::Tesseract::fix_noisy_space_list ( WERD_RES_LIST &  best_perm,
ROW row,
BLOCK block 
)

Definition at line 652 of file fixspace.cpp.

653  {
654  inT16 best_score;
655  WERD_RES_IT best_perm_it(&best_perm);
656  WERD_RES_LIST current_perm;
657  WERD_RES_IT current_perm_it(&current_perm);
658  WERD_RES *old_word_res;
659  WERD_RES *new_word_res;
660  inT16 current_score;
661  BOOL8 improved = FALSE;
662 
663  best_score = fp_eval_word_spacing(best_perm); // default score
664 
665  dump_words(best_perm, best_score, 1, improved);
666 
667  new_word_res = new WERD_RES;
668  old_word_res = best_perm_it.data();
669  old_word_res->combination = TRUE; // Kludge to force deep copy
670  *new_word_res = *old_word_res; // deep copy
671  old_word_res->combination = FALSE; // Undo kludge
672  current_perm_it.add_to_end(new_word_res);
673 
674  break_noisiest_blob_word(current_perm);
675 
676  while (best_score != PERFECT_WERDS && !current_perm.empty()) {
677  match_current_words(current_perm, row, block);
678  current_score = fp_eval_word_spacing(current_perm);
679  dump_words(current_perm, current_score, 2, improved);
680  if (current_score > best_score) {
681  best_perm.clear();
682  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
683  best_score = current_score;
684  improved = TRUE;
685  }
686  if (current_score < PERFECT_WERDS) {
687  break_noisiest_blob_word(current_perm);
688  }
689  }
690  dump_words(best_perm, best_score, 3, improved);
691 }
void break_noisiest_blob_word(WERD_RES_LIST &words)
Definition: fixspace.cpp:699
void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved)
Definition: fixspace.cpp:450
unsigned char BOOL8
Definition: host.h:113
#define PERFECT_WERDS
Definition: fixspace.cpp:35
BOOL8 combination
Definition: pageres.h:450
#define FALSE
Definition: capi.h:28
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:668
inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:914
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:197
short inT16
Definition: host.h:100
#define TRUE
Definition: capi.h:27
void tesseract::Tesseract::fix_rep_char ( PAGE_RES_IT page_res_it)

fix_rep_char() The word is a repeated char. (Leader.) Find the repeated char character. Create the appropriate single-word or multi-word sequence according to the size of spaces in between blobs, and correct the classifications where some of the characters disagree with the majority.

Definition at line 1200 of file control.cpp.

1200  {
1201  WERD_RES *word_res = page_res_it->word();
1202  const WERD_CHOICE &word = *(word_res->best_choice);
1203 
1204  // Find the frequency of each unique character in the word.
1205  UNICHAR_ID space = word_res->uch_set->unichar_to_id(" ");
1206  SortHelper<UNICHAR_ID> rep_ch(word.length());
1207  for (int i = 0; i < word.length(); ++i) {
1208  if (word.unichar_id(i) != space)
1209  rep_ch.Add(word.unichar_id(i), 1);
1210  }
1211 
1212  // Find the most frequent result.
1213  UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1214  int max_count = rep_ch.MaxCount(&maxch_id);
1215  // Find the best exemplar of a classifier result for maxch_id.
1216  BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
1217  if (best_choice == NULL) {
1218  tprintf("Failed to find a choice for %s, occurring %d times\n",
1219  word_res->uch_set->debug_str(maxch_id).string(), max_count);
1220  return;
1221  }
1222  word_res->done = TRUE;
1223 
1224  // Measure the mean space.
1225  int total_gap = 0;
1226  int gap_count = 0;
1227  WERD* werd = word_res->word;
1228  C_BLOB_IT blob_it(werd->cblob_list());
1229  C_BLOB* prev_blob = blob_it.data();
1230  for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1231  C_BLOB* blob = blob_it.data();
1232  int gap = blob->bounding_box().left();
1233  gap -= prev_blob->bounding_box().right();
1234  total_gap += gap;
1235  ++gap_count;
1236  prev_blob = blob;
1237  }
1238  if (total_gap > word_res->x_height * gap_count * kRepcharGapThreshold) {
1239  // Needs spaces between.
1240  ExplodeRepeatedWord(best_choice, page_res_it);
1241  } else {
1242  // Just correct existing classification.
1243  CorrectRepcharChoices(best_choice, word_res);
1244  word_res->reject_map.initialise(word.length());
1245  }
1246 }
int length() const
Definition: ratngs.h:214
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
int UNICHAR_ID
Definition: unichar.h:31
BOOL8 done
Definition: pageres.h:419
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
REJMAP reject_map
Definition: pageres.h:408
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:285
const double kRepcharGapThreshold
Definition: control.cpp:60
WERD_RES * word() const
Definition: pageres.h:757
WERD * word
Definition: pageres.h:334
const UNICHARSET * uch_set
Definition: pageres.h:348
void ExplodeRepeatedWord(BLOB_CHOICE *best_choice, PAGE_RES_IT *page_res_it)
Definition: control.cpp:1252
const char * string() const
Definition: strngs.cpp:156
float x_height
Definition: pageres.h:431
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
Definition: werd.h:60
void Add(T value, int count)
Definition: sorthelper.h:63
TBOX bounding_box()
Definition: stepblob.cpp:192
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
void initialise(inT16 length)
Definition: rejctmap.cpp:324
#define TRUE
Definition: capi.h:27
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::fix_sp_fp_word ( WERD_RES_IT &  word_res_it,
ROW row,
BLOCK block 
)

Definition at line 618 of file fixspace.cpp.

619  {
620  WERD_RES *word_res;
621  WERD_RES_LIST sub_word_list;
622  WERD_RES_IT sub_word_list_it(&sub_word_list);
623  inT16 blob_index;
624  inT16 new_length;
625  float junk;
626 
627  word_res = word_res_it.data();
628  if (word_res->word->flag(W_REP_CHAR) ||
629  word_res->combination ||
630  word_res->part_of_combo ||
631  !word_res->word->flag(W_DONT_CHOP))
632  return;
633 
634  blob_index = worst_noise_blob(word_res, &junk);
635  if (blob_index < 0)
636  return;
637 
638  if (debug_fix_space_level > 1) {
639  tprintf("FP fixspace working on \"%s\"\n",
640  word_res->best_choice->unichar_string().string());
641  }
642  word_res->word->rej_cblob_list()->sort(c_blob_comparator);
643  sub_word_list_it.add_after_stay_put(word_res_it.extract());
644  fix_noisy_space_list(sub_word_list, row, block);
645  new_length = sub_word_list.length();
646  word_res_it.add_list_before(&sub_word_list);
647  for (; !word_res_it.at_last() && new_length > 1; new_length--) {
648  word_res_it.forward();
649  }
650 }
const STRING & unichar_string() const
Definition: ratngs.h:395
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:95
BOOL8 part_of_combo
Definition: pageres.h:451
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:764
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:652
BOOL8 combination
Definition: pageres.h:450
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:122
WERD * word
Definition: pageres.h:334
int c_blob_comparator(const void *blob1p, const void *blob2p)
Definition: genblob.cpp:31
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
short inT16
Definition: host.h:100
WERD_CHOICE * best_choice
Definition: pageres.h:359
BOOL8 tesseract::Tesseract::fixspace_thinks_word_done ( WERD_RES word)

Definition at line 586 of file fixspace.cpp.

586  {
587  if (word->done)
588  return TRUE;
589 
590  /*
591  Use all the standard pass 2 conditions for mode 5 in set_done() in
592  reject.c BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT
593  CARE WHETHER WE HAVE of/at on/an etc.
594  */
595  if (fixsp_done_mode > 0 &&
596  (word->tess_accepted ||
597  (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
598  fixsp_done_mode == 3) &&
599  (strchr(word->best_choice->unichar_string().string(), ' ') == NULL) &&
600  ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
601  (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
602  (word->best_choice->permuter() == USER_DAWG_PERM) ||
603  (word->best_choice->permuter() == NUMBER_PERM))) {
604  return TRUE;
605  } else {
606  return FALSE;
607  }
608 }
const STRING & unichar_string() const
Definition: ratngs.h:395
BOOL8 done
Definition: pageres.h:419
REJMAP reject_map
Definition: pageres.h:408
#define NULL
Definition: host.h:144
#define FALSE
Definition: capi.h:28
uinT8 permuter() const
Definition: ratngs.h:237
const char * string() const
Definition: strngs.cpp:156
BOOL8 tess_accepted
Definition: pageres.h:417
inT16 reject_count()
Definition: rejctmap.h:244
#define TRUE
Definition: capi.h:27
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::flip_0O ( WERD_RES word)

Definition at line 856 of file reject.cpp.

856  {
857  WERD_CHOICE *best_choice = word_res->best_choice;
858  int i;
859  TBOX out_box;
860 
861  if (!tessedit_flip_0O)
862  return;
863 
864  TBLOB* blob = word_res->rebuild_word->blobs;
865  for (i = 0; i < best_choice->length() && blob != NULL; ++i,
866  blob = blob->next) {
867  if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
868  word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
869  out_box = blob->bounding_box();
870  if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
871  (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
872  return; //Beware words with sub/superscripts
873  }
874  }
875  UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
876  UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
877  if (unichar_0 == INVALID_UNICHAR_ID ||
878  !word_res->uch_set->get_enabled(unichar_0) ||
879  unichar_O == INVALID_UNICHAR_ID ||
880  !word_res->uch_set->get_enabled(unichar_O)) {
881  return; // 0 or O are not present/enabled in unicharset
882  }
883  bool modified = false;
884  for (i = 1; i < best_choice->length(); ++i) {
885  if (best_choice->unichar_id(i) == unichar_0 ||
886  best_choice->unichar_id(i) == unichar_O) {
887  /* A0A */
888  if ((i+1) < best_choice->length() &&
889  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
890  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
891  best_choice->set_unichar_id(unichar_O, i);
892  modified = true;
893  }
894  /* A00A */
895  if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
896  (i+1) < best_choice->length() &&
897  (best_choice->unichar_id(i+1) == unichar_0 ||
898  best_choice->unichar_id(i+1) == unichar_O) &&
899  (i+2) < best_choice->length() &&
900  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
901  best_choice->set_unichar_id(unichar_O, i);
902  modified = true;
903  i++;
904  }
905  /* AA0<non digit or end of word> */
906  if ((i > 1) &&
907  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
908  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
909  (((i+1) < best_choice->length() &&
910  !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
911  !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
912  !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
913  (i == best_choice->length() - 1))) {
914  best_choice->set_unichar_id(unichar_O, i);
915  modified = true;
916  }
917  /* 9O9 */
918  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
919  (i+1) < best_choice->length() &&
920  non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
921  best_choice->set_unichar_id(unichar_0, i);
922  modified = true;
923  }
924  /* 9OOO */
925  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
926  (i+2) < best_choice->length() &&
927  (best_choice->unichar_id(i+1) == unichar_0 ||
928  best_choice->unichar_id(i+1) == unichar_O) &&
929  (best_choice->unichar_id(i+2) == unichar_0 ||
930  best_choice->unichar_id(i+2) == unichar_O)) {
931  best_choice->set_unichar_id(unichar_0, i);
932  best_choice->set_unichar_id(unichar_0, i+1);
933  best_choice->set_unichar_id(unichar_0, i+2);
934  modified = true;
935  i += 2;
936  }
937  /* 9OO<non upper> */
938  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
939  (i+2) < best_choice->length() &&
940  (best_choice->unichar_id(i+1) == unichar_0 ||
941  best_choice->unichar_id(i+1) == unichar_O) &&
942  !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
943  best_choice->set_unichar_id(unichar_0, i);
944  best_choice->set_unichar_id(unichar_0, i+1);
945  modified = true;
946  i++;
947  }
948  /* 9O<non upper> */
949  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
950  (i+1) < best_choice->length() &&
951  !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
952  best_choice->set_unichar_id(unichar_0, i);
953  }
954  /* 9[.,]OOO.. */
955  if ((i > 1) &&
956  (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
957  word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
958  (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
959  best_choice->unichar_id(i-2) == unichar_O)) {
960  if (best_choice->unichar_id(i-2) == unichar_O) {
961  best_choice->set_unichar_id(unichar_0, i-2);
962  modified = true;
963  }
964  while (i < best_choice->length() &&
965  (best_choice->unichar_id(i) == unichar_O ||
966  best_choice->unichar_id(i) == unichar_0)) {
967  best_choice->set_unichar_id(unichar_0, i);
968  modified = true;
969  i++;
970  }
971  i--;
972  }
973  }
974  }
975 }
int length() const
Definition: ratngs.h:214
const int kBlnXHeight
Definition: normalis.h:27
int UNICHAR_ID
Definition: unichar.h:31
const int kBlnBaselineOffset
Definition: normalis.h:28
#define NULL
Definition: host.h:144
Definition: rect.h:29
Definition: blobs.h:174
inT16 top() const
Definition: rect.h:53
BOOL8 non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:977
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:247
BOOL8 non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:981
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
TBLOB * next
Definition: blobs.h:228
inT16 bottom() const
Definition: rect.h:60
void tesseract::Tesseract::flip_hyphens ( WERD_RES word)

Definition at line 796 of file reject.cpp.

796  {
797  WERD_CHOICE *best_choice = word_res->best_choice;
798  int i;
799  int prev_right = -9999;
800  int next_left;
801  TBOX out_box;
802  float aspect_ratio;
803 
805  return;
806 
807  TBLOB* blob = word_res->rebuild_word->blobs;
808  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
809  bool modified = false;
810  for (i = 0; i < best_choice->length() && blob != NULL; ++i,
811  blob = blob->next) {
812  out_box = blob->bounding_box();
813  if (blob->next == NULL)
814  next_left = 9999;
815  else
816  next_left = blob->next->bounding_box().left();
817  // Dont touch small or touching blobs - it is too dangerous.
818  if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
819  (out_box.left() > prev_right) && (out_box.right() < next_left)) {
820  aspect_ratio = out_box.width() / (float) out_box.height();
821  if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
822  if (aspect_ratio >= tessedit_upper_flip_hyphen &&
823  word_res->uch_set->contains_unichar_id(unichar_dash) &&
824  word_res->uch_set->get_enabled(unichar_dash)) {
825  /* Certain HYPHEN */
826  best_choice->set_unichar_id(unichar_dash, i);
827  modified = true;
828  if (word_res->reject_map[i].rejected())
829  word_res->reject_map[i].setrej_hyphen_accept();
830  }
831  if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
832  word_res->reject_map[i].accepted())
833  //Suspected HYPHEN
834  word_res->reject_map[i].setrej_hyphen ();
835  }
836  else if (best_choice->unichar_id(i) == unichar_dash) {
837  if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
838  (word_res->reject_map[i].rejected()))
839  word_res->reject_map[i].setrej_hyphen_accept();
840  //Certain HYPHEN
841 
842  if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
843  (word_res->reject_map[i].accepted()))
844  //Suspected HYPHEN
845  word_res->reject_map[i].setrej_hyphen();
846  }
847  }
848  prev_right = out_box.right();
849  }
850 }
int length() const
Definition: ratngs.h:214
int UNICHAR_ID
Definition: unichar.h:31
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
inT16 width() const
Definition: rect.h:104
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
Definition: blobs.h:174
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:247
inT16 height() const
Definition: rect.h:97
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
TBLOB * next
Definition: blobs.h:228
void tesseract::Tesseract::font_recognition_pass ( PAGE_RES page_res)

font_recognition_pass

Smooth the fonts for the document.

Definition at line 1590 of file control.cpp.

1590  {
1591  PAGE_RES_IT page_res_it(page_res);
1592  WERD_RES *word; // current word
1593  STATS doc_fonts(0, font_table_size_); // font counters
1594 
1595  // Gather font id statistics.
1596  for (page_res_it.restart_page(); page_res_it.word() != NULL;
1597  page_res_it.forward()) {
1598  word = page_res_it.word();
1599  if (word->fontinfo != NULL) {
1600  doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
1601  }
1602  if (word->fontinfo2 != NULL) {
1603  doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
1604  }
1605  }
1606  inT16 doc_font; // modal font
1607  inT8 doc_font_count; // modal font
1608  find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
1609  if (doc_font_count == 0)
1610  return;
1611  // Get the modal font pointer.
1612  const FontInfo* modal_font = NULL;
1613  for (page_res_it.restart_page(); page_res_it.word() != NULL;
1614  page_res_it.forward()) {
1615  word = page_res_it.word();
1616  if (word->fontinfo != NULL && word->fontinfo->universal_id == doc_font) {
1617  modal_font = word->fontinfo;
1618  break;
1619  }
1620  if (word->fontinfo2 != NULL && word->fontinfo2->universal_id == doc_font) {
1621  modal_font = word->fontinfo2;
1622  break;
1623  }
1624  }
1625  ASSERT_HOST(modal_font != NULL);
1626 
1627  // Assign modal font to weak words.
1628  for (page_res_it.restart_page(); page_res_it.word() != NULL;
1629  page_res_it.forward()) {
1630  word = page_res_it.word();
1631  int length = word->best_choice->length();
1632 
1633  // 1st choices got 2 pts, so we need to halve the score for the mode.
1634  int count = (word->fontinfo_id_count + 1) / 2;
1635  if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
1636  word->fontinfo = modal_font;
1637  // Counts only get 1 as it came from the doc.
1638  word->fontinfo_id_count = 1;
1639  word->italic = modal_font->is_italic() ? 1 : -1;
1640  word->bold = modal_font->is_bold() ? 1 : -1;
1641  }
1642  }
1643 }
int length() const
Definition: ratngs.h:214
const FontInfo * fontinfo
Definition: pageres.h:424
const FontInfo * fontinfo2
Definition: pageres.h:425
bool is_italic() const
Definition: fontinfo.h:84
#define NULL
Definition: host.h:144
bool is_bold() const
Definition: fontinfo.h:85
WERD * word
Definition: pageres.h:334
inT8 bold
Definition: pageres.h:422
inT8 fontinfo_id2_count
Definition: pageres.h:427
short inT16
Definition: host.h:100
Definition: statistc.h:29
SIGNED char inT8
Definition: host.h:98
inT8 italic
Definition: pageres.h:421
inT8 fontinfo_id_count
Definition: pageres.h:426
#define ASSERT_HOST(x)
Definition: errcode.h:84
int count(LIST var_list)
Definition: oldlist.cpp:108
WERD_CHOICE * best_choice
Definition: pageres.h:359
inT16 tesseract::Tesseract::fp_eval_word_spacing ( WERD_RES_LIST &  word_res_list)

Definition at line 914 of file fixspace.cpp.

914  {
915  WERD_RES_IT word_it(&word_res_list);
916  WERD_RES *word;
917  inT16 word_length;
918  inT16 score = 0;
919  inT16 i;
920  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
921 
922  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
923  word = word_it.data();
924  if (word->rebuild_word == NULL)
925  continue; // Can't handle cube words.
926  word_length = word->reject_map.length();
927  if (word->done ||
928  word->tess_accepted ||
929  word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
930  word->best_choice->permuter() == FREQ_DAWG_PERM ||
931  word->best_choice->permuter() == USER_DAWG_PERM ||
932  safe_dict_word(word) > 0) {
933  TBLOB* blob = word->rebuild_word->blobs;
934  UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
935  for (i = 0; i < word->best_choice->length() && blob != NULL;
936  ++i, blob = blob->next) {
937  if (word->best_choice->unichar_id(i) == space ||
938  blob_noise_score(blob) < small_limit) {
939  score -= 1; // penalise possibly erroneous non-space
940  } else if (word->reject_map[i].accepted()) {
941  score++;
942  }
943  }
944  }
945  }
946  if (score < 0)
947  score = 0;
948  return score;
949 }
int length() const
Definition: ratngs.h:214
TWERD * rebuild_word
Definition: pageres.h:381
const int kBlnXHeight
Definition: normalis.h:27
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:844
int UNICHAR_ID
Definition: unichar.h:31
BOOL8 done
Definition: pageres.h:419
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
REJMAP reject_map
Definition: pageres.h:408
#define NULL
Definition: host.h:144
TBLOB * blobs
Definition: blobs.h:274
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:786
const UNICHARSET * uch_set
Definition: pageres.h:348
uinT8 permuter() const
Definition: ratngs.h:237
Definition: blobs.h:174
short inT16
Definition: host.h:100
BOOL8 tess_accepted
Definition: pageres.h:417
inT32 length() const
Definition: rejctmap.h:238
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
TBLOB * next
Definition: blobs.h:228
WERD_CHOICE * best_choice
Definition: pageres.h:359
GARBAGE_LEVEL tesseract::Tesseract::garbage_word ( WERD_RES word,
BOOL8  ok_dict_word 
)

Definition at line 689 of file docqual.cpp.

689  {
690  enum STATES
691  {
692  JUNK,
693  FIRST_UPPER,
694  FIRST_LOWER,
695  FIRST_NUM,
696  SUBSEQUENT_UPPER,
697  SUBSEQUENT_LOWER,
698  SUBSEQUENT_NUM
699  };
700  const char *str = word->best_choice->unichar_string().string();
701  const char *lengths = word->best_choice->unichar_lengths().string();
702  STATES state = JUNK;
703  int len = 0;
704  int isolated_digits = 0;
705  int isolated_alphas = 0;
706  int bad_char_count = 0;
707  int tess_rejs = 0;
708  int dodgy_chars = 0;
709  int ok_chars;
710  UNICHAR_ID last_char = -1;
711  int alpha_repetition_count = 0;
712  int longest_alpha_repetition_count = 0;
713  int longest_lower_run_len = 0;
714  int lower_string_count = 0;
715  int longest_upper_run_len = 0;
716  int upper_string_count = 0;
717  int total_alpha_count = 0;
718  int total_digit_count = 0;
719 
720  for (; *str != '\0'; str += *(lengths++)) {
721  len++;
722  if (word->uch_set->get_isupper (str, *lengths)) {
723  total_alpha_count++;
724  switch (state) {
725  case SUBSEQUENT_UPPER:
726  case FIRST_UPPER:
727  state = SUBSEQUENT_UPPER;
728  upper_string_count++;
729  if (longest_upper_run_len < upper_string_count)
730  longest_upper_run_len = upper_string_count;
731  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
732  alpha_repetition_count++;
733  if (longest_alpha_repetition_count < alpha_repetition_count) {
734  longest_alpha_repetition_count = alpha_repetition_count;
735  }
736  }
737  else {
738  last_char = word->uch_set->unichar_to_id(str, *lengths);
739  alpha_repetition_count = 1;
740  }
741  break;
742  case FIRST_NUM:
743  isolated_digits++;
744  default:
745  state = FIRST_UPPER;
746  last_char = word->uch_set->unichar_to_id(str, *lengths);
747  alpha_repetition_count = 1;
748  upper_string_count = 1;
749  break;
750  }
751  }
752  else if (word->uch_set->get_islower (str, *lengths)) {
753  total_alpha_count++;
754  switch (state) {
755  case SUBSEQUENT_LOWER:
756  case FIRST_LOWER:
757  state = SUBSEQUENT_LOWER;
758  lower_string_count++;
759  if (longest_lower_run_len < lower_string_count)
760  longest_lower_run_len = lower_string_count;
761  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
762  alpha_repetition_count++;
763  if (longest_alpha_repetition_count < alpha_repetition_count) {
764  longest_alpha_repetition_count = alpha_repetition_count;
765  }
766  }
767  else {
768  last_char = word->uch_set->unichar_to_id(str, *lengths);
769  alpha_repetition_count = 1;
770  }
771  break;
772  case FIRST_NUM:
773  isolated_digits++;
774  default:
775  state = FIRST_LOWER;
776  last_char = word->uch_set->unichar_to_id(str, *lengths);
777  alpha_repetition_count = 1;
778  lower_string_count = 1;
779  break;
780  }
781  }
782  else if (word->uch_set->get_isdigit (str, *lengths)) {
783  total_digit_count++;
784  switch (state) {
785  case FIRST_NUM:
786  state = SUBSEQUENT_NUM;
787  case SUBSEQUENT_NUM:
788  break;
789  case FIRST_UPPER:
790  case FIRST_LOWER:
791  isolated_alphas++;
792  default:
793  state = FIRST_NUM;
794  break;
795  }
796  }
797  else {
798  if (*lengths == 1 && *str == ' ')
799  tess_rejs++;
800  else
801  bad_char_count++;
802  switch (state) {
803  case FIRST_NUM:
804  isolated_digits++;
805  break;
806  case FIRST_UPPER:
807  case FIRST_LOWER:
808  isolated_alphas++;
809  default:
810  break;
811  }
812  state = JUNK;
813  }
814  }
815 
816  switch (state) {
817  case FIRST_NUM:
818  isolated_digits++;
819  break;
820  case FIRST_UPPER:
821  case FIRST_LOWER:
822  isolated_alphas++;
823  default:
824  break;
825  }
826 
828  total_alpha_count += total_digit_count - isolated_digits;
829  }
830 
831  if (crunch_leave_ok_strings && len >= 4 &&
832  2 * (total_alpha_count - isolated_alphas) > len &&
833  longest_alpha_repetition_count < crunch_long_repetitions) {
834  if ((crunch_accept_ok &&
835  acceptable_word_string(*word->uch_set, str, lengths) !=
836  AC_UNACCEPTABLE) ||
837  longest_lower_run_len > crunch_leave_lc_strings ||
838  longest_upper_run_len > crunch_leave_uc_strings)
839  return G_NEVER_CRUNCH;
840  }
841  if (word->reject_map.length() > 1 &&
842  strpbrk(str, " ") == NULL &&
843  (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
844  word->best_choice->permuter() == FREQ_DAWG_PERM ||
845  word->best_choice->permuter() == USER_DAWG_PERM ||
846  word->best_choice->permuter() == NUMBER_PERM ||
847  acceptable_word_string(*word->uch_set, str, lengths) !=
848  AC_UNACCEPTABLE || ok_dict_word))
849  return G_OK;
850 
851  ok_chars = len - bad_char_count - isolated_digits -
852  isolated_alphas - tess_rejs;
853 
854  if (crunch_debug > 3) {
855  tprintf("garbage_word: \"%s\"\n",
856  word->best_choice->unichar_string().string());
857  tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
858  len,
859  bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
860  }
861  if (bad_char_count == 0 &&
862  tess_rejs == 0 &&
863  (len > isolated_digits + isolated_alphas || len <= 2))
864  return G_OK;
865 
866  if (tess_rejs > ok_chars ||
867  (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
868  return G_TERRIBLE;
869 
870  if (len > 4) {
871  dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
872  isolated_alphas;
873  if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5)
874  return G_DODGY;
875  else
876  return G_OK;
877  } else {
878  dodgy_chars = 2 * tess_rejs + bad_char_count;
879  if ((len == 4 && dodgy_chars > 2) ||
880  (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
881  return G_DODGY;
882  else
883  return G_OK;
884  }
885 }
Definition: docqual.h:29
const STRING & unichar_string() const
Definition: ratngs.h:395
int UNICHAR_ID
Definition: unichar.h:31
Unacceptable word.
Definition: control.h:37
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
REJMAP reject_map
Definition: pageres.h:408
#define NULL
Definition: host.h:144
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:399
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:406
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
const UNICHARSET * uch_set
Definition: pageres.h:348
uinT8 permuter() const
Definition: ratngs.h:237
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const STRING & unichar_lengths() const
Definition: ratngs.h:402
inT32 length() const
Definition: rejctmap.h:238
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1284
WERD_CHOICE * best_choice
Definition: pageres.h:359
UNICHAR_ID tesseract::Tesseract::get_rep_char ( WERD_RES word)

Definition at line 349 of file output.cpp.

349  { // what char is repeated?
350  int i;
351  for (i = 0; ((i < word->reject_map.length()) &&
352  (word->reject_map[i].rejected())); ++i);
353 
354  if (i < word->reject_map.length()) {
355  return word->best_choice->unichar_id(i);
356  } else {
357  return word->uch_set->unichar_to_id(unrecognised_char.string());
358  }
359 }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
REJMAP reject_map
Definition: pageres.h:408
const UNICHARSET * uch_set
Definition: pageres.h:348
inT32 length() const
Definition: rejctmap.h:238
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
WERD_CHOICE * best_choice
Definition: pageres.h:359
Tesseract* tesseract::Tesseract::get_sub_lang ( int  index) const
inline

Definition at line 219 of file tesseractclass.h.

219  {
220  return sub_langs_[index];
221  }
CubeRecoContext* tesseract::Tesseract::GetCubeRecoContext ( )
inline

Definition at line 914 of file tesseractclass.h.

914 { return cube_cntxt_; }
int tesseract::Tesseract::ImageHeight ( ) const
inline

Definition at line 193 of file tesseractclass.h.

193  {
194  return pixGetHeight(pix_binary_);
195  }
int tesseract::Tesseract::ImageWidth ( ) const
inline

Definition at line 190 of file tesseractclass.h.

190  {
191  return pixGetWidth(pix_binary_);
192  }
bool tesseract::Tesseract::init_cube_objects ( bool  load_combiner,
TessdataManager tessdata_manager 
)

Definition at line 202 of file cube_control.cpp.

203  {
204  ASSERT_HOST(cube_cntxt_ == NULL);
205  ASSERT_HOST(tess_cube_combiner_ == NULL);
206 
207  // Create the cube context object
208  cube_cntxt_ = CubeRecoContext::Create(this, tessdata_manager, &unicharset);
209  if (cube_cntxt_ == NULL) {
210  if (cube_debug_level > 0) {
211  tprintf("Cube WARNING (Tesseract::init_cube_objects()): Failed to "
212  "instantiate CubeRecoContext\n");
213  }
214  return false;
215  }
216 
217  // Create the combiner object and load the combiner net for target languages.
218  if (load_combiner) {
219  tess_cube_combiner_ = new tesseract::TesseractCubeCombiner(cube_cntxt_);
220  if (!tess_cube_combiner_ || !tess_cube_combiner_->LoadCombinerNet()) {
221  delete cube_cntxt_;
222  cube_cntxt_ = NULL;
223  if (tess_cube_combiner_ != NULL) {
224  delete tess_cube_combiner_;
225  tess_cube_combiner_ = NULL;
226  }
227  if (cube_debug_level > 0)
228  tprintf("Cube ERROR (Failed to instantiate TesseractCubeCombiner\n");
229  return false;
230  }
231  }
232  return true;
233 }
static CubeRecoContext * Create(Tesseract *tess_obj, TessdataManager *tessdata_manager, UNICHARSET *tess_unicharset)
#define NULL
Definition: host.h:144
TessdataManager tessdata_manager
Definition: ccutil.h:71
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
UNICHARSET unicharset
Definition: ccutil.h:72
#define ASSERT_HOST(x)
Definition: errcode.h:84
FILE * tesseract::Tesseract::init_recog_training ( const STRING fname)

Definition at line 37 of file recogtraining.cpp.

37  {
39  tessedit_tess_adaption_mode.set_value(0); // turn off adaption
40  tessedit_enable_doc_dict.set_value(0); // turn off document dictionary
41  save_blob_choices.set_value(1); // save individual char choices
42  getDict().save_raw_choices.set_value(1); // save raw choices
43  getDict().permute_only_top.set_value(true); // use only top choice permuter
44  tessedit_ok_mode.set_value(0); // turn off context checking
45  // Explore all segmentations.
47  }
48 
49  STRING output_fname = fname;
50  const char *lastdot = strrchr(output_fname.string(), '.');
51  if (lastdot != NULL) output_fname[lastdot - output_fname.string()] = '\0';
52  output_fname += ".txt";
53  FILE *output_file = open_file(output_fname.string(), "a+");
54  return output_file;
55 }
FILE * open_file(const char *filename, const char *mode)
Definition: cutil.cpp:82
#define NULL
Definition: host.h:144
Dict & getDict()
Definition: classify.h:62
const char * string() const
Definition: strngs.cpp:156
bool save_raw_choices
Definition: dict.h:864
Definition: strngs.h:40
bool stopper_no_acceptable_choices
Definition: dict.h:859
bool permute_only_top
Definition: dict.h:910
int tesseract::Tesseract::init_tesseract ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params 
)

Definition at line 270 of file tessedit.cpp.

275  {
276  GenericVector<STRING> langs_to_load;
277  GenericVector<STRING> langs_not_to_load;
278  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
279 
280  sub_langs_.delete_data_pointers();
281  sub_langs_.clear();
282  // Find the first loadable lang and load into this.
283  // Add any languages that this language requires
284  bool loaded_primary = false;
285  // Load the rest into sub_langs_.
286  for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
287  if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
288  const char *lang_str = langs_to_load[lang_index].string();
289  Tesseract *tess_to_init;
290  if (!loaded_primary) {
291  tess_to_init = this;
292  } else {
293  tess_to_init = new Tesseract;
294  }
295 
296  int result = tess_to_init->init_tesseract_internal(
297  arg0, textbase, lang_str, oem, configs, configs_size,
298  vars_vec, vars_values, set_only_non_debug_params);
299 
300  if (!loaded_primary) {
301  if (result < 0) {
302  tprintf("Failed loading language '%s'\n", lang_str);
303  } else {
305  tprintf("Loaded language '%s' as main language\n", lang_str);
306  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
307  &langs_to_load, &langs_not_to_load);
308  loaded_primary = true;
309  }
310  } else {
311  if (result < 0) {
312  tprintf("Failed loading language '%s'\n", lang_str);
313  delete tess_to_init;
314  } else {
316  tprintf("Loaded language '%s' as secondary language\n", lang_str);
317  sub_langs_.push_back(tess_to_init);
318  // Add any languages that this language requires
319  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
320  &langs_to_load, &langs_not_to_load);
321  }
322  }
323  }
324  }
325  if (!loaded_primary) {
326  tprintf("Tesseract couldn't load any languages!\n");
327  return -1; // Couldn't load any language!
328  }
330  return 0;
331 }
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
Definition: tessedit.cpp:234
void SetupUniversalFontIds()
Definition: tessedit.cpp:399
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int size() const
Definition: genericvector.h:59
int tesseract::Tesseract::init_tesseract ( const char *  datapath,
const char *  language,
OcrEngineMode  oem 
)
inline

Definition at line 352 of file tesseractclass.h.

354  {
355  return init_tesseract(datapath, NULL, language, oem,
356  NULL, 0, NULL, NULL, false);
357  }
#define NULL
Definition: host.h:144
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
Definition: tessedit.cpp:270
int tesseract::Tesseract::init_tesseract_internal ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params 
)

Definition at line 349 of file tessedit.cpp.

354  {
355  if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
356  configs_size, vars_vec, vars_values,
357  set_only_non_debug_params)) {
358  return -1;
359  }
362  return 0;
363  }
364  // If only Cube will be used, skip loading Tesseract classifier's
365  // pre-trained templates.
366  bool init_tesseract_classifier =
369  // If only Cube will be used and if it has its own Unicharset,
370  // skip initializing permuter and loading Tesseract Dawgs.
371  bool init_dict =
374  program_editup(textbase, init_tesseract_classifier, init_dict);
376  return 0; //Normal exit
377 }
TessdataManager tessdata_manager
Definition: ccutil.h:71
void program_editup(const char *textbase, bool init_classifier, bool init_permute)
Definition: tface.cpp:50
bool SeekToStart(TessdataType tessdata_type)
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
Definition: tessedit.cpp:98
bool tesseract::Tesseract::init_tesseract_lang_data ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params 
)

Definition at line 98 of file tessedit.cpp.

103  {
104  // Set the basename, compute the data directory.
105  main_setup(arg0, textbase);
106 
107  // Set the language data path prefix
108  lang = language != NULL ? language : "eng";
112 
113  // Initialize TessdataManager.
114  STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
115  if (!tessdata_manager.Init(tessdata_path.string(),
117  return false;
118  }
119 
120  // If a language specific config file (lang.config) exists, load it in.
127  tprintf("Loaded language config file\n");
128  }
129  }
130 
131  SetParamConstraint set_params_constraint = set_only_non_debug_params ?
133  // Load tesseract variables from config files. This is done after loading
134  // language-specific variables from [lang].traineddata file, so that custom
135  // config files can override values in [lang].traineddata file.
136  for (int i = 0; i < configs_size; ++i) {
137  read_config_file(configs[i], set_params_constraint);
138  }
139 
140  // Set params specified in vars_vec (done after setting params from config
141  // files, so that params in vars_vec can override those from files).
142  if (vars_vec != NULL && vars_values != NULL) {
143  for (int i = 0; i < vars_vec->size(); ++i) {
144  if (!ParamUtils::SetParam((*vars_vec)[i].string(),
145  (*vars_values)[i].string(),
146  set_params_constraint, this->params())) {
147  tprintf("Error setting param %s\n", (*vars_vec)[i].string());
148  exit(1);
149  }
150  }
151  }
152 
153  if (((STRING &)tessedit_write_params_to_file).length() > 0) {
154  FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb");
155  if (params_file != NULL) {
156  ParamUtils::PrintParams(params_file, this->params());
157  fclose(params_file);
159  tprintf("Wrote parameters to %s\n",
160  tessedit_write_params_to_file.string());
161  }
162  } else {
163  tprintf("Failed to open %s for writing params.\n",
164  tessedit_write_params_to_file.string());
165  }
166  }
167 
168  // Determine which ocr engine(s) should be loaded and used for recognition.
169  if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
171  tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n",
172  static_cast<int>(tessedit_ocr_engine_mode));
173  }
174 
175  // If we are only loading the config file (and so not planning on doing any
176  // recognition) then there's nothing else do here.
179  tprintf("Returning after loading config file\n");
180  }
181  return true;
182  }
183 
184  // Load the unicharset
187  return false;
188  }
189  if (unicharset.size() > MAX_NUM_CLASSES) {
190  tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
191  return false;
192  }
193  if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n");
194  right_to_left_ = unicharset.major_right_to_left();
195 
202  if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
203  }
204 
205  // Load Cube objects if necessary.
209  tprintf("Loaded Cube w/out combiner\n");
213  tprintf("Loaded Cube with combiner\n");
214  }
215 
216  return true;
217 }
STRING datadir
Definition: ccutil.h:67
void LoadUnicharAmbigs(FILE *ambigs_file, inT64 end_offset, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
Definition: ambigs.cpp:44
int size() const
Definition: unicharset.h:264
bool Init(const char *data_file_name, int debug_level)
FILE * GetDataFilePtr() const
#define NULL
Definition: host.h:144
TessdataManager tessdata_manager
Definition: ccutil.h:71
STRING language_data_path_prefix
Definition: ccutil.h:70
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:298
static bool ReadParamsFromFp(FILE *fp, inT64 end_offset, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:71
SetParamConstraint
Definition: params.h:36
int ambigs_debug_level
Definition: ccutil.h:89
bool init_cube_objects(bool load_combiner, TessdataManager *tessdata_manager)
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:67
bool SeekToStart(TessdataType tessdata_type)
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
const char * string() const
Definition: strngs.cpp:156
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:106
inT64 GetEndOffset(TessdataType tessdata_type) const
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
STRING lang
Definition: ccutil.h:69
UNICHARSET unicharset
Definition: ccutil.h:72
Definition: strngs.h:40
int size() const
Definition: genericvector.h:59
bool major_right_to_left() const
Definition: unicharset.cpp:813
bool use_ambigs_for_adaption
Definition: ccutil.h:93
char * tessedit_write_params_to_file
void main_setup(const char *argv0, const char *basename)
Definition: mainblk.cpp:45
#define ASSERT_HOST(x)
Definition: errcode.h:84
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:188
ParamsVectors * params()
Definition: ccutil.h:65
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:73
int tesseract::Tesseract::init_tesseract_lm ( const char *  arg0,
const char *  textbase,
const char *  language 
)

Definition at line 420 of file tessedit.cpp.

422  {
423  if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
424  NULL, 0, NULL, NULL, false))
425  return -1;
426  getDict().Load();
428  return 0;
429 }
void Load()
Definition: dict.cpp:219
#define NULL
Definition: host.h:144
TessdataManager tessdata_manager
Definition: ccutil.h:71
Dict & getDict()
Definition: classify.h:62
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
Definition: tessedit.cpp:98
void tesseract::Tesseract::make_reject_map ( WERD_RES word,
BLOB_CHOICE_LIST_CLIST *  blob_choices,
ROW row,
inT16  pass 
)
void tesseract::Tesseract::match_current_words ( WERD_RES_LIST &  words,
ROW row,
BLOCK block 
)

Definition at line 197 of file fixspace.cpp.

198  {
199  WERD_RES_IT word_it(&words);
200  WERD_RES *word;
201  // Since we are not using PAGE_RES to iterate over words, we need to update
202  // prev_word_best_choice_ before calling classify_word_pass2().
204  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
205  word = word_it.data();
206  if ((!word->part_of_combo) && (word->box_word == NULL)) {
208  block, row, word);
209  }
211  }
212 }
void classify_word_and_language(WordRecognizer recognizer, BLOCK *block, ROW *row, WERD_RES *word)
Definition: control.cpp:795
BOOL8 part_of_combo
Definition: pageres.h:451
#define NULL
Definition: host.h:144
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:506
tesseract::BoxWord * box_word
Definition: pageres.h:387
void classify_word_pass2(BLOCK *block, ROW *row, WERD_RES *word)
Definition: control.cpp:1026
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::match_word_pass2 ( WERD_RES word,
ROW row,
BLOCK block 
)

match_word_pass2

Baseline normalize the word and pass it to Tess.

Definition at line 1098 of file control.cpp.

1100  {
1101  BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST();
1102 
1103  if (word->SetupForTessRecognition(unicharset, this, BestPix(),
1106  row, block))
1107  tess_segment_pass2(word, blob_choices);
1108 
1109  if (!word->tess_failed) {
1110  if (!word->word->flag (W_REP_CHAR)) {
1111  word->fix_quotes(blob_choices);
1113  word->fix_hyphens(blob_choices);
1114  /* Dont trust fix_quotes! - though I think I've fixed the bug */
1115  if (word->best_choice->length() != word->box_word->length() ||
1116  word->best_choice->length() != blob_choices->length()) {
1117  tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1118  " #Blobs=%d; #Choices=%d\n",
1119  word->best_choice->debug_string().string(),
1120  word->best_choice->length(),
1121  word->box_word->length(), blob_choices->length());
1122 
1123  }
1125  word->raw_choice);
1126 
1127  make_reject_map (word, blob_choices, row, 2);
1128  }
1129  }
1130 
1131  // Save best choices in the WERD_CHOICE if needed
1132  word->best_choice->set_blob_choices(blob_choices);
1133  set_word_fonts(word, blob_choices);
1134 
1135  assert (word->raw_choice != NULL);
1136 }
int length() const
Definition: ratngs.h:214
bool SetupForTessRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, bool numeric_mode, bool use_body_size, ROW *row, BLOCK *block)
Definition: pageres.cpp:272
const int length() const
Definition: boxword.h:99
const STRING debug_string() const
Definition: ratngs.h:373
#define NULL
Definition: host.h:144
void make_reject_map(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices, ROW *row, inT16 pass)
Pix * BestPix() const
void fix_hyphens(BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: pageres.cpp:700
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:122
BOOL8 tess_acceptable_word(WERD_CHOICE *word_choice, WERD_CHOICE *raw_choice)
Definition: tessbox.cpp:102
WERD * word
Definition: pageres.h:334
bool classify_bln_numeric_mode
Definition: classify.h:455
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
WERD_CHOICE * raw_choice
Definition: pageres.h:360
UNICHARSET unicharset
Definition: ccutil.h:72
tesseract::BoxWord * box_word
Definition: pageres.h:387
BOOL8 tess_accepted
Definition: pageres.h:417
void set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: ratngs.cpp:184
void set_word_fonts(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: control.cpp:1500
BOOL8 tess_failed
Definition: pageres.h:409
void tess_segment_pass2(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: tessbox.cpp:73
void fix_quotes(BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: pageres.cpp:670
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::MaximallyChopWord ( const GenericVector< TBOX > &  boxes,
BLOCK block,
ROW row,
WERD_RES word_res 
)

Definition at line 257 of file applybox.cpp.

259  {
260  if (!word_res->SetupForTessRecognition(unicharset, this, BestPix(), false,
262  row, block)) {
263  word_res->CloneChoppedToRebuild();
264  return;
265  }
266  if (chop_debug) {
267  tprintf("Maximally chopping word at:");
268  word_res->word->bounding_box().print();
269  }
271  BLOB_CHOICE_LIST *match_result;
272  BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
273  ASSERT_HOST(word_res->chopped_word->blobs != NULL);
274  float rating = static_cast<float>(MAX_INT8);
275  for (TBLOB* blob = word_res->chopped_word->blobs; blob != NULL;
276  blob = blob->next) {
277  // The rating and certainty are not quite arbitrary. Since
278  // select_blob_to_chop uses the worst certainty to choose, they all have
279  // to be different, so starting with MAX_INT8, subtract 1/8 for each blob
280  // in here, and then divide by e each time they are chopped, which
281  // should guarantee a set of unequal values for the whole tree of blobs
282  // produced, however much chopping is required. The chops are thus only
283  // limited by the ability of the chopper to find suitable chop points,
284  // and not by the value of the certainties.
285  match_result = fake_classify_blob(0, rating, -rating);
286  modify_blob_choice(match_result, 0);
287  ASSERT_HOST(!match_result->empty());
288  *char_choices += match_result;
289  rating -= 0.125f;
290  }
291  inT32 blob_number;
292  int right_chop_index = 0;
294  // We only chop if the language is not fixed pitch like CJK.
295  if (prioritize_division) {
296  while (chop_one_blob2(boxes, word_res, &word_res->seam_array));
297  } else {
298  while (chop_one_blob(word_res->chopped_word, char_choices,
299  &blob_number, &word_res->seam_array,
300  &right_chop_index));
301  }
302  }
303  MakeWordChoice(*char_choices, unicharset, word_res->best_choice);
304  MakeWordChoice(*char_choices, unicharset, word_res->raw_choice);
305  word_res->CloneChoppedToRebuild();
307  if (char_choices != NULL) {
308  char_choices->delete_data_pointers();
309  delete char_choices;
310  }
311 }
void delete_data_pointers()
bool SetupForTessRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, bool numeric_mode, bool use_body_size, ROW *row, BLOCK *block)
Definition: pageres.cpp:272
TBOX bounding_box()
Definition: werd.cpp:164
#define MAX_INT8
Definition: host.h:118
bool chop_one_blob(TWERD *word, BLOB_CHOICE_LIST_VECTOR *char_choices, inT32 *blob_number, SEAMS *seam_list, int *right_chop_index)
Definition: chopper.cpp:441
BlobMatchTable blob_match_table
Definition: wordrec.h:501
#define NULL
Definition: host.h:144
int inT32
Definition: host.h:102
TBLOB * blobs
Definition: blobs.h:274
GenericVector< BLOB_CHOICE_LIST * > BLOB_CHOICE_LIST_VECTOR
Definition: ratngs.h:449
Pix * BestPix() const
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:126
void CloneChoppedToRebuild()
Definition: pageres.cpp:480
SEAMS seam_array
Definition: pageres.h:358
WERD * word
Definition: pageres.h:334
BLOB_CHOICE_LIST * fake_classify_blob(UNICHAR_ID class_id, float rating, float certainty)
Definition: wordclass.cpp:136
Definition: blobs.h:174
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool chop_one_blob2(const GenericVector< TBOX > &boxes, WERD_RES *word_res, SEAMS *seam_list)
Definition: chopper.cpp:502
WERD_CHOICE * raw_choice
Definition: pageres.h:360
UNICHARSET unicharset
Definition: ccutil.h:72
TWERD * chopped_word
Definition: pageres.h:357
void modify_blob_choice(BLOB_CHOICE_LIST *answer, int chop_index)
Definition: chopper.cpp:403
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool prioritize_division
Definition: classify.h:354
void print() const
Definition: rect.h:263
WERD_CHOICE * best_choice
Definition: pageres.h:359
Pix** tesseract::Tesseract::mutable_pix_binary ( )
inline

Definition at line 160 of file tesseractclass.h.

160  {
161  Clear();
162  return &pix_binary_;
163  }
Textord* tesseract::Tesseract::mutable_textord ( )
inline

Definition at line 209 of file tesseractclass.h.

209  {
210  return &textord_;
211  }
void tesseract::Tesseract::nn_match_word ( WERD_RES word,
ROW row 
)
void tesseract::Tesseract::nn_recover_rejects ( WERD_RES word,
ROW row 
)
BOOL8 tesseract::Tesseract::noise_outlines ( TWERD word)

Definition at line 987 of file docqual.cpp.

987  {
988  TBOX box; // BB of outline
989  inT16 outline_count = 0;
990  inT16 small_outline_count = 0;
991  inT16 max_dimension;
992  float small_limit = kBlnXHeight * crunch_small_outlines_size;
993 
994  for (TBLOB* blob = word->blobs; blob != NULL; blob = blob->next) {
995  for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) {
996  outline_count++;
997  box = ol->bounding_box();
998  if (box.height() > box.width())
999  max_dimension = box.height();
1000  else
1001  max_dimension = box.width();
1002  if (max_dimension < small_limit)
1003  small_outline_count++;
1004  }
1005  }
1006  return (small_outline_count >= outline_count);
1007 }
const int kBlnXHeight
Definition: normalis.h:27
#define NULL
Definition: host.h:144
inT16 width() const
Definition: rect.h:104
Definition: rect.h:29
TBLOB * blobs
Definition: blobs.h:274
Definition: blobs.h:174
short inT16
Definition: host.h:100
inT16 height() const
Definition: rect.h:97
BOOL8 tesseract::Tesseract::non_0_digit ( const UNICHARSET ch_set,
UNICHAR_ID  unichar_id 
)

Definition at line 981 of file reject.cpp.

981  {
982  return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
983 }
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:555
BOOL8 tesseract::Tesseract::non_O_upper ( const UNICHARSET ch_set,
UNICHAR_ID  unichar_id 
)

Definition at line 977 of file reject.cpp.

977  {
978  return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
979 }
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:406
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:555
int tesseract::Tesseract::num_sub_langs ( ) const
inline

Definition at line 216 of file tesseractclass.h.

216  {
217  return sub_langs_.size();
218  }
BOOL8 tesseract::Tesseract::one_ell_conflict ( WERD_RES word_res,
BOOL8  update_map 
)

Definition at line 456 of file reject.cpp.

456  {
457  const char *word;
458  const char *lengths;
459  inT16 word_len; //its length
460  inT16 first_alphanum_index_;
461  inT16 first_alphanum_offset_;
462  inT16 i;
463  inT16 offset;
464  BOOL8 non_conflict_set_char; //non conf set a/n?
465  BOOL8 conflict = FALSE;
466  BOOL8 allow_1s;
467  ACCEPTABLE_WERD_TYPE word_type;
468  BOOL8 dict_perm_type;
469  BOOL8 dict_word_ok;
470  int dict_word_type;
471 
472  word = word_res->best_choice->unichar_string().string ();
473  lengths = word_res->best_choice->unichar_lengths().string();
474  word_len = strlen (lengths);
475  /*
476  If there are no occurrences of the conflict set characters then the word
477  is OK.
478  */
479  if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
480  return FALSE;
481 
482  /*
483  There is a conflict if there are NO other (confirmed) alphanumerics apart
484  from those in the conflict set.
485  */
486 
487  for (i = 0, offset = 0, non_conflict_set_char = FALSE;
488  (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
489  non_conflict_set_char =
490  (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
491  word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
492  !STRING (conflict_set_I_l_1).contains (word[offset]);
493  if (!non_conflict_set_char) {
494  if (update_map)
495  reject_I_1_L(word_res);
496  return TRUE;
497  }
498 
499  /*
500  If the word is accepted by a dawg permuter, and the first alpha character
501  is "I" or "l", check to see if the alternative is also a dawg word. If it
502  is, then there is a potential error otherwise the word is ok.
503  */
504 
505  dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
506  (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
508  (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
509  (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
510  dict_word_type = dict_word(*(word_res->best_choice));
511  dict_word_ok = (dict_word_type > 0) &&
512  (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
513 
514  if ((rej_1Il_use_dict_word && dict_word_ok) ||
515  (rej_1Il_trust_permuter_type && dict_perm_type) ||
516  (dict_perm_type && dict_word_ok)) {
517  first_alphanum_index_ = first_alphanum_index (word, lengths);
518  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
519  if (lengths[first_alphanum_index_] == 1 &&
520  word[first_alphanum_offset_] == 'I') {
521  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
522  if (safe_dict_word(word_res) > 0) {
523  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
524  if (update_map)
525  word_res->reject_map[first_alphanum_index_].
526  setrej_1Il_conflict();
527  return TRUE;
528  }
529  else {
530  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
531  return FALSE;
532  }
533  }
534 
535  if (lengths[first_alphanum_index_] == 1 &&
536  word[first_alphanum_offset_] == 'l') {
537  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
538  if (safe_dict_word(word_res) > 0) {
539  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
540  if (update_map)
541  word_res->reject_map[first_alphanum_index_].
542  setrej_1Il_conflict();
543  return TRUE;
544  }
545  else {
546  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
547  return FALSE;
548  }
549  }
550  return FALSE;
551  }
552 
553  /*
554  NEW 1Il code. The old code relied on permuter types too much. In fact,
555  tess will use TOP_CHOICE permute for good things like "palette".
556  In this code the string is examined independently to see if it looks like
557  a well formed word.
558  */
559 
560  /*
561  REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
562  dictionary word.
563  */
564  first_alphanum_index_ = first_alphanum_index (word, lengths);
565  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
566  if (lengths[first_alphanum_index_] == 1 &&
567  word[first_alphanum_offset_] == 'l') {
568  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
569  if (safe_dict_word(word_res) > 0)
570  return FALSE;
571  else
572  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
573  }
574  else if (lengths[first_alphanum_index_] == 1 &&
575  word[first_alphanum_offset_] == 'I') {
576  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
577  if (safe_dict_word(word_res) > 0)
578  return FALSE;
579  else
580  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
581  }
582  /*
583  For strings containing digits:
584  If there are no alphas OR the numeric permuter liked the word,
585  reject any non 1 conflict chs
586  Else reject all conflict chs
587  */
588  if (word_contains_non_1_digit (word, lengths)) {
589  allow_1s = (alpha_count (word, lengths) == 0) ||
590  (word_res->best_choice->permuter () == NUMBER_PERM);
591 
592  inT16 offset;
593  conflict = FALSE;
594  for (i = 0, offset = 0; word[offset] != '\0';
595  offset += word_res->best_choice->unichar_lengths()[i++]) {
596  if ((!allow_1s || (word[offset] != '1')) &&
597  STRING (conflict_set_I_l_1).contains (word[offset])) {
598  if (update_map)
599  word_res->reject_map[i].setrej_1Il_conflict ();
600  conflict = TRUE;
601  }
602  }
603  return conflict;
604  }
605  /*
606  For anything else. See if it conforms to an acceptable word type. If so,
607  treat accordingly.
608  */
609  word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
610  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
611  first_alphanum_index_ = first_alphanum_index (word, lengths);
612  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
613  if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
614  if (update_map)
615  word_res->reject_map[first_alphanum_index_].
616  setrej_1Il_conflict ();
617  return TRUE;
618  }
619  else
620  return FALSE;
621  }
622  else if (word_type == AC_UPPER_CASE) {
623  return FALSE;
624  }
625  else {
626  if (update_map)
627  reject_I_1_L(word_res);
628  return TRUE;
629  }
630 }
const STRING & unichar_string() const
Definition: ratngs.h:395
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
ALL upper case.
Definition: control.h:39
void reject_I_1_L(WERD_RES *word)
Definition: reject.cpp:303
inT16 first_alphanum_index(const char *word, const char *word_lengths)
Definition: reject.cpp:633
unsigned char BOOL8
Definition: host.h:113
REJMAP reject_map
Definition: pageres.h:408
#define NULL
Definition: host.h:144
BOOL8 contains(const char c) const
Definition: strngs.cpp:147
#define FALSE
Definition: capi.h:28
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:786
const UNICHARSET * uch_set
Definition: pageres.h:348
uinT8 permuter() const
Definition: ratngs.h:237
const char * string() const
Definition: strngs.cpp:156
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:133
ALL but initial lc.
Definition: control.h:40
Definition: strngs.h:40
inT16 first_alphanum_offset(const char *word, const char *word_lengths)
Definition: reject.cpp:646
inT16 alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:659
short inT16
Definition: host.h:100
const STRING & unichar_lengths() const
Definition: ratngs.h:402
BOOL8 word_contains_non_1_digit(const char *word, const char *word_lengths)
Definition: reject.cpp:673
ALL lower case.
Definition: control.h:38
ACCEPTABLE_WERD_TYPE
Definition: control.h:35
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1284
#define TRUE
Definition: capi.h:27
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::output_pass ( PAGE_RES_IT page_res_it,
const TBOX target_word_box 
)

Definition at line 72 of file output.cpp.

74  {
75  BLOCK_RES *block_of_last_word;
76  inT16 block_id;
77  BOOL8 force_eol; //During output
78  BLOCK *nextblock; //block of next word
79  WERD *nextword; //next word
80 
81  page_res_it.restart_page ();
82  block_of_last_word = NULL;
83  while (page_res_it.word () != NULL) {
84  check_debug_pt (page_res_it.word (), 120);
85 
86  if (target_word_box)
87  {
88 
89  TBOX current_word_box=page_res_it.word ()->word->bounding_box();
90  FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
91  if (!target_word_box->contains(center_pt))
92  {
93  page_res_it.forward ();
94  continue;
95  }
96 
97  }
99  block_of_last_word != page_res_it.block ()) {
100  block_of_last_word = page_res_it.block ();
101  block_id = block_of_last_word->block->index();
102  }
103 
104  force_eol = (tessedit_write_block_separators &&
105  (page_res_it.block () != page_res_it.next_block ())) ||
106  (page_res_it.next_word () == NULL);
107 
108  if (page_res_it.next_word () != NULL)
109  nextword = page_res_it.next_word ()->word;
110  else
111  nextword = NULL;
112  if (page_res_it.next_block () != NULL)
113  nextblock = page_res_it.next_block ()->block;
114  else
115  nextblock = NULL;
116  //regardless of tilde crunching
117  write_results(page_res_it,
118  determine_newline_type(page_res_it.word()->word,
119  page_res_it.block()->block,
120  nextword, nextblock), force_eol);
121  page_res_it.forward();
122  }
123 }
TBOX bounding_box()
Definition: werd.cpp:164
BLOCK * block
Definition: pageres.h:258
bool contains(const FCOORD pt) const
Definition: rect.h:323
BLOCK_RES * next_block() const
Definition: pageres.h:772
WERD_RES * restart_page()
Definition: pageres.h:713
unsigned char BOOL8
Definition: host.h:113
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
void write_results(PAGE_RES_IT &page_res_it, char newline_type, BOOL8 force_eol)
Definition: output.cpp:138
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1388
WERD_RES * word() const
Definition: pageres.h:757
BLOCK_RES * block() const
Definition: pageres.h:763
WERD * word
Definition: pageres.h:334
WERD_RES * next_word() const
Definition: pageres.h:766
Definition: ocrblock.h:31
inT16 top() const
Definition: rect.h:53
WERD_RES * forward()
Definition: pageres.h:737
Definition: points.h:189
Definition: werd.h:60
short inT16
Definition: host.h:100
int index() const
Definition: pdblock.h:80
char determine_newline_type(WERD *word, BLOCK *block, WERD *next_word, BLOCK *next_block)
Definition: output.cpp:311
inT16 bottom() const
Definition: rect.h:60
void tesseract::Tesseract::ParseLanguageString ( const char *  lang_str,
GenericVector< STRING > *  to_load,
GenericVector< STRING > *  not_to_load 
)

Definition at line 234 of file tessedit.cpp.

236  {
237  STRING remains(lang_str);
238  while (remains.length() > 0) {
239  // Find the start of the lang code and which vector to add to.
240  const char* start = remains.string();
241  while (*start == '+')
242  ++start;
243  GenericVector<STRING>* target = to_load;
244  if (*start == '~') {
245  target = not_to_load;
246  ++start;
247  }
248  // Find the index of the end of the lang code in string start.
249  int end = strlen(start);
250  const char* plus = strchr(start, '+');
251  if (plus != NULL && plus - start < end)
252  end = plus - start;
253  STRING lang_code(start);
254  lang_code.truncate_at(end);
255  STRING next(start + end);
256  remains = next;
257  // Check whether lang_code is already in the target vector and add.
258  if (!IsStrInList(lang_code, *target)) {
260  tprintf("Adding language '%s' to list\n", lang_code.string());
261  target->push_back(lang_code);
262  }
263  }
264 }
#define NULL
Definition: host.h:144
int push_back(T object)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
Definition: strngs.h:40
void tesseract::Tesseract::pgeditor_main ( int  width,
int  height,
PAGE_RES page_res 
)

pgeditor_main()

Top level editor operation: Setup a new window and an according event handler

Definition at line 336 of file pgedit.cpp.

336  {
337  current_page_res = page_res;
338  if (current_page_res->block_res_list.empty())
339  return;
340 
341  recog_done = false;
342  stillRunning = true;
343 
344  build_image_window(width, height);
347 #ifndef GRAPHICS_DISABLED
348  pe = new ParamsEditor(this, image_win);
349 #endif
350  PGEventHandler pgEventHandler(this);
351 
352  image_win->AddEventHandler(&pgEventHandler);
354 
355  SVMenuNode* svMenuRoot = build_menu_new();
356 
357  svMenuRoot->BuildMenu(image_win);
358  image_win->SetVisible(true);
359 
362 }
void AddMessageBox()
Definition: scrollview.cpp:579
void BuildMenu(ScrollView *sv, bool menu_bar=true)
Definition: svmnode.cpp:132
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:444
PAGE_RES * current_page_res
Definition: pgedit.cpp:127
BLOCK_RES_LIST block_res_list
Definition: pageres.h:222
void build_image_window(int width, int height)
Definition: pgedit.cpp:192
void SetVisible(bool visible)
Definition: scrollview.cpp:550
#define NULL
Definition: host.h:144
BOOL8 word_set_display(BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: pgedit.cpp:931
ParamsEditor * pe
Definition: pgedit.cpp:107
void AddEventHandler(SVEventHandler *listener)
Add an Event Listener to this ScrollView Window.
Definition: scrollview.cpp:413
bool stillRunning
Definition: pgedit.cpp:108
ScrollView * image_win
Definition: pgedit.cpp:106
bool recog_done
Definition: pgedit.cpp:117
void turn_on_bit(uinT8 bit_num)
Definition: bits16.h:37
void do_re_display(BOOL8(tesseract::Tesseract::*word_painter)(BLOCK *block, ROW *row, WERD_RES *word_res))
Definition: pgedit.cpp:306
SVMenuNode * build_menu_new()
Definition: pgedit.cpp:256
BITS16 word_display_mode
Definition: pgedit.cpp:121
Pix* tesseract::Tesseract::pix_binary ( ) const
inline

Definition at line 164 of file tesseractclass.h.

164  {
165  return pix_binary_;
166  }
Pix* tesseract::Tesseract::pix_grey ( ) const
inline

Definition at line 167 of file tesseractclass.h.

167  {
168  return pix_grey_;
169  }
BOOL8 tesseract::Tesseract::potential_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level,
BOOL8  ok_dict_word 
)

Definition at line 548 of file docqual.cpp.

550  {
551  float rating_per_ch;
552  int adjusted_len;
553  const char *str = word->best_choice->unichar_string().string();
554  const char *lengths = word->best_choice->unichar_lengths().string();
555  BOOL8 word_crunchable;
556  int poor_indicator_count = 0;
557 
558  word_crunchable = !crunch_leave_accept_strings ||
559  word->reject_map.length() < 3 ||
561  str, lengths) == AC_UNACCEPTABLE &&
562  !ok_dict_word);
563 
564  adjusted_len = word->reject_map.length();
565  if (adjusted_len > 10)
566  adjusted_len = 10;
567  rating_per_ch = word->best_choice->rating() / adjusted_len;
568 
569  if (rating_per_ch > crunch_pot_poor_rate) {
570  if (crunch_debug > 2) {
571  tprintf("Potential poor rating on \"%s\"\n",
572  word->best_choice->unichar_string().string());
573  }
574  poor_indicator_count++;
575  }
576 
577  if (word_crunchable &&
579  if (crunch_debug > 2) {
580  tprintf("Potential poor cert on \"%s\"\n",
581  word->best_choice->unichar_string().string());
582  }
583  poor_indicator_count++;
584  }
585 
586  if (garbage_level != G_OK) {
587  if (crunch_debug > 2) {
588  tprintf("Potential garbage on \"%s\"\n",
589  word->best_choice->unichar_string().string());
590  }
591  poor_indicator_count++;
592  }
593  return poor_indicator_count >= crunch_pot_indicators;
594 }
Definition: docqual.h:29
const STRING & unichar_string() const
Definition: ratngs.h:395
Unacceptable word.
Definition: control.h:37
float certainty() const
Definition: ratngs.h:234
unsigned char BOOL8
Definition: host.h:113
REJMAP reject_map
Definition: pageres.h:408
const UNICHARSET * uch_set
Definition: pageres.h:348
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const STRING & unichar_lengths() const
Definition: ratngs.h:402
inT32 length() const
Definition: rejctmap.h:238
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1284
float rating() const
Definition: ratngs.h:231
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::PrepareForPageseg ( )

Definition at line 461 of file tesseractclass.cpp.

461  {
463  pixDestroy(&cube_binary_);
464  cube_binary_ = pixClone(pix_binary());
465  // Find the max splitter strategy over all langs.
466  ShiroRekhaSplitter::SplitStrategy max_pageseg_strategy =
469  for (int i = 0; i < sub_langs_.size(); ++i) {
470  ShiroRekhaSplitter::SplitStrategy pageseg_strategy =
472  static_cast<inT32>(sub_langs_[i]->pageseg_devanagari_split_strategy));
473  if (pageseg_strategy > max_pageseg_strategy)
474  max_pageseg_strategy = pageseg_strategy;
475  // Clone the cube image to all the sub langs too.
476  pixDestroy(&sub_langs_[i]->cube_binary_);
477  sub_langs_[i]->cube_binary_ = pixClone(pix_binary());
478  pixDestroy(&sub_langs_[i]->pix_binary_);
479  sub_langs_[i]->pix_binary_ = pixClone(pix_binary());
480  }
481  // Perform shiro-rekha (top-line) splitting and replace the current image by
482  // the newly splitted image.
483  splitter_.set_orig_pix(pix_binary());
484  splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
485  if (splitter_.Split(true)) {
486  ASSERT_HOST(splitter_.splitted_image());
487  pixDestroy(&pix_binary_);
488  pix_binary_ = pixClone(splitter_.splitted_image());
489  }
490 }
int inT32
Definition: host.h:102
void set_pageseg_split_strategy(SplitStrategy strategy)
bool Split(bool split_for_pageseg)
#define ASSERT_HOST(x)
Definition: errcode.h:84
void set_use_cjk_fp_model(bool flag)
Definition: textord.h:56
Pix * pix_binary() const
void tesseract::Tesseract::PrepareForTessOCR ( BLOCK_LIST *  block_list,
Tesseract osd_tess,
OSResults osr 
)

Definition at line 497 of file tesseractclass.cpp.

498  {
499  // Find the max splitter strategy over all langs.
500  ShiroRekhaSplitter::SplitStrategy max_ocr_strategy =
502  static_cast<inT32>(ocr_devanagari_split_strategy));
503  for (int i = 0; i < sub_langs_.size(); ++i) {
504  ShiroRekhaSplitter::SplitStrategy ocr_strategy =
506  static_cast<inT32>(sub_langs_[i]->ocr_devanagari_split_strategy));
507  if (ocr_strategy > max_ocr_strategy)
508  max_ocr_strategy = ocr_strategy;
509  }
510  // Utilize the segmentation information available.
511  splitter_.set_segmentation_block_list(block_list);
512  splitter_.set_ocr_split_strategy(max_ocr_strategy);
513  // Run the splitter for OCR
514  bool split_for_ocr = splitter_.Split(false);
515  // Restore pix_binary to the binarized original pix for future reference.
516  ASSERT_HOST(splitter_.orig_pix());
517  pixDestroy(&pix_binary_);
518  pix_binary_ = pixClone(splitter_.orig_pix());
519  // If the pageseg and ocr strategies are different, refresh the block list
520  // (from the last SegmentImage call) with blobs from the real image to be used
521  // for OCR.
522  if (splitter_.HasDifferentSplitStrategies()) {
523  BLOCK block("", TRUE, 0, 0, 0, 0, pixGetWidth(pix_binary_),
524  pixGetHeight(pix_binary_));
525  Pix* pix_for_ocr = split_for_ocr ? splitter_.splitted_image() :
526  splitter_.orig_pix();
527  extract_edges(pix_for_ocr, &block);
528  splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());
529  }
530  // The splitter isn't needed any more after this, so save memory by clearing.
531  splitter_.Clear();
532 }
void extract_edges(Pix *pix, BLOCK *block)
Definition: edgblob.cpp:335
int inT32
Definition: host.h:102
void RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs)
Definition: ocrblock.h:31
void set_segmentation_block_list(BLOCK_LIST *block_list)
bool Split(bool split_for_pageseg)
void set_ocr_split_strategy(SplitStrategy strategy)
#define ASSERT_HOST(x)
Definition: errcode.h:84
#define TRUE
Definition: capi.h:27
BOOL8 tesseract::Tesseract::process_cmd_win_event ( inT32  cmd_event,
char *  new_value 
)

Definition at line 396 of file pgedit.cpp.

399  {
400  char msg[160];
401  BOOL8 exit = FALSE;
402 
403  color_mode = CM_RAINBOW;
404 
405  // Run recognition on the full page if needed.
406  switch (cmd_event) {
407  case BLAMER_CMD_EVENT:
411  case SHOW_BOLD_CMD_EVENT:
417  if (!recog_done) {
419  recog_done = true;
420  }
421  break;
422  default:
423  break;
424  }
425 
426  switch (cmd_event) {
427  case NULL_CMD_EVENT:
428  break;
429 
431  case DUMP_WERD_CMD_EVENT:
434  case RECOG_WERDS:
435  case RECOG_PSEUDO:
436  mode =(CMD_EVENTS) cmd_event;
437  break;
440  word_config_ = image_win->ShowInputDialog("Config File Name");
441  break;
443  if (new_value[0] == 'T')
445  else
448  break;
449  case BLAMER_CMD_EVENT:
450  if (new_value[0] == 'T')
452  else
456  break;
458  if (new_value[0] == 'T')
460  else
463  break;
464  case POLYGONAL_CMD_EVENT:
465  if (new_value[0] == 'T')
467  else
470  break;
471  case BL_NORM_CMD_EVENT:
472  if (new_value[0] == 'T')
474  else
477  break;
478  case BITMAP_CMD_EVENT:
479  if (new_value[0] == 'T')
481  else
484  break;
487  break;
488  case IMAGE_CMD_EVENT:
489  display_image =(new_value[0] == 'T');
491  break;
492  case BLOCKS_CMD_EVENT:
493  display_blocks =(new_value[0] == 'T');
495  break;
496  case BASELINES_CMD_EVENT:
497  display_baselines =(new_value[0] == 'T');
499  break;
501  color_mode = CM_SUBSCRIPT;
503  break;
505  color_mode = CM_SUPERSCRIPT;
507  break;
509  color_mode = CM_ITALIC;
511  break;
512  case SHOW_BOLD_CMD_EVENT:
513  color_mode = CM_BOLD;
515  break;
517  color_mode = CM_UNDERLINE;
519  break;
521  color_mode = CM_FIXEDPITCH;
523  break;
525  color_mode = CM_SERIF;
527  break;
529  color_mode = CM_SMALLCAPS;
531  break;
533  color_mode = CM_DROPCAPS;
535  break;
536  case REFRESH_CMD_EVENT:
538  break;
539  case QUIT_CMD_EVENT:
540  exit = TRUE;
542  break;
543 
544  default:
545  sprintf(msg, "Unrecognised event " INT32FORMAT "(%s)",
546  cmd_event, new_value);
547  image_win->AddMessage(msg);
548  break;
549  }
550  return exit;
551 }
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:178
PAGE_RES * current_page_res
Definition: pgedit.cpp:127
unsigned char BOOL8
Definition: host.h:113
#define NULL
Definition: host.h:144
BOOL8 word_set_display(BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: pgedit.cpp:931
#define INT32FORMAT
Definition: host.h:115
static void Exit()
Definition: scrollview.cpp:584
char * ShowInputDialog(const char *msg)
Definition: scrollview.cpp:735
void AddMessage(const char *format,...)
Definition: scrollview.cpp:562
#define FALSE
Definition: capi.h:28
Definition: werd.h:51
BOOL8 display_image
Definition: pgedit.cpp:123
BOOL8 display_baselines
Definition: pgedit.cpp:125
BOOL8 word_display(BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: pgedit.cpp:747
ScrollView * image_win
Definition: pgedit.cpp:106
CMD_EVENTS mode
Definition: pgedit.cpp:115
bool recog_done
Definition: pgedit.cpp:117
Definition: werd.h:55
void turn_off_bit(uinT8 bit_num)
Definition: bits16.h:42
void turn_on_bit(uinT8 bit_num)
Definition: bits16.h:37
void do_re_display(BOOL8(tesseract::Tesseract::*word_painter)(BLOCK *block, ROW *row, WERD_RES *word_res))
Definition: pgedit.cpp:306
BITS16 word_display_mode
Definition: pgedit.cpp:121
BOOL8 display_blocks
Definition: pgedit.cpp:124
Definition: werd.h:50
#define TRUE
Definition: capi.h:27
void tesseract::Tesseract::process_image_event ( const SVEvent event)

process_image_event()

User has done something in the image window - mouse down or up. Work out what it is and do something with it. If DOWN - just remember where it was. If UP - for each word in the selected area do the operation defined by the current mode.

Definition at line 563 of file pgedit.cpp.

564  {
565  // The following variable should remain static, since it is used by
566  // debug editor, which uses a single Tesseract instance.
567  static ICOORD down;
568  ICOORD up;
569  TBOX selection_box;
570  char msg[80];
571 
572  switch(event.type) {
573 
574  case SVET_SELECTION:
575  if (event.type == SVET_SELECTION) {
576  down.set_x(event.x + event.x_size);
577  down.set_y(event.y + event.y_size);
578  if (mode == SHOW_POINT_CMD_EVENT)
579  show_point(current_page_res, event.x, event.y);
580  }
581 
582  up.set_x(event.x);
583  up.set_y(event.y);
584 
585  selection_box = TBOX(down, up);
586 
587  switch(mode) {
591  selection_box,
593  break;
594  case DUMP_WERD_CMD_EVENT:
596  selection_box,
598  break;
601  selection_box,
603  break;
605  debug_word(current_page_res, selection_box);
606  break;
608  break; // ignore up event
609 
610  case RECOG_WERDS:
611  image_win->AddMessage("Recogging selected words");
613  selection_box,
615  break;
616  case RECOG_PSEUDO:
617  image_win->AddMessage("Recogging selected blobs");
618  recog_pseudo_word(current_page_res, selection_box);
619  break;
620 
621  default:
622  sprintf(msg, "Mode %d not yet implemented", mode);
623  image_win->AddMessage(msg);
624  break;
625  }
626  default:
627  break;
628  }
629 }
BOOL8 recog_interactive(BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: control.cpp:97
PAGE_RES * current_page_res
Definition: pgedit.cpp:127
int x_size
Definition: scrollview.h:68
void show_point(PAGE_RES *page_res, float x, float y)
Definition: pgedit.cpp:650
Definition: rect.h:29
void AddMessage(const char *format,...)
Definition: scrollview.cpp:562
SVEventType type
Definition: scrollview.h:64
BOOL8 word_blank_and_set_display(BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: pgedit.cpp:711
int x
Definition: scrollview.h:66
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
Definition: control.cpp:72
BOOL8 word_dumper(BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: pgedit.cpp:908
void set_y(inT16 yin)
rewrite function
Definition: points.h:65
ScrollView * image_win
Definition: pgedit.cpp:106
CMD_EVENTS mode
Definition: pgedit.cpp:115
BOOL8 word_bln_display(BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: pgedit.cpp:724
void set_x(inT16 xin)
rewrite function
Definition: points.h:61
void process_selected_words(PAGE_RES *page_res, TBOX &selection_box, BOOL8(tesseract::Tesseract::*word_processor)(BLOCK *block, ROW *row, WERD_RES *word_res))
Definition: pagewalk.cpp:31
integer coordinate
Definition: points.h:30
int y
Definition: scrollview.h:67
int y_size
Definition: scrollview.h:69
void debug_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:636
void tesseract::Tesseract::process_selected_words ( PAGE_RES page_res,
TBOX selection_box,
BOOL8(tesseract::Tesseract::*)(BLOCK *block, ROW *row, WERD_RES *word_res)  word_processor 
)

Definition at line 31 of file pagewalk.cpp.

35  {
36  for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != NULL;
37  page_res_it.forward()) {
38  WERD* word = page_res_it.word()->word;
39  if (word->bounding_box().overlap(selection_box)) {
40  if (!((this->*word_processor)(page_res_it.block()->block,
41  page_res_it.row()->row,
42  page_res_it.word())))
43  return;
44  }
45  }
46 }
TBOX bounding_box()
Definition: werd.cpp:164
#define NULL
Definition: host.h:144
bool overlap(const TBOX &box) const
Definition: rect.h:345
WERD_RES * word() const
Definition: pageres.h:757
Definition: werd.h:60
bool tesseract::Tesseract::ProcessTargetWord ( const TBOX word_box,
const TBOX target_word_box,
const char *  word_config,
int  pass 
)

Definition at line 128 of file control.cpp.

131  {
132  if (word_config != NULL) {
133  if (word_box.major_overlap(target_word_box)) {
134  if (backup_config_file_ == NULL) {
135  backup_config_file_ = kBackUpConfigFile;
136  FILE* config_fp = fopen(backup_config_file_, "wb");
137  ParamUtils::PrintParams(config_fp, params());
138  fclose(config_fp);
139  ParamUtils::ReadParamsFile(word_config,
141  params());
142  }
143  } else {
144  if (backup_config_file_ != NULL) {
145  ParamUtils::ReadParamsFile(backup_config_file_,
147  params());
148  backup_config_file_ = NULL;
149  }
150  }
151  } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
152  return false;
153  }
154  return true;
155 }
#define NULL
Definition: host.h:144
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:43
bool major_overlap(const TBOX &box) const
Definition: rect.h:358
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:188
ParamsVectors * params()
Definition: ccutil.h:65
const char *const kBackUpConfigFile
Definition: control.cpp:58
void tesseract::Tesseract::quality_based_rejection ( PAGE_RES_IT page_res_it,
BOOL8  good_quality_doc 
)

Definition at line 143 of file docqual.cpp.

144  {
145  if ((tessedit_good_quality_unrej && good_quality_doc))
146  unrej_good_quality_words(page_res_it);
147  doc_and_block_rejection(page_res_it, good_quality_doc);
148  if (unlv_tilde_crunching) {
149  tilde_crunch(page_res_it);
150  tilde_delete(page_res_it);
151  }
152 }
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:424
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:596
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
Definition: docqual.cpp:238
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:166
void tesseract::Tesseract::read_config_file ( const char *  filename,
SetParamConstraint  constraint 
)

Definition at line 67 of file tessedit.cpp.

68  {
69  STRING path = datadir;
70  path += "configs/";
71  path += filename;
72  FILE* fp;
73  if ((fp = fopen(path.string(), "rb")) != NULL) {
74  fclose(fp);
75  } else {
76  path = datadir;
77  path += "tessconfigs/";
78  path += filename;
79  if ((fp = fopen(path.string(), "rb")) != NULL) {
80  fclose(fp);
81  } else {
82  path = filename;
83  }
84  }
85  ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
86 }
STRING datadir
Definition: ccutil.h:67
#define NULL
Definition: host.h:144
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:43
const char * string() const
Definition: strngs.cpp:156
Definition: strngs.h:40
ParamsVectors * params()
Definition: ccutil.h:65
bool tesseract::Tesseract::recog_all_words ( PAGE_RES page_res,
ETEXT_DESC monitor,
const TBOX target_word_box,
const char *  word_config,
int  dopasses 
)

recog_all_words()

Walk the page_res, recognizing all the words. If monitor is not null, it is used as a progress monitor/timeout/cancel. If dopasses is 0, all recognition passes are run, 1 just pass 1, 2 passes2 and higher. If target_word_box is not null, special things are done to words that overlap the target_word_box: if word_config is not null, the word config file is read for just the target word(s), otherwise, on pass 2 and beyond ONLY the target words are processed (Jetsoft modification.) Returns false if we cancelled prematurely.

Parameters
page_respage structure
monitorprogress monitor
word_configword_config file
target_word_boxspecifies just to extract a rectangle
dopasses0 - all, 1 just pass 1, 2 passes 2 and higher

Definition at line 178 of file control.cpp.

182  {
183  PAGE_RES_IT page_res_it;
184  inT32 word_index; // current word
185 
187  tessedit_test_adaption.set_value (TRUE);
188  tessedit_minimal_rejection.set_value (TRUE);
189  }
190 
191  // Before the main recognition loop below, walk through the whole page and set
192  // up fake words. That way, if we run out of time a user will still get the
193  // expected best_choice and box_words out the end; they'll just be empty.
194  page_res_it.page_res = page_res;
195  for (page_res_it.restart_page(); page_res_it.word() != NULL;
196  page_res_it.forward()) {
197  page_res_it.word()->SetupFake(unicharset);
198  }
199 
200  if (dopasses==0 || dopasses==1) {
201  page_res_it.page_res=page_res;
202  page_res_it.restart_page();
203 
204  // ****************** Pass 1 *******************
205 
206  // Clear adaptive classifier at the beginning of the page if it is full.
207  // This is done only at the beginning of the page to ensure that the
208  // classifier is not reset at an arbitrary point while processing the page,
209  // which would cripple Passes 2+ if the reset happens towards the end of
210  // Pass 1 on a page with very difficult text.
211  // TODO(daria): preemptively clear the classifier if it is almost full.
213  // Now check the sub-langs as well.
214  for (int i = 0; i < sub_langs_.size(); ++i) {
215  if (sub_langs_[i]->AdaptiveClassifierIsFull())
216  sub_langs_[i]->ResetAdaptiveClassifierInternal();
217  }
218 
219  stats_.word_count = 0;
220  if (monitor != NULL) {
221  monitor->ocr_alive = TRUE;
222  while (page_res_it.word() != NULL) {
223  stats_.word_count++;
224  page_res_it.forward();
225  }
226  page_res_it.restart_page();
227  } else {
228  stats_.word_count = 1;
229  }
230 
231  word_index = 0;
232 
233  stats_.dict_words = 0;
234  stats_.doc_blob_quality = 0;
235  stats_.doc_outline_errs = 0;
236  stats_.doc_char_quality = 0;
237  stats_.good_char_count = 0;
238  stats_.doc_good_char_quality = 0;
239 
240  most_recently_used_ = this;
241  while (page_res_it.word() != NULL) {
243  word_index++;
244  if (monitor != NULL) {
245  monitor->ocr_alive = TRUE;
246  monitor->progress = 30 + 50 * word_index / stats_.word_count;
247  if (monitor->deadline_exceeded() ||
248  (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
249  stats_.dict_words)))
250  return false;
251  }
252  if (target_word_box &&
253  !ProcessTargetWord(page_res_it.word()->word->bounding_box(),
254  *target_word_box, word_config, 1)) {
255  page_res_it.forward();
256  continue;
257  }
259  page_res_it.block()->block,
260  page_res_it.row()->row,
261  page_res_it.word());
262  if (page_res_it.word()->word->flag(W_REP_CHAR)) {
263  fix_rep_char(&page_res_it);
264  page_res_it.forward();
265  continue;
266  }
267  if (tessedit_dump_choices) {
268  word_dumper(NULL, page_res_it.row()->row, page_res_it.word());
269  tprintf("Pass1: %s [%s]\n",
270  page_res_it.word()->best_choice->unichar_string().string(),
271  page_res_it.word()->best_choice->debug_string().string());
272  }
273 
274  // tessedit_test_adaption enables testing of the accuracy of the
275  // input to the adaptive classifier.
277  if (!word_adaptable (page_res_it.word(),
279  page_res_it.word()->reject_map.rej_word_tess_failure();
280  // FAKE PERM REJ
281  } else {
282  // Override rejection mechanisms for this word.
283  UNICHAR_ID space = unicharset.unichar_to_id(" ");
284  for (int i = 0; i < page_res_it.word()->best_choice->length(); i++) {
285  if ((page_res_it.word()->best_choice->unichar_id(i) != space) &&
286  page_res_it.word()->reject_map[i].rejected())
287  page_res_it.word()->reject_map[i].setrej_minimal_rej_accept();
288  }
289  }
290  }
291 
292  // Count dict words.
293  if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
294  ++(stats_.dict_words);
295 
296  // Update misadaption log (we only need to do it on pass 1, since
297  // adaption only happens on this pass).
298  if (page_res_it.word()->blamer_bundle != NULL &&
299  page_res_it.word()->blamer_bundle->misadaption_debug.length() > 0) {
300  page_res->misadaption_log.push_back(
301  page_res_it.word()->blamer_bundle->misadaption_debug);
302  }
303 
304  page_res_it.forward();
305  }
306  }
307 
308  if (dopasses == 1) return true;
309 
310  // ****************** Pass 2 *******************
311  page_res_it.restart_page();
312  word_index = 0;
313  most_recently_used_ = this;
314  while (!tessedit_test_adaption && page_res_it.word() != NULL) {
316  word_index++;
317  if (monitor != NULL) {
318  monitor->ocr_alive = TRUE;
319  monitor->progress = 80 + 10 * word_index / stats_.word_count;
320  if (monitor->deadline_exceeded() ||
321  (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
322  stats_.dict_words)))
323  return false;
324  }
325 
326  // changed by jetsoft
327  // specific to its needs to extract one word when need
328  if (target_word_box &&
329  !ProcessTargetWord(page_res_it.word()->word->bounding_box(),
330  *target_word_box, word_config, 2)) {
331  page_res_it.forward();
332  continue;
333  }
334  // end jetsoft
335 
337  page_res_it.block()->block,
338  page_res_it.row()->row,
339  page_res_it.word());
340  if (page_res_it.word()->word->flag(W_REP_CHAR) &&
341  !page_res_it.word()->done) {
342  fix_rep_char(&page_res_it);
343  page_res_it.forward();
344  continue;
345  }
346  if (tessedit_dump_choices) {
347  word_dumper(NULL, page_res_it.row()->row, page_res_it.word());
348  tprintf("Pass2: %s [%s]\n",
349  page_res_it.word()->best_choice->unichar_string().string(),
350  page_res_it.word()->best_choice->debug_string().string());
351  }
352  page_res_it.forward();
353  }
354 
355  // The next passes can only be run if tesseract has been used, as cube
356  // doesn't set all the necessary outputs in WERD_RES.
359  // ****************** Pass 3 *******************
360  // Fix fuzzy spaces.
362 
365  fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
366 
367  // ****************** Pass 4 *******************
369 
370  // ****************** Pass 5,6 *******************
371  rejection_passes(page_res, monitor, target_word_box, word_config);
372 
373  // ****************** Pass 7 *******************
374  // Cube combiner.
375  // If cube is loaded and its combiner is present, run it.
377  run_cube_combiner(page_res);
378  }
379 
380  // ****************** Pass 8 *******************
381  font_recognition_pass(page_res);
382 
383  // ****************** Pass 9 *******************
384  // Check the correctness of the final results.
385  blamer_pass(page_res);
386  }
387 
388  if (!save_blob_choices) {
389  // We aren't saving the blob choices so get rid of them now.
390  // set_blob_choices() does a deep clear.
391  page_res_it.restart_page();
392  while (page_res_it.word() != NULL) {
393  WERD_RES* word = page_res_it.word();
395  page_res_it.forward();
396  }
397  }
398 
399  // Write results pass.
401  // This is now redundant, but retained commented so show how to obtain
402  // bounding boxes and style information.
403 
404  // changed by jetsoft
405  // needed for dll to output memory structure
406  if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
407  output_pass(page_res_it, target_word_box);
408  // end jetsoft
409  PageSegMode pageseg_mode = static_cast<PageSegMode>(
410  static_cast<int>(tessedit_pageseg_mode));
411  textord_.CleanupSingleRowResult(pageseg_mode, page_res);
412 
413  if (monitor != NULL) {
414  monitor->progress = 100;
415  }
416  return true;
417 }
int length() const
Definition: ratngs.h:214
void classify_word_and_language(WordRecognizer recognizer, BLOCK *block, ROW *row, WERD_RES *word)
Definition: control.cpp:795
const STRING & unichar_string() const
Definition: ratngs.h:395
TBOX bounding_box()
Definition: werd.cpp:164
int UNICHAR_ID
Definition: unichar.h:31
BOOL8 done
Definition: pageres.h:419
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
inT16 progress
Definition: ocrclass.h:115
BLOCK * block
Definition: pageres.h:258
const STRING debug_string() const
Definition: ratngs.h:373
#define LOC_PASS1
Definition: errcode.h:47
ROW_RES * row() const
Definition: pageres.h:760
bool deadline_exceeded() const
Definition: ocrclass.h:144
inT32 length() const
Definition: strngs.cpp:151
CANCEL_FUNC cancel
Definition: ocrclass.h:119
WERD_RES * restart_page()
Definition: pageres.h:713
void font_recognition_pass(PAGE_RES *page_res)
Definition: control.cpp:1590
REJMAP reject_map
Definition: pageres.h:408
#define NULL
Definition: host.h:144
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:72
void * cancel_this
Definition: ocrclass.h:120
int inT32
Definition: host.h:102
#define LOC_WRITE_RESULTS
Definition: errcode.h:54
int push_back(T object)
bool right_to_left() const
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
Definition: control.cpp:588
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:100
void run_cube_combiner(PAGE_RES *page_res)
WERD_RES * word() const
Definition: pageres.h:757
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:122
void fix_fuzzy_spaces(ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:49
bool tessedit_enable_bigram_correction
volatile inT8 ocr_alive
Definition: ocrclass.h:117
BLOCK_RES * block() const
Definition: pageres.h:763
WERD * word
Definition: pageres.h:334
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:128
uinT8 permuter() const
Definition: ratngs.h:237
BOOL8 word_dumper(BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: pgedit.cpp:908
void classify_word_pass1(BLOCK *block, ROW *row, WERD_RES *word)
Definition: control.cpp:860
void rej_word_tess_failure()
Definition: rejctmap.cpp:431
void bigram_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:419
const char * string() const
Definition: strngs.cpp:156
WERD_RES * forward()
Definition: pageres.h:737
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
#define LOC_FUZZY_SPACE
Definition: errcode.h:50
void blamer_pass(PAGE_RES *page_res)
Definition: control.cpp:684
UNICHARSET unicharset
Definition: ccutil.h:72
#define LOC_PASS2
Definition: errcode.h:48
void classify_word_pass2(BLOCK *block, ROW *row, WERD_RES *word)
Definition: control.cpp:1026
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
Definition: textord.cpp:339
bool AdaptiveClassifierIsFull()
Definition: classify.h:319
void fix_rep_char(PAGE_RES_IT *page_res_it)
Definition: control.cpp:1200
GenericVector< STRING > misadaption_log
Definition: pageres.h:233
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:340
void set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: ratngs.cpp:184
BOOL8 word_adaptable(WERD_RES *word, uinT16 mode)
Definition: adaptions.cpp:50
STRING misadaption_debug
Definition: pageres.h:180
PAGE_RES * page_res
Definition: pageres.h:691
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:636
BlamerBundle * blamer_bundle
Definition: pageres.h:367
ROW * row
Definition: pageres.h:286
#define TRUE
Definition: capi.h:27
WERD_CHOICE * best_choice
Definition: pageres.h:359
BOOL8 tesseract::Tesseract::recog_interactive ( BLOCK block,
ROW row,
WERD_RES word_res 
)

recog_interactive

Recognize a single word in interactive mode.

Parameters
blockblock
rowrow of word
word_resword to recognise

Definition at line 97 of file control.cpp.

97  {
98  inT16 char_qual;
99  inT16 good_char_qual;
100 
102  block, row, word_res);
104  word_char_quality(word_res, row, &char_qual, &good_char_qual);
105  tprintf
106  ("\n%d chars; word_blob_quality: %d; outline_errs: %d; char_quality: %d; good_char_quality: %d\n",
107  word_res->reject_map.length(), word_blob_quality(word_res, row),
108  word_outline_errs(word_res), char_qual, good_char_qual);
109  }
110  return TRUE;
111 }
void classify_word_and_language(WordRecognizer recognizer, BLOCK *block, ROW *row, WERD_RES *word)
Definition: control.cpp:795
REJMAP reject_map
Definition: pageres.h:408
inT16 word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:80
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
inT16 word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:68
short inT16
Definition: host.h:100
void classify_word_pass2(BLOCK *block, ROW *row, WERD_RES *word)
Definition: control.cpp:1026
inT32 length() const
Definition: rejctmap.h:238
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:100
#define TRUE
Definition: capi.h:27
void tesseract::Tesseract::recog_pseudo_word ( PAGE_RES page_res,
TBOX selection_box 
)

Definition at line 72 of file control.cpp.

73  {
74  WERD *word;
75  ROW *pseudo_row; // row of word
76  BLOCK *pseudo_block; // block of word
77 
78  word = make_pseudo_word(page_res, selection_box,
79  pseudo_block, pseudo_row);
80  if (word != NULL) {
81  WERD_RES word_res(word);
82  recog_interactive(pseudo_block, pseudo_row, &word_res);
83  delete word;
84  }
85 }
BOOL8 recog_interactive(BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: control.cpp:97
#define NULL
Definition: host.h:144
Definition: ocrrow.h:32
Definition: ocrblock.h:31
WERD * make_pseudo_word(PAGE_RES *page_res, TBOX &selection_box, BLOCK *&pseudo_block, ROW *&pseudo_row)
Definition: werdit.cpp:30
Definition: werd.h:60
void tesseract::Tesseract::recog_training_segmented ( const STRING fname,
PAGE_RES page_res,
volatile ETEXT_DESC monitor,
FILE *  output_file 
)

Definition at line 88 of file recogtraining.cpp.

91  {
92  STRING box_fname = fname;
93  const char *lastdot = strrchr(box_fname.string(), '.');
94  if (lastdot != NULL) box_fname[lastdot - box_fname.string()] = '\0';
95  box_fname += ".box";
96  // read_next_box() will close box_file
97  FILE *box_file = open_file(box_fname.string(), "r");
98 
99  PAGE_RES_IT page_res_it;
100  page_res_it.page_res = page_res;
101  page_res_it.restart_page();
102  STRING label;
103 
104  // Process all the words on this page.
105  TBOX tbox; // tesseract-identified box
106  TBOX bbox; // box from the box file
107  bool keep_going;
108  int line_number = 0;
109  int examined_words = 0;
110  do {
111  keep_going = read_t(&page_res_it, &tbox);
112  keep_going &= ReadNextBox(applybox_page, &line_number, box_file, &label,
113  &bbox);
114  // Align bottom left points of the TBOXes.
115  while (keep_going &&
116  !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
117  keep_going = (bbox.bottom() < tbox.bottom()) ?
118  read_t(&page_res_it, &tbox) :
119  ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
120  }
121  while (keep_going &&
122  !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
123  keep_going = (bbox.left() > tbox.left()) ? read_t(&page_res_it, &tbox) :
124  ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
125  }
126  // OCR the word if top right points of the TBOXes are similar.
127  if (keep_going &&
128  NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
129  NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
130  ambigs_classify_and_output(page_res_it.prev_word(),
131  page_res_it.prev_row(),
132  page_res_it.prev_block(),
133  label.string(), output_file);
134  examined_words++;
135  }
136  } while (keep_going);
137 
138  // Set up scripts on all of the words that did not get sent to
139  // ambigs_classify_and_output. They all should have, but if all the
140  // werd_res's don't get uch_sets, tesseract will crash when you try
141  // to iterate over them. :-(
142  int total_words = 0;
143  for (page_res_it.restart_page(); page_res_it.block() != NULL;
144  page_res_it.forward()) {
145  if (page_res_it.word()) {
146  if (page_res_it.word()->uch_set == NULL)
147  page_res_it.word()->SetupFake(unicharset);
148  total_words++;
149  }
150  }
151  if (examined_words < 0.85 * total_words) {
152  tprintf("TODO(antonova): clean up recog_training_segmented; "
153  " It examined only a small fraction of the ambigs image.\n");
154  }
155  tprintf("recog_training_segmented: examined %d / %d words.\n",
156  examined_words, total_words);
157 }
FILE * open_file(const char *filename, const char *mode)
Definition: cutil.cpp:82
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox)
const inT16 kMaxBoxEdgeDiff
const char * string() const
Definition: strngs.cpp:156
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:59
inT16 top() const
Definition: rect.h:53
void ambigs_classify_and_output(WERD_RES *werd_res, ROW_RES *row_res, BLOCK_RES *block_res, const char *label, FILE *output_file)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
UNICHARSET unicharset
Definition: ccutil.h:72
Definition: strngs.h:40
PAGE_RES * page_res
Definition: pageres.h:691
inT16 bottom() const
Definition: rect.h:60
void tesseract::Tesseract::recog_word ( WERD_RES word,
BLOB_CHOICE_LIST_CLIST *  blob_choices 
)

Definition at line 54 of file tfacepp.cpp.

55  {
57  recog_word_recursive(word, blob_choices);
58  word->SetupBoxWord();
59  if ((word->best_choice->length() != word->box_word->length()) ||
60  (word->best_choice->length() != blob_choices->length())) {
61  tprintf("recog_word ASSERT FAIL String:\"%s\"; "
62  "Strlen=%d; #Blobs=%d; #Choices=%d\n",
63  word->best_choice->debug_string().string(),
64  word->best_choice->length(), word->box_word->length(),
65  blob_choices->length());
66  }
67  ASSERT_HOST(word->best_choice->length() == word->box_word->length());
68  ASSERT_HOST(word->best_choice->length() == blob_choices->length());
70  /* Override the permuter type if a straight dictionary check disagrees. */
71  uinT8 perm_type = word->best_choice->permuter();
72  if ((perm_type != SYSTEM_DAWG_PERM) &&
73  (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
74  uinT8 real_dict_perm_type = dict_word(*word->best_choice);
75  if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
76  (real_dict_perm_type == FREQ_DAWG_PERM) ||
77  (real_dict_perm_type == USER_DAWG_PERM)) &&
79  word->best_choice->unichar_lengths().string()) > 0)) {
80  word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
81  }
82  }
84  perm_type != word->best_choice->permuter()) {
85  tprintf("Permuter Type Flipped from %d to %d\n",
86  perm_type, word->best_choice->permuter());
87  }
88  }
89  // Factored out from control.cpp
90  ASSERT_HOST((word->best_choice == NULL) == (word->raw_choice == NULL));
91  if (word->best_choice == NULL || word->best_choice->length() == 0 ||
92  strspn(word->best_choice->unichar_string().string(), " ") ==
93  word->best_choice->length()) {
94  word->tess_failed = true;
95  word->reject_map.initialise(word->box_word->length());
97  } else {
98  word->tess_failed = false;
99  }
100 }
int length() const
Definition: ratngs.h:214
const STRING & unichar_string() const
Definition: ratngs.h:395
const int length() const
Definition: boxword.h:99
const STRING debug_string() const
Definition: ratngs.h:373
REJMAP reject_map
Definition: pageres.h:408
#define NULL
Definition: host.h:144
TBLOB * blobs
Definition: blobs.h:274
uinT8 permuter() const
Definition: ratngs.h:237
void rej_word_tess_failure()
Definition: rejctmap.cpp:431
const char * string() const
Definition: strngs.cpp:156
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:133
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
WERD_CHOICE * raw_choice
Definition: pageres.h:360
inT16 alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:659
const STRING & unichar_lengths() const
Definition: ratngs.h:402
tesseract::BoxWord * box_word
Definition: pageres.h:387
TWERD * chopped_word
Definition: pageres.h:357
unsigned char uinT8
Definition: host.h:99
BOOL8 tess_failed
Definition: pageres.h:409
#define ASSERT_HOST(x)
Definition: errcode.h:84
void recog_word_recursive(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: tfacepp.cpp:109
void SetupBoxWord()
Definition: pageres.cpp:495
void initialise(inT16 length)
Definition: rejctmap.cpp:324
WERD_CHOICE * best_choice
Definition: pageres.h:359
void set_permuter(uinT8 perm)
Definition: ratngs.h:261
void tesseract::Tesseract::recog_word_recursive ( WERD_RES word,
BLOB_CHOICE_LIST_CLIST *  blob_choices 
)

Definition at line 109 of file tfacepp.cpp.

110  {
111  int word_length = word->chopped_word->NumBlobs(); // no of blobs
112  if (word_length > MAX_UNDIVIDED_LENGTH) {
113  return split_and_recog_word(word, blob_choices);
114  }
115  int initial_blob_choice_len = blob_choices->length();
116  BLOB_CHOICE_LIST_VECTOR* tess_ratings = cc_recog(word);
117 
118  // Put BLOB_CHOICE_LISTs from tess_ratings into blob_choices.
119  BLOB_CHOICE_LIST_C_IT blob_choices_it(blob_choices);
120  for (int i = 0; i < tess_ratings->length(); ++i) {
121  blob_choices_it.add_to_end(tess_ratings->get(i));
122  }
123  delete tess_ratings;
124 
125  word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
126  // Pad raw_choice with spaces if needed.
127  if (word->raw_choice->length() < word_length) {
128  UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
129  while (word->raw_choice->length() < word_length) {
130  word->raw_choice->append_unichar_id(space_id, 1, 0.0,
131  word->raw_choice->certainty());
132  }
133  }
134 
135  // Do sanity checks and minor fixes on best_choice.
136  if (word->best_choice->length() > word_length) {
137  word->best_choice->make_bad(); // should never happen
138  tprintf("recog_word: Discarded long string \"%s\""
139  " (%d characters vs %d blobs)\n",
140  word->best_choice->unichar_string().string(),
141  word->best_choice->length(), word_length);
142  tprintf("Word is at:");
143  word->word->bounding_box().print();
144  }
145  if (blob_choices->length() - initial_blob_choice_len != word_length) {
146  word->best_choice->make_bad(); // force rejection
147  tprintf("recog_word: Choices list len:%d; blob lists len:%d\n",
148  blob_choices->length(), word_length);
149  blob_choices_it.set_to_list(blob_choices); // list of lists
150  while (blob_choices->length() - initial_blob_choice_len < word_length) {
151  blob_choices_it.add_to_end(new BLOB_CHOICE_LIST()); // add a fake one
152  tprintf("recog_word: Added dummy choice list\n");
153  }
154  while (blob_choices->length() - initial_blob_choice_len > word_length) {
155  blob_choices_it.move_to_last(); // should never happen
156  delete blob_choices_it.extract();
157  tprintf("recog_word: Deleted choice list\n");
158  }
159  }
160  if (word->best_choice->length() < word_length) {
161  UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
162  while (word->best_choice->length() < word_length) {
163  word->best_choice->append_unichar_id(space_id, 1, 0.0,
164  word->best_choice->certainty());
165  }
166  }
167 }
int length() const
Definition: ratngs.h:214
TWERD * rebuild_word
Definition: pageres.h:381
const STRING & unichar_string() const
Definition: ratngs.h:395
TBOX bounding_box()
Definition: werd.cpp:164
int UNICHAR_ID
Definition: unichar.h:31
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
BLOB_CHOICE_LIST_VECTOR * cc_recog(WERD_RES *word)
Definition: tface.cpp:117
float certainty() const
Definition: ratngs.h:234
void append_unichar_id(UNICHAR_ID unichar_id, char fragment_length, float rating, float certainty)
Definition: ratngs.cpp:313
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:321
int NumBlobs() const
Definition: blobs.h:263
T & get(int index) const
WERD * word
Definition: pageres.h:334
const char * string() const
Definition: strngs.cpp:156
#define MAX_UNDIVIDED_LENGTH
Definition: tfacepp.cpp:43
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
WERD_CHOICE * raw_choice
Definition: pageres.h:360
UNICHARSET unicharset
Definition: ccutil.h:72
int length() const
Definition: genericvector.h:63
TWERD * chopped_word
Definition: pageres.h:357
void split_and_recog_word(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: tfacepp.cpp:177
void print() const
Definition: rect.h:263
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::recognize_page ( STRING image_name)
void tesseract::Tesseract::reject_edge_blobs ( WERD_RES word)

Definition at line 427 of file reject.cpp.

427  {
428  TBOX word_box = word->word->bounding_box();
429  // Use the box_word as it is already denormed back to image coordinates.
430  int blobcount = word->box_word->length();
431 
432  if (word_box.left() < tessedit_image_border ||
433  word_box.bottom() < tessedit_image_border ||
434  word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
435  word_box.top() + tessedit_image_border > ImageHeight() - 1) {
436  ASSERT_HOST(word->reject_map.length() == blobcount);
437  for (int blobindex = 0; blobindex < blobcount; blobindex++) {
438  TBOX blob_box = word->box_word->BlobBox(blobindex);
439  if (blob_box.left() < tessedit_image_border ||
440  blob_box.bottom() < tessedit_image_border ||
441  blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
442  blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
443  word->reject_map[blobindex].setrej_edge_char();
444  // Close to edge
445  }
446  }
447  }
448 }
const int length() const
Definition: boxword.h:99
TBOX bounding_box()
Definition: werd.cpp:164
int ImageHeight() const
REJMAP reject_map
Definition: pageres.h:408
inT16 left() const
Definition: rect.h:67
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
WERD * word
Definition: pageres.h:334
inT16 top() const
Definition: rect.h:53
tesseract::BoxWord * box_word
Definition: pageres.h:387
const TBOX & BlobBox(int index) const
Definition: boxword.h:102
inT32 length() const
Definition: rejctmap.h:238
int ImageWidth() const
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 bottom() const
Definition: rect.h:60
void tesseract::Tesseract::reject_I_1_L ( WERD_RES word)

Definition at line 303 of file reject.cpp.

303  {
304  inT16 i;
305  inT16 offset;
306 
307  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
308  offset += word->best_choice->unichar_lengths()[i], i += 1) {
310  contains (word->best_choice->unichar_string()[offset])) {
311  //rej 1Il conflict
312  word->reject_map[i].setrej_1Il_conflict ();
313  }
314  }
315 }
const STRING & unichar_string() const
Definition: ratngs.h:395
REJMAP reject_map
Definition: pageres.h:408
Definition: strngs.h:40
short inT16
Definition: host.h:100
const STRING & unichar_lengths() const
Definition: ratngs.h:402
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::reject_mostly_rejects ( WERD_RES word)

Definition at line 752 of file reject.cpp.

752  {
753  /* Reject the whole of the word if the fraction of rejects exceeds a limit */
754 
755  if ((float) word->reject_map.reject_count() / word->reject_map.length() >=
758 }
REJMAP reject_map
Definition: pageres.h:408
double rej_whole_of_mostly_reject_word_fract
inT32 length() const
Definition: rejctmap.h:238
void rej_word_mostly_rej()
Definition: rejctmap.cpp:485
inT16 reject_count()
Definition: rejctmap.h:244
void tesseract::Tesseract::rejection_passes ( PAGE_RES page_res,
ETEXT_DESC monitor,
const TBOX target_word_box,
const char *  word_config 
)

Definition at line 588 of file control.cpp.

591  {
592  PAGE_RES_IT page_res_it(page_res);
593  // ****************** Pass 5 *******************
594  // Gather statistics on rejects.
595  int word_index = 0;
596  while (!tessedit_test_adaption && page_res_it.word() != NULL) {
598  WERD_RES* word = page_res_it.word();
599  word_index++;
600  if (monitor != NULL) {
601  monitor->ocr_alive = TRUE;
602  monitor->progress = 95 + 5 * word_index / stats_.word_count;
603  }
604  if (word->rebuild_word == NULL) {
605  // Word was not processed by tesseract.
606  page_res_it.forward();
607  continue;
608  }
609  check_debug_pt(word, 70);
610 
611  // changed by jetsoft
612  // specific to its needs to extract one word when need
613  if (target_word_box &&
615  *target_word_box, word_config, 4)) {
616  page_res_it.forward();
617  continue;
618  }
619  // end jetsoft
620 
621  page_res_it.rej_stat_word();
622  int chars_in_word = word->reject_map.length();
623  int rejects_in_word = word->reject_map.reject_count();
624 
625  int blob_quality = word_blob_quality(word, page_res_it.row()->row);
626  stats_.doc_blob_quality += blob_quality;
627  int outline_errs = word_outline_errs(word);
628  stats_.doc_outline_errs += outline_errs;
629  inT16 all_char_quality;
630  inT16 accepted_all_char_quality;
631  word_char_quality(word, page_res_it.row()->row,
632  &all_char_quality, &accepted_all_char_quality);
633  stats_.doc_char_quality += all_char_quality;
634  uinT8 permuter_type = word->best_choice->permuter();
635  if ((permuter_type == SYSTEM_DAWG_PERM) ||
636  (permuter_type == FREQ_DAWG_PERM) ||
637  (permuter_type == USER_DAWG_PERM)) {
638  stats_.good_char_count += chars_in_word - rejects_in_word;
639  stats_.doc_good_char_quality += accepted_all_char_quality;
640  }
641  check_debug_pt(word, 80);
643  (blob_quality == 0) && (outline_errs >= chars_in_word))
645  check_debug_pt(word, 90);
646  page_res_it.forward();
647  }
648 
650  tprintf
651  ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
652  " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
653  page_res->char_count, page_res->rej_count,
654  page_res->rej_count / static_cast<float>(page_res->char_count),
655  stats_.doc_blob_quality,
656  stats_.doc_blob_quality / static_cast<float>(page_res->char_count),
657  stats_.doc_outline_errs,
658  stats_.doc_outline_errs / static_cast<float>(page_res->char_count),
659  stats_.doc_char_quality,
660  stats_.doc_char_quality / static_cast<float>(page_res->char_count),
661  stats_.doc_good_char_quality,
662  (stats_.good_char_count > 0) ?
663  (stats_.doc_good_char_quality /
664  static_cast<float>(stats_.good_char_count)) : 0.0);
665  }
666  BOOL8 good_quality_doc =
667  ((page_res->rej_count / static_cast<float>(page_res->char_count)) <=
668  quality_rej_pc) &&
669  (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=
670  quality_blob_pc) &&
671  (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=
673  (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=
675 
676  // ****************** Pass 6 *******************
677  // Do whole document or whole block rejection pass
678  if (!tessedit_test_adaption) {
680  quality_based_rejection(page_res_it, good_quality_doc);
681  }
682 }
TWERD * rebuild_word
Definition: pageres.h:381
TBOX bounding_box()
Definition: werd.cpp:164
inT32 rej_count
Definition: pageres.h:221
inT16 progress
Definition: ocrclass.h:115
unsigned char BOOL8
Definition: host.h:113
REJMAP reject_map
Definition: pageres.h:408
#define NULL
Definition: host.h:144
void quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
Definition: docqual.cpp:143
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:100
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1388
volatile inT8 ocr_alive
Definition: ocrclass.h:117
WERD * word
Definition: pageres.h:334
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:128
#define LOC_DOC_BLK_REJ
Definition: errcode.h:53
#define LOC_MM_ADAPT
Definition: errcode.h:52
void rej_word_bad_quality()
Definition: rejctmap.cpp:494
uinT8 permuter() const
Definition: ratngs.h:237
inT16 word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:80
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
inT16 word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:68
short inT16
Definition: host.h:100
unsigned char uinT8
Definition: host.h:99
inT32 length() const
Definition: rejctmap.h:238
inT32 char_count
Definition: pageres.h:220
inT16 reject_count()
Definition: rejctmap.h:244
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:100
#define TRUE
Definition: capi.h:27
WERD_CHOICE * best_choice
Definition: pageres.h:359
BOOL8 tesseract::Tesseract::repeated_nonalphanum_wd ( WERD_RES word,
ROW row 
)

Definition at line 761 of file reject.cpp.

761  {
762  inT16 char_quality;
763  inT16 accepted_char_quality;
764 
765  if (word->best_choice->unichar_lengths().length() <= 1)
766  return FALSE;
767 
769  contains(word->best_choice->unichar_string()[0]))
770  return FALSE;
771 
772  UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
773  for (int i = 1; i < word->best_choice->length(); ++i) {
774  if (word->best_choice->unichar_id(i) != uch_id) return FALSE;
775  }
776 
777  word_char_quality(word, row, &char_quality, &accepted_char_quality);
778 
779  if ((word->best_choice->unichar_lengths().length () == char_quality) &&
780  (char_quality == accepted_char_quality))
781  return TRUE;
782  else
783  return FALSE;
784 }
int length() const
Definition: ratngs.h:214
const STRING & unichar_string() const
Definition: ratngs.h:395
int UNICHAR_ID
Definition: unichar.h:31
inT32 length() const
Definition: strngs.cpp:151
#define FALSE
Definition: capi.h:28
Definition: strngs.h:40
short inT16
Definition: host.h:100
const STRING & unichar_lengths() const
Definition: ratngs.h:402
char * ok_repeated_ch_non_alphanum_wds
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:100
#define TRUE
Definition: capi.h:27
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::ReportFailedBox ( int  boxfile_lineno,
TBOX  box,
const char *  box_ch,
const char *  err_msg 
)

Definition at line 756 of file applybox.cpp.

757  {
758  tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
759  boxfile_lineno, box_ch,
760  box.left(), box.bottom(), box.right(), box.top(), err_msg);
761 }
inT16 left() const
Definition: rect.h:67
inT16 right() const
Definition: rect.h:74
inT16 top() const
Definition: rect.h:53
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
inT16 bottom() const
Definition: rect.h:60
void tesseract::Tesseract::ReportXhtFixResult ( bool  accept_new_word,
float  new_x_ht,
WERD_RES word,
WERD_RES new_word 
)

Definition at line 955 of file control.cpp.

956  {
957  tprintf("New XHT Match:%s = %s ",
958  word->best_choice->unichar_string().string(),
959  word->best_choice->debug_string().string());
960  word->reject_map.print(debug_fp);
961  tprintf(" -> %s = %s ",
962  new_word->best_choice->unichar_string().string(),
963  new_word->best_choice->debug_string().string());
964  new_word->reject_map.print(debug_fp);
965  tprintf(" %s->%s %s %s\n",
966  word->guessed_x_ht ? "GUESS" : "CERT",
967  new_word->guessed_x_ht ? "GUESS" : "CERT",
968  new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
969  accept_new_word ? "ACCEPTED" : "");
970 }
const STRING & unichar_string() const
Definition: ratngs.h:395
const STRING debug_string() const
Definition: ratngs.h:373
void print(FILE *fp)
Definition: rejctmap.cpp:400
REJMAP reject_map
Definition: pageres.h:408
BOOL8 guessed_x_ht
Definition: pageres.h:428
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
FILE * debug_fp
Definition: tessvars.cpp:25
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::ReSegmentByClassification ( PAGE_RES page_res)

Definition at line 510 of file applybox.cpp.

510  {
511  PAGE_RES_IT pr_it(page_res);
512  WERD_RES* word_res;
513  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
514  WERD* word = word_res->word;
515  if (word->text() == NULL || word->text()[0] == '\0')
516  continue; // Ignore words that have no text.
517  // Convert the correct text to a vector of UNICHAR_ID
518  GenericVector<UNICHAR_ID> target_text;
519  if (!ConvertStringToUnichars(word->text(), &target_text)) {
520  tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
521  word->text());
522  pr_it.DeleteCurrentWord();
523  continue;
524  }
525  if (!FindSegmentation(target_text, word_res)) {
526  tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
527  word->text());
528  pr_it.DeleteCurrentWord();
529  continue;
530  }
531  }
532 }
bool ConvertStringToUnichars(const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
Definition: applybox.cpp:536
#define NULL
Definition: host.h:144
const char * text() const
Definition: werd.h:119
WERD * word
Definition: pageres.h:334
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
Definition: werd.h:60
bool FindSegmentation(const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
Definition: applybox.cpp:560
bool tesseract::Tesseract::ResegmentCharBox ( PAGE_RES page_res,
const TBOX prev_box,
const TBOX box,
const TBOX next_box,
const char *  correct_text 
)

Definition at line 341 of file applybox.cpp.

343  {
344  if (applybox_debug > 1) {
345  tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
346  }
347  PAGE_RES_IT page_res_it(page_res);
348  WERD_RES* word_res;
349  for (word_res = page_res_it.word(); word_res != NULL;
350  word_res = page_res_it.forward()) {
351  if (!word_res->box_word->bounding_box().major_overlap(box))
352  continue;
353  if (applybox_debug > 1) {
354  tprintf("Checking word box:");
355  word_res->box_word->bounding_box().print();
356  }
357  int word_len = word_res->box_word->length();
358  for (int i = 0; i < word_len; ++i) {
359  TBOX char_box = TBOX();
360  int blob_count = 0;
361  for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
362  TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
363  if (!blob_box.major_overlap(box))
364  break;
365  if (word_res->correct_text[i + blob_count].length() > 0)
366  break; // Blob is claimed already.
367  double current_box_miss_metric = BoxMissMetric(blob_box, box);
368  double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
369  if (applybox_debug > 2) {
370  tprintf("Checking blob:");
371  blob_box.print();
372  tprintf("Current miss metric = %g, next = %g\n",
373  current_box_miss_metric, next_box_miss_metric);
374  }
375  if (current_box_miss_metric > next_box_miss_metric)
376  break; // Blob is a better match for next box.
377  char_box += blob_box;
378  }
379  if (blob_count > 0) {
380  if (applybox_debug > 1) {
381  tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
382  }
383  if (!char_box.almost_equal(box, 3) &&
384  (box.x_gap(next_box) < -3 ||
385  (prev_box != NULL && prev_box->x_gap(box) < -3))) {
386  return false;
387  }
388  // We refine just the box_word, best_state and correct_text here.
389  // The rebuild_word is made in TidyUp.
390  // blob_count blobs are put together to match the box. Merge the
391  // box_word boxes, save the blob_count in the state and the text.
392  word_res->box_word->MergeBoxes(i, i + blob_count);
393  word_res->best_state[i] = blob_count;
394  word_res->correct_text[i] = correct_text;
395  if (applybox_debug > 2) {
396  tprintf("%d Blobs match: blob box:", blob_count);
397  word_res->box_word->BlobBox(i).print();
398  tprintf("Matches box:");
399  box.print();
400  tprintf("With next box:");
401  next_box.print();
402  }
403  // Eliminated best_state and correct_text entries for the consumed
404  // blobs.
405  for (int j = 1; j < blob_count; ++j) {
406  word_res->best_state.remove(i + 1);
407  word_res->correct_text.remove(i + 1);
408  }
409  // Assume that no box spans multiple source words, so we are done with
410  // this box.
411  if (applybox_debug > 1) {
412  tprintf("Best state = ");
413  for (int j = 0; j < word_res->best_state.size(); ++j) {
414  tprintf("%d ", word_res->best_state[j]);
415  }
416  tprintf("\n");
417  tprintf("Correct text = [[ ");
418  for (int j = 0; j < word_res->correct_text.size(); ++j) {
419  tprintf("%s ", word_res->correct_text[j].string());
420  }
421  tprintf("]]\n");
422  }
423  return true;
424  }
425  }
426  }
427  if (applybox_debug > 0) {
428  tprintf("FAIL!\n");
429  }
430  return false; // Failure.
431 }
const int length() const
Definition: boxword.h:99
#define NULL
Definition: host.h:144
void MergeBoxes(int start, int end)
Definition: boxword.cpp:177
GenericVector< int > best_state
Definition: pageres.h:392
Definition: rect.h:29
WERD * word
Definition: pageres.h:334
const TBOX & bounding_box() const
Definition: boxword.h:96
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int size() const
Definition: genericvector.h:59
int length() const
Definition: genericvector.h:63
GenericVector< STRING > correct_text
Definition: pageres.h:396
tesseract::BoxWord * box_word
Definition: pageres.h:387
const TBOX & BlobBox(int index) const
Definition: boxword.h:102
bool major_overlap(const TBOX &box) const
Definition: rect.h:358
virtual void remove(int index)
int x_gap(const TBOX &box) const
Definition: rect.h:210
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:249
void print() const
Definition: rect.h:263
bool tesseract::Tesseract::ResegmentWordBox ( BLOCK_LIST *  block_list,
const TBOX box,
const TBOX next_box,
const char *  correct_text 
)

Definition at line 439 of file applybox.cpp.

441  {
442  if (applybox_debug > 1) {
443  tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
444  }
445  WERD* new_word = NULL;
446  BLOCK_IT b_it(block_list);
447  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
448  BLOCK* block = b_it.data();
449  if (!box.major_overlap(block->bounding_box()))
450  continue;
451  ROW_IT r_it(block->row_list());
452  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
453  ROW* row = r_it.data();
454  if (!box.major_overlap(row->bounding_box()))
455  continue;
456  WERD_IT w_it(row->word_list());
457  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
458  WERD* word = w_it.data();
459  if (applybox_debug > 2) {
460  tprintf("Checking word:");
461  word->bounding_box().print();
462  }
463  if (word->text() != NULL && word->text()[0] != '\0')
464  continue; // Ignore words that are already done.
465  if (!box.major_overlap(word->bounding_box()))
466  continue;
467  C_BLOB_IT blob_it(word->cblob_list());
468  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
469  blob_it.forward()) {
470  C_BLOB* blob = blob_it.data();
471  TBOX blob_box = blob->bounding_box();
472  if (!blob_box.major_overlap(box))
473  continue;
474  double current_box_miss_metric = BoxMissMetric(blob_box, box);
475  double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
476  if (applybox_debug > 2) {
477  tprintf("Checking blob:");
478  blob_box.print();
479  tprintf("Current miss metric = %g, next = %g\n",
480  current_box_miss_metric, next_box_miss_metric);
481  }
482  if (current_box_miss_metric > next_box_miss_metric)
483  continue; // Blob is a better match for next box.
484  if (applybox_debug > 2) {
485  tprintf("Blob match: blob:");
486  blob_box.print();
487  tprintf("Matches box:");
488  box.print();
489  tprintf("With next box:");
490  next_box.print();
491  }
492  if (new_word == NULL) {
493  // Make a new word with a single blob.
494  new_word = word->shallow_copy();
495  new_word->set_text(correct_text);
496  w_it.add_to_end(new_word);
497  }
498  C_BLOB_IT new_blob_it(new_word->cblob_list());
499  new_blob_it.add_to_end(blob_it.extract());
500  }
501  }
502  }
503  }
504  if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
505  return new_word != NULL;
506 }
void set_text(const char *new_text)
Definition: werd.h:120
TBOX bounding_box()
Definition: werd.cpp:164
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
#define NULL
Definition: host.h:144
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:121
Definition: rect.h:29
const char * text() const
Definition: werd.h:119
Definition: ocrrow.h:32
Definition: ocrblock.h:31
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
WERD * shallow_copy()
Definition: werd.cpp:342
Definition: werd.h:60
TBOX bounding_box() const
Definition: ocrrow.h:85
bool major_overlap(const TBOX &box) const
Definition: rect.h:358
WERD_LIST * word_list()
Definition: ocrrow.h:52
TBOX bounding_box()
Definition: stepblob.cpp:192
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:70
void print() const
Definition: rect.h:263
void tesseract::Tesseract::ResetAdaptiveClassifier ( )

Definition at line 433 of file tesseractclass.cpp.

433  {
435  for (int i = 0; i < sub_langs_.size(); ++i) {
436  sub_langs_[i]->ResetAdaptiveClassifierInternal();
437  }
438 }
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:636
void tesseract::Tesseract::ResetDocumentDictionary ( )

Definition at line 441 of file tesseractclass.cpp.

441  {
443  for (int i = 0; i < sub_langs_.size(); ++i) {
444  sub_langs_[i]->getDict().ResetDocumentDictionary();
445  }
446 }
void ResetDocumentDictionary()
Definition: dict.h:478
Dict & getDict()
Definition: classify.h:62
const FCOORD& tesseract::Tesseract::reskew ( ) const
inline

Definition at line 156 of file tesseractclass.h.

156  {
157  return reskew_;
158  }
bool tesseract::Tesseract::RetryWithLanguage ( WERD_RES word,
BLOCK block,
ROW row,
WordRecognizer  recognizer 
)

Definition at line 756 of file control.cpp.

757  {
759  tprintf("Retrying word using lang %s, oem %d\n",
760  lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
761  }
762  // Setup a trial WERD_RES in which to classify.
763  WERD_RES lang_word;
764  lang_word.InitForRetryRecognition(*word);
765  // Run the recognizer on the word.
766  // Initial version is a bit of a hack based on better certainty and rating
767  // (to reduce false positives from cube) or a dictionary vs non-dictionary
768  // word.
769  (this->*recognizer)(block, row, &lang_word);
770  bool new_is_better = NewWordBetter(*word, lang_word);
772  if (lang_word.best_choice == NULL) {
773  tprintf("New result %s better:%s\n",
774  new_is_better ? "IS" : "NOT");
775  } else {
776  tprintf("New result %s better:%s, r=%g, c=%g\n",
777  new_is_better ? "IS" : "NOT",
778  lang_word.best_choice->unichar_string().string(),
779  lang_word.best_choice->rating(),
780  lang_word.best_choice->certainty());
781  }
782  }
783  if (new_is_better) {
784  word->ConsumeWordResults(&lang_word);
785  }
786  return new_is_better;
787 }
const STRING & unichar_string() const
Definition: ratngs.h:395
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:411
float certainty() const
Definition: ratngs.h:234
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
STRING lang
Definition: ccutil.h:69
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:260
float rating() const
Definition: ratngs.h:231
WERD_CHOICE * best_choice
Definition: pageres.h:359
bool tesseract::Tesseract::right_to_left ( ) const
inline

Definition at line 213 of file tesseractclass.h.

213  {
214  return right_to_left_;
215  }
void tesseract::Tesseract::run_cube_combiner ( PAGE_RES page_res)

Definition at line 241 of file cube_control.cpp.

241  {
242  if (page_res == NULL || tess_cube_combiner_ == NULL)
243  return;
244  PAGE_RES_IT page_res_it(page_res);
245  // Iterate through the word results and call cube on each word.
246  for (page_res_it.restart_page(); page_res_it.word () != NULL;
247  page_res_it.forward()) {
248  WERD_RES* word = page_res_it.word();
249  // Skip cube entirely if tesseract's certainty is greater than threshold.
250  int combiner_run_thresh = convert_prob_to_tess_certainty(
251  cube_cntxt_->Params()->CombinerRunThresh());
252  if (word->best_choice->certainty() >= combiner_run_thresh) {
253  continue;
254  }
255  // Use the same language as Tesseract used for the word.
256  Tesseract* lang_tess = word->tesseract;
257 
258  // Setup a trial WERD_RES in which to classify with cube.
259  WERD_RES cube_word;
260  cube_word.InitForRetryRecognition(*word);
261  CubeObject *cube_obj = lang_tess->cube_recognize_word(
262  page_res_it.block()->block, &cube_word);
263  if (cube_obj != NULL)
264  lang_tess->cube_combine_word(cube_obj, &cube_word, word);
265  delete cube_obj;
266  }
267 }
float certainty() const
Definition: ratngs.h:234
tesseract::Tesseract * tesseract
Definition: pageres.h:403
TuningParams * Params() const
double CombinerRunThresh() const
Definition: tuning_params.h:62
#define NULL
Definition: host.h:144
WERD * word
Definition: pageres.h:334
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:260
WERD_CHOICE * best_choice
Definition: pageres.h:359
bool tesseract::Tesseract::RunOldFixXht ( WERD_RES word,
BLOCK block,
ROW row 
)
inT16 tesseract::Tesseract::safe_dict_word ( const WERD_RES werd_res)

Definition at line 786 of file reject.cpp.

786  {
787  const WERD_CHOICE &word = *werd_res->best_choice;
788  int dict_word_type = werd_res->tesseract->dict_word(word);
789  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
790 }
tesseract::Tesseract * tesseract
Definition: pageres.h:403
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:133
WERD_CHOICE * best_choice
Definition: pageres.h:359
Pix* tesseract::Tesseract::scaled_color ( ) const
inline

Definition at line 196 of file tesseractclass.h.

196  {
197  return scaled_color_;
198  }
int tesseract::Tesseract::scaled_factor ( ) const
inline

Definition at line 199 of file tesseractclass.h.

199  {
200  return scaled_factor_;
201  }
void tesseract::Tesseract::SearchForText ( const GenericVector< BLOB_CHOICE_LIST * > *  choices,
int  choices_pos,
int  choices_length,
const GenericVector< UNICHAR_ID > &  target_text,
int  text_index,
float  rating,
GenericVector< int > *  segmentation,
float *  best_rating,
GenericVector< int > *  best_segmentation 
)

Definition at line 625 of file applybox.cpp.

631  {
633  for (int length = 1; length <= choices[choices_pos].size(); ++length) {
634  // Rating of matching choice or worst choice if no match.
635  float choice_rating = 0.0f;
636  // Find the corresponding best BLOB_CHOICE.
637  BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
638  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
639  choice_it.forward()) {
640  BLOB_CHOICE* choice = choice_it.data();
641  choice_rating = choice->rating();
642  UNICHAR_ID class_id = choice->unichar_id();
643  if (class_id == target_text[text_index]) {
644  break;
645  }
646  // Search ambigs table.
647  if (class_id < table.size() && table[class_id] != NULL) {
648  AmbigSpec_IT spec_it(table[class_id]);
649  for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
650  spec_it.forward()) {
651  const AmbigSpec *ambig_spec = spec_it.data();
652  // We'll only do 1-1.
653  if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
654  ambig_spec->correct_ngram_id == target_text[text_index])
655  break;
656  }
657  if (!spec_it.cycled_list())
658  break; // Found an ambig.
659  }
660  }
661  if (choice_it.cycled_list())
662  continue; // No match.
663  segmentation->push_back(length);
664  if (choices_pos + length == choices_length &&
665  text_index + 1 == target_text.size()) {
666  // This is a complete match. If the rating is good record a new best.
667  if (applybox_debug > 2) {
668  tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
669  rating + choice_rating, *best_rating, segmentation->size(),
670  best_segmentation->size());
671  }
672  if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
673  *best_segmentation = *segmentation;
674  *best_rating = rating + choice_rating;
675  }
676  } else if (choices_pos + length < choices_length &&
677  text_index + 1 < target_text.size()) {
678  if (applybox_debug > 3) {
679  tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
680  target_text[text_index],
681  unicharset.id_to_unichar(target_text[text_index]),
682  choice_it.data()->unichar_id() == target_text[text_index]
683  ? "Match" : "Ambig",
684  choices_pos, length);
685  }
686  SearchForText(choices, choices_pos + length, choices_length, target_text,
687  text_index + 1, rating + choice_rating, segmentation,
688  best_rating, best_segmentation);
689  if (applybox_debug > 3) {
690  tprintf("End recursion for %d=%s\n", target_text[text_index],
691  unicharset.id_to_unichar(target_text[text_index]));
692  }
693  }
694  segmentation->truncate(segmentation->size() - 1);
695  }
696 }
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
int UNICHAR_ID
Definition: unichar.h:31
#define NULL
Definition: host.h:144
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:150
UNICHAR_ID unichar_id() const
Definition: ratngs.h:59
void SearchForText(const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
Definition: applybox.cpp:625
int push_back(T object)
Dict & getDict()
Definition: classify.h:62
bool empty() const
Definition: genericvector.h:68
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
UNICHARSET unicharset
Definition: ccutil.h:72
int size() const
Definition: genericvector.h:59
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:139
const UnicharAmbigs & getUnicharAmbigs()
Definition: dict.h:106
virtual void truncate(int size)
float rating() const
Definition: ratngs.h:62
int tesseract::Tesseract::SegmentPage ( const STRING input_file,
BLOCK_LIST *  blocks,
Tesseract osd_tess,
OSResults osr 
)

Segment the page according to the current value of tessedit_pageseg_mode. pix_binary_ is used as the source image and should not be NULL. On return the blocks list owns all the constructed page layout.

Definition at line 107 of file pagesegmain.cpp.

108  {
109  ASSERT_HOST(pix_binary_ != NULL);
110  int width = pixGetWidth(pix_binary_);
111  int height = pixGetHeight(pix_binary_);
112  // Get page segmentation mode.
113  PageSegMode pageseg_mode = static_cast<PageSegMode>(
114  static_cast<int>(tessedit_pageseg_mode));
115  // If a UNLV zone file can be found, use that instead of segmentation.
116  if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
117  input_file != NULL && input_file->length() > 0) {
118  STRING name = *input_file;
119  const char* lastdot = strrchr(name.string(), '.');
120  if (lastdot != NULL)
121  name[lastdot - name.string()] = '\0';
122  read_unlv_file(name, width, height, blocks);
123  }
124  if (blocks->empty()) {
125  // No UNLV file present. Work according to the PageSegMode.
126  // First make a single block covering the whole image.
127  BLOCK_IT block_it(blocks);
128  BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height);
130  block_it.add_to_end(block);
131  } else {
132  // UNLV file present. Use PSM_SINGLE_BLOCK.
133  pageseg_mode = PSM_SINGLE_BLOCK;
134  }
135  bool single_column = !PSM_COL_FIND_ENABLED(pageseg_mode);
136  bool osd_enabled = PSM_OSD_ENABLED(pageseg_mode);
137  bool osd_only = pageseg_mode == PSM_OSD_ONLY;
138 
139  int auto_page_seg_ret_val = 0;
140  TO_BLOCK_LIST to_blocks;
141  if (osd_enabled || PSM_BLOCK_FIND_ENABLED(pageseg_mode)) {
142  auto_page_seg_ret_val =
143  AutoPageSeg(single_column, osd_enabled, osd_only,
144  blocks, &to_blocks, osd_tess, osr);
145  if (osd_only)
146  return auto_page_seg_ret_val;
147  // To create blobs from the image region bounds uncomment this line:
148  // to_blocks.clear(); // Uncomment to go back to the old mode.
149  } else {
150  deskew_ = FCOORD(1.0f, 0.0f);
151  reskew_ = FCOORD(1.0f, 0.0f);
152  if (pageseg_mode == PSM_CIRCLE_WORD) {
153  Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
154  if (pixcleaned != NULL) {
155  pixDestroy(&pix_binary_);
156  pix_binary_ = pixcleaned;
157  }
158  }
159  }
160 
161  if (auto_page_seg_ret_val < 0) {
162  return -1;
163  }
164 
165  if (blocks->empty()) {
167  tprintf("Empty page\n");
168  return 0; // AutoPageSeg found an empty page.
169  }
170 
171  textord_.TextordPage(pageseg_mode, width, height, pix_binary_,
172  blocks, &to_blocks);
173  return auto_page_seg_ret_val;
174 }
#define PSM_OSD_ENABLED(pageseg_mode)
Definition: publictypes.h:170
bool read_unlv_file(STRING name, inT32 xsize, inT32 ysize, BLOCK_LIST *blocks)
Definition: blread.cpp:37
#define PSM_COL_FIND_ENABLED(pageseg_mode)
Definition: publictypes.h:171
inT32 length() const
Definition: strngs.cpp:151
#define PSM_BLOCK_FIND_ENABLED(pageseg_mode)
Definition: publictypes.h:173
#define NULL
Definition: host.h:144
#define f(xc, yc)
Definition: imgscale.cpp:39
void TextordPage(PageSegMode pageseg_mode, int width, int height, Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: textord.cpp:265
bool right_to_left() const
int textord_debug_tabfind
Definition: alignedblob.cpp:28
int AutoPageSeg(bool single_column, bool osd, bool only_osd, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, Tesseract *osd_tess, OSResults *osr)
void set_right_to_left(bool value)
Definition: ocrblock.h:87
Orientation and script detection only.
Definition: publictypes.h:148
const char * string() const
Definition: strngs.cpp:156
Definition: ocrblock.h:31
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
Definition: points.h:189
Definition: strngs.h:40
Treat the image as a single word in a circle.
Definition: publictypes.h:159
#define ASSERT_HOST(x)
Definition: errcode.h:84
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:156
#define TRUE
Definition: capi.h:27
void tesseract::Tesseract::set_done ( WERD_RES word,
inT16  pass 
)
void tesseract::Tesseract::set_pix_grey ( Pix *  grey_pix)
inline

Definition at line 170 of file tesseractclass.h.

170  {
171  pixDestroy(&pix_grey_);
172  pix_grey_ = grey_pix;
173  }
void tesseract::Tesseract::set_source_resolution ( int  ppi)
inline

Definition at line 187 of file tesseractclass.h.

187  {
188  source_resolution_ = ppi;
189  }
void tesseract::Tesseract::set_unlv_suspects ( WERD_RES word)

Definition at line 371 of file output.cpp.

371  {
372  int len = word_res->reject_map.length();
373  const WERD_CHOICE &word = *(word_res->best_choice);
374  const UNICHARSET &uchset = *word.unicharset();
375  int i;
376  float rating_per_ch;
377 
378  if (suspect_level == 0) {
379  for (i = 0; i < len; i++) {
380  if (word_res->reject_map[i].rejected())
381  word_res->reject_map[i].setrej_minimal_rej_accept();
382  }
383  return;
384  }
385 
386  if (suspect_level >= 3)
387  return; //Use defaults
388 
389  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
390 
391  if (safe_dict_word(word_res) &&
392  (count_alphas(word) > suspect_short_words)) {
393  /* Unreject alphas in dictionary words */
394  for (i = 0; i < len; ++i) {
395  if (word_res->reject_map[i].rejected() &&
396  uchset.get_isalpha(word.unichar_id(i)))
397  word_res->reject_map[i].setrej_minimal_rej_accept();
398  }
399  }
400 
401  rating_per_ch = word.rating() / word_res->reject_map.length();
402 
403  if (rating_per_ch >= suspect_rating_per_ch)
404  return; //Dont touch bad ratings
405 
406  if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
407  /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
408  for (i = 0; i < len; ++i) {
409  if (word_res->reject_map[i].rejected() &&
410  (!uchset.eq(word.unichar_id(i), " ")))
411  word_res->reject_map[i].setrej_minimal_rej_accept();
412  }
413  }
414 
415  for (i = 0; i < len; i++) {
416  if (word_res->reject_map[i].rejected()) {
417  if (word_res->reject_map[i].flag(R_DOC_REJ))
418  word_res->reject_map[i].setrej_minimal_rej_accept();
419  if (word_res->reject_map[i].flag(R_BLOCK_REJ))
420  word_res->reject_map[i].setrej_minimal_rej_accept();
421  if (word_res->reject_map[i].flag(R_ROW_REJ))
422  word_res->reject_map[i].setrej_minimal_rej_accept();
423  }
424  }
425 
426  if (suspect_level == 2)
427  return;
428 
429  if (!suspect_constrain_1Il ||
430  (word_res->reject_map.length() <= suspect_short_words)) {
431  for (i = 0; i < len; i++) {
432  if (word_res->reject_map[i].rejected()) {
433  if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
434  word_res->reject_map[i].flag(R_POSTNN_1IL)))
435  word_res->reject_map[i].setrej_minimal_rej_accept();
436 
437  if (!suspect_constrain_1Il &&
438  word_res->reject_map[i].flag(R_MM_REJECT))
439  word_res->reject_map[i].setrej_minimal_rej_accept();
440  }
441  }
442  }
443 
444  if (acceptable_word_string(*word_res->uch_set,
445  word.unichar_string().string(),
446  word.unichar_lengths().string()) !=
447  AC_UNACCEPTABLE ||
449  word.unichar_lengths().string())) {
450  if (word_res->reject_map.length() > suspect_short_words) {
451  for (i = 0; i < len; i++) {
452  if (word_res->reject_map[i].rejected() &&
453  (!word_res->reject_map[i].perm_rejected() ||
454  word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
455  word_res->reject_map[i].flag (R_POSTNN_1IL) ||
456  word_res->reject_map[i].flag (R_MM_REJECT))) {
457  word_res->reject_map[i].setrej_minimal_rej_accept();
458  }
459  }
460  }
461  }
462 }
const STRING & unichar_string() const
Definition: ratngs.h:395
inT16 count_alphas(const WERD_CHOICE &word)
Definition: output.cpp:464
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
Unacceptable word.
Definition: control.h:37
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:786
const char * string() const
Definition: strngs.cpp:156
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:555
const STRING & unichar_lengths() const
Definition: ratngs.h:402
BOOL8 acceptable_number_string(const char *s, const char *lengths)
Definition: output.cpp:485
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1284
const UNICHARSET * unicharset() const
Definition: ratngs.h:211
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
float rating() const
Definition: ratngs.h:231
void tesseract::Tesseract::set_word_fonts ( WERD_RES word,
BLOB_CHOICE_LIST_CLIST *  blob_choices 
)

set_word_fonts

Get the fonts for the word.

Definition at line 1500 of file control.cpp.

1501  {
1502  if (blob_choices == NULL) return;
1503  // Don't try to set the word fonts for a cube word, as the configs
1504  // will be meaningless.
1505  if (word->chopped_word == NULL) return;
1506 
1507  inT32 index; // char id index
1508  // character iterator
1509  BLOB_CHOICE_LIST_C_IT char_it = blob_choices;
1510  BLOB_CHOICE_IT choice_it; // choice iterator
1511  int fontinfo_size = get_fontinfo_table().size();
1512  int fontset_size = get_fontset_table().size();
1513  if (fontinfo_size == 0 || fontset_size == 0) return;
1514  STATS fonts(0, fontinfo_size); // font counters
1515 
1516  word->italic = 0;
1517  word->bold = 0;
1518  if (!word->best_choice_fontinfo_ids.empty()) {
1520  }
1521  // Compute the modal font for the word
1522  for (char_it.mark_cycle_pt(), index = 0;
1523  !char_it.cycled_list(); ++index, char_it.forward()) {
1524  UNICHAR_ID word_ch_id = word->best_choice->unichar_id(index);
1525  choice_it.set_to_list(char_it.data());
1526  if (tessedit_debug_fonts) {
1527  tprintf("Examining fonts in %s\n",
1528  word->best_choice->debug_string().string());
1529  }
1530  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
1531  choice_it.forward()) {
1532  UNICHAR_ID blob_ch_id = choice_it.data()->unichar_id();
1533  if (blob_ch_id == word_ch_id) {
1534  if (tessedit_debug_fonts) {
1535  tprintf("%s font %s (%d) font2 %s (%d)\n",
1536  word->uch_set->id_to_unichar(blob_ch_id),
1537  choice_it.data()->fontinfo_id() < 0 ? "unknown" :
1538  fontinfo_table_.get(choice_it.data()->fontinfo_id()).name,
1539  choice_it.data()->fontinfo_id(),
1540  choice_it.data()->fontinfo_id2() < 0 ? "unknown" :
1541  fontinfo_table_.get(choice_it.data()->fontinfo_id2()).name,
1542  choice_it.data()->fontinfo_id2());
1543  }
1544  // 1st choice font gets 2 pts, 2nd choice 1 pt.
1545  if (choice_it.data()->fontinfo_id() >= 0) {
1546  fonts.add(choice_it.data()->fontinfo_id(), 2);
1547  }
1548  if (choice_it.data()->fontinfo_id2() >= 0) {
1549  fonts.add(choice_it.data()->fontinfo_id2(), 1);
1550  }
1551  break;
1552  }
1553  }
1554  }
1555  inT16 font_id1, font_id2;
1556  find_modal_font(&fonts, &font_id1, &word->fontinfo_id_count);
1557  find_modal_font(&fonts, &font_id2, &word->fontinfo_id2_count);
1558  word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : NULL;
1559  word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : NULL;
1560  // All the blobs get the word's best choice font.
1561  for (int i = 0; i < word->best_choice->length(); ++i) {
1562  word->best_choice_fontinfo_ids.push_back(font_id1);
1563  }
1564  if (word->fontinfo_id_count > 0) {
1565  FontInfo fi = fontinfo_table_.get(font_id1);
1566  if (tessedit_debug_fonts) {
1567  if (word->fontinfo_id2_count > 0) {
1568  tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
1569  fi.name, word->fontinfo_id_count,
1570  fontinfo_table_.get(font_id2).name,
1571  word->fontinfo_id2_count);
1572  } else {
1573  tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
1574  fi.name, word->fontinfo_id_count);
1575  }
1576  }
1577  // 1st choices got 2 pts, so we need to halve the score for the mode.
1578  word->italic = (fi.is_italic() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2;
1579  word->bold = (fi.is_bold() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2;
1580  }
1581 }
int length() const
Definition: ratngs.h:214
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
int UNICHAR_ID
Definition: unichar.h:31
const STRING debug_string() const
Definition: ratngs.h:373
const FontInfo * fontinfo
Definition: pageres.h:424
virtual void clear()
UnicityTable< FontSet > & get_fontset_table()
Definition: classify.h:339
const FontInfo * fontinfo2
Definition: pageres.h:425
bool is_italic() const
Definition: fontinfo.h:84
#define NULL
Definition: host.h:144
int inT32
Definition: host.h:102
int push_back(T object)
GenericVector< inT8 > best_choice_fontinfo_ids
Definition: pageres.h:454
bool is_bold() const
Definition: fontinfo.h:85
inT8 bold
Definition: pageres.h:422
const UNICHARSET * uch_set
Definition: pageres.h:348
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:336
const char * string() const
Definition: strngs.cpp:156
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:443
bool empty() const
Definition: genericvector.h:68
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
inT8 fontinfo_id2_count
Definition: pageres.h:427
short inT16
Definition: host.h:100
Definition: statistc.h:29
inT8 italic
Definition: pageres.h:421
TWERD * chopped_word
Definition: pageres.h:357
inT8 fontinfo_id_count
Definition: pageres.h:426
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::SetBlackAndWhitelist ( )

Definition at line 448 of file tesseractclass.cpp.

448  {
449  // Set the white and blacklists (if any)
451  tessedit_char_whitelist.string());
452  // Black and white lists should apply to all loaded classifiers.
453  for (int i = 0; i < sub_langs_.size(); ++i) {
454  sub_langs_[i]->unicharset.set_black_and_whitelist(
456  }
457 }
void set_black_and_whitelist(const char *blacklist, const char *whitelist)
Definition: unicharset.cpp:829
UNICHARSET unicharset
Definition: ccutil.h:72
void tesseract::Tesseract::SetEquationDetect ( EquationDetect detector)

Definition at line 427 of file tesseractclass.cpp.

427  {
428  equ_detect_ = detector;
429  equ_detect_->SetLangTesseract(this);
430 }
void SetLangTesseract(Tesseract *lang_tesseract)
void tesseract::Tesseract::SetScaledColor ( int  factor,
Pix *  color 
)
inline

Definition at line 202 of file tesseractclass.h.

202  {
203  scaled_factor_ = factor;
204  scaled_color_ = color;
205  }
PAGE_RES * tesseract::Tesseract::SetupApplyBoxes ( const GenericVector< TBOX > &  boxes,
BLOCK_LIST *  block_list 
)

Definition at line 197 of file applybox.cpp.

198  {
199  double median_xheight = MedianXHeight(block_list);
200  double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
201  // Strip all fuzzy space markers to simplify the PAGE_RES.
202  BLOCK_IT b_it(block_list);
203  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
204  BLOCK* block = b_it.data();
205  ROW_IT r_it(block->row_list());
206  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
207  ROW* row = r_it.data();
208  float diff = fabs(row->x_height() - median_xheight);
209  if (diff > max_deviation) {
210  if (applybox_debug) {
211  tprintf("row xheight=%g, but median xheight = %g\n",
212  row->x_height(), median_xheight);
213  }
214  row->set_x_height(static_cast<float>(median_xheight));
215  }
216  WERD_IT w_it(row->word_list());
217  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
218  WERD* word = w_it.data();
219  if (word->cblob_list()->empty()) {
220  delete w_it.extract();
221  } else {
222  word->set_flag(W_FUZZY_SP, false);
223  word->set_flag(W_FUZZY_NON, false);
224  }
225  }
226  }
227  }
228  PAGE_RES* page_res = new PAGE_RES(block_list, NULL);
229  PAGE_RES_IT pr_it(page_res);
230  WERD_RES* word_res;
231  while ((word_res = pr_it.word()) != NULL) {
232  MaximallyChopWord(boxes, pr_it.block()->block,
233  pr_it.row()->row, word_res);
234  pr_it.forward();
235  }
236  return page_res;
237 }
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
const double kMaxXHeightDeviationFraction
Definition: applybox.cpp:44
#define NULL
Definition: host.h:144
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:121
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: applybox.cpp:257
WERD * word
Definition: pageres.h:334
Definition: ocrrow.h:32
Definition: ocrblock.h:31
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:123
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
float x_height() const
Definition: ocrrow.h:61
Definition: werd.h:60
WERD_LIST * word_list()
Definition: ocrrow.h:52
void set_x_height(float new_xheight)
Definition: ocrrow.h:64
ColumnFinder * tesseract::Tesseract::SetupPageSegAndDetectOrientation ( bool  single_column,
bool  osd,
bool  only_osd,
BLOCK_LIST *  blocks,
Tesseract osd_tess,
OSResults osr,
TO_BLOCK_LIST *  to_blocks,
Pix **  photo_mask_pix,
Pix **  music_mask_pix 
)

Sets up auto page segmentation, determines the orientation, and corrects it. Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to facilitate testing. photo_mask_pix is a pointer to a NULL pointer that will be filled on return with the leptonica photo mask, which must be pixDestroyed by the caller. to_blocks is an empty list that will be filled with (usually a single) block that is used during layout analysis. This ugly API is required because of the possibility of a unlv zone file. TODO(rays) clean this up. See AutoPageSeg for other arguments. The returned ColumnFinder must be deleted after use.

Definition at line 281 of file pagesegmain.cpp.

284  {
285  int vertical_x = 0;
286  int vertical_y = 1;
287  TabVector_LIST v_lines;
288  TabVector_LIST h_lines;
289  ICOORD bleft(0, 0);
290 
291  ASSERT_HOST(pix_binary_ != NULL);
293  pixWrite("tessinput.png", pix_binary_, IFF_PNG);
294  }
295  // Leptonica is used to find the rule/separator lines in the input.
296  LineFinder::FindAndRemoveLines(source_resolution_,
297  textord_tabfind_show_vlines, pix_binary_,
298  &vertical_x, &vertical_y, music_mask_pix,
299  &v_lines, &h_lines);
301  pixWrite("tessnolines.png", pix_binary_, IFF_PNG);
302  // Leptonica is used to find a mask of the photo regions in the input.
303  *photo_mask_pix = ImageFind::FindImages(pix_binary_);
305  pixWrite("tessnoimages.png", pix_binary_, IFF_PNG);
306  if (single_column)
307  v_lines.clear();
308 
309  // The rest of the algorithm uses the usual connected components.
310  textord_.find_components(pix_binary_, blocks, to_blocks);
311 
312  TO_BLOCK_IT to_block_it(to_blocks);
313  // There must be exactly one input block.
314  // TODO(rays) handle new textline finding with a UNLV zone file.
315  ASSERT_HOST(to_blocks->singleton());
316  TO_BLOCK* to_block = to_block_it.data();
317  TBOX blkbox = to_block->block->bounding_box();
318  ColumnFinder* finder = NULL;
319 
320  if (to_block->line_size >= 2) {
321  finder = new ColumnFinder(static_cast<int>(to_block->line_size),
322  blkbox.botleft(), blkbox.topright(),
323  source_resolution_,
324  &v_lines, &h_lines, vertical_x, vertical_y);
325 
326  finder->SetupAndFilterNoise(*photo_mask_pix, to_block);
327 
328  if (equ_detect_) {
329  equ_detect_->LabelSpecialText(to_block);
330  }
331 
332  BLOBNBOX_CLIST osd_blobs;
333  // osd_orientation is the number of 90 degree rotations to make the
334  // characters upright. (See osdetect.h for precise definition.)
335  // We want the text lines horizontal, (vertical text indicates vertical
336  // textlines) which may conflict (eg vertically written CJK).
337  int osd_orientation = 0;
338  bool vertical_text = finder->IsVerticallyAlignedText(to_block, &osd_blobs);
339  if (osd && osd_tess != NULL && osr != NULL) {
340  os_detect_blobs(&osd_blobs, osr, osd_tess);
341  if (only_osd) {
342  delete finder;
343  return NULL;
344  }
345  osd_orientation = osr->best_result.orientation_id;
346  double osd_score = osr->orientations[osd_orientation];
347  double osd_margin = min_orientation_margin * 2;
348  for (int i = 0; i < 4; ++i) {
349  if (i != osd_orientation &&
350  osd_score - osr->orientations[i] < osd_margin) {
351  osd_margin = osd_score - osr->orientations[i];
352  }
353  }
354  if (osd_margin < min_orientation_margin) {
355  // The margin is weak.
356  int best_script_id = osr->best_result.script_id;
357  bool cjk = (best_script_id == osd_tess->unicharset.han_sid()) ||
358  (best_script_id == osd_tess->unicharset.hiragana_sid()) ||
359  (best_script_id == osd_tess->unicharset.katakana_sid());
360 
361  if (!cjk && !vertical_text && osd_orientation == 2) {
362  // upside down latin text is improbable with such a weak margin.
363  tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
364  "Don't rotate.\n", osd_margin);
365  osd_orientation = 0;
366  } else {
367  tprintf("OSD: Weak margin (%.2f) for %d blob text block, "
368  "but using orientation anyway: %d\n",
369  osd_blobs.length(), osd_margin, osd_orientation);
370  }
371  }
372  }
373  osd_blobs.shallow_clear();
374  finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
375  }
376 
377  return finder;
378 }
#define NULL
Definition: host.h:144
Definition: rect.h:29
const ICOORD & topright() const
Definition: rect.h:93
void find_components(Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: tordmain.cpp:208
const ICOORD & botleft() const
Definition: rect.h:81
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
float orientations[4]
Definition: osdetect.h:74
static void FindAndRemoveLines(int resolution, bool debug, Pix *pix, int *vertical_x, int *vertical_y, Pix **pix_music_mask, TabVector_LIST *v_lines, TabVector_LIST *h_lines)
Definition: linefind.cpp:243
integer coordinate
Definition: points.h:30
static Pix * FindImages(Pix *pix)
Definition: imagefind.cpp:66
int os_detect_blobs(BLOBNBOX_CLIST *blob_list, OSResults *osr, tesseract::Tesseract *tess)
Definition: osdetect.cpp:275
int orientation_id
Definition: osdetect.h:41
int script_id
Definition: osdetect.h:42
#define ASSERT_HOST(x)
Definition: errcode.h:84
OSBestResult best_result
Definition: osdetect.h:79
int LabelSpecialText(TO_BLOCK *to_block)
void tesseract::Tesseract::SetupUniversalFontIds ( )

Definition at line 399 of file tessedit.cpp.

399  {
400  // Note that we can get away with bitwise copying FontInfo in
401  // all_fonts, as it is a temporary structure and we avoid setting the
402  // delete callback.
403  UnicityTable<FontInfo> all_fonts;
405 
406  // Create the universal ID table.
407  CollectFonts(get_fontinfo_table(), &all_fonts);
408  for (int i = 0; i < sub_langs_.size(); ++i) {
409  CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
410  }
411  // Assign ids from the table to each font table.
412  AssignIds(all_fonts, &get_fontinfo_table());
413  for (int i = 0; i < sub_langs_.size(); ++i) {
414  AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
415  }
416  font_table_size_ = all_fonts.size();
417 }
bool CompareFontInfo(const FontInfo &fi1, const FontInfo &fi2)
Definition: fontinfo.cpp:25
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:336
void set_compare_callback(TessResultCallback2< bool, T const &, T const & > *cb)
int size() const
Return the size used.
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
void tesseract::Tesseract::SetupWordScripts ( BLOCK_LIST *  blocks)
int tesseract::Tesseract::source_resolution ( ) const
inline

Definition at line 184 of file tesseractclass.h.

184  {
185  return source_resolution_;
186  }
void tesseract::Tesseract::split_and_recog_word ( WERD_RES word,
BLOB_CHOICE_LIST_CLIST *  blob_choices 
)

Definition at line 177 of file tfacepp.cpp.

178  {
179  // Find the biggest blob gap in the chopped_word.
180  int bestgap = -MAX_INT32;
181  TPOINT best_split_pt;
182  TBLOB* best_end = NULL;
183  TBLOB* prev_blob = NULL;
184  for (TBLOB* blob = word->chopped_word->blobs; blob != NULL;
185  blob = blob->next) {
186  if (prev_blob != NULL) {
187  TBOX prev_box = prev_blob->bounding_box();
188  TBOX blob_box = blob->bounding_box();
189  int gap = blob_box.left() - prev_box.right();
190  if (gap > bestgap) {
191  bestgap = gap;
192  best_end = prev_blob;
193  best_split_pt.x = (prev_box.right() + blob_box.left()) / 2;
194  best_split_pt.y = (prev_box.top() + prev_box.bottom() +
195  blob_box.top() + blob_box.bottom()) / 4;
196  }
197  }
198  prev_blob = blob;
199  }
200  ASSERT_HOST(best_end != NULL);
201  ASSERT_HOST(best_end->next != NULL);
202 
203  // Make a copy of the word to put the 2nd half in.
204  WERD_RES* word2 = new WERD_RES(*word);
205  // Blow away the copied chopped_word, as we want to work with the blobs
206  // from the input chopped_word so the seam_arrays can be merged.
207  delete word2->chopped_word;
208  word2->chopped_word = new TWERD;
209  word2->chopped_word->blobs = best_end->next;
210  best_end->next = NULL;
211  // Make a new seamarray on both words.
212  free_seam_list(word->seam_array);
214  word2->seam_array = start_seam_list(word2->chopped_word->blobs);
215  BlamerBundle *orig_bb = word->blamer_bundle;
216  STRING blamer_debug;
217  // Try to adjust truth information.
218  if (orig_bb != NULL) {
219  // Find truth boxes that correspond to the split in the blobs.
220  int b;
221  int begin2_truth_index = -1;
222  if (orig_bb->incorrect_result_reason != IRR_NO_TRUTH &&
223  orig_bb->truth_has_char_boxes) {
224  int end1_x = best_end->bounding_box().right();
225  int begin2_x = word2->chopped_word->blobs->bounding_box().left();
226  blamer_debug = "Looking for truth split at";
227  blamer_debug.add_str_int(" end1_x ", end1_x);
228  blamer_debug.add_str_int(" begin2_x ", begin2_x);
229  blamer_debug += "\nnorm_truth_word boxes:\n";
230  if (orig_bb->norm_truth_word.length() > 1) {
231  orig_bb->norm_truth_word.BlobBox(0).append_debug(&blamer_debug);
232  for (b = 1; b < orig_bb->norm_truth_word.length(); ++b) {
233  orig_bb->norm_truth_word.BlobBox(b).append_debug(&blamer_debug);
234  if ((abs(end1_x - orig_bb->norm_truth_word.BlobBox(b-1).right()) <
235  orig_bb->norm_box_tolerance) &&
236  (abs(begin2_x - orig_bb->norm_truth_word.BlobBox(b).left()) <
237  orig_bb->norm_box_tolerance)) {
238  begin2_truth_index = b;
239  blamer_debug += "Split found\n";
240  break;
241  }
242  }
243  }
244  }
245  // Populate truth information in word and word2 with the first and second
246  // part of the original truth.
247  word->blamer_bundle = new BlamerBundle();
248  word2->blamer_bundle = new BlamerBundle();
249  if (begin2_truth_index > 0) {
250  word->blamer_bundle->truth_has_char_boxes = true;
252  word2->blamer_bundle->truth_has_char_boxes = true;
254  BlamerBundle *curr_bb = word->blamer_bundle;
255  for (b = 0; b < orig_bb->norm_truth_word.length(); ++b) {
256  if (b == begin2_truth_index) curr_bb = word2->blamer_bundle;
257  curr_bb->norm_truth_word.InsertBox(
258  b, orig_bb->norm_truth_word.BlobBox(b));
259  curr_bb->truth_word.InsertBox(b, orig_bb->truth_word.BlobBox(b));
260  curr_bb->truth_text.push_back(orig_bb->truth_text[b]);
261  }
262  } else if (orig_bb->incorrect_result_reason == IRR_NO_TRUTH) {
265  } else {
266  blamer_debug += "Truth split not found";
267  blamer_debug += orig_bb->truth_has_char_boxes ?
268  "\n" : " (no truth char boxes)\n";
269  word->blamer_bundle->SetBlame(IRR_NO_TRUTH_SPLIT, blamer_debug,
271  word2->blamer_bundle->SetBlame(IRR_NO_TRUTH_SPLIT, blamer_debug,
273  }
274  }
275 
276  // Recognize the first part of the word.
277  recog_word_recursive(word, blob_choices);
278  // Recognize the second part of the word.
279  recog_word_recursive(word2, blob_choices);
280  // Tack the word2 outputs onto the end of the word outputs.
281  // New blobs might have appeared on the end of word1.
282  for (best_end = word->chopped_word->blobs; best_end->next != NULL;
283  best_end = best_end->next);
284  best_end->next = word2->chopped_word->blobs;
285  TBLOB* blob;
286  for (blob = word->rebuild_word->blobs; blob->next != NULL; blob = blob->next);
287  blob->next = word2->rebuild_word->blobs;
288  word2->chopped_word->blobs = NULL;
289  word2->rebuild_word->blobs = NULL;
290  // Copy the seams onto the end of the word1 seam_array.
291  // Since the seam list is one element short, an empty seam marking the
292  // end of the last blob in the first word is needed first.
293  word->seam_array = add_seam(word->seam_array,
294  new_seam(0.0, best_split_pt, NULL, NULL, NULL));
295  for (int i = 0; i < array_count(word2->seam_array); ++i) {
296  SEAM* seam = reinterpret_cast<SEAM*>(array_value(word2->seam_array, i));
297  array_value(word2->seam_array, i) = NULL;
298  word->seam_array = add_seam(word->seam_array, seam);
299  }
300  word->best_state += word2->best_state;
301  // Append the word choices.
302  *word->best_choice += *word2->best_choice;
303  *word->raw_choice += *word2->raw_choice;
304 
305  // How many alt choices from each should we try to get?
306  const int kAltsPerPiece = 2;
307  // When do we start throwing away extra alt choices?
308  const int kTooManyAltChoices = 100;
309 
310  if (word->alt_choices.size() > 0 && word2->alt_choices.size() > 0) {
311  // Construct the cartesian product of the alt choices of word(1) and word2.
312  int num_first_alt_choices = word->alt_choices.size();
313  // Nota Bene: For the main loop here, we leave in place word1-only
314  // alt_choices in
315  // word->alt_choices[0] .. word_alt_choices[num_first_alt_choices - 1]
316  // These will get fused with the best choices for word2 below.
317  for (int j = 1; j < word2->alt_choices.size() &&
318  (j <= kAltsPerPiece || word->alt_choices.size() < kTooManyAltChoices);
319  j++) {
320  for (int i = 0; i < num_first_alt_choices &&
321  (i <= kAltsPerPiece ||
322  word->alt_choices.size() < kTooManyAltChoices);
323  i++) {
324  WERD_CHOICE *wc = new WERD_CHOICE(*word->alt_choices[i]);
325  *wc += *word2->alt_choices[j];
326  word->alt_choices.push_back(wc);
327 
329  GenericVector<int> &alt_state = word->alt_states.back();
330  alt_state += word->alt_states[i];
331  alt_state += word2->alt_states[j];
332  }
333  }
334  // Now that we've filled in as many alternates as we want, paste the best
335  // choice for word2 onto the original word alt_choices.
336  for (int i = 0; i < num_first_alt_choices; i++) {
337  *word->alt_choices[i] += *word2->alt_choices[0];
338  word->alt_states[i] += word2->alt_states[0];
339  }
340  }
341 
342  // Restore the pointer to original blamer bundle and combine blamer
343  // information recorded in the splits.
344  if (orig_bb != NULL) {
346  if (irr != IRR_NO_TRUTH_SPLIT) blamer_debug = "";
350  blamer_debug += "Blame from part 1: ";
351  blamer_debug += word->blamer_bundle->debug;
353  }
357  blamer_debug += "Blame from part 2: ";
358  blamer_debug += word2->blamer_bundle->debug;
359  if (irr == IRR_CORRECT) {
361  } else if (irr != word2->blamer_bundle->incorrect_result_reason) {
362  irr = IRR_UNKNOWN;
363  }
364  }
365  delete word->blamer_bundle;
366  word->blamer_bundle = orig_bb;
368  if (irr != IRR_CORRECT && irr != IRR_NO_TRUTH) {
369  word->blamer_bundle->SetBlame(irr, blamer_debug, NULL,
371  }
372  }
373  delete word2;
374 }
TWERD * rebuild_word
Definition: pageres.h:381
const int length() const
Definition: boxword.h:99
SEAMS start_seam_list(TBLOB *blobs)
Definition: seam.cpp:175
IncorrectResultReason incorrect_result_reason
Definition: pageres.h:176
GenericVector< WERD_CHOICE * > alt_choices
Definition: pageres.h:363
#define NULL
Definition: host.h:144
Definition: blobs.h:233
inT16 left() const
Definition: rect.h:67
GenericVector< int > best_state
Definition: pageres.h:392
void append_debug(STRING *str) const
Definition: rect.h:270
Definition: rect.h:29
TBLOB * blobs
Definition: blobs.h:274
int push_back(T object)
inT16 right() const
Definition: rect.h:74
inT16 y
Definition: blobs.h:68
GenericVector< GenericVector< int > > alt_states
Definition: pageres.h:364
tesseract::BoxWord truth_word
Definition: pageres.h:167
void add_str_int(const char *str, int number)
Definition: strngs.cpp:334
SEAMS seam_array
Definition: pageres.h:358
#define MAX_INT32
Definition: host.h:120
inT16 x
Definition: blobs.h:67
tesseract::BoxWord norm_truth_word
Definition: pageres.h:170
IncorrectResultReason
Definition: pageres.h:45
GenericVector< STRING > truth_text
Definition: pageres.h:174
Definition: blobs.h:53
Definition: blobs.h:174
void SetBlame(IncorrectResultReason irr, const STRING &msg, const WERD_CHOICE *choice, bool debug)
Definition: pageres.h:151
inT16 top() const
Definition: rect.h:53
void free_seam_list(SEAMS seam_list)
Definition: seam.cpp:200
bool wordrec_debug_blamer
Definition: wordrec.h:142
WERD_CHOICE * raw_choice
Definition: pageres.h:360
TBOX bounding_box() const
Definition: blobs.cpp:384
Definition: strngs.h:40
int size() const
Definition: genericvector.h:59
STRING debug
Definition: pageres.h:178
T & back() const
void InsertBox(int index, const TBOX &box)
Definition: boxword.cpp:194
bool truth_has_char_boxes
Definition: pageres.h:164
const TBOX & BlobBox(int index) const
Definition: boxword.h:102
TWERD * chopped_word
Definition: pageres.h:357
SEAM * new_seam(PRIORITY priority, const TPOINT &location, SPLIT *split1, SPLIT *split2, SPLIT *split3)
Definition: seam.cpp:421
SEAMS add_seam(SEAMS seam_list, SEAM *seam)
Definition: seam.cpp:104
#define array_count(a)
Definition: tessarray.h:74
#define array_value(a, i)
Definition: tessarray.h:132
#define ASSERT_HOST(x)
Definition: errcode.h:84
void recog_word_recursive(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: tfacepp.cpp:109
int norm_box_tolerance
Definition: pageres.h:172
BlamerBundle * blamer_bundle
Definition: pageres.h:367
TBLOB * next
Definition: blobs.h:228
WERD_CHOICE * best_choice
Definition: pageres.h:359
inT16 bottom() const
Definition: rect.h:60
BOOL8 tesseract::Tesseract::terrible_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level 
)

Definition at line 510 of file docqual.cpp.

511  {
512  float rating_per_ch;
513  int adjusted_len;
514  int crunch_mode = 0;
515 
516  if ((word->best_choice->unichar_string().length () == 0) ||
517  (strspn (word->best_choice->unichar_string().string(), " ") ==
518  word->best_choice->unichar_string().length ()))
519  crunch_mode = 1;
520  else {
521  adjusted_len = word->reject_map.length ();
522  if (adjusted_len > crunch_rating_max)
523  adjusted_len = crunch_rating_max;
524  rating_per_ch = word->best_choice->rating () / adjusted_len;
525 
526  if (rating_per_ch > crunch_terrible_rating)
527  crunch_mode = 2;
528  else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
529  crunch_mode = 3;
530  else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
531  (garbage_level != G_OK))
532  crunch_mode = 4;
533  else if ((rating_per_ch > crunch_poor_garbage_rate) &&
534  (garbage_level != G_OK))
535  crunch_mode = 5;
536  }
537  if (crunch_mode > 0) {
538  if (crunch_debug > 2) {
539  tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
540  crunch_mode, word->best_choice->unichar_string().string());
541  }
542  return TRUE;
543  }
544  else
545  return FALSE;
546 }
Definition: docqual.h:29
const STRING & unichar_string() const
Definition: ratngs.h:395
inT32 length() const
Definition: strngs.cpp:151
float certainty() const
Definition: ratngs.h:234
REJMAP reject_map
Definition: pageres.h:408
#define FALSE
Definition: capi.h:28
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
inT32 length() const
Definition: rejctmap.h:238
#define TRUE
Definition: capi.h:27
float rating() const
Definition: ratngs.h:231
WERD_CHOICE * best_choice
Definition: pageres.h:359
BOOL8 tesseract::Tesseract::tess_acceptable_word ( WERD_CHOICE word_choice,
WERD_CHOICE raw_choice 
)

Definition at line 102 of file tessbox.cpp.

104  { // before context
105  return getDict().AcceptableResult(*word_choice);
106 }
Dict & getDict()
Definition: classify.h:62
bool AcceptableResult(const WERD_CHOICE &BestChoice)
Definition: stopper.cpp:254
void tesseract::Tesseract::tess_add_doc_word ( WERD_CHOICE word_choice)

Definition at line 114 of file tessbox.cpp.

114  {
115  getDict().add_document_word(*word_choice);
116 }
Dict & getDict()
Definition: classify.h:62
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
Definition: dict.cpp:690
void tesseract::Tesseract::tess_segment_pass1 ( WERD_RES word,
BLOB_CHOICE_LIST_CLIST *  blob_choices 
)

Definition at line 42 of file tessbox.cpp.

43  {
44  int saved_enable_assoc = 0;
45  int saved_chop_enable = 0;
46 
47  if (word->word->flag(W_DONT_CHOP)) {
48  saved_enable_assoc = wordrec_enable_assoc;
49  saved_chop_enable = chop_enable;
50  wordrec_enable_assoc.set_value(0);
51  chop_enable.set_value(0);
52  if (word->word->flag(W_REP_CHAR))
53  getDict().permute_only_top.set_value(true);
54  }
55  set_pass1();
56  recog_word(word, blob_choices);
57  if (word->word->flag(W_DONT_CHOP)) {
58  wordrec_enable_assoc.set_value(saved_enable_assoc);
59  chop_enable.set_value(saved_chop_enable);
60  getDict().permute_only_top.set_value(false);
61  }
62 }
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:122
Dict & getDict()
Definition: classify.h:62
WERD * word
Definition: pageres.h:334
void set_pass1()
Definition: tface.cpp:93
bool wordrec_enable_assoc
Definition: wordrec.h:98
void recog_word(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: tfacepp.cpp:54
bool permute_only_top
Definition: dict.h:910
void tesseract::Tesseract::tess_segment_pass2 ( WERD_RES word,
BLOB_CHOICE_LIST_CLIST *  blob_choices 
)

Definition at line 73 of file tessbox.cpp.

74  {
75  int saved_enable_assoc = 0;
76  int saved_chop_enable = 0;
77 
78  if (word->word->flag(W_DONT_CHOP)) {
79  saved_enable_assoc = wordrec_enable_assoc;
80  saved_chop_enable = chop_enable;
81  wordrec_enable_assoc.set_value(0);
82  chop_enable.set_value(0);
83  if (word->word->flag(W_REP_CHAR))
84  getDict().permute_only_top.set_value(true);
85  }
86  set_pass2();
87  recog_word(word, blob_choices);
88  if (word->word->flag(W_DONT_CHOP)) {
89  wordrec_enable_assoc.set_value(saved_enable_assoc);
90  chop_enable.set_value(saved_chop_enable);
91  getDict().permute_only_top.set_value(false);
92  }
93 }
void set_pass2()
Definition: tface.cpp:105
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:122
Dict & getDict()
Definition: classify.h:62
WERD * word
Definition: pageres.h:334
bool wordrec_enable_assoc
Definition: wordrec.h:98
void recog_word(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
Definition: tfacepp.cpp:54
bool permute_only_top
Definition: dict.h:910
BOOL8 tesseract::Tesseract::test_ambig_word ( WERD_RES word)

Definition at line 687 of file reject.cpp.

688  {
689  BOOL8 ambig = FALSE;
690 
691  if ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
692  (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
693  (word->best_choice->permuter () == USER_DAWG_PERM)) {
694  ambig = !getDict().NoDangerousAmbig(
695  word->best_choice, NULL, false, NULL, NULL);
696  }
697  return ambig;
698 }
unsigned char BOOL8
Definition: host.h:113
#define NULL
Definition: host.h:144
#define FALSE
Definition: capi.h:28
Dict & getDict()
Definition: classify.h:62
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, BLOB_CHOICE_LIST_VECTOR *Choices, bool *modified_blobs)
Definition: stopper.cpp:581
uinT8 permuter() const
Definition: ratngs.h:237
WERD_CHOICE * best_choice
Definition: pageres.h:359
const Textord& tesseract::Tesseract::textord ( ) const
inline

Definition at line 206 of file tesseractclass.h.

206  {
207  return textord_;
208  }
void tesseract::Tesseract::TidyUp ( PAGE_RES page_res)

Definition at line 702 of file applybox.cpp.

702  {
703  int ok_blob_count = 0;
704  int bad_blob_count = 0;
705  int ok_word_count = 0;
706  int unlabelled_words = 0;
707  PAGE_RES_IT pr_it(page_res);
708  WERD_RES* word_res;
709  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
710  int ok_in_word = 0;
711  BLOB_CHOICE_LIST_VECTOR char_choices;
712  for (int i = word_res->correct_text.size() - 1; i >= 0; i--) {
713  if (word_res->correct_text[i].length() > 0) {
714  ++ok_in_word;
715  }
716  // Since we only need a fake word_res->best_choice, the actual
717  // unichar_ids do not matter. Which is fortunate, since TidyUp()
718  // can be called while training Tesseract, at the stage where
719  // unicharset is not meaningful yet.
720  char_choices += fake_classify_blob(INVALID_UNICHAR_ID, 1.0, -1.0);
721  }
722  if (ok_in_word > 0) {
723  ok_blob_count += ok_in_word;
724  bad_blob_count += word_res->correct_text.size() - ok_in_word;
725  MakeWordChoice(char_choices, unicharset, word_res->best_choice);
726  } else {
727  ++unlabelled_words;
728  if (applybox_debug > 0) {
729  tprintf("APPLY_BOXES: Unlabelled word at :");
730  word_res->word->bounding_box().print();
731  }
732  pr_it.DeleteCurrentWord();
733  }
734  char_choices.delete_data_pointers();
735  }
736  pr_it.restart_page();
737  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
738  // Denormalize back to a BoxWord.
739  word_res->RebuildBestState();
740  word_res->SetupBoxWord();
741  word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
742  word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
743  }
744  if (applybox_debug > 0) {
745  tprintf(" Found %d good blobs.\n", ok_blob_count);
746  if (bad_blob_count > 0) {
747  tprintf(" Leaving %d unlabelled blobs in %d words.\n",
748  bad_blob_count, ok_word_count);
749  }
750  if (unlabelled_words > 0)
751  tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
752  }
753 }
void delete_data_pointers()
TBOX bounding_box()
Definition: werd.cpp:164
#define NULL
Definition: host.h:144
WERD * word
Definition: pageres.h:334
BLOB_CHOICE_LIST * fake_classify_blob(UNICHAR_ID class_id, float rating, float certainty)
Definition: wordclass.cpp:136
Definition: werd.h:35
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:123
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
UNICHARSET unicharset
Definition: ccutil.h:72
int size() const
Definition: genericvector.h:59
void RebuildBestState()
Definition: pageres.cpp:452
int length() const
Definition: genericvector.h:63
GenericVector< STRING > correct_text
Definition: pageres.h:396
Definition: werd.h:36
void SetupBoxWord()
Definition: pageres.cpp:495
void print() const
Definition: rect.h:263
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::tilde_crunch ( PAGE_RES_IT page_res_it)

Definition at line 424 of file docqual.cpp.

424  {
425  WERD_RES *word;
426  GARBAGE_LEVEL garbage_level;
427  PAGE_RES_IT copy_it;
428  BOOL8 prev_potential_marked = FALSE;
429  BOOL8 found_terrible_word = FALSE;
430  BOOL8 ok_dict_word;
431 
432  page_res_it.restart_page();
433  while (page_res_it.word() != NULL) {
434  POLY_BLOCK* pb = page_res_it.block()->block->poly_block();
435  if (pb != NULL && !pb->IsText()) {
436  page_res_it.forward();
437  continue;
438  }
439  word = page_res_it.word();
440 
442  convert_bad_unlv_chs(word);
443 
445  word->merge_tess_fails();
446 
447  if (word->reject_map.accept_count () != 0) {
448  found_terrible_word = FALSE;
449  //Forget earlier potential crunches
450  prev_potential_marked = FALSE;
451  }
452  else {
453  ok_dict_word = safe_dict_word(word);
454  garbage_level = garbage_word (word, ok_dict_word);
455 
456  if ((garbage_level != G_NEVER_CRUNCH) &&
457  (terrible_word_crunch (word, garbage_level))) {
458  if (crunch_debug > 0) {
459  tprintf ("T CRUNCHING: \"%s\"\n",
460  word->best_choice->unichar_string().string());
461  }
463  if (prev_potential_marked) {
464  while (copy_it.word () != word) {
465  if (crunch_debug > 0) {
466  tprintf ("P1 CRUNCHING: \"%s\"\n",
467  copy_it.word()->best_choice->unichar_string().string());
468  }
469  copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
470  copy_it.forward ();
471  }
472  prev_potential_marked = FALSE;
473  }
474  found_terrible_word = TRUE;
475  }
476  else if ((garbage_level != G_NEVER_CRUNCH) &&
477  (potential_word_crunch (word,
478  garbage_level, ok_dict_word))) {
479  if (found_terrible_word) {
480  if (crunch_debug > 0) {
481  tprintf ("P2 CRUNCHING: \"%s\"\n",
482  word->best_choice->unichar_string().string());
483  }
485  }
486  else if (!prev_potential_marked) {
487  copy_it = page_res_it;
488  prev_potential_marked = TRUE;
489  if (crunch_debug > 1) {
490  tprintf ("P3 CRUNCHING: \"%s\"\n",
491  word->best_choice->unichar_string().string());
492  }
493  }
494  }
495  else {
496  found_terrible_word = FALSE;
497  //Forget earlier potential crunches
498  prev_potential_marked = FALSE;
499  if (crunch_debug > 2) {
500  tprintf ("NO CRUNCH: \"%s\"\n",
501  word->best_choice->unichar_string().string());
502  }
503  }
504  }
505  page_res_it.forward ();
506  }
507 }
const STRING & unichar_string() const
Definition: ratngs.h:395
BLOCK * block
Definition: pageres.h:258
GARBAGE_LEVEL
Definition: docqual.h:26
BOOL8 potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word)
Definition: docqual.cpp:548
WERD_RES * restart_page()
Definition: pageres.h:713
unsigned char BOOL8
Definition: host.h:113
void merge_tess_fails()
Definition: pageres.cpp:721
REJMAP reject_map
Definition: pageres.h:408
#define NULL
Definition: host.h:144
#define FALSE
Definition: capi.h:28
bool crunch_early_convert_bad_unlv_chs
WERD_RES * word() const
Definition: pageres.h:757
BLOCK_RES * block() const
Definition: pageres.h:763
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:786
POLY_BLOCK * poly_block() const
Definition: pdblock.h:62
const char * string() const
Definition: strngs.cpp:156
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:666
BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:510
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word)
Definition: docqual.cpp:689
WERD_RES * forward()
Definition: pageres.h:737
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
inT16 accept_count()
Definition: rejctmap.cpp:337
bool IsText() const
Definition: polyblk.h:54
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:430
#define TRUE
Definition: capi.h:27
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::tilde_delete ( PAGE_RES_IT page_res_it)

Definition at line 596 of file docqual.cpp.

596  {
597  WERD_RES *word;
598  PAGE_RES_IT copy_it;
599  BOOL8 deleting_from_bol = FALSE;
600  BOOL8 marked_delete_point = FALSE;
601  inT16 debug_delete_mode;
602  CRUNCH_MODE delete_mode;
603  inT16 x_debug_delete_mode;
604  CRUNCH_MODE x_delete_mode;
605 
606  page_res_it.restart_page();
607  while (page_res_it.word() != NULL) {
608  word = page_res_it.word();
609 
610  delete_mode = word_deletable (word, debug_delete_mode);
611  if (delete_mode != CR_NONE) {
612  if (word->word->flag (W_BOL) || deleting_from_bol) {
613  if (crunch_debug > 0) {
614  tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
615  debug_delete_mode,
616  word->best_choice->unichar_string().string());
617  }
618  word->unlv_crunch_mode = delete_mode;
619  deleting_from_bol = TRUE;
620  } else if (word->word->flag(W_EOL)) {
621  if (marked_delete_point) {
622  while (copy_it.word() != word) {
623  x_delete_mode = word_deletable (copy_it.word (),
624  x_debug_delete_mode);
625  if (crunch_debug > 0) {
626  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
627  x_debug_delete_mode,
628  copy_it.word()->best_choice->unichar_string().string());
629  }
630  copy_it.word ()->unlv_crunch_mode = x_delete_mode;
631  copy_it.forward ();
632  }
633  }
634  if (crunch_debug > 0) {
635  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
636  debug_delete_mode,
637  word->best_choice->unichar_string().string());
638  }
639  word->unlv_crunch_mode = delete_mode;
640  deleting_from_bol = FALSE;
641  marked_delete_point = FALSE;
642  }
643  else {
644  if (!marked_delete_point) {
645  copy_it = page_res_it;
646  marked_delete_point = TRUE;
647  }
648  }
649  }
650  else {
651  deleting_from_bol = FALSE;
652  //Forget earlier potential crunches
653  marked_delete_point = FALSE;
654  }
655  /*
656  The following step has been left till now as the tess fails are used to
657  determine if the word is deletable.
658  */
660  word->merge_tess_fails();
661  page_res_it.forward ();
662  }
663 }
const STRING & unichar_string() const
Definition: ratngs.h:395
WERD_RES * restart_page()
Definition: pageres.h:713
unsigned char BOOL8
Definition: host.h:113
void merge_tess_fails()
Definition: pageres.cpp:721
#define NULL
Definition: host.h:144
#define FALSE
Definition: capi.h:28
CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode)
Definition: docqual.cpp:904
WERD_RES * word() const
Definition: pageres.h:757
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:122
CRUNCH_MODE
Definition: pageres.h:304
WERD * word
Definition: pageres.h:334
Definition: werd.h:35
const char * string() const
Definition: strngs.cpp:156
WERD_RES * forward()
Definition: pageres.h:737
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
short inT16
Definition: host.h:100
Definition: werd.h:36
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:430
#define TRUE
Definition: capi.h:27
WERD_CHOICE * best_choice
Definition: pageres.h:359
bool tesseract::Tesseract::TrainedXheightFix ( WERD_RES word,
BLOCK block,
ROW row 
)

Definition at line 976 of file control.cpp.

976  {
977  bool accept_new_x_ht = false;
978  int original_misfits = CountMisfitTops(word);
979  if (original_misfits == 0)
980  return false;
981  float new_x_ht = ComputeCompatibleXheight(word);
982  if (new_x_ht > 0.0f) {
983  WERD_RES new_x_ht_word(word->word);
984  if (word->blamer_bundle != NULL) {
985  new_x_ht_word.blamer_bundle = new BlamerBundle();
986  new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
987  }
988  new_x_ht_word.x_height = new_x_ht;
989  new_x_ht_word.caps_height = 0.0;
990  match_word_pass2(&new_x_ht_word, row, block);
991  if (!new_x_ht_word.tess_failed) {
992  int new_misfits = CountMisfitTops(&new_x_ht_word);
993  if (debug_x_ht_level >= 1) {
994  tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
995  original_misfits, word->x_height,
996  new_misfits, new_x_ht);
997  tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
998  word->best_choice->rating(), word->best_choice->certainty(),
999  new_x_ht_word.best_choice->rating(),
1000  new_x_ht_word.best_choice->certainty());
1001  }
1002  // The misfits must improve and either the rating or certainty.
1003  accept_new_x_ht = new_misfits < original_misfits &&
1004  (new_x_ht_word.best_choice->certainty() >
1005  word->best_choice->certainty() ||
1006  new_x_ht_word.best_choice->rating() <
1007  word->best_choice->rating());
1008  if (debug_x_ht_level >= 1) {
1009  ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
1010  }
1011  }
1012  if (accept_new_x_ht) {
1013  word->ConsumeWordResults(&new_x_ht_word);
1014  return true;
1015  }
1016  }
1017  return false;
1018 }
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:411
float certainty() const
Definition: ratngs.h:234
float ComputeCompatibleXheight(WERD_RES *word_res)
Definition: fixxht.cpp:96
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:64
#define NULL
Definition: host.h:144
#define f(xc, yc)
Definition: imgscale.cpp:39
WERD * word
Definition: pageres.h:334
float x_height
Definition: pageres.h:431
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void match_word_pass2(WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1098
BlamerBundle * blamer_bundle
Definition: pageres.h:367
float rating() const
Definition: ratngs.h:231
WERD_CHOICE * best_choice
Definition: pageres.h:359
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
Definition: control.cpp:955
BOOL8 tesseract::Tesseract::uniformly_spaced ( WERD_RES word)

Definition at line 515 of file fixspace.cpp.

515  {
516  TBOX box;
517  inT16 prev_right = -MAX_INT16;
518  inT16 gap;
519  inT16 max_gap = -MAX_INT16;
520  inT16 max_gap_count = 0;
521  STATS gap_stats(0, MAXSPACING);
522  BOOL8 result;
523  const ROW *row = word->denorm.row();
524  float max_non_space;
525  float normalised_max_nonspace;
526  inT16 i = 0;
527  inT16 offset = 0;
528  STRING punct_chars = "\"`',.:;";
529 
530  for (TBLOB* blob = word->rebuild_word->blobs; blob != NULL;
531  blob = blob->next) {
532  box = blob->bounding_box();
533  if ((prev_right > -MAX_INT16) &&
534  (!punct_chars.contains(
535  word->best_choice->unichar_string()
536  [offset - word->best_choice->unichar_lengths()[i - 1]]) &&
537  !punct_chars.contains(
538  word->best_choice->unichar_string()[offset]))) {
539  gap = box.left() - prev_right;
540  if (gap < max_gap) {
541  gap_stats.add(gap, 1);
542  } else if (gap == max_gap) {
543  max_gap_count++;
544  } else {
545  if (max_gap_count > 0)
546  gap_stats.add(max_gap, max_gap_count);
547  max_gap = gap;
548  max_gap_count = 1;
549  }
550  }
551  prev_right = box.right();
552  offset += word->best_choice->unichar_lengths()[i++];
553  }
554 
555  max_non_space = (row->space() + 3 * row->kern()) / 4;
556  normalised_max_nonspace = max_non_space * kBlnXHeight / row->x_height();
557 
558  result = (
559  gap_stats.get_total() == 0 ||
560  max_gap <= normalised_max_nonspace ||
561  (gap_stats.get_total() > 2 && max_gap <= 2 * gap_stats.median()) ||
562  (gap_stats.get_total() <= 2 && max_gap <= 2 * gap_stats.mean()));
563  #ifndef SECURE_NAMES
564  if ((debug_fix_space_level > 1)) {
565  if (result) {
566  tprintf(
567  "ACCEPT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d "
568  "total=%d mean=%f median=%f\n",
569  word->best_choice->unichar_string().string(), normalised_max_nonspace,
570  max_gap, max_gap_count, gap_stats.get_total(), gap_stats.mean(),
571  gap_stats.median());
572  } else {
573  tprintf(
574  "REJECT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d "
575  "total=%d mean=%f median=%f\n",
576  word->best_choice->unichar_string().string(), normalised_max_nonspace,
577  max_gap, max_gap_count, gap_stats.get_total(), gap_stats.mean(),
578  gap_stats.median());
579  }
580  }
581  #endif
582 
583  return result;
584 }
TWERD * rebuild_word
Definition: pageres.h:381
const int kBlnXHeight
Definition: normalis.h:27
const STRING & unichar_string() const
Definition: ratngs.h:395
const ROW * row() const
Definition: normalis.h:270
unsigned char BOOL8
Definition: host.h:113
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
inT32 kern() const
Definition: ocrrow.h:67
BOOL8 contains(const char c) const
Definition: strngs.cpp:147
Definition: rect.h:29
#define MAXSPACING
Definition: fixspace.cpp:36
TBLOB * blobs
Definition: blobs.h:274
inT16 right() const
Definition: rect.h:74
inT32 space() const
Definition: ocrrow.h:76
Definition: ocrrow.h:32
Definition: blobs.h:174
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
DENORM denorm
Definition: pageres.h:346
float x_height() const
Definition: ocrrow.h:61
Definition: strngs.h:40
short inT16
Definition: host.h:100
const STRING & unichar_lengths() const
Definition: ratngs.h:402
Definition: statistc.h:29
#define MAX_INT16
Definition: host.h:119
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::unrej_good_chs ( WERD_RES word,
ROW row 
)

Definition at line 120 of file docqual.cpp.

120  {
121  if (word->bln_boxes == NULL ||
122  word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
123  return;
124 
125  DocQualCallbacks cb(word);
127  *word->rebuild_word,
129 }
TWERD * rebuild_word
Definition: pageres.h:381
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:229
void AcceptIfGoodQuality(int index)
Definition: docqual.cpp:52
#define NULL
Definition: host.h:144
TBLOB * blobs
Definition: blobs.h:274
tesseract::BoxWord * bln_boxes
Definition: pageres.h:343
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
void tesseract::Tesseract::unrej_good_quality_words ( PAGE_RES_IT page_res_it)

Definition at line 166 of file docqual.cpp.

167  {
168  WERD_RES *word;
169  ROW_RES *current_row;
170  BLOCK_RES *current_block;
171  int i;
172 
173  page_res_it.restart_page ();
174  while (page_res_it.word () != NULL) {
175  check_debug_pt (page_res_it.word (), 100);
176  if (bland_unrej) {
177  word = page_res_it.word ();
178  for (i = 0; i < word->reject_map.length (); i++) {
179  if (word->reject_map[i].accept_if_good_quality ())
180  word->reject_map[i].setrej_quality_accept ();
181  }
182  page_res_it.forward ();
183  }
184  else if ((page_res_it.row ()->char_count > 0) &&
185  ((page_res_it.row ()->rej_count /
186  (float) page_res_it.row ()->char_count) <=
188  word = page_res_it.word ();
192  word->best_choice->unichar_string().string(),
194  != AC_UNACCEPTABLE)) {
195  unrej_good_chs(word, page_res_it.row ()->row);
196  }
197  page_res_it.forward ();
198  }
199  else {
200  /* Skip to end of dodgy row */
201  current_row = page_res_it.row ();
202  while ((page_res_it.word () != NULL) &&
203  (page_res_it.row () == current_row))
204  page_res_it.forward ();
205  }
206  check_debug_pt (page_res_it.word (), 110);
207  }
208  page_res_it.restart_page ();
209  page_res_it.page_res->char_count = 0;
210  page_res_it.page_res->rej_count = 0;
211  current_block = NULL;
212  current_row = NULL;
213  while (page_res_it.word () != NULL) {
214  if (current_block != page_res_it.block ()) {
215  current_block = page_res_it.block ();
216  current_block->char_count = 0;
217  current_block->rej_count = 0;
218  }
219  if (current_row != page_res_it.row ()) {
220  current_row = page_res_it.row ();
221  current_row->char_count = 0;
222  current_row->rej_count = 0;
223  current_row->whole_word_rej_count = 0;
224  }
225  page_res_it.rej_stat_word ();
226  page_res_it.forward ();
227  }
228 }
const STRING & unichar_string() const
Definition: ratngs.h:395
inT32 rej_count
Definition: pageres.h:221
Unacceptable word.
Definition: control.h:37
ROW_RES * row() const
Definition: pageres.h:760
WERD_RES * restart_page()
Definition: pageres.h:713
REJMAP reject_map
Definition: pageres.h:408
#define NULL
Definition: host.h:144
inT32 char_count
Definition: pageres.h:259
BOOL8 quality_recoverable_rejects()
Definition: rejctmap.cpp:360
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1388
WERD_RES * word() const
Definition: pageres.h:757
BLOCK_RES * block() const
Definition: pageres.h:763
const UNICHARSET * uch_set
Definition: pageres.h:348
inT32 rej_count
Definition: pageres.h:260
void rej_stat_word()
Definition: pageres.cpp:1137
const char * string() const
Definition: strngs.cpp:156
WERD_RES * forward()
Definition: pageres.h:737
inT32 whole_word_rej_count
Definition: pageres.h:289
const STRING & unichar_lengths() const
Definition: ratngs.h:402
void unrej_good_chs(WERD_RES *word, ROW *row)
Definition: docqual.cpp:120
inT32 char_count
Definition: pageres.h:287
inT32 length() const
Definition: rejctmap.h:238
inT32 char_count
Definition: pageres.h:220
inT32 rej_count
Definition: pageres.h:288
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1284
PAGE_RES * page_res
Definition: pageres.h:691
ROW * row
Definition: pageres.h:286
WERD_CHOICE * best_choice
Definition: pageres.h:359
BOOL8 tesseract::Tesseract::word_adaptable ( WERD_RES word,
uinT16  mode 
)

Definition at line 50 of file adaptions.cpp.

52  {
54  tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
55  word->best_choice == NULL ? "" :
57  word->best_choice->rating(), word->best_choice->certainty());
58  }
59 
60  BOOL8 status = FALSE;
61  BITS16 flags(mode);
62 
63  enum MODES
64  {
65  ADAPTABLE_WERD,
66  ACCEPTABLE_WERD,
67  CHECK_DAWGS,
68  CHECK_SPACES,
69  CHECK_ONE_ELL_CONFLICT,
70  CHECK_AMBIG_WERD
71  };
72 
73  /*
74  0: NO adaption
75  */
76  if (mode == 0) {
77  if (tessedit_adaption_debug) tprintf("adaption disabled\n");
78  return FALSE;
79  }
80 
81  if (flags.bit (ADAPTABLE_WERD)) {
82  status |= word->tess_would_adapt; // result of Classify::AdaptableWord()
83  if (tessedit_adaption_debug && !status) {
84  tprintf("tess_would_adapt bit is false\n");
85  }
86  }
87 
88  if (flags.bit (ACCEPTABLE_WERD)) {
89  status |= word->tess_accepted;
90  if (tessedit_adaption_debug && !status) {
91  tprintf("tess_accepted bit is false\n");
92  }
93  }
94 
95  if (!status) { // If not set then
96  return FALSE; // ignore other checks
97  }
98 
99  if (flags.bit (CHECK_DAWGS) &&
100  (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
101  (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
102  (word->best_choice->permuter () != USER_DAWG_PERM) &&
103  (word->best_choice->permuter () != NUMBER_PERM)) {
104  if (tessedit_adaption_debug) tprintf("word not in dawgs\n");
105  return FALSE;
106  }
107 
108  if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE)) {
109  if (tessedit_adaption_debug) tprintf("word has ell conflict\n");
110  return FALSE;
111  }
112 
113  if (flags.bit (CHECK_SPACES) &&
114  (strchr(word->best_choice->unichar_string().string(), ' ') != NULL)) {
115  if (tessedit_adaption_debug) tprintf("word contains spaces\n");
116  return FALSE;
117  }
118 
119 // if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word))
120  if (flags.bit (CHECK_AMBIG_WERD) &&
121  !getDict().NoDangerousAmbig(word->best_choice, NULL, false, NULL, NULL)) {
122  if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
123  return FALSE;
124  }
125 
126  // Do not adapt to words that are composed from fragments if
127  // tessedit_adapt_to_char_fragments is false.
129  const char *fragment_lengths = word->best_choice->fragment_lengths();
130  if (fragment_lengths != NULL && *fragment_lengths != '\0') {
131  for (int i = 0; i < word->best_choice->length(); ++i) {
132  if (fragment_lengths[i] > 1) {
133  if (tessedit_adaption_debug) tprintf("won't adapt to fragments\n");
134  return false; // found a character composed from fragments
135  }
136  }
137  }
138  }
139 
141  tprintf("returning status %d\n", status);
142  }
143  return status;
144 }
int length() const
Definition: ratngs.h:214
const STRING & unichar_string() const
Definition: ratngs.h:395
float certainty() const
Definition: ratngs.h:234
unsigned char BOOL8
Definition: host.h:113
#define NULL
Definition: host.h:144
BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map)
Definition: reject.cpp:456
#define FALSE
Definition: capi.h:28
BOOL8 tess_would_adapt
Definition: pageres.h:418
const char * fragment_lengths() const
Definition: ratngs.h:224
Dict & getDict()
Definition: classify.h:62
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, BLOB_CHOICE_LIST_VECTOR *Choices, bool *modified_blobs)
Definition: stopper.cpp:581
uinT8 permuter() const
Definition: ratngs.h:237
const char * string() const
Definition: strngs.cpp:156
CMD_EVENTS mode
Definition: pgedit.cpp:115
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
BOOL8 tess_accepted
Definition: pageres.h:417
Definition: bits16.h:25
float rating() const
Definition: ratngs.h:231
WERD_CHOICE * best_choice
Definition: pageres.h:359
BOOL8 tesseract::Tesseract::word_blank_and_set_display ( BLOCK block,
ROW row,
WERD_RES word_res 
)

Definition at line 711 of file pgedit.cpp.

712  {
715  return word_set_display(block, row, word_res);
716 }
TBOX bounding_box()
Definition: werd.cpp:164
BOOL8 word_set_display(BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: pgedit.cpp:931
void plot(ScrollView *fd) const
Definition: rect.h:278
WERD * word
Definition: pageres.h:334
ScrollView * image_win
Definition: pgedit.cpp:106
BOOL8 tesseract::Tesseract::word_bln_display ( BLOCK block,
ROW row,
WERD_RES word_res 
)

word_bln_display()

Normalize word and display in word window

Definition at line 724 of file pgedit.cpp.

724  {
725  TWERD *bln_word = word_res->chopped_word;
726  if (bln_word == NULL) {
727  word_res->SetupForTessRecognition(unicharset, this, BestPix(), false,
729  row, block);
730  bln_word = word_res->chopped_word;
731  }
734  1.0, 0.0f, -1000.0f, 1000.0f);
735  bln_word->plot(bln_word_window_handle());
737  return TRUE;
738 }
bool SetupForTessRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, bool numeric_mode, bool use_body_size, ROW *row, BLOCK *block)
Definition: pageres.cpp:272
void Clear()
Definition: scrollview.cpp:590
#define NULL
Definition: host.h:144
Definition: blobs.h:233
#define f(xc, yc)
Definition: imgscale.cpp:39
void display_bln_lines(ScrollView *window, ScrollView::Color colour, float scale_factor, float y_offset, float minx, float maxx)
Definition: pgedit.cpp:209
Pix * BestPix() const
static void Update()
Definition: scrollview.cpp:710
UNICHARSET unicharset
Definition: ccutil.h:72
void plot(ScrollView *window)
Definition: blobs.cpp:522
TWERD * chopped_word
Definition: pageres.h:357
ScrollView * bln_word_window_handle()
Definition: pgedit.cpp:171
#define TRUE
Definition: capi.h:27
inT16 tesseract::Tesseract::word_blob_quality ( WERD_RES word,
ROW row 
)

Definition at line 68 of file docqual.cpp.

68  {
69  if (word->bln_boxes == NULL ||
70  word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
71  return 0;
72 
73  DocQualCallbacks cb(word);
75  *word->rebuild_word,
77  return cb.match_count;
78 }
TWERD * rebuild_word
Definition: pageres.h:381
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:229
#define NULL
Definition: host.h:144
TBLOB * blobs
Definition: blobs.h:274
tesseract::BoxWord * bln_boxes
Definition: pageres.h:343
void CountMatchingBlobs(int index)
Definition: docqual.cpp:42
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
void tesseract::Tesseract::word_char_quality ( WERD_RES word,
ROW row,
inT16 match_count,
inT16 accepted_match_count 
)

Definition at line 100 of file docqual.cpp.

103  {
104  if (word->bln_boxes == NULL ||
105  word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
106  return;
107 
108  DocQualCallbacks cb(word);
110  *word->rebuild_word,
112  *match_count = cb.match_count;
113  *accepted_match_count = cb.accepted_match_count;
114 }
TWERD * rebuild_word
Definition: pageres.h:381
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:229
#define NULL
Definition: host.h:144
TBLOB * blobs
Definition: blobs.h:274
tesseract::BoxWord * bln_boxes
Definition: pageres.h:343
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
void CountAcceptedBlobs(int index)
Definition: docqual.cpp:46
BOOL8 tesseract::Tesseract::word_contains_non_1_digit ( const char *  word,
const char *  word_lengths 
)

Definition at line 673 of file reject.cpp.

674  {
675  inT16 i;
676  inT16 offset;
677 
678  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
679  if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
680  (word_lengths[i] != 1 || word[offset] != '1'))
681  return TRUE;
682  }
683  return FALSE;
684 }
#define FALSE
Definition: capi.h:28
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
UNICHARSET unicharset
Definition: ccutil.h:72
short inT16
Definition: host.h:100
#define TRUE
Definition: capi.h:27
CRUNCH_MODE tesseract::Tesseract::word_deletable ( WERD_RES word,
inT16 delete_mode 
)

Definition at line 904 of file docqual.cpp.

904  {
905  int word_len = word->reject_map.length ();
906  float rating_per_ch;
907  TBOX box; //BB of word
908 
909  if (word->unlv_crunch_mode == CR_NONE) {
910  delete_mode = 0;
911  return CR_NONE;
912  }
913 
914  if (word_len == 0) {
915  delete_mode = 1;
916  return CR_DELETE;
917  }
918 
919  if (word->rebuild_word != NULL) {
920  // Cube leaves rebuild_word NULL.
921  box = word->rebuild_word->bounding_box();
922  if (box.height () < crunch_del_min_ht * kBlnXHeight) {
923  delete_mode = 4;
924  return CR_DELETE;
925  }
926 
927  if (noise_outlines(word->rebuild_word)) {
928  delete_mode = 5;
929  return CR_DELETE;
930  }
931  }
932 
933  if ((failure_count (word) * 1.5) > word_len) {
934  delete_mode = 2;
935  return CR_LOOSE_SPACE;
936  }
937 
938  if (word->best_choice->certainty () < crunch_del_cert) {
939  delete_mode = 7;
940  return CR_LOOSE_SPACE;
941  }
942 
943  rating_per_ch = word->best_choice->rating () / word_len;
944 
945  if (rating_per_ch > crunch_del_rating) {
946  delete_mode = 8;
947  return CR_LOOSE_SPACE;
948  }
949 
951  delete_mode = 9;
952  return CR_LOOSE_SPACE;
953  }
954 
955  if (box.bottom () >
957  delete_mode = 10;
958  return CR_LOOSE_SPACE;
959  }
960 
961  if (box.height () > crunch_del_max_ht * kBlnXHeight) {
962  delete_mode = 11;
963  return CR_LOOSE_SPACE;
964  }
965 
966  if (box.width () < crunch_del_min_width * kBlnXHeight) {
967  delete_mode = 3;
968  return CR_LOOSE_SPACE;
969  }
970 
971  delete_mode = 0;
972  return CR_NONE;
973 }
TWERD * rebuild_word
Definition: pageres.h:381
const int kBlnXHeight
Definition: normalis.h:27
const int kBlnBaselineOffset
Definition: normalis.h:28
float certainty() const
Definition: ratngs.h:234
REJMAP reject_map
Definition: pageres.h:408
#define NULL
Definition: host.h:144
inT16 width() const
Definition: rect.h:104
Definition: rect.h:29
BOOL8 noise_outlines(TWERD *word)
Definition: docqual.cpp:987
inT16 failure_count(WERD_RES *word)
Definition: docqual.cpp:975
inT16 top() const
Definition: rect.h:53
TBOX bounding_box() const
Definition: blobs.cpp:483
inT32 length() const
Definition: rejctmap.h:238
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:430
inT16 height() const
Definition: rect.h:97
float rating() const
Definition: ratngs.h:231
inT16 bottom() const
Definition: rect.h:60
WERD_CHOICE * best_choice
Definition: pageres.h:359
BOOL8 tesseract::Tesseract::word_display ( BLOCK block,
ROW row,
WERD_RES word_res 
)

word_display() Word Processor

Display a word according to its display modes

Definition at line 747 of file pgedit.cpp.

747  {
748  WERD* word = word_res->word;
749  TBOX word_bb; // word bounding box
750  int word_height; // ht of word BB
751  BOOL8 displayed_something = FALSE;
752  float shift; // from bot left
753  C_BLOB_IT c_it; // cblob iterator
754 
755  if (color_mode != CM_RAINBOW && word_res->box_word != NULL) {
756  BoxWord* box_word = word_res->box_word;
757  int length = box_word->length();
758  if (word_res->fontinfo == NULL) return false;
759  const FontInfo& font_info = *word_res->fontinfo;
760  for (int i = 0; i < length; ++i) {
762  switch (color_mode) {
763  case CM_SUBSCRIPT:
764  if (box_word->BlobPosition(i) == SP_SUBSCRIPT)
765  color = ScrollView::RED;
766  break;
767  case CM_SUPERSCRIPT:
768  if (box_word->BlobPosition(i) == SP_SUPERSCRIPT)
769  color = ScrollView::RED;
770  break;
771  case CM_ITALIC:
772  if (font_info.is_italic())
773  color = ScrollView::RED;
774  break;
775  case CM_BOLD:
776  if (font_info.is_bold())
777  color = ScrollView::RED;
778  break;
779  case CM_FIXEDPITCH:
780  if (font_info.is_fixed_pitch())
781  color = ScrollView::RED;
782  break;
783  case CM_SERIF:
784  if (font_info.is_serif())
785  color = ScrollView::RED;
786  break;
787  case CM_SMALLCAPS:
788  if (word_res->small_caps)
789  color = ScrollView::RED;
790  break;
791  case CM_DROPCAPS:
792  if (box_word->BlobPosition(i) == SP_DROPCAP)
793  color = ScrollView::RED;
794  break;
795  // TODO(rays) underline is currently completely unsupported.
796  case CM_UNDERLINE:
797  default:
798  break;
799  }
800  image_win->Pen(color);
801  TBOX box = box_word->BlobBox(i);
802  image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top());
803  }
804  return true;
805  }
806  /*
807  Note the double coercions of(COLOUR)((inT32)editor_image_word_bb_color)
808  etc. are to keep the compiler happy.
809  */
810  // display bounding box
811  if (word->display_flag(DF_BOX)) {
812  word->bounding_box().plot(image_win,
816  editor_image_word_bb_color));
817 
820  image_win->Pen(c);
821  c_it.set_to_list(word->cblob_list());
822  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward())
823  c_it.data()->bounding_box().plot(image_win);
824  displayed_something = TRUE;
825  }
826 
827  // display edge steps
828  if (word->display_flag(DF_EDGE_STEP)) { // edgesteps available
829  word->plot(image_win); // rainbow colors
830  displayed_something = TRUE;
831  }
832 
833  // display poly approx
834  if (word->display_flag(DF_POLYGONAL)) {
835  // need to convert
836  TWERD* tword = TWERD::PolygonalCopy(word);
837  tword->plot(image_win);
838  delete tword;
839  displayed_something = TRUE;
840  }
841 
842  // Display correct text and blamer information.
843  STRING text;
844  STRING blame;
845  if (word->display_flag(DF_TEXT) && word->text() != NULL) {
846  text = word->text();
847  }
848  if (word->display_flag(DF_BLAMER) &&
849  !(word_res->blamer_bundle != NULL &&
851  text = "";
852  const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
853  if (blamer_bundle == NULL) {
854  text += "NULL";
855  } else {
856  for (int i = 0; i < blamer_bundle->truth_text.length(); ++i) {
857  text += blamer_bundle->truth_text[i];
858  }
859  }
860  text += " -> ";
861  STRING best_choice_str;
862  if (word_res->best_choice == NULL) {
863  best_choice_str = "NULL";
864  } else {
865  word_res->best_choice->string_and_lengths(&best_choice_str, NULL);
866  }
867  text += best_choice_str;
868  IncorrectResultReason reason = (blamer_bundle == NULL) ?
869  IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason;
870  ASSERT_HOST(reason < IRR_NUM_REASONS)
871  blame += " [";
872  blame += BlamerBundle::IncorrectReasonName(reason);
873  blame += "]";
874  }
875  if (text.length() > 0) {
876  word_bb = word->bounding_box();
878  word_height = word_bb.height();
879  int text_height = 0.50 * word_height;
880  if (text_height > 20) text_height = 20;
881  image_win->TextAttributes("Arial", text_height, false, false, false);
882  shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
883  image_win->Text(word_bb.left() + shift,
884  word_bb.bottom() + 0.25 * word_height, text.string());
885  if (blame.length() > 0) {
886  image_win->Text(word_bb.left() + shift,
887  word_bb.bottom() + 0.25 * word_height - text_height,
888  blame.string());
889  }
890 
891  displayed_something = TRUE;
892  }
893 
894  if (!displayed_something) // display BBox anyway
895  word->bounding_box().plot(image_win,
896  (ScrollView::Color)((inT32) editor_image_word_bb_color),
898  editor_image_word_bb_color));
899  return TRUE;
900 }
const int length() const
Definition: boxword.h:99
TBOX bounding_box()
Definition: werd.cpp:164
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
static TWERD * PolygonalCopy(WERD *src)
Definition: blobs.cpp:405
IncorrectResultReason incorrect_result_reason
Definition: pageres.h:176
bool is_serif() const
Definition: fontinfo.h:87
const FontInfo * fontinfo
Definition: pageres.h:424
inT32 length() const
Definition: strngs.cpp:151
void Pen(Color color)
Definition: scrollview.cpp:721
unsigned char BOOL8
Definition: host.h:113
bool is_italic() const
Definition: fontinfo.h:84
#define NULL
Definition: host.h:144
Definition: blobs.h:233
inT16 left() const
Definition: rect.h:67
void plot(ScrollView *fd) const
Definition: rect.h:278
int inT32
Definition: host.h:102
inT16 width() const
Definition: rect.h:104
Definition: rect.h:29
#define f(xc, yc)
Definition: imgscale.cpp:39
const char * text() const
Definition: werd.h:119
#define FALSE
Definition: capi.h:28
inT16 right() const
Definition: rect.h:74
BOOL8 display_flag(uinT8 flag) const
Definition: werd.h:125
bool is_fixed_pitch() const
Definition: fontinfo.h:86
Definition: werd.h:51
void TextAttributes(const char *font, int pixel_size, bool bold, bool italic, bool underlined)
Definition: scrollview.cpp:636
bool is_bold() const
Definition: fontinfo.h:85
WERD * word
Definition: pageres.h:334
IncorrectResultReason
Definition: pageres.h:45
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:601
GenericVector< STRING > truth_text
Definition: pageres.h:174
int editor_image_word_bb_color
Definition: pgedit.cpp:135
const char * string() const
Definition: strngs.cpp:156
inT16 top() const
Definition: rect.h:53
int editor_image_blob_bb_color
Definition: pgedit.cpp:137
ScrollView * image_win
Definition: pgedit.cpp:106
void Text(int x, int y, const char *mystring)
Definition: scrollview.cpp:653
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: pageres.cpp:55
Definition: werd.h:55
Definition: strngs.h:40
Definition: werd.h:60
void plot(ScrollView *window)
Definition: blobs.cpp:522
int length() const
Definition: genericvector.h:63
tesseract::BoxWord * box_word
Definition: pageres.h:387
#define ASSERT_HOST(x)
Definition: errcode.h:84
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:294
inT16 height() const
Definition: rect.h:97
void plot(ScrollView *window, ScrollView::Color colour)
Definition: werd.cpp:287
Definition: werd.h:50
BlamerBundle * blamer_bundle
Definition: pageres.h:367
bool small_caps
Definition: pageres.h:420
#define TRUE
Definition: capi.h:27
WERD_CHOICE * best_choice
Definition: pageres.h:359
inT16 bottom() const
Definition: rect.h:60
BOOL8 tesseract::Tesseract::word_dumper ( BLOCK block,
ROW row,
WERD_RES word_res 
)

word_dumper()

Dump members to the debug window

Definition at line 908 of file pgedit.cpp.

908  {
909  if (block != NULL) {
910  tprintf("\nBlock data...\n");
911  block->print(NULL, FALSE);
912  }
913  tprintf("\nRow data...\n");
914  row->print(NULL);
915  tprintf("\nWord data...\n");
916  word_res->word->print();
917  if (word_res->blamer_bundle != NULL && wordrec_debug_blamer &&
919  tprintf("Current blamer debug: %s\n",
920  word_res->blamer_bundle->debug.string());
921  }
922  return TRUE;
923 }
void print(FILE *fp)
Definition: ocrrow.cpp:157
IncorrectResultReason incorrect_result_reason
Definition: pageres.h:176
void print(FILE *fp, BOOL8 dump)
dump whole table
Definition: ocrblock.cpp:185
#define NULL
Definition: host.h:144
#define FALSE
Definition: capi.h:28
WERD * word
Definition: pageres.h:334
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool wordrec_debug_blamer
Definition: wordrec.h:142
STRING debug
Definition: pageres.h:178
void print()
Definition: werd.cpp:256
BlamerBundle * blamer_bundle
Definition: pageres.h:367
#define TRUE
Definition: capi.h:27
inT16 tesseract::Tesseract::word_outline_errs ( WERD_RES word)

Definition at line 80 of file docqual.cpp.

80  {
81  inT16 i = 0;
82  inT16 err_count = 0;
83 
84  if (word->rebuild_word != NULL) {
85  TBLOB* blob = word->rebuild_word->blobs;
86  for (; blob != NULL; blob = blob->next) {
87  err_count += count_outline_errs(word->best_choice->unichar_string()[i],
88  blob->NumOutlines());
89  i++;
90  }
91  }
92  return err_count;
93 }
TWERD * rebuild_word
Definition: pageres.h:381
const STRING & unichar_string() const
Definition: ratngs.h:395
int NumOutlines() const
Definition: blobs.cpp:371
#define NULL
Definition: host.h:144
TBLOB * blobs
Definition: blobs.h:274
Definition: blobs.h:174
inT16 count_outline_errs(char c, inT16 outline_count)
Definition: docqual.cpp:131
short inT16
Definition: host.h:100
TBLOB * next
Definition: blobs.h:228
WERD_CHOICE * best_choice
Definition: pageres.h:359
BOOL8 tesseract::Tesseract::word_set_display ( BLOCK block,
ROW row,
WERD_RES word_res 
)

word_set_display() Word processor

Display word according to current display mode settings

Definition at line 931 of file pgedit.cpp.

931  {
932  WERD* word = word_res->word;
940  return word_display(block, row, word_res);
941 }
Definition: werd.h:51
WERD * word
Definition: pageres.h:334
BOOL8 word_display(BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: pgedit.cpp:747
Definition: werd.h:55
Definition: werd.h:60
BOOL8 bit(uinT8 bit_num) const
Definition: bits16.h:56
BITS16 word_display_mode
Definition: pgedit.cpp:121
Definition: werd.h:50
void set_display_flag(uinT8 flag, BOOL8 value)
Definition: werd.h:126
inT16 tesseract::Tesseract::worst_noise_blob ( WERD_RES word_res,
float *  worst_noise_score 
)

Definition at line 764 of file fixspace.cpp.

765  {
766  float noise_score[512];
767  int i;
768  int min_noise_blob; // 1st contender
769  int max_noise_blob; // last contender
770  int non_noise_count;
771  int worst_noise_blob; // Worst blob
772  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
773  float non_noise_limit = kBlnXHeight * 0.8;
774 
775  if (word_res->rebuild_word == NULL)
776  return -1; // Can't handle cube words.
777 
778  TBLOB* blob = word_res->rebuild_word->blobs;
779  // Normalised.
780  int blob_count = word_res->box_word->length();
781  ASSERT_HOST(blob_count <= 512);
782  if (blob_count < 5)
783  return -1; // too short to split
784 
785  /* Get the noise scores for all blobs */
786 
787  #ifndef SECURE_NAMES
788  if (debug_fix_space_level > 5)
789  tprintf("FP fixspace Noise metrics for \"%s\": ",
790  word_res->best_choice->unichar_string().string());
791  #endif
792 
793  for (i = 0; i < blob_count && blob != NULL; i++, blob = blob->next) {
794  if (word_res->reject_map[i].accepted())
795  noise_score[i] = non_noise_limit;
796  else
797  noise_score[i] = blob_noise_score(blob);
798 
799  if (debug_fix_space_level > 5)
800  tprintf("%1.1f ", noise_score[i]);
801  }
802  if (debug_fix_space_level > 5)
803  tprintf("\n");
804 
805  /* Now find the worst one which is far enough away from the end of the word */
806 
807  non_noise_count = 0;
808  for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
809  if (noise_score[i] >= non_noise_limit) {
810  non_noise_count++;
811  }
812  }
813  if (non_noise_count < fixsp_non_noise_limit)
814  return -1;
815 
816  min_noise_blob = i;
817 
818  non_noise_count = 0;
819  for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
820  i--) {
821  if (noise_score[i] >= non_noise_limit) {
822  non_noise_count++;
823  }
824  }
825  if (non_noise_count < fixsp_non_noise_limit)
826  return -1;
827 
828  max_noise_blob = i;
829 
830  if (min_noise_blob > max_noise_blob)
831  return -1;
832 
833  *worst_noise_score = small_limit;
834  worst_noise_blob = -1;
835  for (i = min_noise_blob; i <= max_noise_blob; i++) {
836  if (noise_score[i] < *worst_noise_score) {
837  worst_noise_blob = i;
838  *worst_noise_score = noise_score[i];
839  }
840  }
841  return worst_noise_blob;
842 }
TWERD * rebuild_word
Definition: pageres.h:381
const int kBlnXHeight
Definition: normalis.h:27
const STRING & unichar_string() const
Definition: ratngs.h:395
const int length() const
Definition: boxword.h:99
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:844
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:764
REJMAP reject_map
Definition: pageres.h:408
#define NULL
Definition: host.h:144
TBLOB * blobs
Definition: blobs.h:274
Definition: blobs.h:174
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
tesseract::BoxWord * box_word
Definition: pageres.h:387
#define ASSERT_HOST(x)
Definition: errcode.h:84
TBLOB * next
Definition: blobs.h:228
WERD_CHOICE * best_choice
Definition: pageres.h:359
void tesseract::Tesseract::write_results ( PAGE_RES_IT page_res_it,
char  newline_type,
BOOL8  force_eol 
)

Definition at line 138 of file output.cpp.

140  { // override tilde crunch?
141  WERD_RES *word = page_res_it.word();
142  const UNICHARSET &uchset = *word->uch_set;
143  STRING repetition_code;
144  const STRING *wordstr;
145  STRING wordstr_lengths;
146  int i;
147  char unrecognised = STRING (unrecognised_char)[0];
148  char ep_chars[32]; //Only for unlv_tilde_crunch
149  int ep_chars_index = 0;
150  char txt_chs[32]; //Only for unlv_tilde_crunch
151  char map_chs[32]; //Only for unlv_tilde_crunch
152  int txt_index = 0;
153  BOOL8 need_reject = FALSE;
154  UNICHAR_ID space = uchset.unichar_to_id(" ");
155  if ((word->unlv_crunch_mode != CR_NONE ||
156  word->best_choice->length() == 0) &&
158  if ((word->unlv_crunch_mode != CR_DELETE) &&
159  (!stats_.tilde_crunch_written ||
160  ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
161  (word->word->space () > 0) &&
162  !word->word->flag (W_FUZZY_NON) &&
163  !word->word->flag (W_FUZZY_SP)))) {
164  if (!word->word->flag (W_BOL) &&
165  (word->word->space () > 0) &&
166  !word->word->flag (W_FUZZY_NON) &&
167  !word->word->flag (W_FUZZY_SP)) {
168  // Write a space to separate from preceeding good text.
169  txt_chs[txt_index] = ' ';
170  map_chs[txt_index++] = '1';
171  ep_chars[ep_chars_index++] = ' ';
172  stats_.last_char_was_tilde = false;
173  }
174  need_reject = TRUE;
175  }
176  if ((need_reject && !stats_.last_char_was_tilde) ||
177  (force_eol && stats_.write_results_empty_block)) {
178  /* Write a reject char - mark as rejected unless zero_rejection mode */
179  stats_.last_char_was_tilde = TRUE;
180  txt_chs[txt_index] = unrecognised;
181  if (tessedit_zero_rejection || (suspect_level == 0)) {
182  map_chs[txt_index++] = '1';
183  ep_chars[ep_chars_index++] = unrecognised;
184  }
185  else {
186  map_chs[txt_index++] = '0';
187  /*
188  The ep_choice string is a faked reject to allow newdiff to sync the
189  .etx with the .txt and .map files.
190  */
191  ep_chars[ep_chars_index++] = CTRL_INSET; // escape code
192  //dummy reject
193  ep_chars[ep_chars_index++] = 1;
194  //dummy reject
195  ep_chars[ep_chars_index++] = 1;
196  //type
197  ep_chars[ep_chars_index++] = 2;
198  //dummy reject
199  ep_chars[ep_chars_index++] = 1;
200  //dummy reject
201  ep_chars[ep_chars_index++] = 1;
202  }
203  stats_.tilde_crunch_written = true;
204  stats_.last_char_was_newline = false;
205  stats_.write_results_empty_block = false;
206  }
207 
208  if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
209  /* Add a new line output */
210  txt_chs[txt_index] = '\n';
211  map_chs[txt_index++] = '\n';
212  //end line
213  ep_chars[ep_chars_index++] = newline_type;
214 
215  //Cos of the real newline
216  stats_.tilde_crunch_written = false;
217  stats_.last_char_was_newline = true;
218  stats_.last_char_was_tilde = false;
219  }
220  txt_chs[txt_index] = '\0';
221  map_chs[txt_index] = '\0';
222  ep_chars[ep_chars_index] = '\0'; // terminate string
223  word->ep_choice = new WERD_CHOICE(ep_chars, uchset);
224 
225  if (force_eol)
226  stats_.write_results_empty_block = true;
227  return;
228  }
229 
230  /* NORMAL PROCESSING of non tilde crunched words */
231 
232  stats_.tilde_crunch_written = false;
233  if (newline_type)
234  stats_.last_char_was_newline = true;
235  else
236  stats_.last_char_was_newline = false;
237  stats_.write_results_empty_block = force_eol; // about to write a real word
238 
239  if (unlv_tilde_crunching &&
240  stats_.last_char_was_tilde &&
241  (word->word->space() == 0) &&
243  (word->best_choice->unichar_id(0) == space)) {
244  /* Prevent adjacent tilde across words - we know that adjacent tildes within
245  words have been removed */
246  word->best_choice->remove_unichar_id(0);
247  if (word->best_choice->blob_choices() != NULL) {
248  BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices());
249  if (!blob_choices_it.empty()) delete blob_choices_it.extract();
250  }
251  word->reject_map.remove_pos (0);
252  word->box_word->DeleteBox(0);
253  }
254  if (newline_type ||
256  stats_.last_char_was_tilde = false;
257  else {
258  if (word->reject_map.length () > 0) {
259  if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
260  stats_.last_char_was_tilde = true;
261  else
262  stats_.last_char_was_tilde = false;
263  }
264  else if (word->word->space () > 0)
265  stats_.last_char_was_tilde = false;
266  /* else it is unchanged as there are no output chars */
267  }
268 
269  ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
270 
271  set_unlv_suspects(word);
272  check_debug_pt (word, 120);
274  tprintf ("Dict word: \"%s\": %d\n",
275  word->best_choice->debug_string().string(),
276  dict_word(*(word->best_choice)));
277  }
278  if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
279  repetition_code = "|^~R";
280  wordstr_lengths = "\001\001\001\001";
281  repetition_code += uchset.id_to_unichar(get_rep_char(word));
282  wordstr_lengths += strlen(uchset.id_to_unichar(get_rep_char(word)));
283  wordstr = &repetition_code;
284  } else {
286  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
287  for (i = 0; i < word->best_choice->length(); ++i) {
288  if (word->reject_map[i].rejected())
289  word->reject_map[i].setrej_minimal_rej_accept();
290  }
291  }
293  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
294  for (i = 0; i < word->best_choice->length(); ++i) {
295  if ((word->best_choice->unichar_id(i) != space) &&
296  word->reject_map[i].rejected())
297  word->reject_map[i].setrej_minimal_rej_accept();
298  }
299  }
300  }
301 }
int length() const
Definition: ratngs.h:214
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
int UNICHAR_ID
Definition: unichar.h:31
BLOB_CHOICE_LIST_CLIST * blob_choices()
Definition: ratngs.h:244
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
const STRING debug_string() const
Definition: ratngs.h:373
unsigned char BOOL8
Definition: host.h:113
REJMAP reject_map
Definition: pageres.h:408
#define NULL
Definition: host.h:144
uinT8 space()
Definition: werd.h:104
#define FALSE
Definition: capi.h:28
void DeleteBox(int index)
Definition: boxword.cpp:205
UNICHAR_ID get_rep_char(WERD_RES *word)
Definition: output.cpp:349
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1388
WERD_RES * word() const
Definition: pageres.h:757
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:122
void remove_pos(inT16 pos)
Definition: rejctmap.cpp:371
WERD * word
Definition: pageres.h:334
const UNICHARSET * uch_set
Definition: pageres.h:348
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:371
Definition: werd.h:35
const char * string() const
Definition: strngs.cpp:156
WERD_CHOICE * ep_choice
Definition: pageres.h:407
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:133
void remove_unichar_id(int index)
Definition: ratngs.h:357
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
Definition: strngs.h:40
tesseract::BoxWord * box_word
Definition: pageres.h:387
#define CTRL_INSET
Definition: output.cpp:46
inT32 length() const
Definition: rejctmap.h:238
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: werd.h:36
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:430
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:220
#define TRUE
Definition: capi.h:27
WERD_CHOICE * best_choice
Definition: pageres.h:359

Member Data Documentation

int tesseract::Tesseract::applybox_debug = 1

"Debug level"

Definition at line 693 of file tesseractclass.h.

char* tesseract::Tesseract::applybox_exposure_pattern = ".exp"

"Exposure value follows this pattern in the image" " filename. The name of the image files are expected" " to be in the form [lang].[fontname].exp[num].tif"

Definition at line 698 of file tesseractclass.h.

bool tesseract::Tesseract::applybox_learn_chars_and_char_frags_mode = false

"Learn both character fragments (as is done in the" " special low exposure mode) as well as unfragmented" " characters."

Definition at line 702 of file tesseractclass.h.

bool tesseract::Tesseract::applybox_learn_ngrams_mode = false

"Each bounding box is assumed to contain ngrams. Only" " learn the ngrams whose outlines overlap horizontally."

Definition at line 705 of file tesseractclass.h.

int tesseract::Tesseract::applybox_page = 0

"Page number to apply boxes from"

Definition at line 694 of file tesseractclass.h.

int tesseract::Tesseract::bidi_debug = 0

"Debug level for BiDi"

Definition at line 692 of file tesseractclass.h.

bool tesseract::Tesseract::bland_unrej = false

"unrej potential with no chekcs"

Definition at line 787 of file tesseractclass.h.

char* tesseract::Tesseract::chs_leading_punct = "('`\""

"Leading punctuation"

Definition at line 725 of file tesseractclass.h.

char* tesseract::Tesseract::chs_trailing_punct1 = ").,;:?!"

"1st Trailing punctuation"

Definition at line 726 of file tesseractclass.h.

char* tesseract::Tesseract::chs_trailing_punct2 = ")'`\""

"2nd Trailing punctuation"

Definition at line 727 of file tesseractclass.h.

char* tesseract::Tesseract::conflict_set_I_l_1 = "Il1[]"

"Il1 conflict set"

Definition at line 878 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_accept_ok = true

"Use acceptability in okstring"

Definition at line 814 of file tesseractclass.h.

int tesseract::Tesseract::crunch_debug = 0

"As it says"

Definition at line 823 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_cert = -10.0

"POTENTIAL crunch cert lt this"

Definition at line 803 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_high_word = 1.5

"Del if word gt xht x this above bl"

Definition at line 808 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_low_word = 0.5

"Del if word gt xht x this below bl"

Definition at line 809 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_max_ht = 3.0

"Del if word ht gt xht x this"

Definition at line 805 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_min_ht = 0.7

"Del if word ht lt xht x this"

Definition at line 804 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_min_width = 3.0

"Del if word width lt xht x this"

Definition at line 806 of file tesseractclass.h.

double tesseract::Tesseract::crunch_del_rating = 60

"POTENTIAL crunch rating lt this"

Definition at line 802 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_early_convert_bad_unlv_chs = false

"Take out ~^ early?"

Definition at line 793 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_early_merge_tess_fails = true

"Before word crunch?"

Definition at line 792 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_include_numerals = false

"Fiddle alpha figures"

Definition at line 817 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_leave_accept_strings = false

"Dont pot crunch sensible strings"

Definition at line 816 of file tesseractclass.h.

int tesseract::Tesseract::crunch_leave_lc_strings = 4

"Dont crunch words with long lower case strings"

Definition at line 819 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_leave_ok_strings = true

"Dont touch sensible strings"

Definition at line 813 of file tesseractclass.h.

int tesseract::Tesseract::crunch_leave_uc_strings = 4

"Dont crunch words with long lower case strings"

Definition at line 821 of file tesseractclass.h.

int tesseract::Tesseract::crunch_long_repetitions = 3

"Crunch words with long repetitions"

Definition at line 822 of file tesseractclass.h.

double tesseract::Tesseract::crunch_poor_garbage_cert = -9.0

"crunch garbage cert lt this"

Definition at line 797 of file tesseractclass.h.

double tesseract::Tesseract::crunch_poor_garbage_rate = 60

"crunch garbage rating lt this"

Definition at line 798 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_pot_garbage = true

"POTENTIAL crunch garbage"

Definition at line 801 of file tesseractclass.h.

int tesseract::Tesseract::crunch_pot_indicators = 1

"How many potential indicators needed"

Definition at line 812 of file tesseractclass.h.

double tesseract::Tesseract::crunch_pot_poor_cert = -8.0

"POTENTIAL crunch cert lt this"

Definition at line 800 of file tesseractclass.h.

double tesseract::Tesseract::crunch_pot_poor_rate = 40

"POTENTIAL crunch rating lt this"

Definition at line 799 of file tesseractclass.h.

int tesseract::Tesseract::crunch_rating_max = 10

"For adj length in rating per ch"

Definition at line 811 of file tesseractclass.h.

double tesseract::Tesseract::crunch_small_outlines_size = 0.6

"Small if lt xht x this"

Definition at line 810 of file tesseractclass.h.

bool tesseract::Tesseract::crunch_terrible_garbage = true

"As it says"

Definition at line 795 of file tesseractclass.h.

double tesseract::Tesseract::crunch_terrible_rating = 80.0

"crunch rating lt this"

Definition at line 794 of file tesseractclass.h.

int tesseract::Tesseract::cube_debug_level = 1

"Print cube debug info."

Definition at line 751 of file tesseractclass.h.

bool tesseract::Tesseract::debug_acceptable_wds = false

"Dump word pass/fail chk"

Definition at line 724 of file tesseractclass.h.

int tesseract::Tesseract::debug_fix_space_level = 0

"Contextual fixspace debug"

Definition at line 829 of file tesseractclass.h.

int tesseract::Tesseract::debug_x_ht_level = 0

"Reestimate debug"

Definition at line 723 of file tesseractclass.h.

bool tesseract::Tesseract::docqual_excuse_outline_errs = false

"Allow outline errs in unrejection?"

Definition at line 755 of file tesseractclass.h.

char* tesseract::Tesseract::file_type = ".tif"

"Filename extension"

Definition at line 885 of file tesseractclass.h.

int tesseract::Tesseract::fixsp_done_mode = 1

"What constitues done for spacing"

Definition at line 828 of file tesseractclass.h.

int tesseract::Tesseract::fixsp_non_noise_limit = 1

"How many non-noise blbs either side?"

Definition at line 825 of file tesseractclass.h.

double tesseract::Tesseract::fixsp_small_outlines_size = 0.28

"Small if lt xht x this"

Definition at line 826 of file tesseractclass.h.

bool tesseract::Tesseract::interactive_display_mode = false

"Run interactively?"

Definition at line 884 of file tesseractclass.h.

double tesseract::Tesseract::min_orientation_margin = 7.0

"Min acceptable orientation margin"

Definition at line 894 of file tesseractclass.h.

int tesseract::Tesseract::min_sane_x_ht_pixels = 8

"Reject any x-ht lt or eq than this"

Definition at line 879 of file tesseractclass.h.

char* tesseract::Tesseract::numeric_punctuation = ".,"

"Punct. chs expected WITHIN numbers"

Definition at line 831 of file tesseractclass.h.

int tesseract::Tesseract::ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT

"Whether to use the top-line splitting process for Devanagari " "documents while performing ocr."

Definition at line 684 of file tesseractclass.h.

char* tesseract::Tesseract::ok_repeated_ch_non_alphanum_wds = "-?*\075"

"Allow NN to unrej"

Definition at line 877 of file tesseractclass.h.

char* tesseract::Tesseract::outlines_2 = "ij!?%\":;"

"Non standard number of outlines"

Definition at line 753 of file tesseractclass.h.

char* tesseract::Tesseract::outlines_odd = "%| "

"Non standard number of outlines"

Definition at line 752 of file tesseractclass.h.

int tesseract::Tesseract::pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT

"Whether to use the top-line splitting process for Devanagari " "documents while performing page-segmentation."

Definition at line 680 of file tesseractclass.h.

int tesseract::Tesseract::paragraph_debug_level = 0

"Print paragraph debug info."

Definition at line 750 of file tesseractclass.h.

double tesseract::Tesseract::quality_blob_pc = 0.0

"good_quality_doc gte good blobs limit"

Definition at line 729 of file tesseractclass.h.

double tesseract::Tesseract::quality_char_pc = 0.95

"good_quality_doc gte good char limit"

Definition at line 732 of file tesseractclass.h.

int tesseract::Tesseract::quality_min_initial_alphas_reqd = 2

"alphas in a good word"

Definition at line 733 of file tesseractclass.h.

double tesseract::Tesseract::quality_outline_pc = 1.0

"good_quality_doc lte outline error limit"

Definition at line 731 of file tesseractclass.h.

double tesseract::Tesseract::quality_rej_pc = 0.08

"good_quality_doc lte rejection limit"

Definition at line 728 of file tesseractclass.h.

double tesseract::Tesseract::quality_rowrej_pc = 1.1

"good_quality_doc gte good char limit"

Definition at line 789 of file tesseractclass.h.

bool tesseract::Tesseract::rej_1Il_trust_permuter_type = true

"Dont double check"

Definition at line 868 of file tesseractclass.h.

bool tesseract::Tesseract::rej_1Il_use_dict_word = false

"Use dictword test"

Definition at line 867 of file tesseractclass.h.

bool tesseract::Tesseract::rej_alphas_in_number_perm = false

"Extend permuter check"

Definition at line 873 of file tesseractclass.h.

bool tesseract::Tesseract::rej_trust_doc_dawg = false

"Use DOC dawg in 11l conf. detector"

Definition at line 866 of file tesseractclass.h.

bool tesseract::Tesseract::rej_use_good_perm = true

"Individual rejection control"

Definition at line 871 of file tesseractclass.h.

bool tesseract::Tesseract::rej_use_sensible_wd = false

"Extend permuter check"

Definition at line 872 of file tesseractclass.h.

bool tesseract::Tesseract::rej_use_tess_accepted = true

"Individual rejection control"

Definition at line 869 of file tesseractclass.h.

bool tesseract::Tesseract::rej_use_tess_blanks = true

"Individual rejection control"

Definition at line 870 of file tesseractclass.h.

double tesseract::Tesseract::rej_whole_of_mostly_reject_word_fract = 0.85

"if >this fract"

Definition at line 874 of file tesseractclass.h.

bool tesseract::Tesseract::save_blob_choices = false

"Save the results of the recognition step" " (blob_choices) within the corresponding WERD_CHOICE"

Definition at line 746 of file tesseractclass.h.

double tesseract::Tesseract::suspect_accept_rating = -999.9

"Accept good rating limit"

Definition at line 850 of file tesseractclass.h.

bool tesseract::Tesseract::suspect_constrain_1Il = false

"UNLV keep 1Il chars rejected"

Definition at line 848 of file tesseractclass.h.

int tesseract::Tesseract::suspect_level = 99

"Suspect marker level"

Definition at line 843 of file tesseractclass.h.

double tesseract::Tesseract::suspect_rating_per_ch = 999.9

"Dont touch bad rating limit"

Definition at line 849 of file tesseractclass.h.

int tesseract::Tesseract::suspect_short_words = 2

"Dont Suspect dict wds longer than this"

Definition at line 847 of file tesseractclass.h.

int tesseract::Tesseract::suspect_space_level = 100

"Min suspect level for rejecting spaces"

Definition at line 845 of file tesseractclass.h.

int tesseract::Tesseract::tessdata_manager_debug_level = 0

"Debug level for TessdataManager functions."

Definition at line 888 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_adapt_to_char_fragments = true

"Adapt to words that contain " " a character composed form fragments"

Definition at line 689 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_adaption_debug = false

"Generate and print debug information for adaption"

Definition at line 691 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_ambigs_training = false

"Perform training for ambiguities"

Definition at line 676 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_bigram_debug = 0

"Amount of debug output for bigram " "correction."

Definition at line 722 of file tesseractclass.h.

char* tesseract::Tesseract::tessedit_char_blacklist = ""

"Blacklist of chars not to recognize"

Definition at line 672 of file tesseractclass.h.

char* tesseract::Tesseract::tessedit_char_whitelist = ""

"Whitelist of chars to recognize"

Definition at line 674 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_consistent_reps = true

"Force all rep chars the same"

Definition at line 857 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_create_boxfile = false

"Output text with boxes"

Definition at line 880 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_create_hocr = false

"Write .html hOCR output file"

Definition at line 840 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_debug_block_rejection = false

"Block and Row stats"

Definition at line 718 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_debug_doc_rejection = false

"Page stats"

Definition at line 784 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_debug_fonts = false

"Output font info per char"

Definition at line 717 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_debug_quality_metrics = false

"Output data to debug file"

Definition at line 786 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_display_outwords = false

"Draw output words"

Definition at line 706 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_dont_blkrej_good_wds = false

"Use word segmentation quality metric"

Definition at line 773 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_dont_rowrej_good_wds = false

"Use word segmentation quality metric"

Definition at line 775 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_dump_choices = false

"Dump char choices"

Definition at line 708 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_dump_pageseg_images = false

"Dump intermediate images made during page segmentation"

Definition at line 662 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_enable_bigram_correction = false

"Enable correction based on the word bigram dictionary."

Definition at line 720 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_enable_doc_dict = true

"Add words to the document dictionary"

Definition at line 716 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_fix_fuzzy_spaces = true

"Try to improve fuzzy spaces"

Definition at line 710 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_fix_hyphens = true

"Crunch double hyphens?"

Definition at line 713 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_flip_0O = true

"Contextual 0O O0 flips"

Definition at line 861 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_good_doc_still_rowrej_wd = 1.1

"rej good doc wd if more than this fraction rejected"

Definition at line 781 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_good_quality_unrej = true

"Reduce rejection on good docs"

Definition at line 757 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_image_border = 2

"Rej blbs near image edge limit"

Definition at line 875 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_init_config_only = false

"Only initialize with the config file. Useful if the instance is " "not going to be used for OCR but say only for layout analysis."

Definition at line 899 of file tesseractclass.h.

char* tesseract::Tesseract::tessedit_load_sublangs = ""

"List of languages to load with this one"

Definition at line 890 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_lower_flip_hyphen = 1.5

"Aspect ratio dot/hyphen test"

Definition at line 863 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_make_boxes_from_boxes = false

"Generate more boxes from boxed chars"

Definition at line 660 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_matcher_log = false

"Log matcher activity"

Definition at line 741 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_minimal_rej_pass1 = false

"Do minimal rejection on pass 1 output"

Definition at line 739 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_minimal_rejection = false

"Only reject tess failures"

Definition at line 851 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_ocr_engine_mode = tesseract::OEM_TESSERACT_ONLY

"Which OCR engine(s) to run (Tesseract, Cube, both). Defaults" " to loading and running only Tesseract (no Cube, no combiner)." " (Values from OcrEngineMode enum in tesseractclass.h)"

Definition at line 670 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_ok_mode = 5

"Acceptance decision algorithm"

Definition at line 859 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_override_permuter = true

"According to dict_word"

Definition at line 886 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_page_number = -1

"-1 -> All pages, else specifc page to process"

Definition at line 882 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_pageseg_mode = PSM_SINGLE_BLOCK

"Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," " 5=line, 6=word, 7=char" " (Values from PageSegMode enum in publictypes.h)"

Definition at line 666 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_prefer_joined_punct = false

"Reward punctation joins"

Definition at line 827 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_preserve_blk_rej_perfect_wds = true

"Only rej partially rejected words in block rejection"

Definition at line 769 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_preserve_min_wd_len = 2

"Only preserve wds longer than this"

Definition at line 777 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_preserve_row_rej_perfect_wds = true

"Only rej partially rejected words in row rejection"

Definition at line 771 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_redo_xheight = true

"Check/Correct x-height"

Definition at line 714 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_reject_bad_qual_wds = true

"Reject all bad quality wds"

Definition at line 783 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_reject_block_percent = 45.00

"%rej allowed before rej whole block"

Definition at line 762 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_reject_doc_percent = 65.00

"%rej allowed before rej whole doc"

Definition at line 760 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_reject_mode = 0

"Rejection algorithm"

Definition at line 858 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_reject_row_percent = 40.00

"%rej allowed before rej whole row"

Definition at line 764 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_rejection_debug = false

"Adaption debug"

Definition at line 860 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_resegment_from_boxes = false

"Take segmentation and labeling from box file"

Definition at line 654 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_resegment_from_line_boxes = false

"Conversion of word/line box file to char box file"

Definition at line 656 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_row_rej_good_docs = true

"Apply row rejection to good docs"

Definition at line 779 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_tess_adapt_to_rejmap = false

"Use reject map to control Tesseract adaption"

Definition at line 735 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_tess_adaption_mode = 0x27

"Adaptation decision algorithm for tess"

Definition at line 737 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_test_adaption = false

"Test adaption criteria"

Definition at line 740 of file tesseractclass.h.

int tesseract::Tesseract::tessedit_test_adaption_mode = 3

"Adaptation decision algorithm for tess"

Definition at line 743 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_train_from_boxes = false

"Generate training data from boxed chars"

Definition at line 658 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_training_tess = false

"Call Tess to learn blobs"

Definition at line 707 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_unrej_any_wd = false

"Dont bother with word plausibility"

Definition at line 712 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_upper_flip_hyphen = 1.8

"Aspect ratio dot/hyphen test"

Definition at line 865 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_use_reject_spaces = true

"Reject spaces?"

Definition at line 758 of file tesseractclass.h.

double tesseract::Tesseract::tessedit_whole_wd_rej_row_percent = 70.00

"Number of row rejects in whole word rejects" "which prevents whole row rejection"

Definition at line 767 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_word_for_word = false

"Make output have exactly one word per WERD"

Definition at line 854 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_write_block_separators = false

"Write block separators in output"

Definition at line 836 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_write_images = false

"Capture the image from the IPE"

Definition at line 883 of file tesseractclass.h.

char* tesseract::Tesseract::tessedit_write_params_to_file = ""

"Write all parameters to the given file."

Definition at line 686 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_write_rep_codes = false

"Write repetition char code"

Definition at line 838 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_write_unlv = false

"Write .unlv output file"

Definition at line 839 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_zero_kelvin_rejection = false

"Dont reject ANYTHING AT ALL"

Definition at line 856 of file tesseractclass.h.

bool tesseract::Tesseract::tessedit_zero_rejection = false

"Dont reject ANYTHING"

Definition at line 852 of file tesseractclass.h.

bool tesseract::Tesseract::test_pt = false

"Test for point"

Definition at line 747 of file tesseractclass.h.

double tesseract::Tesseract::test_pt_x = 99999.99

"xcoord"

Definition at line 748 of file tesseractclass.h.

double tesseract::Tesseract::test_pt_y = 99999.99

"ycoord"

Definition at line 749 of file tesseractclass.h.

bool tesseract::Tesseract::textord_equation_detect = false

"Turn on equation detector"

Definition at line 900 of file tesseractclass.h.

bool tesseract::Tesseract::textord_tabfind_show_vlines = false

"Debug line finding"

Definition at line 895 of file tesseractclass.h.

bool tesseract::Tesseract::textord_use_cjk_fp_model = FALSE

"Use CJK fixed pitch model"

Definition at line 896 of file tesseractclass.h.

bool tesseract::Tesseract::unlv_tilde_crunching = true

"Mark v.bad words for tilde crunch"

Definition at line 791 of file tesseractclass.h.

char* tesseract::Tesseract::unrecognised_char = "|"

"Output char for unidentified blobs"

Definition at line 842 of file tesseractclass.h.

int tesseract::Tesseract::x_ht_acceptance_tolerance = 8

"Max allowed deviation of blob top outside of font data"

Definition at line 833 of file tesseractclass.h.

int tesseract::Tesseract::x_ht_min_change = 8

"Min change in xht before actually trying it"

Definition at line 834 of file tesseractclass.h.


The documentation for this class was generated from the following files: