Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ScriptDetector Class Reference

#include <osdetect.h>

Public Member Functions

 ScriptDetector (OSResults *, tesseract::Tesseract *tess)
 
void detect_blob (BLOB_CHOICE_LIST *scores)
 
void get_script ()
 
bool must_stop (int orientation)
 

Detailed Description

Definition at line 91 of file osdetect.h.

Constructor & Destructor Documentation

ScriptDetector::ScriptDetector ( OSResults osr,
tesseract::Tesseract tess 
)

Definition at line 419 of file osdetect.cpp.

419  {
420  osr_ = osr;
421  tess_ = tess;
422  katakana_id_ = tess_->unicharset.add_script(katakana_script);
423  hiragana_id_ = tess_->unicharset.add_script(hiragana_script);
424  han_id_ = tess_->unicharset.add_script(han_script);
425  hangul_id_ = tess_->unicharset.add_script(hangul_script);
426  japanese_id_ = tess_->unicharset.add_script(japanese_script_);
427  korean_id_ = tess_->unicharset.add_script(korean_script_);
428  latin_id_ = tess_->unicharset.add_script(latin_script);
429  fraktur_id_ = tess_->unicharset.add_script(fraktur_script_);
430 }
int add_script(const char *script)
Definition: unicharset.cpp:866
UNICHARSET unicharset
Definition: ccutil.h:72

Member Function Documentation

void ScriptDetector::detect_blob ( BLOB_CHOICE_LIST *  scores)

Definition at line 435 of file osdetect.cpp.

435  {
436  bool done[kMaxNumberOfScripts];
437  for (int i = 0; i < 4; ++i) {
438  for (int j = 0; j < kMaxNumberOfScripts; ++j)
439  done[j] = false;
440 
441  BLOB_CHOICE_IT choice_it;
442  choice_it.set_to_list(scores + i);
443 
444  float prev_score = -1;
445  int script_count = 0;
446  int prev_id = -1;
447  int prev_script;
448  int prev_class_id = -1;
449  int prev_fontinfo_id = -1;
450  const char* prev_unichar = "";
451  const char* unichar = "";
452  float next_best_score = -1.0;
453  int next_best_script_id = -1;
454  const char* next_best_unichar = "";
455 
456  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
457  choice_it.forward()) {
458  BLOB_CHOICE* choice = choice_it.data();
459  int id = choice->script_id();
460  // Script already processed before.
461  if (done[id]) continue;
462  done[id] = true;
463 
464  unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
465  // Save data from the first match
466  if (prev_score < 0) {
467  prev_score = -choice->certainty();
468  script_count = 1;
469  prev_id = id;
470  prev_script = choice->script_id();
471  prev_unichar = unichar;
472  prev_class_id = choice->unichar_id();
473  prev_fontinfo_id = choice->fontinfo_id();
474  } else if (-choice->certainty() < prev_score + kNonAmbiguousMargin) {
475  ++script_count;
476  next_best_score = -choice->certainty();
477  next_best_script_id = choice->script_id();
478  next_best_unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
479  }
480 
481  if (strlen(prev_unichar) == 1)
482  if (unichar[0] >= '0' && unichar[0] <= '9')
483  break;
484 
485  // if script_count is >= 2, character is ambiguous, skip other matches
486  // since they are useless.
487  if (script_count >= 2)
488  break;
489  }
490  // Character is non ambiguous
491  if (script_count == 1) {
492  // Update the score of the winning script
493  osr_->scripts_na[i][prev_id] += 1.0;
494 
495  // Workaround for Fraktur
496  if (prev_id == latin_id_) {
497  if (prev_fontinfo_id >= 0) {
498  const tesseract::FontInfo &fi =
499  tess_->get_fontinfo_table().get(prev_fontinfo_id);
500  //printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name,
501  // fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(),
502  // fi.is_serif(), fi.is_fraktur(),
503  // prev_unichar);
504  if (fi.is_fraktur()) {
505  osr_->scripts_na[i][prev_id] -= 1.0;
506  osr_->scripts_na[i][fraktur_id_] += 1.0;
507  }
508  }
509  }
510 
511  // Update Japanese / Korean pseudo-scripts
512  if (prev_id == katakana_id_)
513  osr_->scripts_na[i][japanese_id_] += 1.0;
514  if (prev_id == hiragana_id_)
515  osr_->scripts_na[i][japanese_id_] += 1.0;
516  if (prev_id == hangul_id_)
517  osr_->scripts_na[i][korean_id_] += 1.0;
518  if (prev_id == han_id_)
519  osr_->scripts_na[i][korean_id_] += kHanRatioInKorean;
520  if (prev_id == han_id_)
521  osr_->scripts_na[i][japanese_id_] += kHanRatioInJapanese;
522  }
523  } // iterate over each orientation
524 }
const float kHanRatioInKorean
Definition: osdetect.cpp:45
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
bool is_fraktur() const
Definition: fontinfo.h:88
inT16 fontinfo_id() const
Definition: ratngs.h:68
UNICHAR_ID unichar_id() const
Definition: ratngs.h:59
const float kNonAmbiguousMargin
Definition: osdetect.cpp:48
const float kHanRatioInJapanese
Definition: osdetect.cpp:46
float scripts_na[4][kMaxNumberOfScripts]
Definition: osdetect.h:76
float certainty() const
Definition: ratngs.h:65
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:336
UNICHARSET unicharset
Definition: ccutil.h:72
const int kMaxNumberOfScripts
Definition: osdetect.h:36
int script_id() const
Definition: ratngs.h:74
void ScriptDetector::get_script ( )
bool ScriptDetector::must_stop ( int  orientation)

Definition at line 526 of file osdetect.cpp.

526  {
527  osr_->update_best_script(orientation);
528  return osr_->best_result.sconfidence > 1;
529 }
float sconfidence
Definition: osdetect.h:43
OSBestResult best_result
Definition: osdetect.h:79
void update_best_script(int orientation_id)
Definition: osdetect.cpp:94

The documentation for this class was generated from the following files: