Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::UnicharAmbigs Class Reference

#include <ambigs.h>

Public Member Functions

 UnicharAmbigs ()
 
 ~UnicharAmbigs ()
 
const UnicharAmbigsVectordang_ambigs () const
 
const UnicharAmbigsVectorreplace_ambigs () const
 
void LoadUnicharAmbigs (FILE *ambigs_file, inT64 end_offset, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
 
const UnicharIdVectorOneToOneDefiniteAmbigs (UNICHAR_ID unichar_id) const
 
const UnicharIdVectorAmbigsForAdaption (UNICHAR_ID unichar_id) const
 
const UnicharIdVectorReverseAmbigsForAdaption (UNICHAR_ID unichar_id) const
 

Detailed Description

Definition at line 141 of file ambigs.h.

Constructor & Destructor Documentation

tesseract::UnicharAmbigs::UnicharAmbigs ( )
inline

Definition at line 143 of file ambigs.h.

143 {}
tesseract::UnicharAmbigs::~UnicharAmbigs ( )
inline

Definition at line 144 of file ambigs.h.

144  {
145  replace_ambigs_.delete_data_pointers();
146  dang_ambigs_.delete_data_pointers();
147  one_to_one_definite_ambigs_.delete_data_pointers();
148  }
void delete_data_pointers()

Member Function Documentation

const UnicharIdVector* tesseract::UnicharAmbigs::AmbigsForAdaption ( UNICHAR_ID  unichar_id) const
inline

Definition at line 178 of file ambigs.h.

179  {
180  if (ambigs_for_adaption_.empty()) return NULL;
181  return ambigs_for_adaption_[unichar_id];
182  }
#define NULL
Definition: host.h:144
const UnicharAmbigsVector& tesseract::UnicharAmbigs::dang_ambigs ( ) const
inline

Definition at line 150 of file ambigs.h.

150 { return dang_ambigs_; }
void tesseract::UnicharAmbigs::LoadUnicharAmbigs ( FILE *  ambigs_file,
inT64  end_offset,
int  debug_level,
bool  use_ambigs_for_adaption,
UNICHARSET unicharset 
)

Definition at line 44 of file ambigs.cpp.

48  {
49  int i, j;
50  UnicharIdVector *adaption_ambigs_entry;
51  for (i = 0; i < unicharset->size(); ++i) {
52  replace_ambigs_.push_back(NULL);
53  dang_ambigs_.push_back(NULL);
54  one_to_one_definite_ambigs_.push_back(NULL);
55  if (use_ambigs_for_adaption) {
56  ambigs_for_adaption_.push_back(NULL);
57  reverse_ambigs_for_adaption_.push_back(NULL);
58  }
59  }
60  if (debug_level) tprintf("Reading ambiguities\n");
61 
62  int TestAmbigPartSize;
63  int ReplacementAmbigPartSize;
64  // Maximum line size:
65  // 10 for sizes of ambigs, tabs, abmig type and newline
66  // UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig
67  // The space for buffer is allocated on the heap to avoid
68  // GCC frame size warning.
69  const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1);
70  const int kBufferSize = 10 + 2 * kMaxAmbigStringSize;
71  char *buffer = new char[kBufferSize];
72  char ReplacementString[kMaxAmbigStringSize];
73  UNICHAR_ID TestUnicharIds[MAX_AMBIG_SIZE + 1];
74  int line_num = 0;
75  int type = NOT_AMBIG;
76 
77  // Determine the version of the ambigs file.
78  int version = 0;
79  ASSERT_HOST(fgets(buffer, kBufferSize, AmbigFile) != NULL &&
80  strlen(buffer) > 0);
81  if (*buffer == 'v') {
82  version = static_cast<int>(strtol(buffer+1, NULL, 10));
83  ++line_num;
84  } else {
85  rewind(AmbigFile);
86  }
87  while ((end_offset < 0 || ftell(AmbigFile) < end_offset) &&
88  fgets(buffer, kBufferSize, AmbigFile) != NULL) {
89  chomp_string(buffer);
90  if (debug_level > 2) tprintf("read line %s\n", buffer);
91  ++line_num;
92  if (!ParseAmbiguityLine(line_num, version, debug_level, *unicharset,
93  buffer, &TestAmbigPartSize, TestUnicharIds,
94  &ReplacementAmbigPartSize,
95  ReplacementString, &type)) continue;
96  // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST.
97  AmbigSpec *ambig_spec = new AmbigSpec();
98  InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_,
99  TestAmbigPartSize, TestUnicharIds,
100  ReplacementAmbigPartSize, ReplacementString, type,
101  ambig_spec, unicharset);
102 
103  // Update one_to_one_definite_ambigs_.
104  if (TestAmbigPartSize == 1 &&
105  ReplacementAmbigPartSize == 1 && type == DEFINITE_AMBIG) {
106  if (one_to_one_definite_ambigs_[TestUnicharIds[0]] == NULL) {
107  one_to_one_definite_ambigs_[TestUnicharIds[0]] = new UnicharIdVector();
108  }
109  one_to_one_definite_ambigs_[TestUnicharIds[0]]->push_back(
110  ambig_spec->correct_ngram_id);
111  }
112  // Update ambigs_for_adaption_.
113  if (use_ambigs_for_adaption) {
114  for (i = 0; i < TestAmbigPartSize; ++i) {
115  if (ambigs_for_adaption_[TestUnicharIds[i]] == NULL) {
116  ambigs_for_adaption_[TestUnicharIds[i]] = new UnicharIdVector();
117  }
118  adaption_ambigs_entry = ambigs_for_adaption_[TestUnicharIds[i]];
119  const char *tmp_ptr = ReplacementString;
120  const char *tmp_ptr_end = ReplacementString + strlen(ReplacementString);
121  int step = unicharset->step(tmp_ptr);
122  while (step > 0) {
123  UNICHAR_ID id_to_insert = unicharset->unichar_to_id(tmp_ptr, step);
124  ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID);
125  // Add the new unichar id to adaption_ambigs_entry (only if the
126  // vector does not already contain it) keeping it in sorted order.
127  for (j = 0; j < adaption_ambigs_entry->size() &&
128  (*adaption_ambigs_entry)[j] > id_to_insert; ++j);
129  if (j < adaption_ambigs_entry->size()) {
130  if ((*adaption_ambigs_entry)[j] != id_to_insert) {
131  adaption_ambigs_entry->insert(id_to_insert, j);
132  }
133  } else {
134  adaption_ambigs_entry->push_back(id_to_insert);
135  }
136  // Update tmp_ptr and step.
137  tmp_ptr += step;
138  step = tmp_ptr < tmp_ptr_end ? unicharset->step(tmp_ptr) : 0;
139  }
140  }
141  }
142  }
143  delete[] buffer;
144 
145  // Fill in reverse_ambigs_for_adaption from ambigs_for_adaption vector.
146  if (use_ambigs_for_adaption) {
147  for (i = 0; i < ambigs_for_adaption_.size(); ++i) {
148  adaption_ambigs_entry = ambigs_for_adaption_[i];
149  if (adaption_ambigs_entry == NULL) continue;
150  for (j = 0; j < adaption_ambigs_entry->size(); ++j) {
151  UNICHAR_ID ambig_id = (*adaption_ambigs_entry)[j];
152  if (reverse_ambigs_for_adaption_[ambig_id] == NULL) {
153  reverse_ambigs_for_adaption_[ambig_id] = new UnicharIdVector();
154  }
155  reverse_ambigs_for_adaption_[ambig_id]->push_back(i);
156  }
157  }
158  }
159 
160  // Print what was read from the input file.
161  if (debug_level > 1) {
162  for (int tbl = 0; tbl < 2; ++tbl) {
163  const UnicharAmbigsVector &print_table =
164  (tbl == 0) ? replace_ambigs_ : dang_ambigs_;
165  for (i = 0; i < print_table.size(); ++i) {
166  AmbigSpec_LIST *lst = print_table[i];
167  if (lst == NULL) continue;
168  if (!lst->empty()) {
169  tprintf("%s Ambiguities for %s:\n",
170  (tbl == 0) ? "Replaceable" : "Dangerous",
171  unicharset->debug_str(i).string());
172  }
173  AmbigSpec_IT lst_it(lst);
174  for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) {
175  AmbigSpec *ambig_spec = lst_it.data();
176  tprintf("wrong_ngram:");
177  UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset);
178  tprintf("correct_fragments:");
179  UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset);
180  }
181  }
182  }
183  if (use_ambigs_for_adaption) {
184  for (int vec_id = 0; vec_id < 2; ++vec_id) {
185  const GenericVector<UnicharIdVector *> &vec = (vec_id == 0) ?
186  ambigs_for_adaption_ : reverse_ambigs_for_adaption_;
187  for (i = 0; i < vec.size(); ++i) {
188  adaption_ambigs_entry = vec[i];
189  if (adaption_ambigs_entry != NULL) {
190  tprintf("%sAmbigs for adaption for %s:\n",
191  (vec_id == 0) ? "" : "Reverse ",
192  unicharset->debug_str(i).string());
193  for (j = 0; j < adaption_ambigs_entry->size(); ++j) {
194  tprintf("%s ", unicharset->debug_str(
195  (*adaption_ambigs_entry)[j]).string());
196  }
197  tprintf("\n");
198  }
199  }
200  }
201  }
202  }
203 }
int UNICHAR_ID
Definition: unichar.h:31
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
int size() const
Definition: unicharset.h:264
GenericVector< UNICHAR_ID > UnicharIdVector
Definition: ambigs.h:34
#define MAX_AMBIG_SIZE
Definition: ambigs.h:30
const int kBufferSize
Definition: svutil.cpp:62
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
Definition: ambigs.h:97
#define NULL
Definition: host.h:144
int push_back(T object)
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:285
void chomp_string(char *str)
Definition: helpers.h:32
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int size() const
Definition: genericvector.h:59
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:139
#define UNICHAR_LEN
Definition: unichar.h:28
int step(const char *str) const
Definition: unicharset.cpp:192
#define ASSERT_HOST(x)
Definition: errcode.h:84
const UnicharIdVector* tesseract::UnicharAmbigs::OneToOneDefiniteAmbigs ( UNICHAR_ID  unichar_id) const
inline

Definition at line 167 of file ambigs.h.

168  {
169  if (one_to_one_definite_ambigs_.empty()) return NULL;
170  return one_to_one_definite_ambigs_[unichar_id];
171  }
#define NULL
Definition: host.h:144
const UnicharAmbigsVector& tesseract::UnicharAmbigs::replace_ambigs ( ) const
inline

Definition at line 151 of file ambigs.h.

151 { return replace_ambigs_; }
const UnicharIdVector* tesseract::UnicharAmbigs::ReverseAmbigsForAdaption ( UNICHAR_ID  unichar_id) const
inline

Definition at line 187 of file ambigs.h.

188  {
189  if (reverse_ambigs_for_adaption_.empty()) return NULL;
190  return reverse_ambigs_for_adaption_[unichar_id];
191  }
#define NULL
Definition: host.h:144

The documentation for this class was generated from the following files: