Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::WordListLangModel Class Reference

#include <word_list_lang_model.h>

Inheritance diagram for tesseract::WordListLangModel:
tesseract::LangModel

Public Member Functions

 WordListLangModel (CubeRecoContext *cntxt)
 
 ~WordListLangModel ()
 
LangModEdgeRoot ()
 
LangModEdge ** GetEdges (CharAltList *alt_list, LangModEdge *edge, int *edge_cnt)
 
bool IsValidSequence (const char_32 *sequence, bool eow_flag, LangModEdge **edges)
 
bool IsLeadingPunc (char_32 ch)
 
bool IsTrailingPunc (char_32 ch)
 
bool IsDigit (char_32 ch)
 
bool AddString (const char *char_ptr)
 
bool AddString32 (const char_32 *char_32_ptr)
 
- Public Member Functions inherited from tesseract::LangModel
 LangModel ()
 
virtual ~LangModel ()
 
bool OOD ()
 
bool Numeric ()
 
bool WordList ()
 
bool Punc ()
 
void SetOOD (bool ood)
 
void SetNumeric (bool numeric)
 
void SetWordList (bool word_list)
 
void SetPunc (bool punc_enabled)
 

Static Public Member Functions

static void WordVariants (const CharSet &char_set, const UNICHARSET *uchset, string_32 str32, vector< WERD_CHOICE * > *word_variants)
 

Additional Inherited Members

- Protected Attributes inherited from tesseract::LangModel
bool ood_enabled_
 
bool numeric_enabled_
 
bool word_list_enabled_
 
bool punc_enabled_
 

Detailed Description

Definition at line 39 of file word_list_lang_model.h.

Constructor & Destructor Documentation

tesseract::WordListLangModel::WordListLangModel ( CubeRecoContext cntxt)
explicit

Definition at line 29 of file word_list_lang_model.cpp.

29  {
30  cntxt_ = cntxt;
31  dawg_ = NULL;
32  init_ = false;
33 }
#define NULL
Definition: host.h:144
tesseract::WordListLangModel::~WordListLangModel ( )

Definition at line 35 of file word_list_lang_model.cpp.

35  {
36  Cleanup();
37 }

Member Function Documentation

bool tesseract::WordListLangModel::AddString ( const char *  char_ptr)

Definition at line 167 of file word_list_lang_model.cpp.

167  {
168  if (!init_ && !Init()) { // initialize if necessary
169  return false;
170  }
171 
172  string_32 str32;
173  CubeUtils::UTF8ToUTF32(char_ptr, &str32);
174  if (str32.length() < 1) {
175  return false;
176  }
177  return AddString32(str32.c_str());
178 }
bool AddString32(const char_32 *char_32_ptr)
basic_string< char_32 > string_32
Definition: string_32.h:41
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
Definition: cube_utils.cpp:335
bool tesseract::WordListLangModel::AddString32 ( const char_32 char_32_ptr)

Definition at line 181 of file word_list_lang_model.cpp.

181  {
182  if (char_32_ptr == NULL) {
183  return false;
184  }
185  // get all the word variants
186  vector<WERD_CHOICE *> word_variants;
187  WordVariants(*(cntxt_->CharacterSet()), cntxt_->TessUnicharset(),
188  char_32_ptr, &word_variants);
189 
190  if (word_variants.size() > 0) {
191  // find the shortest variant
192  int shortest_word = 0;
193  for (int word = 1; word < word_variants.size(); word++) {
194  if (word_variants[shortest_word]->length() >
195  word_variants[word]->length()) {
196  shortest_word = word;
197  }
198  }
199  // only add the shortest grapheme interpretation of string to the word list
200  dawg_->add_word_to_dawg(*word_variants[shortest_word]);
201  }
202  for (int i = 0; i < word_variants.size(); i++) { delete word_variants[i]; }
203  return true;
204 }
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:173
#define NULL
Definition: host.h:144
static void WordVariants(const CharSet &char_set, const UNICHARSET *uchset, string_32 str32, vector< WERD_CHOICE * > *word_variants)
CharSet * CharacterSet() const
const UNICHARSET * TessUnicharset() const
LangModEdge ** tesseract::WordListLangModel::GetEdges ( CharAltList alt_list,
LangModEdge edge,
int *  edge_cnt 
)
virtual

Implements tesseract::LangModel.

Definition at line 71 of file word_list_lang_model.cpp.

73  {
74  // initialize if necessary
75  if (init_ == false) {
76  if (Init() == false) {
77  return false;
78  }
79  }
80 
81  (*edge_cnt) = 0;
82 
83  EDGE_REF edge_ref;
84 
85  TessLangModEdge *tess_lm_edge = reinterpret_cast<TessLangModEdge *>(edge);
86 
87  if (tess_lm_edge == NULL) {
88  edge_ref = 0;
89  } else {
90  edge_ref = tess_lm_edge->EndEdge();
91 
92  // advance node
93  edge_ref = dawg_->next_node(edge_ref);
94  if (edge_ref == 0) {
95  return 0;
96  }
97  }
98 
99  // allocate memory for edges
100  LangModEdge **edge_array = new LangModEdge *[kMaxEdge];
101  if (edge_array == NULL) {
102  return NULL;
103  }
104 
105  // now get all the emerging edges
106  (*edge_cnt) += TessLangModEdge::CreateChildren(cntxt_, dawg_, edge_ref,
107  edge_array + (*edge_cnt));
108 
109  return edge_array;
110 }
#define NULL
Definition: host.h:144
inT64 EDGE_REF
Definition: dawg.h:54
NODE_REF next_node(EDGE_REF edge_ref) const
Definition: trie.h:130
static int CreateChildren(CubeRecoContext *cntxt, const Dawg *edges, NODE_REF edge_reg, LangModEdge **lm_edges)
bool tesseract::WordListLangModel::IsDigit ( char_32  ch)
inlinevirtual

Implements tesseract::LangModel.

Definition at line 58 of file word_list_lang_model.h.

58 { return false; } // not yet implemented
bool tesseract::WordListLangModel::IsLeadingPunc ( char_32  ch)
inlinevirtual

Implements tesseract::LangModel.

Definition at line 56 of file word_list_lang_model.h.

56 { return false; } // not yet implemented
bool tesseract::WordListLangModel::IsTrailingPunc ( char_32  ch)
inlinevirtual

Implements tesseract::LangModel.

Definition at line 57 of file word_list_lang_model.h.

57 { return false; } // not yet implemented
bool tesseract::WordListLangModel::IsValidSequence ( const char_32 sequence,
bool  eow_flag,
LangModEdge **  edges 
)
virtual

Implements tesseract::LangModel.

Definition at line 114 of file word_list_lang_model.cpp.

115  {
116  return false;
117 }
LangModEdge * tesseract::WordListLangModel::Root ( )
virtual

Implements tesseract::LangModel.

Definition at line 66 of file word_list_lang_model.cpp.

66  {
67  return NULL;
68 }
#define NULL
Definition: host.h:144
void tesseract::WordListLangModel::WordVariants ( const CharSet char_set,
const UNICHARSET uchset,
string_32  str32,
vector< WERD_CHOICE * > *  word_variants 
)
static

Definition at line 154 of file word_list_lang_model.cpp.

156  {
157  for (int i = 0; i < word_variants->size(); i++) {
158  delete (*word_variants)[i];
159  }
160  word_variants->clear();
161  string_32 prefix_str32;
162  WERD_CHOICE word_so_far(uchset);
163  WordVariants(char_set, prefix_str32, &word_so_far, str32, word_variants);
164 }
basic_string< char_32 > string_32
Definition: string_32.h:41
static void WordVariants(const CharSet &char_set, const UNICHARSET *uchset, string_32 str32, vector< WERD_CHOICE * > *word_variants)

The documentation for this class was generated from the following files: