Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
wordlist2dawg.cpp File Reference
#include <stdio.h>
#include "classify.h"
#include "dawg.h"
#include "dict.h"
#include "emalloc.h"
#include "freelist.h"
#include "helpers.h"
#include "serialis.h"
#include "trie.h"
#include "unicharset.h"

Go to the source code of this file.

Functions

int main (int argc, char **argv)
 

Function Documentation

int main ( int  argc,
char **  argv 
)

Public Function Prototypes

Definition at line 37 of file wordlist2dawg.cpp.

37  {
38  int min_word_length;
39  int max_word_length;
40  if (!(argc == 4 || (argc == 5 && strcmp(argv[1], "-t") == 0) ||
41  (argc == 6 && strcmp(argv[1], "-r") == 0) ||
42  (argc == 7 && strcmp(argv[1], "-l") == 0 &&
43  sscanf(argv[2], "%d", &min_word_length) == 1 &&
44  sscanf(argv[3], "%d", &max_word_length) == 1))) {
45  printf("Usage: %s [-t | -r [reverse policy] |"
46  " -l min_len max_len] word_list_file"
47  " dawg_file unicharset_file\n", argv[0]);
48  return 1;
49  }
50  tesseract::Classify *classify = new tesseract::Classify();
51  int argv_index = 0;
52  if (argc == 5) ++argv_index;
53  tesseract::Trie::RTLReversePolicy reverse_policy =
55  if (argc == 6) {
56  ++argv_index;
57  int tmp_int;
58  sscanf(argv[++argv_index], "%d", &tmp_int);
59  reverse_policy = static_cast<tesseract::Trie::RTLReversePolicy>(tmp_int);
60  tprintf("Set reverse_policy to %s\n",
62  }
63  if (argc == 7) argv_index += 3;
64  const char* wordlist_filename = argv[++argv_index];
65  const char* dawg_filename = argv[++argv_index];
66  const char* unicharset_file = argv[++argv_index];
67  tprintf("Loading unicharset from '%s'\n", unicharset_file);
68  if (!classify->getDict().getUnicharset().load_from_file(unicharset_file)) {
69  tprintf("Failed to load unicharset from '%s'\n", unicharset_file);
70  delete classify;
71  return 1;
72  }
73  const UNICHARSET &unicharset = classify->getDict().getUnicharset();
74  if (argc == 4 || argc == 6) {
75  tesseract::Trie trie(
76  // the first 3 arguments are not used in this case
77  tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
78  kMaxNumEdges, unicharset.size(),
79  classify->getDict().dawg_debug_level);
80  tprintf("Reading word list from '%s'\n", wordlist_filename);
81  if (!trie.read_word_list(wordlist_filename, unicharset, reverse_policy)) {
82  tprintf("Failed to read word list from '%s'\n", wordlist_filename);
83  exit(1);
84  }
85  tprintf("Reducing Trie to SquishedDawg\n");
86  tesseract::SquishedDawg *dawg = trie.trie_to_dawg();
87  if (dawg != NULL && dawg->NumEdges() > 0) {
88  tprintf("Writing squished DAWG to '%s'\n", dawg_filename);
89  dawg->write_squished_dawg(dawg_filename);
90  } else {
91  tprintf("Dawg is empty, skip producing the output file\n");
92  }
93  delete dawg;
94  } else if (argc == 5) {
95  tprintf("Loading dawg DAWG from '%s'\n", dawg_filename);
97  dawg_filename,
98  // these 3 arguments are not used in this case
99  tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
100  classify->getDict().dawg_debug_level);
101  tprintf("Checking word list from '%s'\n", wordlist_filename);
102  words.check_for_words(wordlist_filename, unicharset, true);
103  } else if (argc == 7) {
104  // Place words of different lengths in separate Dawgs.
105  char str[CHARS_PER_LINE];
106  FILE *word_file = fopen(wordlist_filename, "rb");
107  if (word_file == NULL) {
108  tprintf("Failed to open wordlist file %s\n", wordlist_filename);
109  exit(1);
110  }
111  FILE *dawg_file = fopen(dawg_filename, "wb");
112  if (dawg_file == NULL) {
113  tprintf("Failed to open dawg output file %s\n", dawg_filename);
114  exit(1);
115  }
116  tprintf("Reading word list from '%s'\n", wordlist_filename);
118  int i;
119  for (i = min_word_length; i <= max_word_length; ++i) {
120  trie_vec.push_back(new tesseract::Trie(
121  // the first 3 arguments are not used in this case
122  tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
123  kMaxNumEdges, unicharset.size(),
124  classify->getDict().dawg_debug_level));
125  }
126  while (fgets(str, CHARS_PER_LINE, word_file) != NULL) {
127  chomp_string(str); // remove newline
128  int badpos;
129  if (!unicharset.encodable_string(str, &badpos)) {
130  tprintf("String '%s' not compatible with unicharset. "
131  "Bad chars here: '%s'\n", str, str + badpos);
132  continue;
133  }
134  WERD_CHOICE word(str, unicharset);
135  if ((reverse_policy == tesseract::Trie::RRP_REVERSE_IF_HAS_RTL &&
136  word.has_rtl_unichar_id()) ||
137  reverse_policy == tesseract::Trie::RRP_FORCE_REVERSE) {
138  word.reverse_and_mirror_unichar_ids();
139  }
140  if (word.length() >= min_word_length &&
141  word.length() <= max_word_length &&
142  !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
143  tesseract::Trie *curr_trie = trie_vec[word.length()-min_word_length];
144  if (!curr_trie->word_in_dawg(word)) {
145  if (!curr_trie->add_word_to_dawg(word)) {
146  tprintf("Failed to add the following word to dawg:\n");
147  word.print();
148  exit(1);
149  }
150  if (classify->getDict().dawg_debug_level > 1) {
151  tprintf("Added word %s of length %d\n", str, word.length());
152  }
153  if (!curr_trie->word_in_dawg(word)) {
154  tprintf("Error: word '%s' not in DAWG after adding it\n", str);
155  exit(1);
156  }
157  }
158  }
159  }
160  fclose(word_file);
161  tprintf("Writing fixed length dawgs to '%s'\n", dawg_filename);
163  for (i = 0; i <= max_word_length; ++i) {
164  dawg_vec.push_back(i < min_word_length ? NULL :
165  trie_vec[i-min_word_length]->trie_to_dawg());
166  }
168  dawg_vec, max_word_length - min_word_length + 1,
169  classify->getDict().dawg_debug_level, dawg_file);
170  fclose(dawg_file);
171  dawg_vec.delete_data_pointers();
172  trie_vec.delete_data_pointers();
173  } else { // should never get here
174  tprintf("Invalid command-line options\n");
175  exit(1);
176  }
177  delete classify;
178  return 0;
179 }
void delete_data_pointers()
int size() const
Definition: unicharset.h:264
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:173
#define NULL
Definition: host.h:144
bool encodable_string(const char *str, int *first_bad_position) const
Definition: unicharset.cpp:220
const UNICHARSET & getUnicharset() const
Definition: dict.h:100
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:298
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:48
int push_back(T object)
#define CHARS_PER_LINE
Definition: cutil.h:57
Dict & getDict()
Definition: classify.h:62
static void WriteFixedLengthDawgs(const GenericVector< SquishedDawg * > &dawg_vec, int num_dawgs, int debug_level, FILE *output_file)
Definition: dict.cpp:626
void chomp_string(char *str)
Definition: helpers.h:32
int dawg_debug_level
Definition: dict.h:839
void write_squished_dawg(FILE *file)
Writes the squished/reduced Dawg to a file.
Definition: dawg.cpp:369
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
static const char * get_reverse_policy_name(RTLReversePolicy reverse_policy)
Definition: trie.cpp:60
RTLReversePolicy
Definition: trie.h:64