Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
unicharset_extractor.cpp File Reference
#include <stdio.h>
#include <locale.h>
#include "boxread.h"
#include "rect.h"
#include "strngs.h"
#include "tessopt.h"
#include "unichar.h"
#include "unicharset.h"

Go to the source code of this file.

Functions

UNICHAR_ID wc_to_unichar_id (const UNICHARSET &unicharset, int wc)
 
void set_properties (UNICHARSET *unicharset, const char *const c_string)
 
int main (int argc, char **argv)
 

Function Documentation

int main ( int  argc,
char **  argv 
)

Public Function Prototypes

Definition at line 102 of file unicharset_extractor.cpp.

102  {
103  int option;
104  const char* output_directory = ".";
105  STRING unicharset_file_name;
106  UNICHARSET unicharset;
107 
108  setlocale(LC_ALL, "");
109  // Space character needed to represent NIL classification
110  unicharset.unichar_insert(" ");
111 
112  // Print usage
113  if (argc <= 1) {
114  printf("Usage: %s [-D DIRECTORY] FILE...\n", argv[0]);
115  exit(1);
116 
117  }
118 
119  // Parse arguments
120  while ((option = tessopt(argc, argv, "D" )) != EOF) {
121  switch (option) {
122  case 'D':
123  output_directory = tessoptarg;
124  ++tessoptind;
125  break;
126  }
127  }
128 
129  // Save file name
130  unicharset_file_name = output_directory;
131  unicharset_file_name += "/";
132  unicharset_file_name += kUnicharsetFileName;
133 
134  // Load box files
135  for (; tessoptind < argc; ++tessoptind) {
136  printf("Extracting unicharset from %s\n", argv[tessoptind]);
137 
138  FILE* box_file = fopen(argv[tessoptind], "rb");
139  if (box_file == NULL) {
140  printf("Cannot open box file %s\n", argv[tessoptind]);
141  return -1;
142  }
143 
144  TBOX box;
145  STRING unichar_string;
146  int line_number = 0;
147  while (ReadNextBox(&line_number, box_file, &unichar_string, &box)) {
148  unicharset.unichar_insert(unichar_string.string());
149  set_properties(&unicharset, unichar_string.string());
150  }
151  }
152 
153  // Write unicharset file
154  if (unicharset.save_to_file(unicharset_file_name.string())) {
155  printf("Wrote unicharset file %s.\n", unicharset_file_name.string());
156  }
157  else {
158  printf("Cannot save unicharset file %s.\n", unicharset_file_name.string());
159  return -1;
160  }
161  return 0;
162 }
int tessoptind
Definition: tessopt.cpp:26
#define NULL
Definition: host.h:144
Definition: rect.h:29
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:511
char * tessoptarg
Definition: tessopt.cpp:27
const char * string() const
Definition: strngs.cpp:156
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:59
void set_properties(UNICHARSET *unicharset, const char *const c_string)
int tessopt(inT32 argc, char *argv[], const char *arglist)
Definition: tessopt.cpp:35
Definition: strngs.h:40
bool save_to_file(const char *const filename) const
Definition: unicharset.h:273
void set_properties ( UNICHARSET unicharset,
const char *const  c_string 
)

Definition at line 61 of file unicharset_extractor.cpp.

61  {
62 #ifdef USING_WCTYPE
63  UNICHAR_ID id;
64  int wc;
65 
66  // Convert the string to a unichar id.
67  id = unicharset->unichar_to_id(c_string);
68 
69  // Set the other_case property to be this unichar id by default.
70  unicharset->set_other_case(id, id);
71 
72  int step = UNICHAR::utf8_step(c_string);
73  if (step == 0)
74  return; // Invalid utf-8.
75 
76  // Get the next Unicode code point in the string.
77  UNICHAR ch(c_string, step);
78  wc = ch.first_uni();
79 
80  /* Copy the properties. */
81  if (iswalpha(wc)) {
82  unicharset->set_isalpha(id, 1);
83  if (iswlower(wc)) {
84  unicharset->set_islower(id, 1);
85  unicharset->set_other_case(id, wc_to_unichar_id(*unicharset,
86  towupper(wc)));
87  }
88  if (iswupper(wc)) {
89  unicharset->set_isupper(id, 1);
90  unicharset->set_other_case(id, wc_to_unichar_id(*unicharset,
91  towlower(wc)));
92  }
93  }
94  if (iswdigit(wc))
95  unicharset->set_isdigit(id, 1);
96  if(iswpunct(wc))
97  unicharset->set_ispunctuation(id, 1);
98 
99 #endif
100 }
UNICHAR_ID wc_to_unichar_id(const UNICHARSET &unicharset, int wc)
int UNICHAR_ID
Definition: unichar.h:31
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:131
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:336
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:341
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:356
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:351
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:346
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:372
UNICHAR_ID wc_to_unichar_id ( const UNICHARSET unicharset,
int  wc 
)

Definition at line 49 of file unicharset_extractor.cpp.

49  {
50  UNICHAR uch(wc);
51  char *unichar = uch.utf8_str();
52  UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar);
53  delete[] unichar;
54  return unichar_id;
55 }
int UNICHAR_ID
Definition: unichar.h:31
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176