Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::TessdataManager Class Reference

#include <tessdatamanager.h>

Public Member Functions

 TessdataManager ()
 
 ~TessdataManager ()
 
int DebugLevel ()
 
bool Init (const char *data_file_name, int debug_level)
 
FILE * GetDataFilePtr () const
 
bool SeekToStart (TessdataType tessdata_type)
 
inT64 GetEndOffset (TessdataType tessdata_type) const
 
void End ()
 
bool swap () const
 
bool OverwriteComponents (const char *new_traineddata_filename, char **component_filenames, int num_new_components)
 
bool ExtractToFile (const char *filename)
 

Static Public Member Functions

static void WriteMetadata (inT64 *offset_table, FILE *output_file)
 
static bool CombineDataFiles (const char *language_data_path_prefix, const char *output_filename)
 
static void CopyFile (FILE *input_file, FILE *output_file, bool newline_end, inT64 num_bytes_to_copy)
 
static bool TessdataTypeFromFileSuffix (const char *suffix, TessdataType *type, bool *text_file)
 
static bool TessdataTypeFromFileName (const char *filename, TessdataType *type, bool *text_file)
 

Detailed Description

Definition at line 131 of file tessdatamanager.h.

Constructor & Destructor Documentation

tesseract::TessdataManager::TessdataManager ( )
inline

Definition at line 133 of file tessdatamanager.h.

133  {
134  data_file_ = NULL;
135  actual_tessdata_num_entries_ = 0;
136  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
137  offset_table_[i] = -1;
138  }
139  }
#define NULL
Definition: host.h:144
tesseract::TessdataManager::~TessdataManager ( )
inline

Definition at line 140 of file tessdatamanager.h.

140 {}

Member Function Documentation

bool tesseract::TessdataManager::CombineDataFiles ( const char *  language_data_path_prefix,
const char *  output_filename 
)
static

Reads all the standard tesseract config and data files for a language at the given path and bundles them up into one binary data file. Returns true if the combined traineddata file was successfully written.

Definition at line 108 of file tessdatamanager.cpp.

110  {
111  int i;
112  inT64 offset_table[TESSDATA_NUM_ENTRIES];
113  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
114  FILE *output_file = fopen(output_filename, "wb");
115  if (output_file == NULL) {
116  tprintf("Error opening %s for writing\n", output_filename);
117  return false;
118  }
119  // Leave some space for recording the offset_table.
120  fseek(output_file,
121  sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
122 
124  bool text_file = false;
125  FILE *file_ptr[TESSDATA_NUM_ENTRIES];
126 
127  // Load individual tessdata components from files.
128  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
130  kTessdataFileSuffixes[i], &type, &text_file));
131  STRING filename = language_data_path_prefix;
132  filename += kTessdataFileSuffixes[i];
133  file_ptr[i] = fopen(filename.string(), "rb");
134  if (file_ptr[i] != NULL) {
135  offset_table[type] = ftell(output_file);
136  CopyFile(file_ptr[i], output_file, text_file, -1);
137  fclose(file_ptr[i]);
138  }
139  }
140 
141  // Make sure that the required components are present.
142  if (file_ptr[TESSDATA_UNICHARSET] == NULL) {
143  tprintf("Error opening unicharset file\n");
144  fclose(output_file);
145  return false;
146  }
147  if (file_ptr[TESSDATA_INTTEMP] != NULL &&
148  (file_ptr[TESSDATA_PFFMTABLE] == NULL ||
149  file_ptr[TESSDATA_NORMPROTO] == NULL)) {
150  tprintf("Error opening pffmtable and/or normproto files"
151  " while inttemp file was present\n");
152  fclose(output_file);
153  return false;
154  }
155 
156  WriteMetadata(offset_table, output_file);
157  return true;
158 }
static void WriteMetadata(inT64 *offset_table, FILE *output_file)
long long int inT64
Definition: host.h:108
#define NULL
Definition: host.h:144
static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end, inT64 num_bytes_to_copy)
int inT32
Definition: host.h:102
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type, bool *text_file)
const char * string() const
Definition: strngs.cpp:156
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
Definition: strngs.h:40
#define ASSERT_HOST(x)
Definition: errcode.h:84
void tesseract::TessdataManager::CopyFile ( FILE *  input_file,
FILE *  output_file,
bool  newline_end,
inT64  num_bytes_to_copy 
)
static

Copies data from the given input file to the output_file provided. If num_bytes_to_copy is >= 0, only num_bytes_to_copy is copied from the input file, otherwise all the data in the input file is copied.

Definition at line 68 of file tessdatamanager.cpp.

69  {
70  if (num_bytes_to_copy == 0) return;
71  int buffer_size = 1024;
72  if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) {
73  buffer_size = num_bytes_to_copy;
74  }
75  inT64 num_bytes_copied = 0;
76  char *chunk = new char[buffer_size];
77  int bytes_read;
78  char last_char = 0x0;
79  while ((bytes_read = fread(chunk, sizeof(char),
80  buffer_size, input_file))) {
81  fwrite(chunk, sizeof(char), bytes_read, output_file);
82  last_char = chunk[bytes_read-1];
83  if (num_bytes_to_copy > 0) {
84  num_bytes_copied += bytes_read;
85  if (num_bytes_copied == num_bytes_to_copy) break;
86  if (num_bytes_copied + buffer_size > num_bytes_to_copy) {
87  buffer_size = num_bytes_to_copy - num_bytes_copied;
88  }
89  }
90  }
91  if (newline_end) ASSERT_HOST(last_char == '\n');
92  delete[] chunk;
93 }
long long int inT64
Definition: host.h:108
#define ASSERT_HOST(x)
Definition: errcode.h:84
int tesseract::TessdataManager::DebugLevel ( )
inline

Definition at line 141 of file tessdatamanager.h.

141 { return debug_level_; }
void tesseract::TessdataManager::End ( )
inline

Closes data_file_ (if it was opened by Init()).

Definition at line 187 of file tessdatamanager.h.

187  {
188  if (data_file_ != NULL) {
189  fclose(data_file_);
190  data_file_ = NULL;
191  }
192  }
#define NULL
Definition: host.h:144
bool tesseract::TessdataManager::ExtractToFile ( const char *  filename)

Extracts tessdata component implied by the name of the input file from the combined traineddata loaded into TessdataManager. Writes the extracted component to the file indicated by the file name. E.g. if the filename given is somepath/somelang.unicharset, unicharset will be extracted from the data loaded into the TessdataManager and will be written to somepath/somelang.unicharset.

Returns
true if the component was successfully extracted, false if the component was not present in the traineddata loaded into TessdataManager.

Definition at line 233 of file tessdatamanager.cpp.

233  {
235  bool text_file = false;
237  filename, &type, &text_file));
238  if (!SeekToStart(type)) return false;
239 
240  FILE *output_file = fopen(filename, "wb");
241  if (output_file == NULL) {
242  printf("Error openning %s\n", filename);
243  exit(1);
244  }
245  inT64 begin_offset = ftell(GetDataFilePtr());
246  inT64 end_offset = GetEndOffset(type);
248  GetDataFilePtr(), output_file, text_file,
249  end_offset - begin_offset + 1);
250  fclose(output_file);
251  return true;
252 }
long long int inT64
Definition: host.h:108
FILE * GetDataFilePtr() const
#define NULL
Definition: host.h:144
static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end, inT64 num_bytes_to_copy)
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type, bool *text_file)
bool SeekToStart(TessdataType tessdata_type)
inT64 GetEndOffset(TessdataType tessdata_type) const
#define ASSERT_HOST(x)
Definition: errcode.h:84
FILE* tesseract::TessdataManager::GetDataFilePtr ( ) const
inline

Returns data file pointer.

Definition at line 150 of file tessdatamanager.h.

150 { return data_file_; }
inT64 tesseract::TessdataManager::GetEndOffset ( TessdataType  tessdata_type) const
inline

Returns the end offset for the given tesseract data file type.

Definition at line 173 of file tessdatamanager.h.

173  {
174  int index = tessdata_type + 1;
175  while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) {
176  ++index; // skip tessdata types not present in the combined file
177  }
178  if (debug_level_) {
179  tprintf("TessdataManager: end offset for type %d is %lld\n",
180  tessdata_type,
181  (index == actual_tessdata_num_entries_) ? -1
182  : offset_table_[index]);
183  }
184  return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1;
185  }
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool tesseract::TessdataManager::Init ( const char *  data_file_name,
int  debug_level 
)

Opens the given data file and reads the offset table. Returns true on success.

Definition at line 35 of file tessdatamanager.cpp.

35  {
36  int i;
37  debug_level_ = debug_level;
38  data_file_ = fopen(data_file_name, "rb");
39  if (data_file_ == NULL) {
40  tprintf("Error opening data file %s\n", data_file_name);
41  tprintf("Please make sure the TESSDATA_PREFIX environment variable is set "
42  "to the parent directory of your \"tessdata\" directory.\n");
43  return false;
44  }
45  fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_);
46  swap_ = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
47  if (swap_) {
48  actual_tessdata_num_entries_ = reverse32(actual_tessdata_num_entries_);
49  }
50  ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES);
51  fread(offset_table_, sizeof(inT64),
52  actual_tessdata_num_entries_, data_file_);
53  if (swap_) {
54  for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
55  offset_table_[i] = reverse64(offset_table_[i]);
56  }
57  }
58  if (debug_level_) {
59  tprintf("TessdataManager loaded %d types of tesseract data files.\n",
60  actual_tessdata_num_entries_);
61  for (i = 0; i < actual_tessdata_num_entries_; ++i) {
62  tprintf("Offset for type %d is %lld\n", i, offset_table_[i]);
63  }
64  }
65  return true;
66 }
DLLSYM uinT64 reverse64(uinT64 num)
Definition: serialis.cpp:25
long long int inT64
Definition: host.h:108
#define NULL
Definition: host.h:144
int inT32
Definition: host.h:102
DLLSYM uinT32 reverse32(uinT32 num)
Definition: serialis.cpp:36
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool tesseract::TessdataManager::OverwriteComponents ( const char *  new_traineddata_filename,
char **  component_filenames,
int  num_new_components 
)

Gets the individual components from the data_file_ with which the class was initialized. Overwrites the components specified by component_filenames. Writes the updated traineddata file to new_traineddata_filename.

Definition at line 160 of file tessdatamanager.cpp.

163  {
164  int i;
165  inT64 offset_table[TESSDATA_NUM_ENTRIES];
167  bool text_file = false;
168  FILE *file_ptr[TESSDATA_NUM_ENTRIES];
169  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
170  offset_table[i] = -1;
171  file_ptr[i] = NULL;
172  }
173  FILE *output_file = fopen(new_traineddata_filename, "wb");
174  if (output_file == NULL) {
175  tprintf("Error opening %s for writing\n", new_traineddata_filename);
176  return false;
177  }
178 
179  // Leave some space for recording the offset_table.
180  fseek(output_file,
181  sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
182 
183  // Open the files with the new components.
184  for (i = 0; i < num_new_components; ++i) {
185  TessdataTypeFromFileName(component_filenames[i], &type, &text_file);
186  file_ptr[type] = fopen(component_filenames[i], "rb");
187  }
188 
189  // Write updated data to the output traineddata file.
190  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
191  if (file_ptr[i] != NULL) {
192  // Get the data from the opened component file.
193  offset_table[i] = ftell(output_file);
194  CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1);
195  fclose(file_ptr[i]);
196  } else {
197  // Get this data component from the loaded data file.
198  if (SeekToStart(static_cast<TessdataType>(i))) {
199  offset_table[i] = ftell(output_file);
200  CopyFile(data_file_, output_file, kTessdataFileIsText[i],
201  GetEndOffset(static_cast<TessdataType>(i)) -
202  ftell(data_file_) + 1);
203  }
204  }
205  }
206 
207  WriteMetadata(offset_table, output_file);
208  return true;
209 }
static void WriteMetadata(inT64 *offset_table, FILE *output_file)
long long int inT64
Definition: host.h:108
#define NULL
Definition: host.h:144
static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end, inT64 num_bytes_to_copy)
int inT32
Definition: host.h:102
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type, bool *text_file)
bool SeekToStart(TessdataType tessdata_type)
inT64 GetEndOffset(TessdataType tessdata_type) const
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool tesseract::TessdataManager::SeekToStart ( TessdataType  tessdata_type)
inline

Returns false if there is no data of the given type. Otherwise does a seek on the data_file_ to position the pointer at the start of the data of the given type.

Definition at line 157 of file tessdatamanager.h.

157  {
158  if (debug_level_) {
159  tprintf("TessdataManager: seek to offset %lld - start of tessdata"
160  "type %d (%s))\n", offset_table_[tessdata_type],
161  tessdata_type, kTessdataFileSuffixes[tessdata_type]);
162  }
163  if (offset_table_[tessdata_type] < 0) {
164  return false;
165  } else {
166  ASSERT_HOST(fseek(data_file_,
167  static_cast<size_t>(offset_table_[tessdata_type]),
168  SEEK_SET) == 0);
169  return true;
170  }
171  }
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool tesseract::TessdataManager::swap ( ) const
inline

Definition at line 193 of file tessdatamanager.h.

193  {
194  return swap_;
195  }
bool tesseract::TessdataManager::TessdataTypeFromFileName ( const char *  filename,
TessdataType type,
bool *  text_file 
)
static

Tries to determine tessdata component file suffix from filename, returns true on success.

Definition at line 225 of file tessdatamanager.cpp.

226  {
227  // Get the file suffix (extension)
228  const char *suffix = strrchr(filename, '.');
229  if (suffix == NULL || *(++suffix) == '\0') return false;
230  return TessdataTypeFromFileSuffix(suffix, type, text_file);
231 }
#define NULL
Definition: host.h:144
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type, bool *text_file)
bool tesseract::TessdataManager::TessdataTypeFromFileSuffix ( const char *  suffix,
TessdataType type,
bool *  text_file 
)
static

Fills type with TessdataType of the tessdata component represented by the given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET. Sets *text_file to true if the component is in text format (e.g. unicharset, unichar ambigs, config, etc).

Returns
true if the tessdata component type could be determined from the given file name.

Definition at line 211 of file tessdatamanager.cpp.

212  {
213  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
214  if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
215  *type = static_cast<TessdataType>(i);
216  *text_file = kTessdataFileIsText[i];
217  return true;
218  }
219  }
220  printf("TessdataManager can't determine which tessdata"
221  " component is represented by %s\n", suffix);
222  return false;
223 }
void tesseract::TessdataManager::WriteMetadata ( inT64 offset_table,
FILE *  output_file 
)
static

Writes the number of entries and the given offset table to output_file.

Definition at line 95 of file tessdatamanager.cpp.

95  {
96  fseek(output_file, 0, SEEK_SET);
97  inT32 num_entries = TESSDATA_NUM_ENTRIES;
98  fwrite(&num_entries, sizeof(inT32), 1, output_file);
99  fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, output_file);
100  fclose(output_file);
101 
102  tprintf("TessdataManager combined tesseract data files.\n");
103  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
104  tprintf("Offset for type %d is %lld\n", i, offset_table[i]);
105  }
106 }
long long int inT64
Definition: host.h:108
int inT32
Definition: host.h:102
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41

The documentation for this class was generated from the following files: