Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cntraining.cpp File Reference
#include "oldlist.h"
#include "efio.h"
#include "emalloc.h"
#include "featdefs.h"
#include "tessopt.h"
#include "ocrfeatures.h"
#include "clusttool.h"
#include "cluster.h"
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "unichar.h"
#include "commontraining.h"

Go to the source code of this file.

Macros

#define PROGRAM_FEATURE_TYPE   "cn"
 

Functions

 DECLARE_STRING_PARAM_FLAG (D)
 
int main (int argc, char **argv)
 
void WriteNormProtos (const char *Directory, LIST LabeledProtoList, CLUSTERER *Clusterer)
 
void WriteProtos (FILE *File, uinT16 N, LIST ProtoList, BOOL8 WriteSigProtos, BOOL8 WriteInsigProtos)
 
int main (int argc, char *argv[])
 

Variables

CLUSTERCONFIG CNConfig
 

Macro Definition Documentation

#define PROGRAM_FEATURE_TYPE   "cn"

Include Files and Type Defines

Definition at line 41 of file cntraining.cpp.

Function Documentation

DECLARE_STRING_PARAM_FLAG ( )
int main ( int  argc,
char **  argv 
)

Public Function Prototypes

Definition at line 50 of file tesseractmain.cpp.

50  {
51 #ifdef USING_GETTEXT
52  setlocale (LC_ALL, "");
53  bindtextdomain (PACKAGE, LOCALEDIR);
54  textdomain (PACKAGE);
55 #endif
56  if ((argc == 2 && strcmp(argv[1], "-v") == 0) ||
57  (argc == 2 && strcmp(argv[1], "--version") == 0)) {
58  char *versionStrP;
59 
60  fprintf(stderr, "tesseract %s\n", tesseract::TessBaseAPI::Version());
61 
62  versionStrP = getLeptonicaVersion();
63  fprintf(stderr, " %s\n", versionStrP);
64  lept_free(versionStrP);
65 
66  versionStrP = getImagelibVersions();
67  fprintf(stderr, " %s\n", versionStrP);
68  lept_free(versionStrP);
69 
70  exit(0);
71  }
72 
74  STRING tessdata_dir;
75  truncate_path(argv[0], &tessdata_dir);
76  int rc = api.Init(tessdata_dir.string(), NULL);
77  if (rc) {
78  fprintf(stderr, _("Could not initialize tesseract.\n"));
79  exit(1);
80  }
81 
82  if (argc == 2 && strcmp(argv[1], "--list-langs") == 0) {
83  GenericVector<STRING> languages;
84  api.GetAvailableLanguagesAsVector(&languages);
85  fprintf(stderr, _("List of available languages (%d):\n"), languages.size());
86  for (int index = 0; index < languages.size(); ++index) {
87  STRING& string = languages[index];
88  fprintf(stderr, "%s\n", string.string());
89  }
90  api.Clear();
91  exit(0);
92  }
93  api.End();
94 
95  // Make the order of args a bit more forgiving than it used to be.
96  const char* lang = "eng";
97  const char* image = NULL;
98  const char* output = NULL;
100  int arg = 1;
101  while (arg < argc && (output == NULL || argv[arg][0] == '-')) {
102  if (strcmp(argv[arg], "-l") == 0 && arg + 1 < argc) {
103  lang = argv[arg + 1];
104  ++arg;
105  } else if (strcmp(argv[arg], "-psm") == 0 && arg + 1 < argc) {
106  pagesegmode = static_cast<tesseract::PageSegMode>(atoi(argv[arg + 1]));
107  ++arg;
108  } else if (image == NULL) {
109  image = argv[arg];
110  } else if (output == NULL) {
111  output = argv[arg];
112  }
113  ++arg;
114  }
115  if (output == NULL) {
116  fprintf(stderr, _("Usage:%s imagename outputbase [-l lang] "
117  "[-psm pagesegmode] [configfile...]\n\n"), argv[0]);
118  fprintf(stderr,
119  _("pagesegmode values are:\n"
120  "0 = Orientation and script detection (OSD) only.\n"
121  "1 = Automatic page segmentation with OSD.\n"
122  "2 = Automatic page segmentation, but no OSD, or OCR\n"
123  "3 = Fully automatic page segmentation, but no OSD. (Default)\n"
124  "4 = Assume a single column of text of variable sizes.\n"
125  "5 = Assume a single uniform block of vertically aligned text.\n"
126  "6 = Assume a single uniform block of text.\n"
127  "7 = Treat the image as a single text line.\n"
128  "8 = Treat the image as a single word.\n"
129  "9 = Treat the image as a single word in a circle.\n"
130  "10 = Treat the image as a single character.\n"));
131  fprintf(stderr, _("-l lang and/or -psm pagesegmode must occur before any"
132  "configfile.\n\n"));
133  fprintf(stderr, _("Single options:\n"));
134  fprintf(stderr, _(" -v --version: version info\n"));
135  fprintf(stderr, _(" --list-langs: list available languages for tesseract "
136  "engine\n"));
137  exit(1);
138  }
139 
140 
141  api.SetOutputName(output);
142 
143  rc = api.Init(tessdata_dir.string(), lang, tesseract::OEM_DEFAULT,
144  &(argv[arg]), argc - arg, NULL, NULL, false);
145  if (rc) {
146  fprintf(stderr, _("Could not initialize tesseract.\n"));
147  exit(1);
148  }
149 
150  // We have 2 possible sources of pagesegmode: a config file and
151  // the command line. For backwards compatability reasons, the
152  // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
153  // default for this program is tesseract::PSM_AUTO. We will let
154  // the config file take priority, so the command-line default
155  // can take priority over the tesseract default, so we use the
156  // value from the command line only if the retrieved mode
157  // is still tesseract::PSM_SINGLE_BLOCK, indicating no change
158  // in any config file. Therefore the only way to force
159  // tesseract::PSM_SINGLE_BLOCK is from the command line.
160  // It would be simpler if we could set the value before Init,
161  // but that doesn't work.
163  api.SetPageSegMode(pagesegmode);
164  tprintf("Tesseract Open Source OCR Engine v%s with Leptonica\n",
166 
167 
168  FILE* fin = fopen(image, "rb");
169  if (fin == NULL) {
170  fprintf(stderr, _("Cannot open input file: %s\n"), image);
171  exit(2);
172  }
173  fclose(fin);
174 
175  PIX *pixs;
176  if ((pixs = pixRead(image)) == NULL) {
177  fprintf(stderr, _("Unsupported image type.\n"));
178  exit(3);
179  }
180  pixDestroy(&pixs);
181 
183  if (!api.ProcessPages(image, NULL, 0, &text_out)) {
184  fprintf(stderr, _("Error during processing.\n"));
185  }
186  bool output_hocr = false;
187  api.GetBoolVariable("tessedit_create_hocr", &output_hocr);
188  bool output_box = false;
189  api.GetBoolVariable("tessedit_create_boxfile", &output_box);
190  STRING outfile = output;
191  outfile += output_hocr ? ".html" : output_box ? ".box" : ".txt";
192  FILE* fout = fopen(outfile.string(), "wb");
193  if (fout == NULL) {
194  fprintf(stderr, _("Cannot create output file %s\n"), outfile.string());
195  exit(1);
196  }
197  fwrite(text_out.string(), 1, text_out.length(), fout);
198  fclose(fout);
199 
200  return 0; // Normal exit
201 }
bool ProcessPages(const char *filename, const char *retry_config, int timeout_millisec, STRING *text_out)
Definition: baseapi.cpp:803
Fully automatic page segmentation, but no OSD.
Definition: publictypes.h:152
inT32 length() const
Definition: strngs.cpp:151
TESS_API void truncate_path(const char *code_path, STRING *trunc_path)
Definition: basedir.cpp:32
struct Pix PIX
Definition: capi.h:69
#define NULL
Definition: host.h:144
int Init(const char *datapath, const char *language, OcrEngineMode mode, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_non_debug_params)
Definition: baseapi.cpp:213
bool GetBoolVariable(const char *name, bool *value) const
Definition: baseapi.cpp:173
const char * string() const
Definition: strngs.cpp:156
#define _(x)
PageSegMode GetPageSegMode() const
Definition: baseapi.cpp:377
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
Definition: strngs.h:40
int size() const
Definition: genericvector.h:59
void SetPageSegMode(PageSegMode mode)
Definition: baseapi.cpp:370
void GetAvailableLanguagesAsVector(GenericVector< STRING > *langs) const
Definition: baseapi.cpp:293
static const char * Version()
Definition: baseapi.cpp:130
void SetOutputName(const char *name)
Definition: baseapi.cpp:146
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:156
#define PACKAGE
Definition: config_auto.h:135
int main ( int  argc,
char *  argv[] 
)

Public Code

Definition at line 89 of file cntraining.cpp.

142 {
143  // Set the global Config parameters before parsing the command line.
144  Config = CNConfig;
145 
146  const char *PageName;
147  FILE *TrainingPage;
148  LIST CharList = NIL_LIST;
149  CLUSTERER *Clusterer = NULL;
150  LIST ProtoList = NIL_LIST;
151  LIST NormProtoList = NIL_LIST;
152  LIST pCharList;
153  LABELEDLIST CharSample;
154  FEATURE_DEFS_STRUCT FeatureDefs;
155  InitFeatureDefs(&FeatureDefs);
156 
157  ParseArguments(&argc, &argv);
158  int num_fonts = 0;
159  while ((PageName = GetNextFilename(argc, argv)) != NULL) {
160  printf("Reading %s ...\n", PageName);
161  TrainingPage = Efopen(PageName, "rb");
163  100, NULL, TrainingPage, &CharList);
164  fclose(TrainingPage);
165  ++num_fonts;
166  }
167  printf("Clustering ...\n");
168  // To allow an individual font to form a separate cluster,
169  // reduce the min samples:
170  // Config.MinSamples = 0.5 / num_fonts;
171  pCharList = CharList;
172  iterate(pCharList) {
173  //Cluster
174  CharSample = (LABELEDLIST)first_node(pCharList);
175  Clusterer =
176  SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
177  float SavedMinSamples = Config.MinSamples;
178  // To disable the tendency to produce a single cluster for all fonts,
179  // make MagicSamples an impossible to achieve number:
180  // Config.MagicSamples = CharSample->SampleCount * 10;
181  Config.MagicSamples = CharSample->SampleCount;
182  while (Config.MinSamples > 0.001) {
183  ProtoList = ClusterSamples(Clusterer, &Config);
184  if (NumberOfProtos(ProtoList, 1, 0) > 0) {
185  break;
186  } else {
187  Config.MinSamples *= 0.95;
188  printf("0 significant protos for %s."
189  " Retrying clustering with MinSamples = %f%%\n",
190  CharSample->Label, Config.MinSamples);
191  }
192  }
193  Config.MinSamples = SavedMinSamples;
194  AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
195  }
196  FreeTrainingSamples(CharList);
197  if (Clusterer == NULL) { // To avoid a SIGSEGV
198  fprintf(stderr, "Error: NULL clusterer!\n");
199  return 1;
200  }
201  WriteNormProtos(FLAGS_D.c_str(), NormProtoList, Clusterer);
202  FreeNormProtoList(NormProtoList);
203  FreeProtoList(&ProtoList);
204  FreeClusterer(Clusterer);
205  printf ("\n");
206  return 0;
207 } // main
void FreeTrainingSamples(LIST CharList)
int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
void WriteNormProtos(const char *Directory, LIST LabeledProtoList, CLUSTERER *Clusterer)
Definition: cntraining.cpp:215
void ParseArguments(int *argc, char ***argv)
#define NIL_LIST
Definition: oldlist.h:126
struct LABELEDLISTNODE * LABELEDLIST
#define NULL
Definition: host.h:144
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:121
void FreeNormProtoList(LIST CharList)
FILE * Efopen(const char *Name, const char *Mode)
Definition: efio.cpp:32
CLUSTERCONFIG Config
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:532
#define PROGRAM_FEATURE_TYPE
Definition: cntraining.cpp:41
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:504
FLOAT32 MinSamples
Definition: cluster.h:50
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:560
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
int MagicSamples
Definition: cluster.h:55
CLUSTERCONFIG CNConfig
Definition: cntraining.cpp:79
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
const char * GetNextFilename(int argc, const char *const *argv)
void WriteNormProtos ( const char *  Directory,
LIST  LabeledProtoList,
CLUSTERER Clusterer 
)

Private Function Prototypes


Private Code

Definition at line 215 of file cntraining.cpp.

232 {
233  FILE *File;
234  STRING Filename;
235  LABELEDLIST LabeledProto;
236  int N;
237 
238  Filename = "";
239  if (Directory != NULL && Directory[0] != '\0')
240  {
241  Filename += Directory;
242  Filename += "/";
243  }
244  Filename += "normproto";
245  printf ("\nWriting %s ...", Filename.string());
246  File = Efopen (Filename.string(), "wb");
247  fprintf(File,"%0d\n",Clusterer->SampleSize);
248  WriteParamDesc(File,Clusterer->SampleSize,Clusterer->ParamDesc);
249  iterate(LabeledProtoList)
250  {
251  LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
252  N = NumberOfProtos(LabeledProto->List, true, false);
253  if (N < 1) {
254  printf ("\nError! Not enough protos for %s: %d protos"
255  " (%d significant protos"
256  ", %d insignificant protos)\n",
257  LabeledProto->Label, N,
258  NumberOfProtos(LabeledProto->List, 1, 0),
259  NumberOfProtos(LabeledProto->List, 0, 1));
260  exit(1);
261  }
262  fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
263  WriteProtos(File, Clusterer->SampleSize, LabeledProto->List, true, false);
264  }
265  fclose (File);
266 
267 } // WriteNormProtos
void WriteProtos(FILE *File, uinT16 N, LIST ProtoList, BOOL8 WriteSigProtos, BOOL8 WriteInsigProtos)
Definition: cntraining.cpp:270
int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
PARAM_DESC * ParamDesc
Definition: cluster.h:88
struct LABELEDLISTNODE * LABELEDLIST
#define NULL
Definition: host.h:144
FILE * Efopen(const char *Name, const char *Mode)
Definition: efio.cpp:32
inT16 SampleSize
Definition: cluster.h:87
const char * string() const
Definition: strngs.cpp:156
Definition: strngs.h:40
void WriteParamDesc(FILE *File, uinT16 N, PARAM_DESC ParamDesc[])
Definition: clusttool.cpp:318
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
void WriteProtos ( FILE *  File,
uinT16  N,
LIST  ProtoList,
BOOL8  WriteSigProtos,
BOOL8  WriteInsigProtos 
)

Definition at line 270 of file cntraining.cpp.

276 {
277  PROTOTYPE *Proto;
278 
279  // write prototypes
280  iterate(ProtoList)
281  {
282  Proto = (PROTOTYPE *) first_node ( ProtoList );
283  if (( Proto->Significant && WriteSigProtos ) ||
284  ( ! Proto->Significant && WriteInsigProtos ) )
285  WritePrototype( File, N, Proto );
286  }
287 } // WriteProtos
unsigned Significant
Definition: cluster.h:68
#define iterate(l)
Definition: oldlist.h:159
#define first_node(l)
Definition: oldlist.h:139
void WritePrototype(FILE *File, uinT16 N, PROTOTYPE *Proto)
Definition: clusttool.cpp:348

Variable Documentation

CLUSTERCONFIG CNConfig
Initial value:
=
{
elliptical, 0.025, 0.05, 0.8, 1e-3, 0
}

Global Data Definitions and Declarations

Definition at line 79 of file cntraining.cpp.