Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
wordseg.cpp File Reference
#include "mfcpch.h"
#include "stderr.h"
#include "blobbox.h"
#include "statistc.h"
#include "drawtord.h"
#include "makerow.h"
#include "pitsync1.h"
#include "tovars.h"
#include "topitch.h"
#include "cjkpitch.h"
#include "textord.h"
#include "fpchop.h"
#include "wordseg.h"

Go to the source code of this file.

Macros

#define EXTERN
 
#define FIXED_WIDTH_MULTIPLE   5
 
#define BLOCK_STATS_CLUSTERS   10
 

Functions

make_single_word

For each row, arrange the blobs into one word. There is no fixed pitch detection.

void make_single_word (bool one_blob, TO_ROW_LIST *rows, ROW_LIST *real_rows)
 
void make_words (tesseract::Textord *textord, ICOORD page_tr, float gradient, BLOCK_LIST *blocks, TO_BLOCK_LIST *port_blocks)
 
set_row_spaces

Set the min_space and max_nonspace members of the row so that the blobs can be arranged into words.

void set_row_spaces (TO_BLOCK *block, FCOORD rotation, BOOL8 testing_on)
 
row_words

Compute the max nonspace and min space for the row.

inT32 row_words (TO_BLOCK *block, TO_ROW *row, inT32 maxwidth, FCOORD rotation, BOOL8 testing_on)
 
row_words2

Compute the max nonspace and min space for the row.

inT32 row_words2 (TO_BLOCK *block, TO_ROW *row, inT32 maxwidth, FCOORD rotation, BOOL8 testing_on)
 
make_real_words

Convert a TO_BLOCK to a BLOCK.

void make_real_words (tesseract::Textord *textord, TO_BLOCK *block, FCOORD rotation)
 
make_rep_words

Fabricate a real row from only the repeated blob words. Get the xheight from the block as it may be more meaningful.

ROWmake_rep_words (TO_ROW *row, TO_BLOCK *block)
 
make_real_word

Construct a WERD from a given number of adjacent entries in a list of BLOBNBOXs.

WERDmake_real_word (BLOBNBOX_IT *box_it, inT32 blobcount, BOOL8 bol, uinT8 blanks)
 

Variables

EXTERN bool textord_fp_chopping = TRUE
 
EXTERN bool textord_force_make_prop_words = FALSE
 
EXTERN bool textord_chopper_test = FALSE
 

Macro Definition Documentation

#define BLOCK_STATS_CLUSTERS   10

Definition at line 51 of file wordseg.cpp.

#define EXTERN

Definition at line 42 of file wordseg.cpp.

#define FIXED_WIDTH_MULTIPLE   5

Definition at line 50 of file wordseg.cpp.

Function Documentation

WERD* make_real_word ( BLOBNBOX_IT *  box_it,
inT32  blobcount,
BOOL8  bol,
uinT8  blanks 
)

Definition at line 611 of file wordseg.cpp.

615  {
616  C_OUTLINE_IT cout_it;
617  C_BLOB_LIST cblobs;
618  C_BLOB_IT cblob_it = &cblobs;
619  WERD *word; // new word
620  BLOBNBOX *bblob; // current blob
621  inT32 blobindex; // in row
622 
623  for (blobindex = 0; blobindex < blobcount; blobindex++) {
624  bblob = box_it->extract();
625  if (bblob->joined_to_prev()) {
626  if (bblob->cblob() != NULL) {
627  cout_it.set_to_list(cblob_it.data()->out_list());
628  cout_it.move_to_last();
629  cout_it.add_list_after(bblob->cblob()->out_list());
630  delete bblob->cblob();
631  }
632  }
633  else {
634  if (bblob->cblob() != NULL)
635  cblob_it.add_after_then_move(bblob->cblob());
636  }
637  delete bblob;
638  box_it->forward(); // next one
639  }
640 
641  if (blanks < 1)
642  blanks = 1;
643 
644  word = new WERD(&cblobs, blanks, NULL);
645 
646  if (bol)
647  word->set_flag(W_BOL, TRUE);
648  if (box_it->at_first())
649  word->set_flag(W_EOL, TRUE); // at end of line
650 
651  return word;
652 }
C_BLOB * cblob() const
Definition: blobbox.h:245
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:42
#define NULL
Definition: host.h:144
int inT32
Definition: host.h:102
Definition: werd.h:35
bool joined_to_prev() const
Definition: blobbox.h:233
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:123
Definition: werd.h:60
Definition: werd.h:36
#define TRUE
Definition: capi.h:27
void make_real_words ( tesseract::Textord textord,
TO_BLOCK block,
FCOORD  rotation 
)

Definition at line 516 of file wordseg.cpp.

520  {
521  TO_ROW *row; //current row
522  TO_ROW_IT row_it = block->get_rows ();
523  ROW *real_row = NULL; //output row
524  ROW_IT real_row_it = block->block->row_list ();
525 
526  if (row_it.empty ())
527  return; //empty block
528  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
529  row = row_it.data ();
530  if (row->blob_list ()->empty () && !row->rep_words.empty ()) {
531  real_row = make_rep_words (row, block);
532  } else if (!row->blob_list()->empty()) {
533  // In a fixed pitch document, some lines may be detected as fixed pitch
534  // while others don't, and will go through different path.
535  // For non-space delimited language like CJK, fixed pitch chop always
536  // leave the entire line as one word. We can force consistent chopping
537  // with force_make_prop_words flag.
538  POLY_BLOCK* pb = block->block->poly_block();
539  if (textord_chopper_test) {
540  real_row = textord->make_blob_words (row, rotation);
541  } else if (textord_force_make_prop_words ||
542  (pb != NULL && !pb->IsText()) ||
543  row->pitch_decision == PITCH_DEF_PROP ||
545  real_row = textord->make_prop_words (row, rotation);
546  } else if (row->pitch_decision == PITCH_DEF_FIXED ||
548  real_row = fixed_pitch_words (row, rotation);
549  } else {
551  }
552  }
553  if (real_row != NULL) {
554  //put row in block
555  real_row_it.add_after_then_move (real_row);
556  }
557  }
558  block->block->set_stats (block->fixed_pitch == 0, (inT16) block->kern_size,
559  (inT16) block->space_size,
560  (inT16) block->fixed_pitch);
561  block->block->check_pitch ();
562 }
EXTERN bool textord_chopper_test
Definition: wordseg.cpp:48
EXTERN bool textord_force_make_prop_words
Definition: wordseg.cpp:46
ROW * make_blob_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:1183
#define NULL
Definition: host.h:144
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:121
#define FALSE
Definition: capi.h:28
ROW * make_prop_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:886
ROW * make_rep_words(TO_ROW *row, TO_BLOCK *block)
Definition: wordseg.cpp:572
ROW * fixed_pitch_words(TO_ROW *row, FCOORD rotation)
Definition: fpchop.cpp:53
WERD_LIST rep_words
Definition: blobbox.h:640
Definition: ocrrow.h:32
POLY_BLOCK * poly_block() const
Definition: pdblock.h:62
void set_stats(BOOL8 prop, inT16 kern, inT16 space, inT16 ch_pitch)
Definition: ocrblock.h:63
PITCH_TYPE pitch_decision
Definition: blobbox.h:622
short inT16
Definition: host.h:100
float space_size
Definition: blobbox.h:754
float kern_size
Definition: blobbox.h:753
TO_ROW_LIST * get_rows()
Definition: blobbox.h:676
bool IsText() const
Definition: polyblk.h:54
#define ASSERT_HOST(x)
Definition: errcode.h:84
BLOCK * block
Definition: blobbox.h:740
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:571
void check_pitch()
check proportional
Definition: ocrblock.cpp:159
float fixed_pitch
Definition: blobbox.h:752
ROW* make_rep_words ( TO_ROW row,
TO_BLOCK block 
)

Definition at line 572 of file wordseg.cpp.

575  {
576  inT32 xstarts[2]; //ends of row
577  ROW *real_row; //output row
578  TBOX word_box; //bounding box
579  double coeffs[3]; //spline
580  //iterator
581  WERD_IT word_it = &row->rep_words;
582 
583  if (word_it.empty ())
584  return NULL;
585  word_box = word_it.data ()->bounding_box ();
586  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ())
587  word_box += word_it.data ()->bounding_box ();
588  xstarts[0] = word_box.left ();
589  xstarts[1] = word_box.right ();
590  coeffs[0] = 0;
591  coeffs[1] = row->line_m ();
592  coeffs[2] = row->line_c ();
593  row->xheight = block->xheight;
594  real_row = new ROW(row,
595  (inT16) block->kern_size, (inT16) block->space_size);
596  word_it.set_to_list (real_row->word_list ());
597  //put words in row
598  word_it.add_list_after (&row->rep_words);
599  real_row->recalc_bounding_box ();
600  return real_row;
601 }
float line_m() const
Definition: blobbox.h:542
float xheight
Definition: blobbox.h:751
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
float line_c() const
Definition: blobbox.h:545
int inT32
Definition: host.h:102
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
void recalc_bounding_box()
Definition: ocrrow.cpp:91
WERD_LIST rep_words
Definition: blobbox.h:640
Definition: ocrrow.h:32
short inT16
Definition: host.h:100
WERD_LIST * word_list()
Definition: ocrrow.h:52
float space_size
Definition: blobbox.h:754
float kern_size
Definition: blobbox.h:753
float xheight
Definition: blobbox.h:629
void make_single_word ( bool  one_blob,
TO_ROW_LIST *  rows,
ROW_LIST *  real_rows 
)

Definition at line 61 of file wordseg.cpp.

61  {
62  TO_ROW_IT to_row_it(rows);
63  ROW_IT row_it(real_rows);
64  for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list();
65  to_row_it.forward()) {
66  TO_ROW* row = to_row_it.data();
67  // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready
68  // to create the word.
69  C_BLOB_LIST cblobs;
70  C_BLOB_IT cblob_it(&cblobs);
71  BLOBNBOX_IT box_it(row->blob_list());
72  for (;!box_it.empty(); box_it.forward()) {
73  BLOBNBOX* bblob= box_it.extract();
74  if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
75  if (bblob->cblob() != NULL) {
76  C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
77  cout_it.move_to_last();
78  cout_it.add_list_after(bblob->cblob()->out_list());
79  delete bblob->cblob();
80  }
81  } else {
82  if (bblob->cblob() != NULL)
83  cblob_it.add_after_then_move(bblob->cblob());
84  }
85  delete bblob;
86  }
87  // Convert the TO_ROW to a ROW.
88  ROW* real_row = new ROW(row, static_cast<inT16>(row->kern_size),
89  static_cast<inT16>(row->space_size));
90  WERD_IT word_it(real_row->word_list());
91  WERD* word = new WERD(&cblobs, 0, NULL);
92  word->set_flag(W_BOL, TRUE);
93  word->set_flag(W_EOL, TRUE);
94  word->set_flag(W_DONT_CHOP, one_blob);
95  word_it.add_after_then_move(word);
96  row_it.add_after_then_move(real_row);
97  }
98 }
C_BLOB * cblob() const
Definition: blobbox.h:245
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:42
#define NULL
Definition: host.h:144
float space_size
Definition: blobbox.h:639
float kern_size
Definition: blobbox.h:638
Definition: ocrrow.h:32
Definition: werd.h:35
bool joined_to_prev() const
Definition: blobbox.h:233
Definition: werd.h:60
WERD_LIST * word_list()
Definition: ocrrow.h:52
Definition: werd.h:36
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:571
#define TRUE
Definition: capi.h:27
void make_words ( tesseract::Textord textord,
ICOORD  page_tr,
float  gradient,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  port_blocks 
)

make_words

Arrange the blobs into words.

Definition at line 105 of file wordseg.cpp.

109  { // output list
110  TO_BLOCK_IT block_it; // iterator
111  TO_BLOCK *block; // current block
112 
113  if (textord->use_cjk_fp_model()) {
114  compute_fixed_pitch_cjk(page_tr, port_blocks);
115  } else {
116  compute_fixed_pitch(page_tr, port_blocks, gradient, FCOORD(0.0f, -1.0f),
118  }
119  textord->to_spacing(page_tr, port_blocks);
120  block_it.set_to_list(port_blocks);
121  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
122  block = block_it.data();
123  make_real_words(textord, block, FCOORD(1.0f, 0.0f));
124  }
125 }
unsigned char BOOL8
Definition: host.h:113
#define f(xc, yc)
Definition: imgscale.cpp:39
bool textord_test_landscape
Definition: makerow.cpp:51
void compute_fixed_pitch(ICOORD page_tr, TO_BLOCK_LIST *port_blocks, float gradient, FCOORD rotation, BOOL8 testing_on)
Definition: topitch.cpp:75
void to_spacing(ICOORD page_tr, TO_BLOCK_LIST *blocks)
Definition: tospace.cpp:35
Definition: points.h:189
void make_real_words(tesseract::Textord *textord, TO_BLOCK *block, FCOORD rotation)
Definition: wordseg.cpp:516
void compute_fixed_pitch_cjk(ICOORD page_tr, TO_BLOCK_LIST *port_blocks)
Definition: cjkpitch.cpp:1057
bool use_cjk_fp_model() const
Definition: textord.h:53
inT32 row_words ( TO_BLOCK block,
TO_ROW row,
inT32  maxwidth,
FCOORD  rotation,
BOOL8  testing_on 
)

Definition at line 187 of file wordseg.cpp.

193  {
194  BOOL8 testing_row; //contains testpt
195  BOOL8 prev_valid; //if decent size
196  BOOL8 this_valid; //current blob big enough
197  inT32 prev_x; //end of prev blob
198  inT32 min_gap; //min interesting gap
199  inT32 cluster_count; //no of clusters
200  inT32 gap_index; //which cluster
201  inT32 smooth_factor; //for smoothing stats
202  BLOBNBOX *blob; //current blob
203  float lower, upper; //clustering parameters
204  float gaps[3]; //gap clusers
205  ICOORD testpt;
206  TBOX blob_box; //bounding box
207  //iterator
208  BLOBNBOX_IT blob_it = row->blob_list ();
209  STATS gap_stats (0, maxwidth);
210  STATS cluster_stats[4]; //clusters
211 
213  smooth_factor =
214  (inT32) (block->xheight * textord_wordstats_smooth_factor + 1.5);
215  // if (testing_on)
216  // tprintf("Row smooth factor=%d\n",smooth_factor);
217  prev_valid = FALSE;
218  prev_x = -MAX_INT32;
219  testing_row = FALSE;
220  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
221  blob = blob_it.data ();
222  blob_box = blob->bounding_box ();
223  if (blob_box.contains (testpt))
224  testing_row = TRUE;
225  gap_stats.add (blob_box.width (), 1);
226  }
227  min_gap = (inT32) floor (gap_stats.ile (textord_words_width_ile));
228  gap_stats.clear ();
229  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
230  blob = blob_it.data ();
231  if (!blob->joined_to_prev ()) {
232  blob_box = blob->bounding_box ();
233  // this_valid=blob_box.width()>=min_gap;
234  this_valid = TRUE;
235  if (this_valid && prev_valid
236  && blob_box.left () - prev_x < maxwidth) {
237  gap_stats.add (blob_box.left () - prev_x, 1);
238  }
239  prev_x = blob_box.right ();
240  prev_valid = this_valid;
241  }
242  }
243  if (gap_stats.get_total () == 0) {
244  row->min_space = 0; //no evidence
245  row->max_nonspace = 0;
246  return 0;
247  }
248  gap_stats.smooth (smooth_factor);
249  lower = row->xheight * textord_words_initial_lower;
250  upper = row->xheight * textord_words_initial_upper;
251  cluster_count = gap_stats.cluster (lower, upper,
253  cluster_stats);
254  while (cluster_count < 2 && ceil (lower) < floor (upper)) {
255  //shrink gap
256  upper = (upper * 3 + lower) / 4;
257  lower = (lower * 3 + upper) / 4;
258  cluster_count = gap_stats.cluster (lower, upper,
260  cluster_stats);
261  }
262  if (cluster_count < 2) {
263  row->min_space = 0; //no evidence
264  row->max_nonspace = 0;
265  return 0;
266  }
267  for (gap_index = 0; gap_index < cluster_count; gap_index++)
268  gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
269  //get medians
270  if (cluster_count > 2) {
271  if (testing_on && textord_show_initial_words) {
272  tprintf ("Row at %g has 3 sizes of gap:%g,%g,%g\n",
273  row->intercept (),
274  cluster_stats[1].ile (0.5),
275  cluster_stats[2].ile (0.5), cluster_stats[3].ile (0.5));
276  }
277  lower = gaps[0];
278  if (gaps[1] > lower) {
279  upper = gaps[1]; //prefer most frequent
280  if (upper < block->xheight * textord_words_min_minspace
281  && gaps[2] > gaps[1]) {
282  upper = gaps[2];
283  }
284  }
285  else if (gaps[2] > lower
286  && gaps[2] >= block->xheight * textord_words_min_minspace)
287  upper = gaps[2];
288  else if (lower >= block->xheight * textord_words_min_minspace) {
289  upper = lower; //not nice
290  lower = gaps[1];
291  if (testing_on && textord_show_initial_words) {
292  tprintf ("Had to switch most common from lower to upper!!\n");
293  gap_stats.print();
294  }
295  }
296  else {
297  row->min_space = 0; //no evidence
298  row->max_nonspace = 0;
299  return 0;
300  }
301  }
302  else {
303  if (gaps[1] < gaps[0]) {
304  if (testing_on && textord_show_initial_words) {
305  tprintf ("Had to switch most common from lower to upper!!\n");
306  gap_stats.print();
307  }
308  lower = gaps[1];
309  upper = gaps[0];
310  }
311  else {
312  upper = gaps[1];
313  lower = gaps[0];
314  }
315  }
316  if (upper < block->xheight * textord_words_min_minspace) {
317  row->min_space = 0; //no evidence
318  row->max_nonspace = 0;
319  return 0;
320  }
321  if (upper * 3 < block->min_space * 2 + block->max_nonspace
322  || lower * 3 > block->min_space * 2 + block->max_nonspace) {
323  if (testing_on && textord_show_initial_words) {
324  tprintf ("Disagreement between block and row at %g!!\n",
325  row->intercept ());
326  tprintf ("Lower=%g, upper=%g, Stats:\n", lower, upper);
327  gap_stats.print();
328  }
329  }
330  row->min_space =
331  (inT32) ceil (upper - (upper - lower) * textord_words_definite_spread);
332  row->max_nonspace =
333  (inT32) floor (lower + (upper - lower) * textord_words_definite_spread);
334  row->space_threshold = (row->max_nonspace + row->min_space) / 2;
335  row->space_size = upper;
336  row->kern_size = lower;
337  if (testing_on && textord_show_initial_words) {
338  if (testing_row) {
339  tprintf ("GAP STATS\n");
340  gap_stats.print();
341  tprintf ("SPACE stats\n");
342  cluster_stats[2].print_summary();
343  tprintf ("NONSPACE stats\n");
344  cluster_stats[1].print_summary();
345  }
346  tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n",
347  row->intercept (), row->min_space, upper,
348  row->max_nonspace, lower);
349  }
350  return cluster_stats[2].get_total ();
351 }
int textord_test_y
Definition: makerow.cpp:64
float intercept() const
Definition: blobbox.h:560
const TBOX & bounding_box() const
Definition: blobbox.h:208
bool contains(const FCOORD pt) const
Definition: rect.h:323
inT32 get_total() const
Definition: statistc.h:82
float xheight
Definition: blobbox.h:751
unsigned char BOOL8
Definition: host.h:113
inT16 left() const
Definition: rect.h:67
EXTERN double textord_words_initial_upper
Definition: tovars.cpp:56
int inT32
Definition: host.h:102
inT16 width() const
Definition: rect.h:104
EXTERN double textord_wordstats_smooth_factor
Definition: tovars.cpp:40
Definition: rect.h:29
#define FALSE
Definition: capi.h:28
inT16 right() const
Definition: rect.h:74
float space_size
Definition: blobbox.h:639
double ile(double frac) const
Definition: statistc.cpp:176
EXTERN double textord_words_initial_lower
Definition: tovars.cpp:54
#define MAX_INT32
Definition: host.h:120
float kern_size
Definition: blobbox.h:638
EXTERN double textord_spacesize_ratioprop
Definition: tovars.cpp:81
bool joined_to_prev() const
Definition: blobbox.h:233
EXTERN double textord_words_definite_spread
Definition: tovars.cpp:77
inT32 space_threshold
Definition: blobbox.h:637
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
inT32 max_nonspace
Definition: blobbox.h:636
EXTERN bool textord_show_initial_words
Definition: tovars.cpp:26
inT32 min_space
Definition: blobbox.h:635
Definition: statistc.h:29
integer coordinate
Definition: points.h:30
int textord_test_x
Definition: makerow.cpp:63
float xheight
Definition: blobbox.h:629
EXTERN double textord_words_width_ile
Definition: tovars.cpp:44
void print_summary() const
Definition: statistc.cpp:472
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:571
inT32 min_space
Definition: blobbox.h:755
EXTERN double textord_words_min_minspace
Definition: tovars.cpp:50
#define TRUE
Definition: capi.h:27
inT32 max_nonspace
Definition: blobbox.h:756
inT32 row_words2 ( TO_BLOCK block,
TO_ROW row,
inT32  maxwidth,
FCOORD  rotation,
BOOL8  testing_on 
)

Definition at line 360 of file wordseg.cpp.

366  {
367  BOOL8 testing_row; //contains testpt
368  BOOL8 prev_valid; //if decent size
369  BOOL8 this_valid; //current blob big enough
370  inT32 prev_x; //end of prev blob
371  inT32 min_width; //min interesting width
372  inT32 valid_count; //good gaps
373  inT32 total_count; //total gaps
374  inT32 cluster_count; //no of clusters
375  inT32 prev_count; //previous cluster_count
376  inT32 gap_index; //which cluster
377  inT32 smooth_factor; //for smoothing stats
378  BLOBNBOX *blob; //current blob
379  float lower, upper; //clustering parameters
380  ICOORD testpt;
381  TBOX blob_box; //bounding box
382  //iterator
383  BLOBNBOX_IT blob_it = row->blob_list ();
384  STATS gap_stats (0, maxwidth);
385  //gap sizes
386  float gaps[BLOCK_STATS_CLUSTERS];
387  STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
388  //clusters
389 
391  smooth_factor =
392  (inT32) (block->xheight * textord_wordstats_smooth_factor + 1.5);
393  // if (testing_on)
394  // tprintf("Row smooth factor=%d\n",smooth_factor);
395  prev_valid = FALSE;
396  prev_x = -MAX_INT16;
397  testing_row = FALSE;
398  //min blob size
399  min_width = (inT32) block->pr_space;
400  total_count = 0;
401  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
402  blob = blob_it.data ();
403  if (!blob->joined_to_prev ()) {
404  blob_box = blob->bounding_box ();
405  this_valid = blob_box.width () >= min_width;
406  this_valid = TRUE;
407  if (this_valid && prev_valid
408  && blob_box.left () - prev_x < maxwidth) {
409  gap_stats.add (blob_box.left () - prev_x, 1);
410  }
411  total_count++; //count possibles
412  prev_x = blob_box.right ();
413  prev_valid = this_valid;
414  }
415  }
416  valid_count = gap_stats.get_total ();
417  if (valid_count < total_count * textord_words_minlarge) {
418  gap_stats.clear ();
419  prev_x = -MAX_INT16;
420  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
421  blob_it.forward ()) {
422  blob = blob_it.data ();
423  if (!blob->joined_to_prev ()) {
424  blob_box = blob->bounding_box ();
425  if (blob_box.left () - prev_x < maxwidth) {
426  gap_stats.add (blob_box.left () - prev_x, 1);
427  }
428  prev_x = blob_box.right ();
429  }
430  }
431  }
432  if (gap_stats.get_total () == 0) {
433  row->min_space = 0; //no evidence
434  row->max_nonspace = 0;
435  return 0;
436  }
437 
438  cluster_count = 0;
439  lower = block->xheight * words_initial_lower;
440  upper = block->xheight * words_initial_upper;
441  gap_stats.smooth (smooth_factor);
442  do {
443  prev_count = cluster_count;
444  cluster_count = gap_stats.cluster (lower, upper,
446  BLOCK_STATS_CLUSTERS, cluster_stats);
447  }
448  while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
449  if (cluster_count < 1) {
450  row->min_space = 0;
451  row->max_nonspace = 0;
452  return 0;
453  }
454  for (gap_index = 0; gap_index < cluster_count; gap_index++)
455  gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
456  //get medians
457  if (testing_on) {
458  tprintf ("cluster_count=%d:", cluster_count);
459  for (gap_index = 0; gap_index < cluster_count; gap_index++)
460  tprintf (" %g(%d)", gaps[gap_index],
461  cluster_stats[gap_index + 1].get_total ());
462  tprintf ("\n");
463  }
464 
465  //Try to find proportional non-space and space for row.
466  for (gap_index = 0; gap_index < cluster_count
467  && gaps[gap_index] > block->max_nonspace; gap_index++);
468  if (gap_index < cluster_count)
469  lower = gaps[gap_index]; //most frequent below
470  else {
471  if (testing_on)
472  tprintf ("No cluster below block threshold!, using default=%g\n",
473  block->pr_nonsp);
474  lower = block->pr_nonsp;
475  }
476  for (gap_index = 0; gap_index < cluster_count
477  && gaps[gap_index] <= block->max_nonspace; gap_index++);
478  if (gap_index < cluster_count)
479  upper = gaps[gap_index]; //most frequent above
480  else {
481  if (testing_on)
482  tprintf ("No cluster above block threshold!, using default=%g\n",
483  block->pr_space);
484  upper = block->pr_space;
485  }
486  row->min_space =
487  (inT32) ceil (upper - (upper - lower) * textord_words_definite_spread);
488  row->max_nonspace =
489  (inT32) floor (lower + (upper - lower) * textord_words_definite_spread);
490  row->space_threshold = (row->max_nonspace + row->min_space) / 2;
491  row->space_size = upper;
492  row->kern_size = lower;
493  if (testing_on) {
494  if (testing_row) {
495  tprintf ("GAP STATS\n");
496  gap_stats.print();
497  tprintf ("SPACE stats\n");
498  cluster_stats[2].print_summary();
499  tprintf ("NONSPACE stats\n");
500  cluster_stats[1].print_summary();
501  }
502  tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n",
503  row->intercept (), row->min_space, upper,
504  row->max_nonspace, lower);
505  }
506  return 1;
507 }
int textord_test_y
Definition: makerow.cpp:64
float intercept() const
Definition: blobbox.h:560
float pr_space
Definition: blobbox.h:759
const TBOX & bounding_box() const
Definition: blobbox.h:208
float xheight
Definition: blobbox.h:751
unsigned char BOOL8
Definition: host.h:113
inT16 left() const
Definition: rect.h:67
#define BLOCK_STATS_CLUSTERS
Definition: wordseg.cpp:51
int inT32
Definition: host.h:102
inT16 width() const
Definition: rect.h:104
EXTERN double textord_wordstats_smooth_factor
Definition: tovars.cpp:40
Definition: rect.h:29
#define FALSE
Definition: capi.h:28
inT16 right() const
Definition: rect.h:74
float space_size
Definition: blobbox.h:639
float pr_nonsp
Definition: blobbox.h:760
EXTERN double words_initial_upper
Definition: tovars.cpp:72
float kern_size
Definition: blobbox.h:638
EXTERN double textord_spacesize_ratioprop
Definition: tovars.cpp:81
bool joined_to_prev() const
Definition: blobbox.h:233
EXTERN double textord_words_definite_spread
Definition: tovars.cpp:77
inT32 space_threshold
Definition: blobbox.h:637
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
inT32 max_nonspace
Definition: blobbox.h:636
inT32 min_space
Definition: blobbox.h:635
Definition: statistc.h:29
EXTERN double words_initial_lower
Definition: tovars.cpp:71
integer coordinate
Definition: points.h:30
int textord_test_x
Definition: makerow.cpp:63
#define MAX_INT16
Definition: host.h:119
void print_summary() const
Definition: statistc.cpp:472
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:571
EXTERN double textord_words_minlarge
Definition: tovars.cpp:58
#define TRUE
Definition: capi.h:27
inT32 max_nonspace
Definition: blobbox.h:756
void set_row_spaces ( TO_BLOCK block,
FCOORD  rotation,
BOOL8  testing_on 
)

Definition at line 135 of file wordseg.cpp.

139  {
140  inT32 maxwidth; //of widest space
141  TO_ROW *row; //current row
142  TO_ROW_IT row_it = block->get_rows ();
143 
144  if (row_it.empty ())
145  return; //empty block
146  maxwidth = (inT32) ceil (block->xheight * textord_words_maxspace);
147  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
148  row = row_it.data ();
149  if (row->fixed_pitch == 0) {
150  // if (!textord_test_mode
151  // && row_words(block,row,maxwidth,rotation,testing_on)==0
152  // || textord_test_mode
153  // && row_words2(block,row,maxwidth,rotation,testing_on)==0)
154  // {
155  row->min_space =
156  (inT32) ceil (row->pr_space -
157  (row->pr_space -
159  row->max_nonspace =
160  (inT32) floor (row->pr_nonsp +
161  (row->pr_space -
163  if (testing_on && textord_show_initial_words) {
164  tprintf ("Assigning defaults %d non, %d space to row at %g\n",
165  row->max_nonspace, row->min_space, row->intercept ());
166  }
167  row->space_threshold = (row->max_nonspace + row->min_space) / 2;
168  row->space_size = row->pr_space;
169  row->kern_size = row->pr_nonsp;
170  // }
171  }
172 #ifndef GRAPHICS_DISABLED
173  if (textord_show_initial_words && testing_on) {
175  }
176 #endif
177  }
178 }
EXTERN ScrollView * to_win
Definition: drawtord.cpp:40
float intercept() const
Definition: blobbox.h:560
float xheight
Definition: blobbox.h:751
void plot_word_decisions(ScrollView *win, inT16 pitch, TO_ROW *row)
Definition: drawtord.cpp:250
int inT32
Definition: host.h:102
float space_size
Definition: blobbox.h:639
float pr_space
Definition: blobbox.h:626
float kern_size
Definition: blobbox.h:638
EXTERN double textord_words_definite_spread
Definition: tovars.cpp:77
inT32 space_threshold
Definition: blobbox.h:637
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
inT32 max_nonspace
Definition: blobbox.h:636
EXTERN bool textord_show_initial_words
Definition: tovars.cpp:26
short inT16
Definition: host.h:100
inT32 min_space
Definition: blobbox.h:635
TO_ROW_LIST * get_rows()
Definition: blobbox.h:676
EXTERN double textord_words_maxspace
Definition: tovars.cpp:45
float fixed_pitch
Definition: blobbox.h:623
float pr_nonsp
Definition: blobbox.h:627

Variable Documentation

EXTERN bool textord_chopper_test = FALSE

"Chopper is being tested."

Definition at line 48 of file wordseg.cpp.

EXTERN bool textord_force_make_prop_words = FALSE

"Force proportional word segmentation on all rows"

Definition at line 46 of file wordseg.cpp.

EXTERN bool textord_fp_chopping = TRUE

"Do fixed pitch chopping"

Definition at line 44 of file wordseg.cpp.