Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
OL_BUCKETS Class Reference

#include <edgblob.h>

Public Member Functions

 ~OL_BUCKETS ()
 
C_OUTLINE_LIST * start_scan ()
 
C_OUTLINE_LIST * scan_next ()
 
OL_BUCKETS::OL_BUCKETS

Construct an array of buckets for associating outlines into blobs.

 OL_BUCKETS (ICOORD bleft, ICOORD tright)
 
OL_BUCKETS::operator(

Return a pointer to a list of C_OUTLINEs corresponding to the given pixel coordinates.

C_OUTLINE_LIST * operator() (inT16 x, inT16 y)
 
OL_BUCKETS::count_children

Find number of descendants of this outline.

inT32 count_children (C_OUTLINE *outline, inT32 max_count)
 
OL_BUCKETS::outline_complexity

This is the new version of count_child.

The goal of this function is to determine if an outline and its interiors could be part of a character blob. This is done by computing a "complexity" index for the outline, which is the return value of this function, and checking it against a threshold. The max_count is used for short-circuiting the recursion and forcing a rejection that guarantees to fail the threshold test. The complexity F for outline X with N children X[i] is F(X) = N + sum_i F(X[i]) * edges_children_per_grandchild so each layer of nesting increases complexity exponentially. An outline can be rejected as a text blob candidate if its complexity is too high, has too many children(likely a container), or has too many layers of nested inner loops. This has the side-effect of flattening out boxed or reversed video text regions.

inT32 outline_complexity (C_OUTLINE *outline, inT32 max_count, inT16 depth)
 
OL_BUCKETS::extract_children

Find number of descendants of this outline.

void extract_children (C_OUTLINE *outline, C_OUTLINE_IT *it)
 

Detailed Description

Definition at line 33 of file edgblob.h.

Constructor & Destructor Documentation

OL_BUCKETS::OL_BUCKETS ( ICOORD  bleft,
ICOORD  tright 
)

Definition at line 69 of file edgblob.cpp.

71  : bl(bleft), tr(tright) {
72  bxdim =(tright.x() - bleft.x()) / BUCKETSIZE + 1;
73  bydim =(tright.y() - bleft.y()) / BUCKETSIZE + 1;
74  // make array
75  buckets = new C_OUTLINE_LIST[bxdim * bydim];
76  index = 0;
77 }
inT16 x() const
access function
Definition: points.h:52
inT16 y() const
access_function
Definition: points.h:56
#define BUCKETSIZE
Definition: edgblob.h:31
OL_BUCKETS::~OL_BUCKETS ( )
inline

Definition at line 40 of file edgblob.h.

40  { //cleanup
41  delete[]buckets;
42  }

Member Function Documentation

inT32 OL_BUCKETS::count_children ( C_OUTLINE outline,
inT32  max_count 
)

Definition at line 184 of file edgblob.cpp.

187  {
188  BOOL8 parent_box; // could it be boxy
189  inT16 xmin, xmax; // coord limits
190  inT16 ymin, ymax;
191  inT16 xindex, yindex; // current bucket
192  C_OUTLINE *child; // current child
193  inT32 child_count; // no of children
194  inT32 grandchild_count; // no of grandchildren
195  inT32 parent_area; // potential box
196  FLOAT32 max_parent_area; // potential box
197  inT32 child_area; // current child
198  inT32 child_length; // current child
199  TBOX olbox;
200  C_OUTLINE_IT child_it; // search iterator
201 
202  olbox = outline->bounding_box();
203  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
204  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
205  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
206  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
207  child_count = 0;
208  grandchild_count = 0;
209  parent_area = 0;
210  max_parent_area = 0;
211  parent_box = TRUE;
212  for (yindex = ymin; yindex <= ymax; yindex++) {
213  for (xindex = xmin; xindex <= xmax; xindex++) {
214  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
215  if (child_it.empty())
216  continue;
217  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
218  child_it.forward()) {
219  child = child_it.data();
220  if (child != outline && *child < *outline) {
221  child_count++;
222  if (child_count <= max_count) {
223  int max_grand =(max_count - child_count) /
225  if (max_grand > 0)
226  grandchild_count += count_children(child, max_grand) *
228  else
229  grandchild_count += count_children(child, 1);
230  }
231  if (child_count + grandchild_count > max_count) {
232  if (edges_debug)
233  tprintf("Discarding parent with child count=%d, gc=%d\n",
234  child_count,grandchild_count);
235  return child_count + grandchild_count;
236  }
237  if (parent_area == 0) {
238  parent_area = outline->outer_area();
239  if (parent_area < 0)
240  parent_area = -parent_area;
241  max_parent_area = outline->bounding_box().area() * edges_boxarea;
242  if (parent_area < max_parent_area)
243  parent_box = FALSE;
244  }
245  if (parent_box &&
246  (!edges_children_fix ||
247  child->bounding_box().height() > edges_min_nonhole)) {
248  child_area = child->outer_area();
249  if (child_area < 0)
250  child_area = -child_area;
251  if (edges_children_fix) {
252  if (parent_area - child_area < max_parent_area) {
253  parent_box = FALSE;
254  continue;
255  }
256  if (grandchild_count > 0) {
257  if (edges_debug)
258  tprintf("Discarding parent of area %d, child area=%d, max%g "
259  "with gc=%d\n",
260  parent_area, child_area, max_parent_area,
261  grandchild_count);
262  return max_count + 1;
263  }
264  child_length = child->pathlength();
265  if (child_length * child_length >
266  child_area * edges_patharea_ratio) {
267  if (edges_debug)
268  tprintf("Discarding parent of area %d, child area=%d, max%g "
269  "with child length=%d\n",
270  parent_area, child_area, max_parent_area,
271  child_length);
272  return max_count + 1;
273  }
274  }
275  if (child_area < child->bounding_box().area() * edges_childarea) {
276  if (edges_debug)
277  tprintf("Discarding parent of area %d, child area=%d, max%g "
278  "with child rect=%d\n",
279  parent_area, child_area, max_parent_area,
280  child->bounding_box().area());
281  return max_count + 1;
282  }
283  }
284  }
285  }
286  }
287  }
288  return child_count + grandchild_count;
289 }
EXTERN int edges_min_nonhole
Definition: edgblob.cpp:55
inT32 area() const
Definition: rect.h:111
EXTERN bool edges_debug
Definition: edgblob.cpp:45
inT16 x() const
access function
Definition: points.h:52
unsigned char BOOL8
Definition: host.h:113
const TBOX & bounding_box() const
Definition: coutln.h:85
inT16 left() const
Definition: rect.h:67
inT32 count_children(C_OUTLINE *outline, inT32 max_count)
Definition: edgblob.cpp:184
int inT32
Definition: host.h:102
Definition: rect.h:29
#define FALSE
Definition: capi.h:28
inT16 right() const
Definition: rect.h:74
float FLOAT32
Definition: host.h:111
inT16 y() const
access_function
Definition: points.h:56
inT32 outer_area()
Definition: coutln.cpp:303
EXTERN int edges_patharea_ratio
Definition: edgblob.cpp:57
EXTERN int edges_children_per_grandchild
Definition: edgblob.cpp:49
inT16 top() const
Definition: rect.h:53
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
inT32 pathlength() const
Definition: coutln.h:111
#define BUCKETSIZE
Definition: edgblob.h:31
short inT16
Definition: host.h:100
EXTERN double edges_childarea
Definition: edgblob.cpp:59
EXTERN double edges_boxarea
Definition: edgblob.cpp:61
inT16 height() const
Definition: rect.h:97
EXTERN bool edges_children_fix
Definition: edgblob.cpp:53
#define TRUE
Definition: capi.h:27
inT16 bottom() const
Definition: rect.h:60
void OL_BUCKETS::extract_children ( C_OUTLINE outline,
C_OUTLINE_IT *  it 
)

Definition at line 300 of file edgblob.cpp.

303  {
304  inT16 xmin, xmax; // coord limits
305  inT16 ymin, ymax;
306  inT16 xindex, yindex; // current bucket
307  TBOX olbox;
308  C_OUTLINE_IT child_it; // search iterator
309 
310  olbox = outline->bounding_box();
311  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
312  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
313  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
314  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
315  for (yindex = ymin; yindex <= ymax; yindex++) {
316  for (xindex = xmin; xindex <= xmax; xindex++) {
317  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
318  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
319  child_it.forward()) {
320  if (*child_it.data() < *outline) {
321  it->add_after_then_move(child_it.extract());
322  }
323  }
324  }
325  }
326 }
inT16 x() const
access function
Definition: points.h:52
const TBOX & bounding_box() const
Definition: coutln.h:85
inT16 left() const
Definition: rect.h:67
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
inT16 y() const
access_function
Definition: points.h:56
inT16 top() const
Definition: rect.h:53
#define BUCKETSIZE
Definition: edgblob.h:31
short inT16
Definition: host.h:100
inT16 bottom() const
Definition: rect.h:60
C_OUTLINE_LIST * OL_BUCKETS::operator() ( inT16  x,
inT16  y 
)

Definition at line 88 of file edgblob.cpp.

90  {
91  return &buckets[(y-bl.y()) / BUCKETSIZE * bxdim + (x-bl.x()) / BUCKETSIZE];
92 }
inT16 x() const
access function
Definition: points.h:52
inT16 y() const
access_function
Definition: points.h:56
#define BUCKETSIZE
Definition: edgblob.h:31
inT32 OL_BUCKETS::outline_complexity ( C_OUTLINE outline,
inT32  max_count,
inT16  depth 
)

Definition at line 115 of file edgblob.cpp.

119  {
120  inT16 xmin, xmax; // coord limits
121  inT16 ymin, ymax;
122  inT16 xindex, yindex; // current bucket
123  C_OUTLINE *child; // current child
124  inT32 child_count; // no of children
125  inT32 grandchild_count; // no of grandchildren
126  C_OUTLINE_IT child_it; // search iterator
127 
128  TBOX olbox = outline->bounding_box();
129  xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
130  xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
131  ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
132  ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
133  child_count = 0;
134  grandchild_count = 0;
135  if (++depth > edges_max_children_layers) // nested loops are too deep
136  return max_count + depth;
137 
138  for (yindex = ymin; yindex <= ymax; yindex++) {
139  for (xindex = xmin; xindex <= xmax; xindex++) {
140  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
141  if (child_it.empty())
142  continue;
143  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
144  child_it.forward()) {
145  child = child_it.data();
146  if (child == outline || !(*child < *outline))
147  continue;
148  child_count++;
149 
150  if (child_count > edges_max_children_per_outline) { // too fragmented
151  if (edges_debug)
152  tprintf("Discard outline on child_count=%d > "
153  "max_children_per_outline=%d\n",
154  child_count,
155  static_cast<inT32>(edges_max_children_per_outline));
156  return max_count + child_count;
157  }
158 
159  // Compute the "complexity" of each child recursively
160  inT32 remaining_count = max_count - child_count - grandchild_count;
161  if (remaining_count > 0)
162  grandchild_count += edges_children_per_grandchild *
163  outline_complexity(child, remaining_count, depth);
164  if (child_count + grandchild_count > max_count) { // too complex
165  if (edges_debug)
166  tprintf("Disgard outline on child_count=%d + grandchild_count=%d "
167  "> max_count=%d\n",
168  child_count, grandchild_count, max_count);
169  return child_count + grandchild_count;
170  }
171  }
172  }
173  }
174  return child_count + grandchild_count;
175 }
inT32 outline_complexity(C_OUTLINE *outline, inT32 max_count, inT16 depth)
Definition: edgblob.cpp:115
EXTERN bool edges_debug
Definition: edgblob.cpp:45
inT16 x() const
access function
Definition: points.h:52
const TBOX & bounding_box() const
Definition: coutln.h:85
inT16 left() const
Definition: rect.h:67
int inT32
Definition: host.h:102
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
inT16 y() const
access_function
Definition: points.h:56
EXTERN int edges_children_per_grandchild
Definition: edgblob.cpp:49
inT16 top() const
Definition: rect.h:53
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
#define BUCKETSIZE
Definition: edgblob.h:31
short inT16
Definition: host.h:100
EXTERN int edges_max_children_per_outline
Definition: edgblob.cpp:41
EXTERN int edges_max_children_layers
Definition: edgblob.cpp:43
inT16 bottom() const
Definition: rect.h:60
C_OUTLINE_LIST* OL_BUCKETS::scan_next ( )
inline

Definition at line 53 of file edgblob.h.

53  {
54  for (; buckets[index].empty () && index < bxdim * bydim - 1; index++);
55  return &buckets[index];
56  }
C_OUTLINE_LIST* OL_BUCKETS::start_scan ( )
inline

Definition at line 47 of file edgblob.h.

47  {
48  for (index = 0; buckets[index].empty () && index < bxdim * bydim - 1;
49  index++);
50  return &buckets[index];
51  }

The documentation for this class was generated from the following files: