Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::StructuredTable Class Reference

#include <tablerecog.h>

Public Member Functions

 StructuredTable ()
 
 ~StructuredTable ()
 
void Init ()
 
void set_text_grid (ColPartitionGrid *text)
 
void set_line_grid (ColPartitionGrid *lines)
 
void set_max_text_height (int height)
 
bool is_lined () const
 
int row_count () const
 
int column_count () const
 
int cell_count () const
 
void set_bounding_box (const TBOX &box)
 
const TBOXbounding_box () const
 
int median_cell_height ()
 
int median_cell_width ()
 
int row_height (int row) const
 
int column_width (int column) const
 
int space_above () const
 
int space_below () const
 
bool FindLinedStructure ()
 
bool FindWhitespacedStructure ()
 
bool DoesPartitionFit (const ColPartition &part) const
 
int CountFilledCells ()
 
int CountFilledCellsInRow (int row)
 
int CountFilledCellsInColumn (int column)
 
int CountFilledCells (int row_start, int row_end, int column_start, int column_end)
 
bool VerifyRowFilled (int row)
 
double CalculateCellFilledPercentage (int row, int column)
 
void Display (ScrollView *window, ScrollView::Color color)
 

Protected Member Functions

void ClearStructure ()
 
bool VerifyLinedTableCells ()
 
bool VerifyWhitespacedTable ()
 
void FindWhitespacedColumns ()
 
void FindWhitespacedRows ()
 
void CalculateMargins ()
 
void UpdateMargins (ColPartitionGrid *grid)
 
int FindVerticalMargin (ColPartitionGrid *grid, int start_x, bool decrease) const
 
int FindHorizontalMargin (ColPartitionGrid *grid, int start_y, bool decrease) const
 
void CalculateStats ()
 
void AbsorbNearbyLines ()
 
int CountVerticalIntersections (int x)
 
int CountHorizontalIntersections (int y)
 
int CountPartitions (const TBOX &box)
 

Static Protected Member Functions

static void FindCellSplitLocations (const GenericVector< int > &min_list, const GenericVector< int > &max_list, int max_merged, GenericVector< int > *locations)
 

Protected Attributes

ColPartitionGridtext_grid_
 
ColPartitionGridline_grid_
 
TBOX bounding_box_
 
GenericVectorEqEq< int > cell_x_
 
GenericVectorEqEq< int > cell_y_
 
bool is_lined_
 
int space_above_
 
int space_below_
 
int space_left_
 
int space_right_
 
int median_cell_height_
 
int median_cell_width_
 
int max_text_height_
 

Detailed Description

Definition at line 72 of file tablerecog.h.

Constructor & Destructor Documentation

tesseract::StructuredTable::StructuredTable ( )

Definition at line 63 of file tablerecog.cpp.

tesseract::StructuredTable::~StructuredTable ( )

Definition at line 76 of file tablerecog.cpp.

76  {
77 }

Member Function Documentation

void tesseract::StructuredTable::AbsorbNearbyLines ( )
protected

Definition at line 531 of file tablerecog.cpp.

531  {
533  gsearch.SetUniqueMode(true);
534 
535  // Is the closest line above good? Loop multiple times for tables with
536  // multi-line (sometimes 2) borders. Limit the number of lines by
537  // making sure they stay within a table cell or so.
538  ColPartition* line = NULL;
539  gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
540  bounding_box_.top());
541  while ((line = gsearch.NextVerticalSearch(false)) != NULL) {
542  if (!line->IsHorizontalLine())
543  break;
544  TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1,
545  bounding_box_.right(), line->MidY());
546  if (text_search.height() > median_cell_height_ * 2)
547  break;
548  if (CountPartitions(text_search) > 0)
549  break;
550  bounding_box_.set_top(line->MidY());
551  }
552  // As above, is the closest line below good?
553  line = NULL;
554  gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
556  while ((line = gsearch.NextVerticalSearch(true)) != NULL) {
557  if (!line->IsHorizontalLine())
558  break;
559  TBOX text_search(bounding_box_.left(), line->MidY(),
561  if (text_search.height() > median_cell_height_ * 2)
562  break;
563  if (CountPartitions(text_search) > 0)
564  break;
565  bounding_box_.set_bottom(line->MidY());
566  }
567  // TODO(nbeato): vertical lines
568 }
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
int CountPartitions(const TBOX &box)
Definition: tablerecog.cpp:681
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
inT16 top() const
Definition: rect.h:53
void set_bottom(int y)
Definition: rect.h:63
void set_top(int y)
Definition: rect.h:56
ColPartitionGrid * line_grid_
Definition: tablerecog.h:238
inT16 bottom() const
Definition: rect.h:60
const TBOX & tesseract::StructuredTable::bounding_box ( ) const

Definition at line 106 of file tablerecog.cpp.

106  {
107  return bounding_box_;
108 }
double tesseract::StructuredTable::CalculateCellFilledPercentage ( int  row,
int  column 
)

Definition at line 263 of file tablerecog.cpp.

263  {
264  ASSERT_HOST(0 <= row && row <= row_count());
265  ASSERT_HOST(0 <= column && column <= column_count());
266  const TBOX kCellBox(cell_x_[column], cell_y_[row],
267  cell_x_[column + 1], cell_y_[row + 1]);
268  ASSERT_HOST(!kCellBox.null_box());
269 
271  gsearch.SetUniqueMode(true);
272  gsearch.StartRectSearch(kCellBox);
273  double area_covered = 0;
274  ColPartition* text = NULL;
275  while ((text = gsearch.NextRectSearch()) != NULL) {
276  if (text->IsTextType())
277  area_covered += text->bounding_box().intersection(kCellBox).area();
278  }
279  return MIN(1.0, area_covered / kCellBox.area());
280 }
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
#define NULL
Definition: host.h:144
Definition: rect.h:29
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
#define MIN(x, y)
Definition: ndminx.h:28
#define ASSERT_HOST(x)
Definition: errcode.h:84
void tesseract::StructuredTable::CalculateMargins ( )
protected

Definition at line 457 of file tablerecog.cpp.

457  {
464 }
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
void UpdateMargins(ColPartitionGrid *grid)
Definition: tablerecog.cpp:467
#define MAX_INT32
Definition: host.h:120
ColPartitionGrid * line_grid_
Definition: tablerecog.h:238
void tesseract::StructuredTable::CalculateStats ( )
protected

Definition at line 511 of file tablerecog.cpp.

511  {
512  const int kMaxCellHeight = 1000;
513  const int kMaxCellWidth = 1000;
514  STATS height_stats(0, kMaxCellHeight + 1);
515  STATS width_stats(0, kMaxCellWidth + 1);
516 
517  for (int i = 0; i < row_count(); ++i)
518  height_stats.add(row_height(i), column_count());
519  for (int i = 0; i < column_count(); ++i)
520  width_stats.add(column_width(i), row_count());
521 
522  median_cell_height_ = static_cast<int>(height_stats.median() + 0.5);
523  median_cell_width_ = static_cast<int>(width_stats.median() + 0.5);
524 }
int row_height(int row) const
Definition: tablerecog.cpp:115
int column_width(int column) const
Definition: tablerecog.cpp:119
Definition: statistc.h:29
int tesseract::StructuredTable::cell_count ( ) const

Definition at line 100 of file tablerecog.cpp.

100  {
101  return row_count() * column_count();
102 }
void tesseract::StructuredTable::ClearStructure ( )
protected

Definition at line 301 of file tablerecog.cpp.

301  {
302  cell_x_.clear();
303  cell_y_.clear();
304  is_lined_ = false;
305  space_above_ = 0;
306  space_below_ = 0;
307  space_left_ = 0;
308  space_right_ = 0;
310  median_cell_width_ = 0;
311 }
virtual void clear()
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
int tesseract::StructuredTable::column_count ( ) const

Definition at line 97 of file tablerecog.cpp.

97  {
98  return cell_x_.length() == 0 ? 0 : cell_x_.length() - 1;
99 }
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
int length() const
Definition: genericvector.h:63
int tesseract::StructuredTable::column_width ( int  column) const

Definition at line 119 of file tablerecog.cpp.

119  {
120  ASSERT_HOST(0 <= column && column < column_count());
121  return cell_x_[column + 1] - cell_x_[column];
122 }
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
#define ASSERT_HOST(x)
Definition: errcode.h:84
int tesseract::StructuredTable::CountFilledCells ( )

Definition at line 220 of file tablerecog.cpp.

220  {
221  return CountFilledCells(0, row_count() - 1, 0, column_count() - 1);
222 }
int tesseract::StructuredTable::CountFilledCells ( int  row_start,
int  row_end,
int  column_start,
int  column_end 
)

Definition at line 229 of file tablerecog.cpp.

230  {
231  ASSERT_HOST(0 <= row_start && row_start <= row_end && row_end < row_count());
232  ASSERT_HOST(0 <= column_start && column_start <= column_end &&
233  column_end < column_count());
234  int cell_count = 0;
235  TBOX cell_box;
236  for (int row = row_start; row <= row_end; ++row) {
237  cell_box.set_bottom(cell_y_[row]);
238  cell_box.set_top(cell_y_[row + 1]);
239  for (int col = column_start; col <= column_end; ++col) {
240  cell_box.set_left(cell_x_[col]);
241  cell_box.set_right(cell_x_[col + 1]);
242  if (CountPartitions(cell_box) > 0)
243  ++cell_count;
244  }
245  }
246  return cell_count;
247 }
void set_right(int x)
Definition: rect.h:77
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
Definition: rect.h:29
int CountPartitions(const TBOX &box)
Definition: tablerecog.cpp:681
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
void set_bottom(int y)
Definition: rect.h:63
#define ASSERT_HOST(x)
Definition: errcode.h:84
void set_top(int y)
Definition: rect.h:56
void set_left(int x)
Definition: rect.h:70
int tesseract::StructuredTable::CountFilledCellsInColumn ( int  column)

Definition at line 226 of file tablerecog.cpp.

226  {
227  return CountFilledCells(0, row_count() - 1, column, column);
228 }
int tesseract::StructuredTable::CountFilledCellsInRow ( int  row)

Definition at line 223 of file tablerecog.cpp.

223  {
224  return CountFilledCells(row, row, 0, column_count() - 1);
225 }
int tesseract::StructuredTable::CountHorizontalIntersections ( int  y)
protected

Definition at line 655 of file tablerecog.cpp.

655  {
656  int count = 0;
657  // Make a small box to keep the search time down.
658  const int kGridSize = text_grid_->gridsize();
659  TBOX horizontal_box = bounding_box_;
660  horizontal_box.set_bottom(y - kGridSize);
661  horizontal_box.set_top(y + kGridSize);
662 
664  gsearch.SetUniqueMode(true);
665  gsearch.StartRectSearch(horizontal_box);
666  ColPartition* text = NULL;
667  while ((text = gsearch.NextRectSearch()) != NULL) {
668  if (!text->IsTextType())
669  continue;
670 
671  const TBOX& box = text->bounding_box();
672  if (box.bottom() < y && y < box.top())
673  ++count;
674  }
675  return count;
676 }
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
int gridsize() const
Definition: bbgrid.h:68
#define NULL
Definition: host.h:144
Definition: rect.h:29
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
inT16 top() const
Definition: rect.h:53
void set_bottom(int y)
Definition: rect.h:63
int count(LIST var_list)
Definition: oldlist.cpp:108
void set_top(int y)
Definition: rect.h:56
inT16 bottom() const
Definition: rect.h:60
int tesseract::StructuredTable::CountPartitions ( const TBOX box)
protected

Definition at line 681 of file tablerecog.cpp.

681  {
683  gsearch.SetUniqueMode(true);
684  gsearch.StartRectSearch(box);
685  int count = 0;
686  ColPartition* text = NULL;
687  while ((text = gsearch.NextRectSearch()) != NULL) {
688  if (text->IsTextType())
689  ++count;
690  }
691  return count;
692 }
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
#define NULL
Definition: host.h:144
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
int count(LIST var_list)
Definition: oldlist.cpp:108
int tesseract::StructuredTable::CountVerticalIntersections ( int  x)
protected

Definition at line 631 of file tablerecog.cpp.

631  {
632  int count = 0;
633  // Make a small box to keep the search time down.
634  const int kGridSize = text_grid_->gridsize();
635  TBOX vertical_box = bounding_box_;
636  vertical_box.set_left(x - kGridSize);
637  vertical_box.set_right(x + kGridSize);
638 
640  gsearch.SetUniqueMode(true);
641  gsearch.StartRectSearch(vertical_box);
642  ColPartition* text = NULL;
643  while ((text = gsearch.NextRectSearch()) != NULL) {
644  if (!text->IsTextType())
645  continue;
646  const TBOX& box = text->bounding_box();
647  if (box.left() < x && x < box.right())
648  ++count;
649  }
650  return count;
651 }
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
void set_right(int x)
Definition: rect.h:77
int gridsize() const
Definition: bbgrid.h:68
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
int count(LIST var_list)
Definition: oldlist.cpp:108
void set_left(int x)
Definition: rect.h:70
void tesseract::StructuredTable::Display ( ScrollView window,
ScrollView::Color  color 
)

Definition at line 282 of file tablerecog.cpp.

282  {
283 #ifndef GRAPHICS_DISABLED
284  window->Brush(ScrollView::NONE);
285  window->Pen(color);
288  for (int i = 0; i < cell_x_.length(); i++) {
289  window->Line(cell_x_[i], bounding_box_.bottom(),
290  cell_x_[i], bounding_box_.top());
291  }
292  for (int i = 0; i < cell_y_.length(); i++) {
293  window->Line(bounding_box_.left(), cell_y_[i],
294  bounding_box_.right(), cell_y_[i]);
295  }
296  window->UpdateWindow();
297 #endif
298 }
void Pen(Color color)
Definition: scrollview.cpp:721
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
void Brush(Color color)
Definition: scrollview.cpp:727
inT16 left() const
Definition: rect.h:67
inT16 right() const
Definition: rect.h:74
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:601
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
inT16 top() const
Definition: rect.h:53
void UpdateWindow()
Definition: scrollview.cpp:705
int length() const
Definition: genericvector.h:63
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:533
inT16 bottom() const
Definition: rect.h:60
bool tesseract::StructuredTable::DoesPartitionFit ( const ColPartition part) const

Definition at line 208 of file tablerecog.cpp.

208  {
209  const TBOX& box = part.bounding_box();
210  for (int i = 0; i < cell_x_.length(); ++i)
211  if (box.left() < cell_x_[i] && cell_x_[i] < box.right())
212  return false;
213  for (int i = 0; i < cell_y_.length(); ++i)
214  if (box.bottom() < cell_y_[i] && cell_y_[i] < box.top())
215  return false;
216  return true;
217 }
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
inT16 left() const
Definition: rect.h:67
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
inT16 top() const
Definition: rect.h:53
int length() const
Definition: genericvector.h:63
inT16 bottom() const
Definition: rect.h:60
void tesseract::StructuredTable::FindCellSplitLocations ( const GenericVector< int > &  min_list,
const GenericVector< int > &  max_list,
int  max_merged,
GenericVector< int > *  locations 
)
staticprotected

Definition at line 585 of file tablerecog.cpp.

588  {
589  locations->clear();
590  ASSERT_HOST(min_list.length() == max_list.length());
591  if (min_list.length() == 0)
592  return;
593  ASSERT_HOST(min_list.get(0) < max_list.get(0));
594  ASSERT_HOST(min_list.get(min_list.length() - 1) <
595  max_list.get(max_list.length() - 1));
596 
597  locations->push_back(min_list.get(0));
598  int min_index = 0;
599  int max_index = 0;
600  int stacked_partitions = 0;
601  int last_cross_position = MAX_INT32;
602  // max_index will expire after min_index.
603  // However, we can't "increase" the hill size if min_index expired.
604  // So finish processing when min_index expires.
605  while (min_index < min_list.length()) {
606  // Increase the hill count.
607  if (min_list[min_index] < max_list[max_index]) {
608  ++stacked_partitions;
609  if (last_cross_position != MAX_INT32 &&
610  stacked_partitions > max_merged) {
611  int mid = (last_cross_position + min_list[min_index]) / 2;
612  locations->push_back(mid);
613  last_cross_position = MAX_INT32;
614  }
615  ++min_index;
616  } else {
617  // Decrease the hill count.
618  --stacked_partitions;
619  if (last_cross_position == MAX_INT32 &&
620  stacked_partitions <= max_merged) {
621  last_cross_position = max_list[max_index];
622  }
623  ++max_index;
624  }
625  }
626  locations->push_back(max_list.get(max_list.length() - 1));
627 }
virtual void clear()
T & get(int index) const
int push_back(T object)
#define MAX_INT32
Definition: host.h:120
int length() const
Definition: genericvector.h:63
#define ASSERT_HOST(x)
Definition: errcode.h:84
int tesseract::StructuredTable::FindHorizontalMargin ( ColPartitionGrid grid,
int  start_y,
bool  decrease 
) const
protected

Definition at line 494 of file tablerecog.cpp.

495  {
496  ColPartitionGridSearch gsearch(grid);
497  gsearch.SetUniqueMode(true);
498  gsearch.StartSideSearch(border, bounding_box_.bottom(), bounding_box_.top());
499  ColPartition* part = NULL;
500  while ((part = gsearch.NextSideSearch(decrease)) != NULL) {
501  if (!part->IsTextType() && !part->IsVerticalLine())
502  continue;
503  int distance = decrease ? border - part->bounding_box().right()
504  : part->bounding_box().left() - border;
505  if (distance >= 0)
506  return distance;
507  }
508  return MAX_INT32;
509 }
#define NULL
Definition: host.h:144
#define MAX_INT32
Definition: host.h:120
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
inT16 top() const
Definition: rect.h:53
inT16 bottom() const
Definition: rect.h:60
bool tesseract::StructuredTable::FindLinedStructure ( )

Definition at line 137 of file tablerecog.cpp.

137  {
138  ClearStructure();
139 
140  // Search for all of the lines in the current box.
141  // Update the cellular structure with the exact lines.
143  box_search.SetUniqueMode(true);
144  box_search.StartRectSearch(bounding_box_);
145  ColPartition* line = NULL;
146 
147  while ((line = box_search.NextRectSearch()) != NULL) {
148  if (line->IsHorizontalLine())
149  cell_y_.push_back(line->MidY());
150  if (line->IsVerticalLine())
151  cell_x_.push_back(line->MidX());
152  }
153 
154  // HasSignificantLines should guarantee cells.
155  // Because that code is a different class, just gracefully
156  // return false. This could be an assert.
157  if (cell_x_.length() < 3 || cell_y_.length() < 3)
158  return false;
159 
160  cell_x_.sort();
161  cell_y_.sort();
162 
163  // Remove duplicates that may have occurred due to split lines.
166 
167  // The border should be the extents of line boxes, not middle.
168  cell_x_[0] = bounding_box_.left();
172 
173  // Remove duplicates that may have occurred due to moving the borders.
176 
178  CalculateStats();
180  return is_lined_;
181 }
void compact_sorted()
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
int push_back(T object)
inT16 right() const
Definition: rect.h:74
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
inT16 top() const
Definition: rect.h:53
int length() const
Definition: genericvector.h:63
ColPartitionGrid * line_grid_
Definition: tablerecog.h:238
inT16 bottom() const
Definition: rect.h:60
int tesseract::StructuredTable::FindVerticalMargin ( ColPartitionGrid grid,
int  start_x,
bool  decrease 
) const
protected

Definition at line 477 of file tablerecog.cpp.

478  {
479  ColPartitionGridSearch gsearch(grid);
480  gsearch.SetUniqueMode(true);
481  gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
482  border);
483  ColPartition* part = NULL;
484  while ((part = gsearch.NextVerticalSearch(decrease)) != NULL) {
485  if (!part->IsTextType() && !part->IsHorizontalLine())
486  continue;
487  int distance = decrease ? border - part->bounding_box().top()
488  : part->bounding_box().bottom() - border;
489  if (distance >= 0)
490  return distance;
491  }
492  return MAX_INT32;
493 }
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
inT16 right() const
Definition: rect.h:74
#define MAX_INT32
Definition: host.h:120
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
void tesseract::StructuredTable::FindWhitespacedColumns ( )
protected

Definition at line 347 of file tablerecog.cpp.

347  {
348  // Set of the extents of all partitions on the page.
349  GenericVectorEqEq<int> left_sides;
350  GenericVectorEqEq<int> right_sides;
351 
352  // Look at each text partition. We want to find the partitions
353  // that have extremal left/right sides. These will give us a basis
354  // for the table columns.
356  gsearch.SetUniqueMode(true);
357  gsearch.StartRectSearch(bounding_box_);
358  ColPartition* text = NULL;
359  while ((text = gsearch.NextRectSearch()) != NULL) {
360  if (!text->IsTextType())
361  continue;
362 
363  ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right());
364  int spacing = static_cast<int>(text->median_width() *
365  kHorizontalSpacing / 2.0 + 0.5);
366  left_sides.push_back(text->bounding_box().left() - spacing);
367  right_sides.push_back(text->bounding_box().right() + spacing);
368  }
369  // It causes disaster below, so avoid it!
370  if (left_sides.length() == 0 || right_sides.length() == 0)
371  return;
372 
373  // Since data may be inserted in grid order, we sort the left/right sides.
374  left_sides.sort();
375  right_sides.sort();
376 
377  // At this point, in the "merged list", we expect to have a left side,
378  // followed by either more left sides or a right side. The last number
379  // should be a right side. We find places where the splits occur by looking
380  // for "valleys". If we want to force gap sizes or allow overlap, change
381  // the spacing above. If you want to let lines "slice" partitions as long
382  // as it is infrequent, change the following function.
383  FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold,
384  &cell_x_);
385 }
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
#define NULL
Definition: host.h:144
int push_back(T object)
const double kHorizontalSpacing
Definition: tablerecog.cpp:29
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
const int kCellSplitColumnThreshold
Definition: tablerecog.cpp:36
int length() const
Definition: genericvector.h:63
static void FindCellSplitLocations(const GenericVector< int > &min_list, const GenericVector< int > &max_list, int max_merged, GenericVector< int > *locations)
Definition: tablerecog.cpp:585
#define ASSERT_HOST(x)
Definition: errcode.h:84
void tesseract::StructuredTable::FindWhitespacedRows ( )
protected

Definition at line 392 of file tablerecog.cpp.

392  {
393  // Set of the extents of all partitions on the page.
394  GenericVectorEqEq<int> bottom_sides;
395  GenericVectorEqEq<int> top_sides;
396  // We will be "shrinking" partitions, so keep the min/max around to
397  // make sure the bottom/top lines do not intersect text.
398  int min_bottom = MAX_INT32;
399  int max_top = MIN_INT32;
400 
401  // Look at each text partition. We want to find the partitions
402  // that have extremal bottom/top sides. These will give us a basis
403  // for the table rows. Because the textlines can be skewed and close due
404  // to warping, the height of the partitions is toned down a little bit.
406  gsearch.SetUniqueMode(true);
407  gsearch.StartRectSearch(bounding_box_);
408  ColPartition* text = NULL;
409  while ((text = gsearch.NextRectSearch()) != NULL) {
410  if (!text->IsTextType())
411  continue;
412 
413  ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top());
414  min_bottom = MIN(min_bottom, text->bounding_box().bottom());
415  max_top = MAX(max_top, text->bounding_box().top());
416 
417  // Ignore "tall" text partitions, as these are usually false positive
418  // vertical text or multiple lines pulled together.
419  if (text->bounding_box().height() > max_text_height_)
420  continue;
421 
422  int spacing = static_cast<int>(text->bounding_box().height() *
423  kVerticalSpacing / 2.0 + 0.5);
424  int bottom = text->bounding_box().bottom() - spacing;
425  int top = text->bounding_box().top() + spacing;
426  // For horizontal text, the factor can be negative. This should
427  // probably cause a warning or failure. I haven't actually checked if
428  // it happens.
429  if (bottom >= top)
430  continue;
431 
432  bottom_sides.push_back(bottom);
433  top_sides.push_back(top);
434  }
435  // It causes disaster below, so avoid it!
436  if (bottom_sides.length() == 0 || top_sides.length() == 0)
437  return;
438 
439  // Since data may be inserted in grid order, we sort the bottom/top sides.
440  bottom_sides.sort();
441  top_sides.sort();
442 
443  // At this point, in the "merged list", we expect to have a bottom side,
444  // followed by either more bottom sides or a top side. The last number
445  // should be a top side. We find places where the splits occur by looking
446  // for "valleys". If we want to force gap sizes or allow overlap, change
447  // the spacing above. If you want to let lines "slice" partitions as long
448  // as it is infrequent, change the following function.
449  FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold,
450  &cell_y_);
451 
452  // Recover the min/max correctly since it was shifted.
453  cell_y_[0] = min_bottom;
454  cell_y_[cell_y_.length() - 1] = max_top;
455 }
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
#define NULL
Definition: host.h:144
int push_back(T object)
#define MAX_INT32
Definition: host.h:120
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
const double kVerticalSpacing
Definition: tablerecog.cpp:32
const int kCellSplitRowThreshold
Definition: tablerecog.cpp:35
int length() const
Definition: genericvector.h:63
#define MIN(x, y)
Definition: ndminx.h:28
#define MAX(x, y)
Definition: ndminx.h:24
static void FindCellSplitLocations(const GenericVector< int > &min_list, const GenericVector< int > &max_list, int max_merged, GenericVector< int > *locations)
Definition: tablerecog.cpp:585
#define ASSERT_HOST(x)
Definition: errcode.h:84
#define MIN_INT32
Definition: host.h:128
bool tesseract::StructuredTable::FindWhitespacedStructure ( )

Definition at line 184 of file tablerecog.cpp.

184  {
185  ClearStructure();
188 
189  if (!VerifyWhitespacedTable()) {
190  return false;
191  } else {
198  CalculateStats();
199  return true;
200  }
201 }
void set_right(int x)
Definition: rect.h:77
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
int length() const
Definition: genericvector.h:63
void set_bottom(int y)
Definition: rect.h:63
void set_top(int y)
Definition: rect.h:56
void set_left(int x)
Definition: rect.h:70
void tesseract::StructuredTable::Init ( )

Definition at line 79 of file tablerecog.cpp.

79  {
80 }
bool tesseract::StructuredTable::is_lined ( ) const

Definition at line 91 of file tablerecog.cpp.

91  {
92  return is_lined_;
93 }
int tesseract::StructuredTable::median_cell_height ( )

Definition at line 109 of file tablerecog.cpp.

109  {
110  return median_cell_height_;
111 }
int tesseract::StructuredTable::median_cell_width ( )

Definition at line 112 of file tablerecog.cpp.

112  {
113  return median_cell_width_;
114 }
int tesseract::StructuredTable::row_count ( ) const

Definition at line 94 of file tablerecog.cpp.

94  {
95  return cell_y_.length() == 0 ? 0 : cell_y_.length() - 1;
96 }
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
int length() const
Definition: genericvector.h:63
int tesseract::StructuredTable::row_height ( int  row) const

Definition at line 115 of file tablerecog.cpp.

115  {
116  ASSERT_HOST(0 <= row && row < row_count());
117  return cell_y_[row + 1] - cell_y_[row];
118 }
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
#define ASSERT_HOST(x)
Definition: errcode.h:84
void tesseract::StructuredTable::set_bounding_box ( const TBOX box)

Definition at line 103 of file tablerecog.cpp.

103  {
104  bounding_box_ = box;
105 }
void tesseract::StructuredTable::set_line_grid ( ColPartitionGrid lines)

Definition at line 85 of file tablerecog.cpp.

85  {
86  line_grid_ = line_grid;
87 }
ColPartitionGrid * line_grid_
Definition: tablerecog.h:238
void tesseract::StructuredTable::set_max_text_height ( int  height)

Definition at line 88 of file tablerecog.cpp.

88  {
89  max_text_height_ = height;
90 }
void tesseract::StructuredTable::set_text_grid ( ColPartitionGrid text)

Definition at line 82 of file tablerecog.cpp.

82  {
83  text_grid_ = text_grid;
84 }
ColPartitionGrid * text_grid_
Definition: tablerecog.h:237
int tesseract::StructuredTable::space_above ( ) const

Definition at line 123 of file tablerecog.cpp.

123  {
124  return space_above_;
125 }
int tesseract::StructuredTable::space_below ( ) const

Definition at line 126 of file tablerecog.cpp.

126  {
127  return space_below_;
128 }
void tesseract::StructuredTable::UpdateMargins ( ColPartitionGrid grid)
protected

Definition at line 467 of file tablerecog.cpp.

467  {
468  int below = FindVerticalMargin(grid, bounding_box_.bottom(), true);
469  space_below_ = MIN(space_below_, below);
470  int above = FindVerticalMargin(grid, bounding_box_.top(), false);
471  space_above_ = MIN(space_above_, above);
472  int left = FindHorizontalMargin(grid, bounding_box_.left(), true);
473  space_left_ = MIN(space_left_, left);
474  int right = FindHorizontalMargin(grid, bounding_box_.right(), false);
475  space_right_ = MIN(space_right_, right);
476 }
inT16 left() const
Definition: rect.h:67
inT16 right() const
Definition: rect.h:74
inT16 top() const
Definition: rect.h:53
#define MIN(x, y)
Definition: ndminx.h:28
int FindVerticalMargin(ColPartitionGrid *grid, int start_x, bool decrease) const
Definition: tablerecog.cpp:477
int FindHorizontalMargin(ColPartitionGrid *grid, int start_y, bool decrease) const
Definition: tablerecog.cpp:494
inT16 bottom() const
Definition: rect.h:60
bool tesseract::StructuredTable::VerifyLinedTableCells ( )
protected

Definition at line 315 of file tablerecog.cpp.

315  {
316  // Function only called when lines exist.
317  ASSERT_HOST(cell_y_.length() >= 2 && cell_x_.length() >= 2);
318  for (int i = 0; i < cell_y_.length(); ++i) {
320  return false;
321  }
322  for (int i = 0; i < cell_x_.length(); ++i) {
324  return false;
325  }
326  return true;
327 }
GenericVectorEqEq< int > cell_x_
Definition: tablerecog.h:243
int CountHorizontalIntersections(int y)
Definition: tablerecog.cpp:655
GenericVectorEqEq< int > cell_y_
Definition: tablerecog.h:244
int length() const
Definition: genericvector.h:63
int CountVerticalIntersections(int x)
Definition: tablerecog.cpp:631
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool tesseract::StructuredTable::VerifyRowFilled ( int  row)

Definition at line 252 of file tablerecog.cpp.

252  {
253  for (int i = 0; i < column_count(); ++i) {
254  double area_filled = CalculateCellFilledPercentage(row, i);
255  if (area_filled >= kMinFilledArea)
256  return true;
257  }
258  return false;
259 }
const double kMinFilledArea
Definition: tablerecog.cpp:57
double CalculateCellFilledPercentage(int row, int column)
Definition: tablerecog.cpp:263
bool tesseract::StructuredTable::VerifyWhitespacedTable ( )
protected

Definition at line 337 of file tablerecog.cpp.

337  {
338  // criteria for a table, must be at least 2x3 or 3x2
339  return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6;
340 }

Member Data Documentation

TBOX tesseract::StructuredTable::bounding_box_
protected

Definition at line 242 of file tablerecog.h.

GenericVectorEqEq<int> tesseract::StructuredTable::cell_x_
protected

Definition at line 243 of file tablerecog.h.

GenericVectorEqEq<int> tesseract::StructuredTable::cell_y_
protected

Definition at line 244 of file tablerecog.h.

bool tesseract::StructuredTable::is_lined_
protected

Definition at line 245 of file tablerecog.h.

ColPartitionGrid* tesseract::StructuredTable::line_grid_
protected

Definition at line 238 of file tablerecog.h.

int tesseract::StructuredTable::max_text_height_
protected

Definition at line 254 of file tablerecog.h.

int tesseract::StructuredTable::median_cell_height_
protected

Definition at line 251 of file tablerecog.h.

int tesseract::StructuredTable::median_cell_width_
protected

Definition at line 252 of file tablerecog.h.

int tesseract::StructuredTable::space_above_
protected

Definition at line 247 of file tablerecog.h.

int tesseract::StructuredTable::space_below_
protected

Definition at line 248 of file tablerecog.h.

int tesseract::StructuredTable::space_left_
protected

Definition at line 249 of file tablerecog.h.

int tesseract::StructuredTable::space_right_
protected

Definition at line 250 of file tablerecog.h.

ColPartitionGrid* tesseract::StructuredTable::text_grid_
protected

Definition at line 237 of file tablerecog.h.


The documentation for this class was generated from the following files: