Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::TableFinder Class Reference

#include <tablefind.h>

Public Member Functions

 TableFinder ()
 
 ~TableFinder ()
 
void set_resolution (int resolution)
 
void set_left_to_right_language (bool order)
 
void Init (int grid_size, const ICOORD &bottom_left, const ICOORD &top_right)
 
void InsertCleanPartitions (ColPartitionGrid *grid, TO_BLOCK *block)
 
void LocateTables (ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb, const FCOORD &reskew)
 

Protected Member Functions

int gridsize () const
 
int gridwidth () const
 
int gridheight () const
 
const ICOORDbleft () const
 
const ICOORDtright () const
 
ScrollViewMakeWindow (int x, int y, const char *window_name)
 
void InsertTextPartition (ColPartition *part)
 
void InsertFragmentedTextPartition (ColPartition *part)
 
void InsertLeaderPartition (ColPartition *part)
 
void InsertRulingPartition (ColPartition *part)
 
void InsertImagePartition (ColPartition *part)
 
void SplitAndInsertFragmentedTextPartition (ColPartition *part)
 
bool AllowTextPartition (const ColPartition &part) const
 
bool AllowBlob (const BLOBNBOX &blob) const
 
void MoveColSegmentsToGrid (ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
 
void InitializePartitions (ColPartitionSet **all_columns)
 
void SetVerticalSpacing (ColPartition *part)
 
void SetGlobalSpacings (ColPartitionGrid *grid)
 
void set_global_median_xheight (int xheight)
 
void set_global_median_blob_width (int width)
 
void set_global_median_ledding (int ledding)
 
void FindNeighbors ()
 
void MarkTablePartitions ()
 
void MarkPartitionsUsingLocalInformation ()
 
bool HasWideOrNoInterWordGap (ColPartition *part) const
 
bool HasLeaderAdjacent (const ColPartition &part)
 
void FilterFalseAlarms ()
 
void FilterParagraphEndings ()
 
void FilterHeaderAndFooter ()
 
void SmoothTablePartitionRuns ()
 
void GetColumnBlocks (ColPartitionSet **columns, ColSegment_LIST *col_segments)
 
void GroupColumnBlocks (ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
 
bool ConsecutiveBoxes (const TBOX &b1, const TBOX &b2)
 
void SetColumnsType (ColSegment_LIST *col_segments)
 
void GridMergeColumnBlocks ()
 
void GetTableColumns (ColSegment_LIST *table_columns)
 
void GetTableRegions (ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
 
void GridMergeTableRegions ()
 
bool BelongToOneTable (const TBOX &box1, const TBOX &box2)
 
void AdjustTableBoundaries ()
 
void GrowTableBox (const TBOX &table_box, TBOX *result_box)
 
void GrowTableToIncludePartials (const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
 
void GrowTableToIncludeLines (const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
 
bool HLineBelongsToTable (const ColPartition &part, const TBOX &table_box)
 
void IncludeLeftOutColumnHeaders (TBOX *table_box)
 
void DeleteSingleColumnTables ()
 
bool GapInXProjection (int *xprojection, int length)
 
void RecognizeTables ()
 
void DisplayColSegments (ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
 
void DisplayColPartitions (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
 
void DisplayColPartitions (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
 
void DisplayColPartitionConnections (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
 
void DisplayColSegmentGrid (ScrollView *win, ColSegmentGrid *grid, ScrollView::Color color)
 
void WriteToPix (const FCOORD &reskew)
 
void MakeTableBlocks (ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb)
 

Static Protected Member Functions

static void SetPartitionSpacings (ColPartitionGrid *grid, ColPartitionSet **all_columns)
 

Protected Attributes

int resolution_
 
int global_median_xheight_
 
int global_median_blob_width_
 
int global_median_ledding_
 
ColPartitionGrid clean_part_grid_
 
ColPartitionGrid leader_and_ruling_grid_
 
ColPartitionGrid fragmented_text_grid_
 
ColSegmentGrid col_seg_grid_
 
ColSegmentGrid table_grid_
 
bool left_to_right_language_
 

Detailed Description

Definition at line 131 of file tablefind.h.

Constructor & Destructor Documentation

tesseract::TableFinder::TableFinder ( )

Definition at line 168 of file tablefind.cpp.

tesseract::TableFinder::~TableFinder ( )

Definition at line 176 of file tablefind.cpp.

176  {
177  // ColPartitions and ColSegments created by this class for storage in grids
178  // need to be deleted explicitly.
179  clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
180  leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);
181  fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);
182  col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
183  table_grid_.ClearGridData(&DeleteObject<ColSegment>);
184 }
ColSegmentGrid table_grid_
Definition: tablefind.h:428
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:426
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:424
void ClearGridData(void(*free_method)(BBC *))
Definition: bbgrid.h:464

Member Function Documentation

void tesseract::TableFinder::AdjustTableBoundaries ( )
protected

Definition at line 1485 of file tablefind.cpp.

1485  {
1486  // Iterate the table regions in the grid
1487  ColSegment_CLIST adjusted_tables;
1488  ColSegment_C_IT it(&adjusted_tables);
1490  gsearch.StartFullSearch();
1491  ColSegment* table = NULL;
1492  while ((table = gsearch.NextFullSearch()) != NULL) {
1493  const TBOX& table_box = table->bounding_box();
1494  TBOX grown_box = table_box;
1495  GrowTableBox(table_box, &grown_box);
1496  // To prevent a table from expanding again, do not insert the
1497  // modified box back to the grid. Instead move it to a list and
1498  // and remove it from the grid. The list is moved later back to the grid.
1499  if (!grown_box.null_box()) {
1500  ColSegment* col = new ColSegment();
1501  col->InsertBox(grown_box);
1502  it.add_after_then_move(col);
1503  }
1504  gsearch.RemoveBBox();
1505  delete table;
1506  }
1507  // clear table grid to move final tables in it
1508  // TODO(nbeato): table_grid_ should already be empty. The above loop
1509  // removed everything. Maybe just assert it is empty?
1510  table_grid_.Clear();
1511  it.move_to_first();
1512  // move back final tables to table_grid_
1513  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1514  ColSegment* seg = it.extract();
1515  table_grid_.InsertBBox(true, true, seg);
1516  }
1517 }
bool null_box() const
Definition: rect.h:45
ColSegmentGrid table_grid_
Definition: tablefind.h:428
#define NULL
Definition: host.h:144
void GrowTableBox(const TBOX &table_box, TBOX *result_box)
Definition: tablefind.cpp:1519
Definition: rect.h:29
void Clear()
Definition: bbgrid.h:455
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:486
GridSearch< ColSegment, ColSegment_CLIST, ColSegment_C_IT > ColSegmentGridSearch
Definition: tablefind.h:121
bool tesseract::TableFinder::AllowBlob ( const BLOBNBOX blob) const
protected

Definition at line 502 of file tablefind.cpp.

502  {
503  const TBOX& box = blob.bounding_box();
504  const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;
505  const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;
506  const int median_area = global_median_xheight_ * global_median_blob_width_;
507  const double kAreaRequired = median_area * kAllowBlobArea;
508  // Keep comparisons strictly greater to disallow 0!
509  return box.height() > kHeightRequired &&
510  box.width() > kWidthRequired &&
511  box.area() > kAreaRequired;
512 }
inT32 area() const
Definition: rect.h:111
const double kAllowBlobWidth
Definition: tablefind.cpp:59
const TBOX & bounding_box() const
Definition: blobbox.h:208
inT16 width() const
Definition: rect.h:104
Definition: rect.h:29
const double kAllowBlobArea
Definition: tablefind.cpp:60
const double kAllowBlobHeight
Definition: tablefind.cpp:58
inT16 height() const
Definition: rect.h:97
bool tesseract::TableFinder::AllowTextPartition ( const ColPartition part) const
protected

Definition at line 489 of file tablefind.cpp.

489  {
490  const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;
491  const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;
492  const int median_area = global_median_xheight_ * global_median_blob_width_;
493  const double kAreaPerBlobRequired = median_area * kAllowTextArea;
494  // Keep comparisons strictly greater to disallow 0!
495  return part.median_size() > kHeightRequired &&
496  part.median_width() > kWidthRequired &&
497  part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();
498 }
const double kAllowTextArea
Definition: tablefind.cpp:53
const double kAllowTextWidth
Definition: tablefind.cpp:52
const double kAllowTextHeight
Definition: tablefind.cpp:51
bool tesseract::TableFinder::BelongToOneTable ( const TBOX box1,
const TBOX box2 
)
protected

Definition at line 1444 of file tablefind.cpp.

1444  {
1445  // Check the obvious case. Most likely not true because overlapping boxes
1446  // should already be merged, but seems like a good thing to do in case things
1447  // change.
1448  if (box1.overlap(box2))
1449  return true;
1450  // Check for ColPartitions spanning both table regions
1451  TBOX bbox = box1.bounding_union(box2);
1452  // Start a rect search on bbox
1453  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1454  rectsearch(&clean_part_grid_);
1455  rectsearch.StartRectSearch(bbox);
1456  ColPartition* part = NULL;
1457  while ((part = rectsearch.NextRectSearch()) != NULL) {
1458  const TBOX& part_box = part->bounding_box();
1459  // return true if a colpartition spanning both table regions is found
1460  if (part_box.overlap(box1) && part_box.overlap(box2))
1461  return true;
1462  }
1463  return false;
1464 }
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:130
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
#define NULL
Definition: host.h:144
bool overlap(const TBOX &box) const
Definition: rect.h:345
Definition: rect.h:29
const ICOORD & tesseract::TableFinder::bleft ( ) const
protected

Definition at line 387 of file tablefind.cpp.

387  {
388  return clean_part_grid_.bleft();
389 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
const ICOORD & bleft() const
Definition: bbgrid.h:77
bool tesseract::TableFinder::ConsecutiveBoxes ( const TBOX b1,
const TBOX b2 
)
protected

Definition at line 568 of file tablefind.cpp.

568  {
569  int x_margin = 20;
570  int y_margin = 5;
571  return (abs(b1.left() - b2.left()) < x_margin) &&
572  (abs(b1.right() - b2.right()) < x_margin) &&
573  (abs(b1.top()-b2.bottom()) < y_margin ||
574  abs(b2.top()-b1.bottom()) < y_margin);
575 }
inT16 left() const
Definition: rect.h:67
inT16 right() const
Definition: rect.h:74
inT16 top() const
Definition: rect.h:53
inT16 bottom() const
Definition: rect.h:60
void tesseract::TableFinder::DeleteSingleColumnTables ( )
protected

Definition at line 1702 of file tablefind.cpp.

1702  {
1703  int page_width = tright().x() - bleft().x();
1704  ASSERT_HOST(page_width > 0);
1705  // create an integer array to hold projection on x-axis
1706  int* table_xprojection = new int[page_width];
1707  // Iterate through all tables in the table grid
1708  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1709  table_search(&table_grid_);
1710  table_search.StartFullSearch();
1711  ColSegment* table;
1712  while ((table = table_search.NextFullSearch()) != NULL) {
1713  TBOX table_box = table->bounding_box();
1714  // reset the projection array
1715  for (int i = 0; i < page_width; i++) {
1716  table_xprojection[i] = 0;
1717  }
1718  // Start a rect search on table_box
1719  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1720  rectsearch(&clean_part_grid_);
1721  rectsearch.SetUniqueMode(true);
1722  rectsearch.StartRectSearch(table_box);
1723  ColPartition* part;
1724  while ((part = rectsearch.NextRectSearch()) != NULL) {
1725  if (!part->IsTextType())
1726  continue; // Do not consider non-text partitions
1727  if (part->flow() == BTFT_LEADER)
1728  continue; // Assume leaders are in tables
1729  TBOX part_box = part->bounding_box();
1730  // Do not consider partitions partially covered by the table
1731  if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable)
1732  continue;
1733  BLOBNBOX_CLIST* part_boxes = part->boxes();
1734  BLOBNBOX_C_IT pit(part_boxes);
1735 
1736  // Make sure overlapping blobs don't artificially inflate the number
1737  // of rows in the table. This happens frequently with things such as
1738  // decimals and split characters. Do this by assuming the column
1739  // partition is sorted mostly left to right and just clip
1740  // bounding boxes by the previous box's extent.
1741  int next_position_to_write = 0;
1742 
1743  for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1744  BLOBNBOX *pblob = pit.data();
1745  // ignore blob height for the purpose of projection since we
1746  // are only interested in finding valleys
1747  int xstart = pblob->bounding_box().left();
1748  int xend = pblob->bounding_box().right();
1749 
1750  xstart = MAX(xstart, next_position_to_write);
1751  for (int i = xstart; i < xend; i++)
1752  table_xprojection[i - bleft().x()]++;
1753  next_position_to_write = xend;
1754  }
1755  }
1756  // Find largest valley between two reasonable peaks in the table
1757  if (!GapInXProjection(table_xprojection, page_width)) {
1758  table_search.RemoveBBox();
1759  delete table;
1760  }
1761  }
1762  delete[] table_xprojection;
1763 }
ColSegmentGrid table_grid_
Definition: tablefind.h:428
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
inT16 x() const
access function
Definition: points.h:52
const TBOX & bounding_box() const
Definition: blobbox.h:208
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
const ICOORD & tright() const
Definition: tablefind.cpp:390
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
const ICOORD & bleft() const
Definition: tablefind.cpp:387
const double kMinOverlapWithTable
Definition: tablefind.cpp:99
bool GapInXProjection(int *xprojection, int length)
Definition: tablefind.cpp:1767
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
#define MAX(x, y)
Definition: ndminx.h:24
#define ASSERT_HOST(x)
Definition: errcode.h:84
void tesseract::TableFinder::DisplayColPartitionConnections ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  default_color 
)
protected

Definition at line 1949 of file tablefind.cpp.

1952  {
1953 #ifndef GRAPHICS_DISABLED
1954  // Iterate the ColPartitions in the grid.
1955  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1956  gsearch(grid);
1957  gsearch.StartFullSearch();
1958  ColPartition* part = NULL;
1959  while ((part = gsearch.NextFullSearch()) != NULL) {
1960  const TBOX& box = part->bounding_box();
1961  int left_x = box.left();
1962  int right_x = box.right();
1963  int top_y = box.top();
1964  int bottom_y = box.bottom();
1965 
1966  ColPartition* upper_part = part->nearest_neighbor_above();
1967  if (upper_part) {
1968  TBOX upper_box = upper_part->bounding_box();
1969  int mid_x = (left_x + right_x) / 2;
1970  int mid_y = (top_y + bottom_y) / 2;
1971  int other_x = (upper_box.left() + upper_box.right()) / 2;
1972  int other_y = (upper_box.top() + upper_box.bottom()) / 2;
1973  win->Brush(ScrollView::NONE);
1974  win->Pen(color);
1975  win->Line(mid_x, mid_y, other_x, other_y);
1976  }
1977  ColPartition* lower_part = part->nearest_neighbor_below();
1978  if (lower_part) {
1979  TBOX lower_box = lower_part->bounding_box();
1980  int mid_x = (left_x + right_x) / 2;
1981  int mid_y = (top_y + bottom_y) / 2;
1982  int other_x = (lower_box.left() + lower_box.right()) / 2;
1983  int other_y = (lower_box.top() + lower_box.bottom()) / 2;
1984  win->Brush(ScrollView::NONE);
1985  win->Pen(color);
1986  win->Line(mid_x, mid_y, other_x, other_y);
1987  }
1988  }
1989  win->UpdateWindow();
1990 #endif
1991 }
void Pen(Color color)
Definition: scrollview.cpp:721
void Brush(Color color)
Definition: scrollview.cpp:727
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
inT16 top() const
Definition: rect.h:53
void UpdateWindow()
Definition: scrollview.cpp:705
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:533
inT16 bottom() const
Definition: rect.h:60
void tesseract::TableFinder::DisplayColPartitions ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  text_color,
ScrollView::Color  table_color 
)
protected

Definition at line 1915 of file tablefind.cpp.

1918  {
1919 #ifndef GRAPHICS_DISABLED
1920  ScrollView::Color color = default_color;
1921  // Iterate the ColPartitions in the grid.
1922  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1923  gsearch(grid);
1924  gsearch.StartFullSearch();
1925  ColPartition* part = NULL;
1926  while ((part = gsearch.NextFullSearch()) != NULL) {
1927  color = default_color;
1928  if (part->type() == PT_TABLE)
1929  color = table_color;
1930 
1931  const TBOX& box = part->bounding_box();
1932  int left_x = box.left();
1933  int right_x = box.right();
1934  int top_y = box.top();
1935  int bottom_y = box.bottom();
1936  win->Brush(ScrollView::NONE);
1937  win->Pen(color);
1938  win->Rectangle(left_x, bottom_y, right_x, top_y);
1939  }
1940  win->UpdateWindow();
1941 #endif
1942 }
void Pen(Color color)
Definition: scrollview.cpp:721
void Brush(Color color)
Definition: scrollview.cpp:727
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
Definition: capi.h:62
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:601
inT16 top() const
Definition: rect.h:53
void UpdateWindow()
Definition: scrollview.cpp:705
inT16 bottom() const
Definition: rect.h:60
void tesseract::TableFinder::DisplayColPartitions ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  default_color 
)
protected

Definition at line 1943 of file tablefind.cpp.

1945  {
1946  DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW);
1947 }
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1915
void tesseract::TableFinder::DisplayColSegmentGrid ( ScrollView win,
ColSegmentGrid grid,
ScrollView::Color  color 
)
protected

Definition at line 1890 of file tablefind.cpp.

1891  {
1892 #ifndef GRAPHICS_DISABLED
1893  // Iterate the ColPartitions in the grid.
1894  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1895  gsearch(grid);
1896  gsearch.StartFullSearch();
1897  ColSegment* seg = NULL;
1898  while ((seg = gsearch.NextFullSearch()) != NULL) {
1899  const TBOX& box = seg->bounding_box();
1900  int left_x = box.left();
1901  int right_x = box.right();
1902  int top_y = box.top();
1903  int bottom_y = box.bottom();
1904  win->Brush(ScrollView::NONE);
1905  win->Pen(color);
1906  win->Rectangle(left_x, bottom_y, right_x, top_y);
1907  }
1908  win->UpdateWindow();
1909 #endif
1910 }
void Pen(Color color)
Definition: scrollview.cpp:721
void Brush(Color color)
Definition: scrollview.cpp:727
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:601
inT16 top() const
Definition: rect.h:53
void UpdateWindow()
Definition: scrollview.cpp:705
inT16 bottom() const
Definition: rect.h:60
void tesseract::TableFinder::DisplayColSegments ( ScrollView win,
ColSegment_LIST *  cols,
ScrollView::Color  color 
)
protected

Definition at line 1870 of file tablefind.cpp.

1872  {
1873 #ifndef GRAPHICS_DISABLED
1874  win->Pen(color);
1875  win->Brush(ScrollView::NONE);
1876  ColSegment_IT it(segments);
1877  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1878  ColSegment* col = it.data();
1879  const TBOX& box = col->bounding_box();
1880  int left_x = box.left();
1881  int right_x = box.right();
1882  int top_y = box.top();
1883  int bottom_y = box.bottom();
1884  win->Rectangle(left_x, bottom_y, right_x, top_y);
1885  }
1886  win->UpdateWindow();
1887 #endif
1888 }
void Pen(Color color)
Definition: scrollview.cpp:721
void Brush(Color color)
Definition: scrollview.cpp:727
inT16 left() const
Definition: rect.h:67
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:601
inT16 top() const
Definition: rect.h:53
void UpdateWindow()
Definition: scrollview.cpp:705
inT16 bottom() const
Definition: rect.h:60
void tesseract::TableFinder::FilterFalseAlarms ( )
protected

Definition at line 988 of file tablefind.cpp.

988  {
991  // TODO(nbeato): Fully justified text as non-table?
992 }
void tesseract::TableFinder::FilterHeaderAndFooter ( )
protected

Definition at line 1074 of file tablefind.cpp.

1074  {
1075  // Consider top-most text colpartition as header and bottom most as footer
1076  ColPartition* header = NULL;
1077  ColPartition* footer = NULL;
1078  int max_top = MIN_INT32;
1079  int min_bottom = MAX_INT32;
1081  gsearch.StartFullSearch();
1082  ColPartition* part = NULL;
1083  while ((part = gsearch.NextFullSearch()) != NULL) {
1084  if (!part->IsTextType())
1085  continue; // Consider only text partitions
1086  int top = part->bounding_box().top();
1087  int bottom = part->bounding_box().bottom();
1088  if (top > max_top) {
1089  max_top = top;
1090  header = part;
1091  }
1092  if (bottom < min_bottom) {
1093  min_bottom = bottom;
1094  footer = part;
1095  }
1096  }
1097  if (header)
1098  header->clear_table_type();
1099  if (footer)
1100  footer->clear_table_type();
1101 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
#define NULL
Definition: host.h:144
#define MAX_INT32
Definition: host.h:120
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
#define MIN_INT32
Definition: host.h:128
void tesseract::TableFinder::FilterParagraphEndings ( )
protected

Definition at line 994 of file tablefind.cpp.

994  {
995  // Detect last line of paragraph
996  // Iterate the ColPartitions in the grid.
998  gsearch.StartFullSearch();
999  ColPartition* part = NULL;
1000  while ((part = gsearch.NextFullSearch()) != NULL) {
1001  if (part->type() != PT_TABLE)
1002  continue; // Consider only table partitions
1003 
1004  // Paragraph ending should have flowing text above it.
1005  ColPartition* upper_part = part->nearest_neighbor_above();
1006  if (!upper_part)
1007  continue;
1008  if (upper_part->type() != PT_FLOWING_TEXT)
1009  continue;
1010  if (upper_part->bounding_box().width() <
1011  2 * part->bounding_box().width())
1012  continue;
1013  // Check if its the last line of a paragraph.
1014  // In most cases, a paragraph ending should be left-aligned to text line
1015  // above it. Sometimes, it could be a 2 line paragraph, in which case
1016  // the line above it is indented.
1017  // To account for that, check if the partition center is to
1018  // the left of the one above it.
1019  int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2;
1020  int upper_mid = (upper_part->bounding_box().left() +
1021  upper_part->bounding_box().right()) / 2;
1022  int current_spacing = 0; // spacing of the current line to margin
1023  int upper_spacing = 0; // spacing of the previous line to the margin
1025  // Left to right languages, use mid - left to figure out the distance
1026  // the middle is from the left margin.
1027  int left = MIN(part->bounding_box().left(),
1028  upper_part->bounding_box().left());
1029  current_spacing = mid - left;
1030  upper_spacing = upper_mid - left;
1031  } else {
1032  // Right to left languages, use right - mid to figure out the distance
1033  // the middle is from the right margin.
1034  int right = MAX(part->bounding_box().right(),
1035  upper_part->bounding_box().right());
1036  current_spacing = right - mid;
1037  upper_spacing = right - upper_mid;
1038  }
1039  if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing)
1040  continue;
1041 
1042  // Paragraphs should have similar fonts.
1043  if (!part->MatchingSizes(*upper_part) ||
1044  !part->MatchingStrokeWidth(*upper_part, kStrokeWidthFractionalTolerance,
1046  continue;
1047  }
1048 
1049  // The last line of a paragraph should be left aligned.
1050  // TODO(nbeato): This would be untrue if the text was right aligned.
1051  // How often is that?
1052  if (part->space_to_left() >
1053  kMaxParagraphEndingLeftSpaceMultiple * part->median_size())
1054  continue;
1055  // The line above it should be right aligned (assuming justified format).
1056  // Since we can't assume justified text, we compare whitespace to text.
1057  // The above line should have majority spanning text (or the current
1058  // line could have fit on the previous line). So compare
1059  // whitespace to text.
1060  if (upper_part->bounding_box().width() <
1061  kMinParagraphEndingTextToWhitespaceRatio * upper_part->space_to_right())
1062  continue;
1063 
1064  // Ledding above the line should be less than ledding below
1065  if (part->space_above() >= part->space_below() ||
1066  part->space_above() > 2 * global_median_ledding_)
1067  continue;
1068 
1069  // If all checks failed, it is probably text.
1070  part->clear_table_type();
1071  }
1072 }
const double kParagraphEndingPreviousLineRatio
Definition: tablefind.cpp:129
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
#define NULL
Definition: host.h:144
Definition: capi.h:62
const double kMaxParagraphEndingLeftSpaceMultiple
Definition: tablefind.cpp:133
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
const double kStrokeWidthFractionalTolerance
Definition: tablefind.cpp:147
#define MIN(x, y)
Definition: ndminx.h:28
const double kStrokeWidthConstantTolerance
#define MAX(x, y)
Definition: ndminx.h:24
const double kMinParagraphEndingTextToWhitespaceRatio
Definition: tablefind.cpp:139
void tesseract::TableFinder::FindNeighbors ( )
protected

Definition at line 766 of file tablefind.cpp.

766  {
768  gsearch.StartFullSearch();
769  ColPartition* part = NULL;
770  while ((part = gsearch.NextFullSearch()) != NULL) {
771  // TODO(nbeato): Rename this function, meaning is different now.
772  // IT is finding nearest neighbors its own way
773  //SetVerticalSpacing(part);
774 
775  ColPartition* upper = part->SingletonPartner(true);
776  if (upper)
777  part->set_nearest_neighbor_above(upper);
778 
779  ColPartition* lower = part->SingletonPartner(false);
780  if (lower)
781  part->set_nearest_neighbor_below(lower);
782  }
783 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
#define NULL
Definition: host.h:144
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
bool tesseract::TableFinder::GapInXProjection ( int *  xprojection,
int  length 
)
protected

Definition at line 1767 of file tablefind.cpp.

1767  {
1768  // Find peak value of the histogram
1769  int peak_value = 0;
1770  for (int i = 0; i < length; i++) {
1771  if (xprojection[i] > peak_value) {
1772  peak_value = xprojection[i];
1773  }
1774  }
1775  // Peak value represents the maximum number of horizontally
1776  // overlapping colpartitions, so this can be considered as the
1777  // number of rows in the table
1778  if (peak_value < kMinRowsInTable)
1779  return false;
1780  double projection_threshold = kSmallTableProjectionThreshold * peak_value;
1781  if (peak_value >= kLargeTableRowCount)
1782  projection_threshold = kLargeTableProjectionThreshold * peak_value;
1783  // Threshold the histogram
1784  for (int i = 0; i < length; i++) {
1785  xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
1786  }
1787  // Find the largest run of zeros between two ones
1788  int largest_gap = 0;
1789  int run_start = -1;
1790  for (int i = 1; i < length; i++) {
1791  // detect start of a run of zeros
1792  if (xprojection[i - 1] && !xprojection[i]) {
1793  run_start = i;
1794  }
1795  // detect end of a run of zeros and update the value of largest gap
1796  if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1797  int gap = i - run_start;
1798  if (gap > largest_gap)
1799  largest_gap = gap;
1800  run_start = -1;
1801  }
1802  }
1803  return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_;
1804 }
const double kLargeTableProjectionThreshold
Definition: tablefind.cpp:109
const double kSmallTableProjectionThreshold
Definition: tablefind.cpp:108
const int kMinRowsInTable
Definition: tablefind.cpp:114
const int kLargeTableRowCount
Definition: tablefind.cpp:111
const double kMaxXProjectionGapFactor
Definition: tablefind.cpp:143
void tesseract::TableFinder::GetColumnBlocks ( ColPartitionSet **  columns,
ColSegment_LIST *  col_segments 
)
protected

Definition at line 523 of file tablefind.cpp.

524  {
525  for (int i = 0; i < gridheight(); ++i) {
526  ColPartitionSet* columns = all_columns[i];
527  if (columns != NULL) {
528  ColSegment_LIST new_blocks;
529  // Get boxes from the current vertical position on the grid
530  columns->GetColumnBoxes(i * gridsize(), (i+1) * gridsize(), &new_blocks);
531  // Merge the new_blocks boxes into column_blocks if they are well-aligned
532  GroupColumnBlocks(&new_blocks, column_blocks);
533  }
534  }
535 }
void GroupColumnBlocks(ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:538
#define NULL
Definition: host.h:144
int gridsize() const
Definition: tablefind.cpp:378
int gridheight() const
Definition: tablefind.cpp:384
void tesseract::TableFinder::GetTableColumns ( ColSegment_LIST *  table_columns)
protected

Definition at line 1273 of file tablefind.cpp.

1273  {
1274  ColSegment_IT it(table_columns);
1275  // Iterate the ColPartitions in the grid.
1276  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1277  gsearch(&clean_part_grid_);
1278  gsearch.StartFullSearch();
1279  ColPartition* part;
1280  while ((part = gsearch.NextFullSearch()) != NULL) {
1281  if (part->inside_table_column() || part->type() != PT_TABLE)
1282  continue; // prevent a partition to be assigned to multiple columns
1283  const TBOX& box = part->bounding_box();
1284  ColSegment* col = new ColSegment();
1285  col->InsertBox(box);
1286  part->set_inside_table_column(true);
1287  // Start a search below the current cell to find bottom neighbours
1288  // Note: a full search will always process things above it first, so
1289  // this should be starting at the highest cell and working its way down.
1290  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1291  vsearch(&clean_part_grid_);
1292  vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
1293  ColPartition* neighbor = NULL;
1294  bool found_neighbours = false;
1295  while ((neighbor = vsearch.NextVerticalSearch(true)) != NULL) {
1296  // only consider neighbors not assigned to any column yet
1297  if (neighbor->inside_table_column())
1298  continue;
1299  // Horizontal lines should not break the flow
1300  if (neighbor->IsHorizontalLine())
1301  continue;
1302  // presence of a non-table neighbor marks the end of current
1303  // table column
1304  if (neighbor->type() != PT_TABLE)
1305  break;
1306  // add the neighbor partition to the table column
1307  const TBOX& neighbor_box = neighbor->bounding_box();
1308  col->InsertBox(neighbor_box);
1309  neighbor->set_inside_table_column(true);
1310  found_neighbours = true;
1311  }
1312  if (found_neighbours) {
1313  it.add_after_then_move(col);
1314  } else {
1315  part->set_inside_table_column(false);
1316  delete col;
1317  }
1318  }
1319 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
Definition: capi.h:62
inT16 bottom() const
Definition: rect.h:60
void tesseract::TableFinder::GetTableRegions ( ColSegment_LIST *  table_columns,
ColSegment_LIST *  table_regions 
)
protected

Definition at line 1323 of file tablefind.cpp.

1324  {
1325  ColSegment_IT cit(table_columns);
1326  ColSegment_IT rit(table_regions);
1327  // Iterate through column blocks
1328  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1329  gsearch(&col_seg_grid_);
1330  gsearch.StartFullSearch();
1331  ColSegment* part;
1332  int page_height = tright().y() - bleft().y();
1333  ASSERT_HOST(page_height > 0);
1334  // create a bool array to hold projection on y-axis
1335  bool* table_region = new bool[page_height];
1336  while ((part = gsearch.NextFullSearch()) != NULL) {
1337  TBOX part_box = part->bounding_box();
1338  // reset the projection array
1339  for (int i = 0; i < page_height; i++) {
1340  table_region[i] = false;
1341  }
1342  // iterate through all table columns to find regions in the current
1343  // page column block
1344  cit.move_to_first();
1345  for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1346  TBOX col_box = cit.data()->bounding_box();
1347  // find intersection region of table column and page column
1348  TBOX intersection_box = col_box.intersection(part_box);
1349  // project table column on the y-axis
1350  for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
1351  table_region[i - bleft().y()] = true;
1352  }
1353  }
1354  // set x-limits of table regions to page column width
1355  TBOX current_table_box;
1356  current_table_box.set_left(part_box.left());
1357  current_table_box.set_right(part_box.right());
1358  // go through the y-axis projection to find runs of table
1359  // regions. Each run makes one table region.
1360  for (int i = 1; i < page_height; i++) {
1361  // detect start of a table region
1362  if (!table_region[i - 1] && table_region[i]) {
1363  current_table_box.set_bottom(i + bleft().y());
1364  }
1365  // TODO(nbeato): Is it guaranteed that the last row is not a table region?
1366  // detect end of a table region
1367  if (table_region[i - 1] && !table_region[i]) {
1368  current_table_box.set_top(i + bleft().y());
1369  if (!current_table_box.null_box()) {
1370  ColSegment* seg = new ColSegment();
1371  seg->InsertBox(current_table_box);
1372  rit.add_after_then_move(seg);
1373  }
1374  }
1375  }
1376  }
1377  delete[] table_region;
1378 }
void set_right(int x)
Definition: rect.h:77
bool null_box() const
Definition: rect.h:45
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
const ICOORD & tright() const
Definition: tablefind.cpp:390
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
inT16 y() const
access_function
Definition: points.h:56
const ICOORD & bleft() const
Definition: tablefind.cpp:387
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:426
inT16 top() const
Definition: rect.h:53
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:88
void set_bottom(int y)
Definition: rect.h:63
#define ASSERT_HOST(x)
Definition: errcode.h:84
void set_top(int y)
Definition: rect.h:56
void set_left(int x)
Definition: rect.h:70
inT16 bottom() const
Definition: rect.h:60
int tesseract::TableFinder::gridheight ( ) const
protected

Definition at line 384 of file tablefind.cpp.

384  {
385  return clean_part_grid_.gridheight();
386 }
int gridheight() const
Definition: bbgrid.h:74
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
void tesseract::TableFinder::GridMergeColumnBlocks ( )
protected

Definition at line 1195 of file tablefind.cpp.

1195  {
1196  int margin = gridsize();
1197 
1198  // Iterate the Column Blocks in the grid.
1199  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1200  gsearch(&col_seg_grid_);
1201  gsearch.StartFullSearch();
1202  ColSegment* seg;
1203  while ((seg = gsearch.NextFullSearch()) != NULL) {
1204  if (seg->type() != COL_TEXT)
1205  continue; // only consider text blocks for split detection
1206  bool neighbor_found = false;
1207  bool modified = false; // Modified at least once
1208  // keep expanding current box as long as neighboring table columns
1209  // are found above or below it.
1210  do {
1211  TBOX box = seg->bounding_box();
1212  // slightly expand the search region vertically
1213  int top_range = MIN(box.top() + margin, tright().y());
1214  int bottom_range = MAX(box.bottom() - margin, bleft().y());
1215  box.set_top(top_range);
1216  box.set_bottom(bottom_range);
1217  neighbor_found = false;
1218  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1219  rectsearch(&col_seg_grid_);
1220  rectsearch.StartRectSearch(box);
1221  ColSegment* neighbor = NULL;
1222  while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
1223  if (neighbor == seg)
1224  continue;
1225  const TBOX& neighbor_box = neighbor->bounding_box();
1226  // If the neighbor box significantly overlaps with the current
1227  // box (due to the expansion of the current box in the
1228  // previous iteration of this loop), remove the neighbor box
1229  // and expand the current box to include it.
1230  if (neighbor_box.overlap_fraction(box) >= 0.9) {
1231  seg->InsertBox(neighbor_box);
1232  modified = true;
1233  rectsearch.RemoveBBox();
1234  gsearch.RepositionIterator();
1235  delete neighbor;
1236  continue;
1237  }
1238  // Only expand if the neighbor box is of table type
1239  if (neighbor->type() != COL_TABLE)
1240  continue;
1241  // Insert the neighbor box into the current column block
1242  if (neighbor_box.major_x_overlap(box) &&
1243  !box.contains(neighbor_box)) {
1244  seg->InsertBox(neighbor_box);
1245  neighbor_found = true;
1246  modified = true;
1247  rectsearch.RemoveBBox();
1248  gsearch.RepositionIterator();
1249  delete neighbor;
1250  }
1251  }
1252  } while (neighbor_found);
1253  if (modified) {
1254  // Because the box has changed, it has to be removed first.
1255  gsearch.RemoveBBox();
1256  col_seg_grid_.InsertBBox(true, true, seg);
1257  gsearch.RepositionIterator();
1258  }
1259  }
1260 }
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
bool contains(const FCOORD pt) const
Definition: rect.h:323
#define NULL
Definition: host.h:144
const ICOORD & tright() const
Definition: tablefind.cpp:390
int gridsize() const
Definition: tablefind.cpp:378
Definition: rect.h:29
inT16 y() const
access_function
Definition: points.h:56
const ICOORD & bleft() const
Definition: tablefind.cpp:387
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:426
inT16 top() const
Definition: rect.h:53
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:486
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
#define MIN(x, y)
Definition: ndminx.h:28
#define MAX(x, y)
Definition: ndminx.h:24
void set_bottom(int y)
Definition: rect.h:63
void set_top(int y)
Definition: rect.h:56
inT16 bottom() const
Definition: rect.h:60
void tesseract::TableFinder::GridMergeTableRegions ( )
protected

Definition at line 1386 of file tablefind.cpp.

1386  {
1387  // Iterate the table regions in the grid.
1388  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1389  gsearch(&table_grid_);
1390  gsearch.StartFullSearch();
1391  ColSegment* seg = NULL;
1392  while ((seg = gsearch.NextFullSearch()) != NULL) {
1393  bool neighbor_found = false;
1394  bool modified = false; // Modified at least once
1395  do {
1396  // Start a rectangle search x-bounded by the image and y by the table
1397  const TBOX& box = seg->bounding_box();
1398  TBOX search_region(box);
1399  search_region.set_left(bleft().x());
1400  search_region.set_right(tright().x());
1401  neighbor_found = false;
1402  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
1403  rectsearch(&table_grid_);
1404  rectsearch.StartRectSearch(search_region);
1405  ColSegment* neighbor = NULL;
1406  while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
1407  if (neighbor == seg)
1408  continue;
1409  const TBOX& neighbor_box = neighbor->bounding_box();
1410  // Check if a neighbor box has a large overlap with the table
1411  // region. This may happen as a result of merging two table
1412  // regions in the previous iteration.
1413  if (neighbor_box.overlap_fraction(box) >= 0.9) {
1414  seg->InsertBox(neighbor_box);
1415  rectsearch.RemoveBBox();
1416  gsearch.RepositionIterator();
1417  delete neighbor;
1418  modified = true;
1419  continue;
1420  }
1421  // Check if two table regions belong together based on a common
1422  // horizontal ruling line
1423  if (BelongToOneTable(box, neighbor_box)) {
1424  seg->InsertBox(neighbor_box);
1425  neighbor_found = true;
1426  modified = true;
1427  rectsearch.RemoveBBox();
1428  gsearch.RepositionIterator();
1429  delete neighbor;
1430  }
1431  }
1432  } while (neighbor_found);
1433  if (modified) {
1434  // Because the box has changed, it has to be removed first.
1435  gsearch.RemoveBBox();
1436  table_grid_.InsertBBox(true, true, seg);
1437  gsearch.RepositionIterator();
1438  }
1439  }
1440 }
ColSegmentGrid table_grid_
Definition: tablefind.h:428
bool BelongToOneTable(const TBOX &box1, const TBOX &box2)
Definition: tablefind.cpp:1444
#define NULL
Definition: host.h:144
const ICOORD & tright() const
Definition: tablefind.cpp:390
Definition: rect.h:29
const ICOORD & bleft() const
Definition: tablefind.cpp:387
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:486
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
int tesseract::TableFinder::gridsize ( ) const
protected

Definition at line 378 of file tablefind.cpp.

378  {
379  return clean_part_grid_.gridsize();
380 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
int gridsize() const
Definition: bbgrid.h:68
int tesseract::TableFinder::gridwidth ( ) const
protected

Definition at line 381 of file tablefind.cpp.

381  {
382  return clean_part_grid_.gridwidth();
383 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
int gridwidth() const
Definition: bbgrid.h:71
void tesseract::TableFinder::GroupColumnBlocks ( ColSegment_LIST *  current_segments,
ColSegment_LIST *  col_segments 
)
protected

Definition at line 538 of file tablefind.cpp.

539  {
540  ColSegment_IT src_it(new_blocks);
541  ColSegment_IT dest_it(column_blocks);
542  // iterate through the source list
543  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
544  ColSegment* src_seg = src_it.data();
545  TBOX src_box = src_seg->bounding_box();
546  bool match_found = false;
547  // iterate through the destination list to find a matching column block
548  for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
549  ColSegment* dest_seg = dest_it.data();
550  TBOX dest_box = dest_seg->bounding_box();
551  if (ConsecutiveBoxes(src_box, dest_box)) {
552  // If matching block is found, insert the current block into it
553  // and delete the soure block
554  dest_seg->InsertBox(src_box);
555  match_found = true;
556  delete src_it.extract();
557  break;
558  }
559  }
560  // If no match is found, just append the source block to column_blocks
561  if (!match_found) {
562  dest_it.add_after_then_move(src_it.extract());
563  }
564  }
565 }
bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2)
Definition: tablefind.cpp:568
Definition: rect.h:29
void tesseract::TableFinder::GrowTableBox ( const TBOX table_box,
TBOX result_box 
)
protected

Definition at line 1519 of file tablefind.cpp.

1519  {
1520  // TODO(nbeato): The growing code is a bit excessive right now.
1521  // By removing these lines, the partitions considered need
1522  // to have some overlap or be special cases. These lines could
1523  // be added again once a check is put in place to make sure that
1524  // growing tables don't stomp on a lot of non-table partitions.
1525 
1526  // search for horizontal ruling lines within the vertical margin
1527  // int vertical_margin = kRulingVerticalMargin * gridsize();
1528  TBOX search_box = table_box;
1529  // int top = MIN(search_box.top() + vertical_margin, tright().y());
1530  // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
1531  // search_box.set_top(top);
1532  // search_box.set_bottom(bottom);
1533 
1534  GrowTableToIncludePartials(table_box, search_box, result_box);
1535  GrowTableToIncludeLines(table_box, search_box, result_box);
1536  IncludeLeftOutColumnHeaders(result_box);
1537 }
void GrowTableToIncludeLines(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1569
Definition: rect.h:29
void GrowTableToIncludePartials(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1541
void IncludeLeftOutColumnHeaders(TBOX *table_box)
Definition: tablefind.cpp:1663
void tesseract::TableFinder::GrowTableToIncludeLines ( const TBOX table_box,
const TBOX search_range,
TBOX result_box 
)
protected

Definition at line 1569 of file tablefind.cpp.

1571  {
1573  rsearch.SetUniqueMode(true);
1574  rsearch.StartRectSearch(search_range);
1575  ColPartition* part = NULL;
1576  while ((part = rsearch.NextRectSearch()) != NULL) {
1577  // TODO(nbeato) This should also do vertical, but column
1578  // boundaries are breaking things. This function needs to be
1579  // updated to allow vertical lines as well.
1580  if (!part->IsLineType())
1581  continue;
1582  // Avoid the following function call if the result of the
1583  // function is irrelevant.
1584  const TBOX& part_box = part->bounding_box();
1585  if (result_box->contains(part_box))
1586  continue;
1587  // Include a partially overlapping horizontal line only if the
1588  // extra ColPartitions that will be included due to expansion
1589  // have large side spacing w.r.t. columns containing them.
1590  if (HLineBelongsToTable(*part, table_box))
1591  *result_box = result_box->bounding_union(part_box);
1592  // TODO(nbeato): Vertical
1593  }
1594 }
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:130
bool contains(const FCOORD pt) const
Definition: rect.h:323
#define NULL
Definition: host.h:144
Definition: rect.h:29
bool HLineBelongsToTable(const ColPartition &part, const TBOX &table_box)
Definition: tablefind.cpp:1599
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
void tesseract::TableFinder::GrowTableToIncludePartials ( const TBOX table_box,
const TBOX search_range,
TBOX result_box 
)
protected

Definition at line 1541 of file tablefind.cpp.

1543  {
1544  // Rulings are in a different grid, so search 2 grids for rulings, text,
1545  // and table partitions that are not entirely within the new box.
1546  for (int i = 0; i < 2; ++i) {
1547  ColPartitionGrid* grid = (i == 0) ? &fragmented_text_grid_ :
1549  ColPartitionGridSearch rectsearch(grid);
1550  rectsearch.StartRectSearch(search_range);
1551  ColPartition* part = NULL;
1552  while ((part = rectsearch.NextRectSearch()) != NULL) {
1553  // Only include text and table types.
1554  if (part->IsImageType())
1555  continue;
1556  const TBOX& part_box = part->bounding_box();
1557  // Include partition in the table if more than half of it
1558  // is covered by the table
1559  if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1560  *result_box = result_box->bounding_union(part_box);
1561  continue;
1562  }
1563  }
1564  }
1565 }
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:130
#define NULL
Definition: host.h:144
Definition: rect.h:29
const double kMinOverlapWithTable
Definition: tablefind.cpp:99
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:424
bool tesseract::TableFinder::HasLeaderAdjacent ( const ColPartition part)
protected

Definition at line 946 of file tablefind.cpp.

946  {
947  if (part.flow() == BTFT_LEADER)
948  return true;
949  // Search range is left and right bounded by an offset of the
950  // median xheight. This offset is to allow some tolerance to the
951  // the leaders on the page in the event that the alignment is still
952  // a bit off.
953  const TBOX& box = part.bounding_box();
954  const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_;
955  const int top = box.top() + search_size;
956  const int bottom = box.bottom() - search_size;
958  for (int direction = 0; direction < 2; ++direction) {
959  bool right_to_left = (direction == 0);
960  int x = right_to_left ? box.right() : box.left();
961  hsearch.StartSideSearch(x, bottom, top);
962  ColPartition* leader = NULL;
963  while ((leader = hsearch.NextSideSearch(right_to_left)) != NULL) {
964  // This should not happen, they are in different grids.
965  ASSERT_HOST(&part != leader);
966  // The leader could be a horizontal ruling in the grid.
967  // Make sure it is actually a leader.
968  if (leader->flow() != BTFT_LEADER)
969  continue;
970  // Make sure the leader shares a page column with the partition,
971  // otherwise we are spreading across columns.
972  if (!part.IsInSameColumnAs(*leader))
973  break;
974  // There should be a significant vertical overlap
975  if (!leader->VSignificantCoreOverlap(part))
976  continue;
977  // Leader passed all tests, so it is adjacent.
978  return true;
979  }
980  }
981  // No leaders are adjacent to the given partition.
982  return false;
983 }
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
Definition: rect.h:29
const int kAdjacentLeaderSearchPadding
Definition: tablefind.cpp:124
inT16 right() const
Definition: rect.h:74
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
inT16 top() const
Definition: rect.h:53
#define ASSERT_HOST(x)
Definition: errcode.h:84
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
inT16 bottom() const
Definition: rect.h:60
bool tesseract::TableFinder::HasWideOrNoInterWordGap ( ColPartition part) const
protected

Definition at line 857 of file tablefind.cpp.

857  {
858  // Should only get text partitions.
859  ASSERT_HOST(part->IsTextType());
860  // Blob access
861  BLOBNBOX_CLIST* part_boxes = part->boxes();
862  BLOBNBOX_C_IT it(part_boxes);
863  // Check if this is a relatively small partition (such as a single word)
864  if (part->bounding_box().width() <
865  kMinBoxesInTextPartition * part->median_size() &&
866  part_boxes->length() < kMinBoxesInTextPartition)
867  return true;
868 
869  // Variables used to compute inter-blob spacing.
870  int current_x0 = -1;
871  int current_x1 = -1;
872  int previous_x1 = -1;
873  // Stores the maximum gap detected.
874  int largest_partition_gap_found = -1;
875  // Text partition gap limits. If this is text (and not a table),
876  // there should be at least one gap larger than min_gap and no gap
877  // larger than max_gap.
878  const double max_gap = kMaxGapInTextPartition * part->median_size();
879  const double min_gap = kMinMaxGapInTextPartition * part->median_size();
880 
881  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
882  BLOBNBOX* blob = it.data();
883  current_x0 = blob->bounding_box().left();
884  current_x1 = blob->bounding_box().right();
885  if (previous_x1 != -1) {
886  int gap = current_x0 - previous_x1;
887 
888  // TODO(nbeato): Boxes may overlap? Huh?
889  // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors
890  // on the top right of the page are filtered out with this line.
891  // Note 2: Iterating over blobs in a partition, so we are looking for
892  // spacing between the words.
893  if (gap < 0) {
894  // More likely case, the blobs slightly overlap. This can happen
895  // with diacritics (accents) or broken alphabet symbols (characters).
896  // Merge boxes together by taking max of right sides.
897  if (-gap < part->median_size() * kMaxBlobOverlapFactor) {
898  previous_x1 = MAX(previous_x1, current_x1);
899  continue;
900  }
901  // Extreme case, blobs overlap significantly in the same partition...
902  // This should not happen often (if at all), but it does.
903  // TODO(nbeato): investigate cases when this happens.
904  else {
905  // The behavior before was to completely ignore this case.
906  }
907  }
908 
909  // If a large enough gap is found, mark it as a table cell (return true)
910  if (gap > max_gap)
911  return true;
912  if (gap > largest_partition_gap_found)
913  largest_partition_gap_found = gap;
914  }
915  previous_x1 = current_x1;
916  }
917  // Since no large gap was found, return false if the partition is too
918  // long to be a data cell
919  if (part->bounding_box().width() >
920  kMaxBoxesInDataPartition * part->median_size() ||
921  part_boxes->length() > kMaxBoxesInDataPartition)
922  return false;
923 
924  // A partition may be a single blob. In this case, it's an isolated symbol
925  // or non-text (such as a ruling or image).
926  // Detect these as table partitions? Shouldn't this be case by case?
927  // The behavior before was to ignore this, making max_partition_gap < 0
928  // and implicitly return true. Just making it explicit.
929  if (largest_partition_gap_found == -1)
930  return true;
931 
932  // return true if the maximum gap found is smaller than the minimum allowed
933  // max_gap in a text partition. This indicates that there is no signficant
934  // space in the partition, hence it is likely a single word.
935  return largest_partition_gap_found < min_gap;
936 }
const double kMinMaxGapInTextPartition
Definition: tablefind.cpp:75
const TBOX & bounding_box() const
Definition: blobbox.h:208
inT16 left() const
Definition: rect.h:67
inT16 right() const
Definition: rect.h:74
const int kMaxBoxesInDataPartition
Definition: tablefind.cpp:68
const double kMaxGapInTextPartition
Definition: tablefind.cpp:71
const int kMinBoxesInTextPartition
Definition: tablefind.cpp:65
const double kMaxBlobOverlapFactor
Definition: tablefind.cpp:79
#define MAX(x, y)
Definition: ndminx.h:24
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool tesseract::TableFinder::HLineBelongsToTable ( const ColPartition part,
const TBOX table_box 
)
protected

Definition at line 1599 of file tablefind.cpp.

1600  {
1601  if (!part.IsHorizontalLine())
1602  return false;
1603  const TBOX& part_box = part.bounding_box();
1604  if (!part_box.major_x_overlap(table_box))
1605  return false;
1606  // Do not consider top-most horizontal line since it usually
1607  // originates from noise.
1608  // TODO(nbeato): I had to comment this out because the ruling grid doesn't
1609  // have neighbors solved.
1610  // if (!part.nearest_neighbor_above())
1611  // return false;
1612  const TBOX bbox = part_box.bounding_union(table_box);
1613  // In the "unioned table" box (the table extents expanded by the line),
1614  // keep track of how many partitions have significant padding to the left
1615  // and right. If more than half of the partitions covered by the new table
1616  // have significant spacing, the line belongs to the table and the table
1617  // grows to include all of the partitions.
1618  int num_extra_partitions = 0;
1619  int extra_space_to_right = 0;
1620  int extra_space_to_left = 0;
1621  // Rulings are in a different grid, so search 2 grids for rulings, text,
1622  // and table partitions that are introduced by the new box.
1623  for (int i = 0; i < 2; ++i) {
1624  ColPartitionGrid* grid = (i == 0) ? &clean_part_grid_ :
1626  // Start a rect search on bbox
1627  ColPartitionGridSearch rectsearch(grid);
1628  rectsearch.SetUniqueMode(true);
1629  rectsearch.StartRectSearch(bbox);
1630  ColPartition* extra_part = NULL;
1631  while ((extra_part = rectsearch.NextRectSearch()) != NULL) {
1632  // ColPartition already in table
1633  const TBOX& extra_part_box = extra_part->bounding_box();
1634  if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable)
1635  continue;
1636  // Non-text ColPartitions do not contribute
1637  if (extra_part->IsImageType())
1638  continue;
1639  // Consider this partition.
1640  num_extra_partitions++;
1641  // presence of a table cell is a strong hint, so just increment the scores
1642  // without looking at the spacing.
1643  if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
1644  extra_space_to_right++;
1645  extra_space_to_left++;
1646  continue;
1647  }
1648  int space_threshold = kSideSpaceMargin * part.median_size();
1649  if (extra_part->space_to_right() > space_threshold)
1650  extra_space_to_right++;
1651  if (extra_part->space_to_left() > space_threshold)
1652  extra_space_to_left++;
1653  }
1654  }
1655  // tprintf("%d %d %d\n",
1656  // num_extra_partitions,extra_space_to_right,extra_space_to_left);
1657  return (extra_space_to_right > num_extra_partitions / 2) ||
1658  (extra_space_to_left > num_extra_partitions / 2);
1659 }
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:130
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
#define NULL
Definition: host.h:144
Definition: rect.h:29
Definition: capi.h:62
const double kMinOverlapWithTable
Definition: tablefind.cpp:99
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
const int kSideSpaceMargin
Definition: tablefind.cpp:104
void tesseract::TableFinder::IncludeLeftOutColumnHeaders ( TBOX table_box)
protected

Definition at line 1663 of file tablefind.cpp.

1663  {
1664  // Start a search above the current table to look for column headers
1666  vsearch.StartVerticalSearch(table_box->left(), table_box->right(),
1667  table_box->top());
1668  ColPartition* neighbor = NULL;
1669  ColPartition* previous_neighbor = NULL;
1670  while ((neighbor = vsearch.NextVerticalSearch(false)) != NULL) {
1671  // Max distance to find a table heading.
1672  const int max_distance = kMaxColumnHeaderDistance *
1673  neighbor->median_size();
1674  int table_top = table_box->top();
1675  const TBOX& box = neighbor->bounding_box();
1676  // Do not continue if the next box is way above
1677  if (box.bottom() - table_top > max_distance)
1678  break;
1679  // Unconditionally include partitions of type TABLE or LINE
1680  // TODO(faisal): add some reasonable conditions here
1681  if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
1682  table_box->set_top(box.top());
1683  previous_neighbor = NULL;
1684  continue;
1685  }
1686  // If there are two text partitions, one above the other, without a table
1687  // cell on their left or right side, consider them a barrier and quit
1688  if (previous_neighbor == NULL) {
1689  previous_neighbor = neighbor;
1690  } else {
1691  const TBOX& previous_box = previous_neighbor->bounding_box();
1692  if (!box.major_y_overlap(previous_box))
1693  break;
1694  }
1695  }
1696 }
const int kMaxColumnHeaderDistance
Definition: tablefind.cpp:87
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
Definition: capi.h:62
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
inT16 top() const
Definition: rect.h:53
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:429
void set_top(int y)
Definition: rect.h:56
inT16 bottom() const
Definition: rect.h:60
void tesseract::TableFinder::Init ( int  grid_size,
const ICOORD bottom_left,
const ICOORD top_right 
)

Definition at line 190 of file tablefind.cpp.

191  {
192  // Initialize clean partitions list and grid
193  clean_part_grid_.Init(grid_size, bottom_left, top_right);
194  leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);
195  fragmented_text_grid_.Init(grid_size, bottom_left, top_right);
196  col_seg_grid_.Init(grid_size, bottom_left, top_right);
197  table_grid_.Init(grid_size, bottom_left, top_right);
198 }
ColSegmentGrid table_grid_
Definition: tablefind.h:428
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:426
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:424
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: bbgrid.h:444
void tesseract::TableFinder::InitializePartitions ( ColPartitionSet **  all_columns)
protected

Definition at line 579 of file tablefind.cpp.

579  {
580  FindNeighbors();
581  SetPartitionSpacings(&clean_part_grid_, all_columns);
583 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
static void SetPartitionSpacings(ColPartitionGrid *grid, ColPartitionSet **all_columns)
Definition: tablefind.cpp:586
void SetGlobalSpacings(ColPartitionGrid *grid)
Definition: tablefind.cpp:709
void tesseract::TableFinder::InsertCleanPartitions ( ColPartitionGrid grid,
TO_BLOCK block 
)

Definition at line 202 of file tablefind.cpp.

203  {
204  // Calculate stats. This lets us filter partitions in AllowTextPartition()
205  // and filter blobs in AllowBlob().
206  SetGlobalSpacings(grid);
207 
208  // Iterate the ColPartitions in the grid.
209  ColPartitionGridSearch gsearch(grid);
210  gsearch.SetUniqueMode(true);
211  gsearch.StartFullSearch();
212  ColPartition* part = NULL;
213  while ((part = gsearch.NextFullSearch()) != NULL) {
214  // Reject partitions with nothing useful inside of them.
215  if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0)
216  continue;
217  ColPartition* clean_part = part->ShallowCopy();
218  ColPartition* leader_part = NULL;
219  if (part->IsLineType()) {
220  InsertRulingPartition(clean_part);
221  continue;
222  }
223  // Insert all non-text partitions to clean_parts
224  if (!part->IsTextType()) {
225  InsertImagePartition(clean_part);
226  continue;
227  }
228  // Insert text colpartitions after removing noisy components from them
229  // The leaders are split into a separate grid.
230  BLOBNBOX_CLIST* part_boxes = part->boxes();
231  BLOBNBOX_C_IT pit(part_boxes);
232  for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
233  BLOBNBOX *pblob = pit.data();
234  // Bad blobs... happens in UNLV set.
235  // news.3G1, page 17 (around x=6)
236  if (!AllowBlob(*pblob))
237  continue;
238  if (pblob->flow() == BTFT_LEADER) {
239  if (leader_part == NULL) {
240  leader_part = part->ShallowCopy();
241  leader_part->set_flow(BTFT_LEADER);
242  }
243  leader_part->AddBox(pblob);
244  } else if (pblob->region_type() != BRT_NOISE) {
245  clean_part->AddBox(pblob);
246  }
247  }
248  clean_part->ComputeLimits();
249  ColPartition* fragmented = clean_part->CopyButDontOwnBlobs();
250  InsertTextPartition(clean_part);
252  if (leader_part != NULL) {
253  // TODO(nbeato): Note that ComputeLimits does not update the column
254  // information. So the leader may appear to span more columns than it
255  // really does later on when IsInSameColumnAs gets called to test
256  // for adjacent leaders.
257  leader_part->ComputeLimits();
258  InsertLeaderPartition(leader_part);
259  }
260  }
261 
262  // Make the partition partners better for upper and lower neighbors.
265 }
void InsertImagePartition(ColPartition *part)
Definition: tablefind.cpp:421
void InsertRulingPartition(ColPartition *part)
Definition: tablefind.cpp:418
void InsertTextPartition(ColPartition *part)
Definition: tablefind.cpp:394
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
BlobRegionType region_type() const
Definition: blobbox.h:260
#define NULL
Definition: host.h:144
void InsertLeaderPartition(ColPartition *part)
Definition: tablefind.cpp:410
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
void SetGlobalSpacings(ColPartitionGrid *grid)
Definition: tablefind.cpp:709
bool AllowBlob(const BLOBNBOX &blob) const
Definition: tablefind.cpp:502
void set_flow(BlobTextFlowType value)
Definition: blobbox.h:275
void SplitAndInsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:436
BlobTextFlowType flow() const
Definition: blobbox.h:272
void RefinePartitionPartners(bool get_desperate)
void tesseract::TableFinder::InsertFragmentedTextPartition ( ColPartition part)
protected

Definition at line 402 of file tablefind.cpp.

402  {
403  ASSERT_HOST(part != NULL);
404  if (AllowTextPartition(*part)) {
405  fragmented_text_grid_.InsertBBox(true, true, part);
406  } else {
407  delete part;
408  }
409 }
bool AllowTextPartition(const ColPartition &part) const
Definition: tablefind.cpp:489
#define NULL
Definition: host.h:144
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:486
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:424
#define ASSERT_HOST(x)
Definition: errcode.h:84
void tesseract::TableFinder::InsertImagePartition ( ColPartition part)
protected

Definition at line 421 of file tablefind.cpp.

421  {
422  // NOTE: If images are placed into a different grid in the future,
423  // the function SetPartitionSpacings needs to be updated. It should
424  // be the only thing that cares about image partitions.
425  clean_part_grid_.InsertBBox(true, true, part);
426 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:486
void tesseract::TableFinder::InsertLeaderPartition ( ColPartition part)
protected

Definition at line 410 of file tablefind.cpp.

410  {
411  ASSERT_HOST(part != NULL);
412  if (!part->IsEmpty() && part->bounding_box().area() > 0) {
413  leader_and_ruling_grid_.InsertBBox(true, true, part);
414  } else {
415  delete part;
416  }
417 }
#define NULL
Definition: host.h:144
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:486
#define ASSERT_HOST(x)
Definition: errcode.h:84
void tesseract::TableFinder::InsertRulingPartition ( ColPartition part)
protected

Definition at line 418 of file tablefind.cpp.

418  {
419  leader_and_ruling_grid_.InsertBBox(true, true, part);
420 }
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:486
void tesseract::TableFinder::InsertTextPartition ( ColPartition part)
protected

Definition at line 394 of file tablefind.cpp.

394  {
395  ASSERT_HOST(part != NULL);
396  if (AllowTextPartition(*part)) {
397  clean_part_grid_.InsertBBox(true, true, part);
398  } else {
399  delete part;
400  }
401 }
bool AllowTextPartition(const ColPartition &part) const
Definition: tablefind.cpp:489
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
#define NULL
Definition: host.h:144
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:486
#define ASSERT_HOST(x)
Definition: errcode.h:84
void tesseract::TableFinder::LocateTables ( ColPartitionGrid grid,
ColPartitionSet **  columns,
WidthCallback width_cb,
const FCOORD reskew 
)

Definition at line 268 of file tablefind.cpp.

271  {
272  // initialize spacing, neighbors, and columns
273  InitializePartitions(all_columns);
274  if (textord_show_tables) {
275  ScrollView* table_win = MakeWindow(0, 300, "Column Partitions & Neighbors");
281 
282  table_win = MakeWindow(100, 300, "Fragmented Text");
284  }
285  // mark, filter, and smooth candidate table partitions
287 
288  // Make single-column blocks from good_columns_ partitions. col_segments are
289  // moved to a grid later which takes the ownership
290  ColSegment_LIST column_blocks;
291  GetColumnBlocks(all_columns, &column_blocks);
292  // Set the ratio of candidate table partitions in each column
293  SetColumnsType(&column_blocks);
294 
295  // Move column segments to col_seg_grid_
296  MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
297 
298  // Detect split in column layout that might have occurred due to the
299  // presence of a table. In such a case, merge the corresponding columns.
301 
302  // Group horizontally overlapping table partitions into table columns.
303  // table_columns created here get deleted at the end of this method.
304  ColSegment_LIST table_columns;
305  GetTableColumns(&table_columns);
306 
307  // Within each column, mark the range table regions occupy based on the
308  // table columns detected. table_regions are moved to a grid later which
309  // takes the ownership
310  ColSegment_LIST table_regions;
311  GetTableRegions(&table_columns, &table_regions);
312 
314  ScrollView* table_win = MakeWindow(1200, 300, "Table Columns and Regions");
315  DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);
316  DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);
317  }
318 
319  // Merge table regions across columns for tables spanning multiple
320  // columns
321  MoveColSegmentsToGrid(&table_regions, &table_grid_);
323 
324  // Adjust table boundaries by including nearby horizontal lines and left
325  // out column headers
328 
330  // Remove false alarms consiting of a single column
332 
333  if (textord_show_tables) {
334  ScrollView* table_win = MakeWindow(1200, 300, "Detected Table Locations");
336  DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);
337  table_grid_.DisplayBoxes(table_win);
338  }
339 
340  // Find table grid structure and reject tables that are malformed.
341  RecognizeTables();
343  RecognizeTables();
344 
345  if (textord_show_tables) {
346  ScrollView* table_win = MakeWindow(1400, 600, "Recognized Tables");
349  table_grid_.DisplayBoxes(table_win);
350  }
351  } else {
352  // Remove false alarms consiting of a single column
353  // TODO(nbeato): verify this is a NOP after structured table rejection.
354  // Right now it isn't. If the recognize function is doing what it is
355  // supposed to do, this function is obsolete.
357 
358  if (textord_show_tables) {
359  ScrollView* table_win = MakeWindow(1500, 300, "Detected Tables");
362  table_grid_.DisplayBoxes(table_win);
363  }
364  }
365 
367  WriteToPix(reskew);
368 
369  // Merge all colpartitions in table regions to make them a single
370  // colpartition and revert types of isolated table cells not
371  // assigned to any table to their original types.
372  MakeTableBlocks(grid, all_columns, width_cb);
373 }
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:518
ColSegmentGrid table_grid_
Definition: tablefind.h:428
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
void InitializePartitions(ColPartitionSet **all_columns)
Definition: tablefind.cpp:579
void GetTableColumns(ColSegment_LIST *table_columns)
Definition: tablefind.cpp:1273
void MoveColSegmentsToGrid(ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
Definition: tablefind.cpp:1176
void GetTableRegions(ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
Definition: tablefind.cpp:1323
void DisplayBoxes(ScrollView *window)
Definition: bbgrid.h:613
void DisplayColPartitionConnections(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
Definition: tablefind.cpp:1949
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1915
bool textord_tablefind_recognize_tables
Definition: tablefind.cpp:157
void DisplayColSegments(ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
Definition: tablefind.cpp:1870
bool textord_dump_table_images
Definition: tablefind.cpp:150
void GetColumnBlocks(ColPartitionSet **columns, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:523
bool textord_tablefind_show_mark
Definition: tablefind.cpp:153
void SetColumnsType(ColSegment_LIST *col_segments)
Definition: tablefind.cpp:1143
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:426
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
void WriteToPix(const FCOORD &reskew)
Definition: tablefind.cpp:1997
void MakeTableBlocks(ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb)
Definition: tablefind.cpp:2070
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:424
bool textord_show_tables
Definition: tablefind.cpp:151
void tesseract::TableFinder::MakeTableBlocks ( ColPartitionGrid grid,
ColPartitionSet **  columns,
WidthCallback width_cb 
)
protected

Definition at line 2070 of file tablefind.cpp.

2072  {
2073  // Since we have table blocks already, remove table tags from all
2074  // colpartitions
2075  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
2076  gsearch(grid);
2077  gsearch.StartFullSearch();
2078  ColPartition* part = NULL;
2079 
2080  while ((part = gsearch.NextFullSearch()) != NULL) {
2081  if (part->type() == PT_TABLE) {
2082  part->clear_table_type();
2083  }
2084  }
2085  // Now make a single colpartition out of each table block and remove
2086  // all colpartitions contained within a table
2087  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
2088  table_search(&table_grid_);
2089  table_search.StartFullSearch();
2090  ColSegment* table;
2091  while ((table = table_search.NextFullSearch()) != NULL) {
2092  TBOX table_box = table->bounding_box();
2093  // Start a rect search on table_box
2094  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
2095  rectsearch(grid);
2096  rectsearch.StartRectSearch(table_box);
2097  ColPartition* part;
2098  ColPartition* table_partition = NULL;
2099  while ((part = rectsearch.NextRectSearch()) != NULL) {
2100  // Do not consider image partitions
2101  if (!part->IsTextType())
2102  continue;
2103  TBOX part_box = part->bounding_box();
2104  // Include partition in the table if more than half of it
2105  // is covered by the table
2106  if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
2107  rectsearch.RemoveBBox();
2108  if (table_partition) {
2109  table_partition->Absorb(part, width_cb);
2110  } else {
2111  table_partition = part;
2112  }
2113  }
2114  }
2115  // Insert table colpartition back to part_grid_
2116  if (table_partition) {
2117  // To match the columns used when transforming to blocks, the new table
2118  // partition must have its first and last column set at the grid y that
2119  // corresponds to its bottom.
2120  const TBOX& table_box = table_partition->bounding_box();
2121  int grid_x, grid_y;
2122  grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);
2123  table_partition->SetPartitionType(resolution_, all_columns[grid_y]);
2124  table_partition->set_table_type();
2125  table_partition->set_blob_type(BRT_TEXT);
2126  table_partition->set_flow(BTFT_CHAIN);
2127  table_partition->SetBlobTypes();
2128  grid->InsertBBox(true, true, table_partition);
2129  }
2130  }
2131 }
ColSegmentGrid table_grid_
Definition: tablefind.h:428
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
Definition: rect.h:29
Definition: capi.h:62
const double kMinOverlapWithTable
Definition: tablefind.cpp:99
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
inT16 bottom() const
Definition: rect.h:60
ScrollView * tesseract::TableFinder::MakeWindow ( int  x,
int  y,
const char *  window_name 
)
protected

Definition at line 518 of file tablefind.cpp.

518  {
519  return clean_part_grid_.MakeWindow(x, y, window_name);
520 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:589
void tesseract::TableFinder::MarkPartitionsUsingLocalInformation ( )
protected

Definition at line 827 of file tablefind.cpp.

827  {
828  // Iterate the ColPartitions in the grid.
829  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
830  gsearch(&clean_part_grid_);
831  gsearch.StartFullSearch();
832  ColPartition* part = NULL;
833  while ((part = gsearch.NextFullSearch()) != NULL) {
834  if (!part->IsTextType()) // Only consider text partitions
835  continue;
836  // Only consider partitions in dominant font size or smaller
837  if (part->median_size() > kMaxTableCellXheight * global_median_xheight_)
838  continue;
839  // Mark partitions with a large gap, or no significant gap as
840  // table partitions.
841  // Comments: It produces several false alarms at:
842  // - last line of a paragraph (fixed)
843  // - single word section headings
844  // - page headers and footers
845  // - numbered equations
846  // - line drawing regions
847  // TODO(faisal): detect and fix above-mentioned cases
848  if (HasWideOrNoInterWordGap(part) ||
849  HasLeaderAdjacent(*part)) {
850  part->set_table_type();
851  }
852  }
853 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
bool HasLeaderAdjacent(const ColPartition &part)
Definition: tablefind.cpp:946
#define NULL
Definition: host.h:144
bool HasWideOrNoInterWordGap(ColPartition *part) const
Definition: tablefind.cpp:857
const double kMaxTableCellXheight
Definition: tablefind.cpp:83
void tesseract::TableFinder::MarkTablePartitions ( )
protected

Definition at line 789 of file tablefind.cpp.

789  {
792  ScrollView* table_win = MakeWindow(300, 300, "Initial Table Partitions");
796  }
799  ScrollView* table_win = MakeWindow(600, 300, "Filtered Table Partitions");
803  }
806  ScrollView* table_win = MakeWindow(900, 300, "Smoothed Table Partitions");
810  }
813  ScrollView* table_win = MakeWindow(900, 300, "Final Table Partitions");
817  }
818 }
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:518
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
void MarkPartitionsUsingLocalInformation()
Definition: tablefind.cpp:827
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1915
bool textord_tablefind_show_mark
Definition: tablefind.cpp:153
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
bool textord_show_tables
Definition: tablefind.cpp:151
void tesseract::TableFinder::MoveColSegmentsToGrid ( ColSegment_LIST *  segments,
ColSegmentGrid col_seg_grid 
)
protected

Definition at line 1176 of file tablefind.cpp.

1177  {
1178  ColSegment_IT it(segments);
1179  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1180  ColSegment* seg = it.extract();
1181  col_seg_grid->InsertBBox(true, true, seg);
1182  }
1183 }
void tesseract::TableFinder::RecognizeTables ( )
protected

Definition at line 1816 of file tablefind.cpp.

1816  {
1817  ScrollView* table_win = NULL;
1818  if (textord_show_tables) {
1819  table_win = MakeWindow(0, 0, "Table Structure");
1822  // table_grid_.DisplayBoxes(table_win);
1823  }
1824 
1825 
1826  TableRecognizer recognizer;
1827  recognizer.Init();
1828  recognizer.set_line_grid(&leader_and_ruling_grid_);
1829  recognizer.set_text_grid(&fragmented_text_grid_);
1830  recognizer.set_max_text_height(global_median_xheight_ * 2.0);
1831  recognizer.set_min_height(1.5 * gridheight());
1832  // Loop over all of the tables and try to fit them.
1833  // Store the good tables here.
1834  ColSegment_CLIST good_tables;
1835  ColSegment_C_IT good_it(&good_tables);
1836 
1838  gsearch.StartFullSearch();
1839  ColSegment* found_table = NULL;
1840  while ((found_table = gsearch.NextFullSearch()) != NULL) {
1841  gsearch.RemoveBBox();
1842 
1843  // The goal is to make the tables persistent in a list.
1844  // When that happens, this will move into the search loop.
1845  const TBOX& found_box = found_table->bounding_box();
1846  StructuredTable* table_structure = recognizer.RecognizeTable(found_box);
1847 
1848  // Process a table. Good tables are inserted into the grid again later on
1849  // We can't change boxes in the grid while it is running a search.
1850  if (table_structure != NULL) {
1851  if (textord_show_tables) {
1852  table_structure->Display(table_win, ScrollView::LIME_GREEN);
1853  }
1854  found_table->set_bounding_box(table_structure->bounding_box());
1855  delete table_structure;
1856  good_it.add_after_then_move(found_table);
1857  } else {
1858  delete found_table;
1859  }
1860  }
1861  // TODO(nbeato): MERGE!! There is awesome info now available for merging.
1862 
1863  // At this point, the grid is empty. We can safely insert the good tables
1864  // back into grid.
1865  for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward())
1866  table_grid_.InsertBBox(true, true, good_it.extract());
1867 }
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:518
ColSegmentGrid table_grid_
Definition: tablefind.h:428
#define NULL
Definition: host.h:144
Definition: rect.h:29
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1915
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:420
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:486
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:424
GridSearch< ColSegment, ColSegment_CLIST, ColSegment_C_IT > ColSegmentGridSearch
Definition: tablefind.h:121
int gridheight() const
Definition: tablefind.cpp:384
bool textord_show_tables
Definition: tablefind.cpp:151
void tesseract::TableFinder::set_global_median_blob_width ( int  width)
protected

Definition at line 759 of file tablefind.cpp.

759  {
761 }
void tesseract::TableFinder::set_global_median_ledding ( int  ledding)
protected

Definition at line 762 of file tablefind.cpp.

762  {
763  global_median_ledding_ = ledding;
764 }
void tesseract::TableFinder::set_global_median_xheight ( int  xheight)
protected

Definition at line 756 of file tablefind.cpp.

756  {
757  global_median_xheight_ = xheight;
758 }
void tesseract::TableFinder::set_left_to_right_language ( bool  order)

Definition at line 186 of file tablefind.cpp.

186  {
187  left_to_right_language_ = order;
188 }
void tesseract::TableFinder::set_resolution ( int  resolution)
inline

Definition at line 138 of file tablefind.h.

138  {
139  resolution_ = resolution;
140  }
void tesseract::TableFinder::SetColumnsType ( ColSegment_LIST *  col_segments)
protected

Definition at line 1143 of file tablefind.cpp.

1143  {
1144  ColSegment_IT it(column_blocks);
1145  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1146  ColSegment* seg = it.data();
1147  TBOX box = seg->bounding_box();
1148  int num_table_cells = 0;
1149  int num_text_cells = 0;
1150  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
1151  rsearch(&clean_part_grid_);
1152  rsearch.SetUniqueMode(true);
1153  rsearch.StartRectSearch(box);
1154  ColPartition* part = NULL;
1155  while ((part = rsearch.NextRectSearch()) != NULL) {
1156  if (part->type() == PT_TABLE) {
1157  num_table_cells++;
1158  } else if (part->type() == PT_FLOWING_TEXT) {
1159  num_text_cells++;
1160  }
1161  }
1162  // If a column block has no text or table partition in it, it is not needed
1163  // for table detection.
1164  if (!num_table_cells && !num_text_cells) {
1165  delete it.extract();
1166  } else {
1167  seg->set_num_table_cells(num_table_cells);
1168  seg->set_num_text_cells(num_text_cells);
1169  // set column type based on the ratio of table to text cells
1170  seg->set_type();
1171  }
1172  }
1173 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
#define NULL
Definition: host.h:144
Definition: rect.h:29
Definition: capi.h:62
void tesseract::TableFinder::SetGlobalSpacings ( ColPartitionGrid grid)
protected

Definition at line 709 of file tablefind.cpp.

709  {
710  STATS xheight_stats(0, kMaxVerticalSpacing + 1);
711  STATS width_stats(0, kMaxBlobWidth + 1);
712  STATS ledding_stats(0, kMaxVerticalSpacing + 1);
713  // Iterate the ColPartitions in the grid.
714  ColPartitionGridSearch gsearch(grid);
715  gsearch.SetUniqueMode(true);
716  gsearch.StartFullSearch();
717  ColPartition* part = NULL;
718  while ((part = gsearch.NextFullSearch()) != NULL) {
719  // TODO(nbeato): HACK HACK HACK! medians are equal to partition length.
720  // ComputeLimits needs to get called somewhere outside of TableFinder
721  // to make sure the partitions are properly initialized.
722  // When this is called, SmoothPartitionPartners dies in an assert after
723  // table find runs. Alternative solution.
724  // part->ComputeLimits();
725  if (part->IsTextType()) {
726  // xheight_stats.add(part->median_size(), part->boxes_count());
727  // width_stats.add(part->median_width(), part->boxes_count());
728 
729  // This loop can be removed when above issues are fixed.
730  // Replace it with the 2 lines commented out above.
731  BLOBNBOX_C_IT it(part->boxes());
732  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
733  xheight_stats.add(it.data()->bounding_box().height(), 1);
734  width_stats.add(it.data()->bounding_box().width(), 1);
735  }
736 
737  ledding_stats.add(part->space_above(), 1);
738  ledding_stats.add(part->space_below(), 1);
739  }
740  }
741  // Set estimates based on median of statistics obtained
742  set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));
743  set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));
744  set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));
745  #ifndef GRAPHICS_DISABLED
747  const char* kWindowName = "X-height (R), X-width (G), and ledding (B)";
748  ScrollView* stats_win = MakeWindow(500, 10, kWindowName);
749  xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
750  width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
751  ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);
752  }
753  #endif // GRAPHICS_DISABLED
754 }
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:518
void set_global_median_xheight(int xheight)
Definition: tablefind.cpp:756
#define NULL
Definition: host.h:144
const int kMaxVerticalSpacing
Definition: tablefind.cpp:40
bool textord_tablefind_show_stats
Definition: tablefind.cpp:155
const int kMaxBlobWidth
Definition: tablefind.cpp:42
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
Definition: statistc.h:29
void set_global_median_ledding(int ledding)
Definition: tablefind.cpp:762
void set_global_median_blob_width(int width)
Definition: tablefind.cpp:759
void tesseract::TableFinder::SetPartitionSpacings ( ColPartitionGrid grid,
ColPartitionSet **  all_columns 
)
staticprotected

Definition at line 586 of file tablefind.cpp.

587  {
588  // Iterate the ColPartitions in the grid.
589  ColPartitionGridSearch gsearch(grid);
590  gsearch.StartFullSearch();
591  ColPartition* part = NULL;
592  while ((part = gsearch.NextFullSearch()) != NULL) {
593  ColPartitionSet* columns = all_columns[gsearch.GridY()];
594  TBOX box = part->bounding_box();
595  int y = part->MidY();
596  ColPartition* left_column = columns->ColumnContaining(box.left(), y);
597  ColPartition* right_column = columns->ColumnContaining(box.right(), y);
598  // set distance from left column as space to the left
599  if (left_column) {
600  int left_space = MAX(0, box.left() - left_column->LeftAtY(y));
601  part->set_space_to_left(left_space);
602  }
603  // set distance from right column as space to the right
604  if (right_column) {
605  int right_space = MAX(0, right_column->RightAtY(y) - box.right());
606  part->set_space_to_right(right_space);
607  }
608 
609  // Look for images that may be closer.
610  // NOTE: used to be part_grid_, might cause issues now
611  ColPartitionGridSearch hsearch(grid);
612  hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
613  ColPartition* neighbor = NULL;
614  while ((neighbor = hsearch.NextSideSearch(true)) != NULL) {
615  if (neighbor->type() == PT_PULLOUT_IMAGE ||
616  neighbor->type() == PT_FLOWING_IMAGE ||
617  neighbor->type() == PT_HEADING_IMAGE) {
618  int right = neighbor->bounding_box().right();
619  if (right < box.left()) {
620  int space = MIN(box.left() - right, part->space_to_left());
621  part->set_space_to_left(space);
622  }
623  }
624  }
625  hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
626  neighbor = NULL;
627  while ((neighbor = hsearch.NextSideSearch(false)) != NULL) {
628  if (neighbor->type() == PT_PULLOUT_IMAGE ||
629  neighbor->type() == PT_FLOWING_IMAGE ||
630  neighbor->type() == PT_HEADING_IMAGE) {
631  int left = neighbor->bounding_box().left();
632  if (left > box.right()) {
633  int space = MIN(left - box.right(), part->space_to_right());
634  part->set_space_to_right(space);
635  }
636  }
637  }
638 
639  ColPartition* upper_part = part->SingletonPartner(true);
640  if (upper_part) {
641  int space = MAX(0, upper_part->bounding_box().bottom() -
642  part->bounding_box().bottom());
643  part->set_space_above(space);
644  } else {
645  // TODO(nbeato): What constitutes a good value?
646  // 0 is the default value when not set, explicitly noting it needs to
647  // be something else.
648  part->set_space_above(MAX_INT32);
649  }
650 
651  ColPartition* lower_part = part->SingletonPartner(false);
652  if (lower_part) {
653  int space = MAX(0, part->bounding_box().bottom() -
654  lower_part->bounding_box().bottom());
655  part->set_space_below(space);
656  } else {
657  // TODO(nbeato): What constitutes a good value?
658  // 0 is the default value when not set, explicitly noting it needs to
659  // be something else.
660  part->set_space_below(MAX_INT32);
661  }
662  }
663 }
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
#define MAX_INT32
Definition: host.h:120
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
inT16 top() const
Definition: rect.h:53
#define MIN(x, y)
Definition: ndminx.h:28
#define MAX(x, y)
Definition: ndminx.h:24
inT16 bottom() const
Definition: rect.h:60
void tesseract::TableFinder::SetVerticalSpacing ( ColPartition part)
protected

Definition at line 666 of file tablefind.cpp.

666  {
667  TBOX box = part->bounding_box();
668  int top_range = MIN(box.top() + kMaxVerticalSpacing, tright().y());
669  int bottom_range = MAX(box.bottom() - kMaxVerticalSpacing, bleft().y());
670  box.set_top(top_range);
671  box.set_bottom(bottom_range);
672 
673  TBOX part_box = part->bounding_box();
674  // Start a rect search
675  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
676  rectsearch(&clean_part_grid_);
677  rectsearch.StartRectSearch(box);
678  ColPartition* neighbor;
679  int min_space_above = kMaxVerticalSpacing;
680  int min_space_below = kMaxVerticalSpacing;
681  ColPartition* above_neighbor = NULL;
682  ColPartition* below_neighbor = NULL;
683  while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
684  if (neighbor == part)
685  continue;
686  TBOX neighbor_box = neighbor->bounding_box();
687  if (neighbor_box.major_x_overlap(part_box)) {
688  int gap = abs(part->median_bottom() - neighbor->median_bottom());
689  // If neighbor is below current partition
690  if (neighbor_box.top() < part_box.bottom() &&
691  gap < min_space_below) {
692  min_space_below = gap;
693  below_neighbor = neighbor;
694  } // If neighbor is above current partition
695  else if (part_box.top() < neighbor_box.bottom() &&
696  gap < min_space_above) {
697  min_space_above = gap;
698  above_neighbor = neighbor;
699  }
700  }
701  }
702  part->set_space_above(min_space_above);
703  part->set_space_below(min_space_below);
704  part->set_nearest_neighbor_above(above_neighbor);
705  part->set_nearest_neighbor_below(below_neighbor);
706 }
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
#define NULL
Definition: host.h:144
const ICOORD & tright() const
Definition: tablefind.cpp:390
Definition: rect.h:29
const int kMaxVerticalSpacing
Definition: tablefind.cpp:40
inT16 y() const
access_function
Definition: points.h:56
const ICOORD & bleft() const
Definition: tablefind.cpp:387
inT16 top() const
Definition: rect.h:53
#define MIN(x, y)
Definition: ndminx.h:28
#define MAX(x, y)
Definition: ndminx.h:24
void set_bottom(int y)
Definition: rect.h:63
void set_top(int y)
Definition: rect.h:56
inT16 bottom() const
Definition: rect.h:60
void tesseract::TableFinder::SmoothTablePartitionRuns ( )
protected

Definition at line 1108 of file tablefind.cpp.

1108  {
1109  // Iterate the ColPartitions in the grid.
1111  gsearch.StartFullSearch();
1112  ColPartition* part = NULL;
1113  while ((part = gsearch.NextFullSearch()) != NULL) {
1114  if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN)
1115  continue; // Consider only text partitions
1116  ColPartition* upper_part = part->nearest_neighbor_above();
1117  ColPartition* lower_part = part->nearest_neighbor_below();
1118  if (!upper_part || !lower_part)
1119  continue;
1120  if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE)
1121  part->set_table_type();
1122  }
1123 
1124  // Pass 2, do the opposite. If both the upper and lower neighbors
1125  // exist and are not tables, this probably shouldn't be a table.
1126  gsearch.StartFullSearch();
1127  part = NULL;
1128  while ((part = gsearch.NextFullSearch()) != NULL) {
1129  if (part->type() != PT_TABLE)
1130  continue; // Consider only text partitions
1131  ColPartition* upper_part = part->nearest_neighbor_above();
1132  ColPartition* lower_part = part->nearest_neighbor_below();
1133 
1134  // table can't be by itself
1135  if ((upper_part && upper_part->type() != PT_TABLE) &&
1136  (lower_part && lower_part->type() != PT_TABLE)) {
1137  part->clear_table_type();
1138  }
1139  }
1140 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
#define NULL
Definition: host.h:144
Definition: capi.h:62
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:895
void tesseract::TableFinder::SplitAndInsertFragmentedTextPartition ( ColPartition part)
protected

Definition at line 436 of file tablefind.cpp.

436  {
437  ASSERT_HOST(part != NULL);
438  // Bye bye empty partitions!
439  if (part->boxes()->empty()) {
440  delete part;
441  return;
442  }
443 
444  // The AllowBlob function prevents this.
445  ASSERT_HOST(part->median_width() > 0);
446  const double kThreshold = part->median_width() * kSplitPartitionSize;
447 
448  ColPartition* right_part = part;
449  bool found_split = true;
450  while (found_split) {
451  found_split = false;
452  BLOBNBOX_C_IT box_it(right_part->boxes());
453  // Blobs are sorted left side first. If blobs overlap,
454  // the previous blob may have a "more right" right side.
455  // Account for this by always keeping the largest "right"
456  // so far.
457  int previous_right = MIN_INT32;
458 
459  // Look for the next split in the partition.
460  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
461  const TBOX& box = box_it.data()->bounding_box();
462  if (previous_right != MIN_INT32 &&
463  box.left() - previous_right > kThreshold) {
464  // We have a split position. Split the partition in two pieces.
465  // Insert the left piece in the grid and keep processing the right.
466  int mid_x = (box.left() + previous_right) / 2;
467  ColPartition* left_part = right_part;
468  right_part = left_part->SplitAt(mid_x);
469 
471  found_split = true;
472  break;
473  }
474 
475  // The right side of the previous blobs.
476  previous_right = MAX(previous_right, box.right());
477  }
478  }
479  // When a split is not found, the right part is minimized
480  // as much as possible, so process it.
481  InsertFragmentedTextPartition(right_part);
482 }
void InsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:402
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
const double kSplitPartitionSize
Definition: tablefind.cpp:46
#define MAX(x, y)
Definition: ndminx.h:24
#define ASSERT_HOST(x)
Definition: errcode.h:84
#define MIN_INT32
Definition: host.h:128
const ICOORD & tesseract::TableFinder::tright ( ) const
protected

Definition at line 390 of file tablefind.cpp.

390  {
391  return clean_part_grid_.tright();
392 }
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
const ICOORD & tright() const
Definition: bbgrid.h:80
void tesseract::TableFinder::WriteToPix ( const FCOORD reskew)
protected

Definition at line 1997 of file tablefind.cpp.

1997  {
1998  // Input file must be named test1.tif
1999  PIX* pix = pixRead("test1.tif");
2000  if (!pix) {
2001  tprintf("Input file test1.tif not found.\n");
2002  return;
2003  }
2004  int img_height = pixGetHeight(pix);
2005  int img_width = pixGetWidth(pix);
2006  // Maximum number of text or table partitions
2007  int num_boxes = 10;
2008  BOXA* text_box_array = boxaCreate(num_boxes);
2009  BOXA* table_box_array = boxaCreate(num_boxes);
2010  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
2011  gsearch(&clean_part_grid_);
2012  gsearch.StartFullSearch();
2013  ColPartition* part;
2014  // load colpartitions into text_box_array and table_box_array
2015  while ((part = gsearch.NextFullSearch()) != NULL) {
2016  TBOX box = part->bounding_box();
2017  box.rotate_large(reskew);
2018  BOX* lept_box = boxCreate(box.left(), img_height - box.top(),
2019  box.right() - box.left(),
2020  box.top() - box.bottom());
2021  if (part->type() == PT_TABLE)
2022  boxaAddBox(table_box_array, lept_box, L_INSERT);
2023  else
2024  boxaAddBox(text_box_array, lept_box, L_INSERT);
2025  }
2026  // draw colpartitions on the output image
2027  PIX* out = pixDrawBoxa(pix, text_box_array, 3, 0xff000000);
2028  out = pixDrawBoxa(out, table_box_array, 3, 0x0000ff00);
2029 
2030  BOXA* table_array = boxaCreate(num_boxes);
2031  // text file containing detected table bounding boxes
2032  FILE* fptr = fopen("tess-table.txt", "wb");
2033  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
2034  table_search(&table_grid_);
2035  table_search.StartFullSearch();
2036  ColSegment* table;
2037  // load table boxes to table_array and write them to text file as well
2038  while ((table = table_search.NextFullSearch()) != NULL) {
2039  TBOX box = table->bounding_box();
2040  box.rotate_large(reskew);
2041  // Since deskewing introduces negative coordinates, reskewing
2042  // might not completely recover from that since both steps enlarge
2043  // the actual box. Hence a box that undergoes deskewing/reskewing
2044  // may go out of image boundaries. Crop a table box if needed to
2045  // contain it inside the image dimensions.
2046  box = box.intersection(TBOX(0, 0, img_width - 1, img_height - 1));
2047  BOX* lept_box = boxCreate(box.left(), img_height - box.top(),
2048  box.right() - box.left(),
2049  box.top() - box.bottom());
2050  boxaAddBox(table_array, lept_box, L_INSERT);
2051  fprintf(fptr, "%d %d %d %d TABLE\n", box.left(),
2052  img_height - box.top(), box.right(), img_height - box.bottom());
2053  }
2054  fclose(fptr);
2055  // paint table boxes on the debug image
2056  out = pixDrawBoxa(out, table_array, 5, 0x7fff0000);
2057 
2058  pixWrite("out.png", out, IFF_PNG);
2059  // memory cleanup
2060  boxaDestroy(&text_box_array);
2061  boxaDestroy(&table_box_array);
2062  boxaDestroy(&table_array);
2063  pixDestroy(&pix);
2064  pixDestroy(&out);
2065 }
struct Boxa BOXA
Definition: capi.h:70
ColSegmentGrid table_grid_
Definition: tablefind.h:428
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:418
struct Pix PIX
Definition: capi.h:69
#define NULL
Definition: host.h:144
inT16 left() const
Definition: rect.h:67
void rotate_large(const FCOORD &vec)
Definition: rect.cpp:73
Definition: rect.h:29
inT16 right() const
Definition: rect.h:74
Definition: capi.h:62
inT16 top() const
Definition: rect.h:53
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:88
inT16 bottom() const
Definition: rect.h:60

Member Data Documentation

ColPartitionGrid tesseract::TableFinder::clean_part_grid_
protected

Definition at line 418 of file tablefind.h.

ColSegmentGrid tesseract::TableFinder::col_seg_grid_
protected

Definition at line 426 of file tablefind.h.

ColPartitionGrid tesseract::TableFinder::fragmented_text_grid_
protected

Definition at line 424 of file tablefind.h.

int tesseract::TableFinder::global_median_blob_width_
protected

Definition at line 412 of file tablefind.h.

int tesseract::TableFinder::global_median_ledding_
protected

Definition at line 414 of file tablefind.h.

int tesseract::TableFinder::global_median_xheight_
protected

Definition at line 410 of file tablefind.h.

ColPartitionGrid tesseract::TableFinder::leader_and_ruling_grid_
protected

Definition at line 420 of file tablefind.h.

bool tesseract::TableFinder::left_to_right_language_
protected

Definition at line 430 of file tablefind.h.

int tesseract::TableFinder::resolution_
protected

Definition at line 408 of file tablefind.h.

ColSegmentGrid tesseract::TableFinder::table_grid_
protected

Definition at line 428 of file tablefind.h.


The documentation for this class was generated from the following files: