Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::ResultIterator Class Reference

#include <resultiterator.h>

Inheritance diagram for tesseract::ResultIterator:
tesseract::LTRResultIterator tesseract::PageIterator tesseract::MutableIterator

Public Member Functions

virtual ~ResultIterator ()
 
virtual void Begin ()
 
virtual bool Next (PageIteratorLevel level)
 
virtual bool IsAtBeginningOf (PageIteratorLevel level) const
 
virtual bool IsAtFinalElement (PageIteratorLevel level, PageIteratorLevel element) const
 
virtual char * GetUTF8Text (PageIteratorLevel level) const
 
bool ParagraphIsLtr () const
 
- Public Member Functions inherited from tesseract::LTRResultIterator
 LTRResultIterator (PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
 
virtual ~LTRResultIterator ()
 
char * GetUTF8Text (PageIteratorLevel level) const
 
void SetLineSeparator (const char *new_line)
 
void SetParagraphSeparator (const char *new_para)
 
float Confidence (PageIteratorLevel level) const
 
const char * WordFontAttributes (bool *is_bold, bool *is_italic, bool *is_underlined, bool *is_monospace, bool *is_serif, bool *is_smallcaps, int *pointsize, int *font_id) const
 
const char * WordRecognitionLanguage () const
 
StrongScriptDirection WordDirection () const
 
bool WordIsFromDictionary () const
 
bool WordIsNumeric () const
 
bool HasBlamerInfo () const
 
void * GetParamsTrainingBundle () const
 
const char * GetBlamerDebug () const
 
const char * GetBlamerMisadaptionDebug () const
 
char * WordTruthUTF8Text () const
 
const char * WordLattice (int *lattice_size) const
 
bool SymbolIsSuperscript () const
 
bool SymbolIsSubscript () const
 
bool SymbolIsDropcap () const
 
- Public Member Functions inherited from tesseract::PageIterator
 PageIterator (PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
 
virtual ~PageIterator ()
 
 PageIterator (const PageIterator &src)
 
const PageIteratoroperator= (const PageIterator &src)
 
bool PositionedAtSameWord (const PAGE_RES_IT *other) const
 
virtual void RestartParagraph ()
 
bool IsWithinFirstTextlineOfParagraph () const
 
virtual void RestartRow ()
 
int Cmp (const PageIterator &other) const
 
bool BoundingBox (PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
 
bool BoundingBoxInternal (PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
 
bool Empty (PageIteratorLevel level) const
 
PolyBlockType BlockType () const
 
Pix * GetBinaryImage (PageIteratorLevel level) const
 
Pix * GetImage (PageIteratorLevel level, int padding, int *left, int *top) const
 
bool Baseline (PageIteratorLevel level, int *x1, int *y1, int *x2, int *y2) const
 
void Orientation (tesseract::Orientation *orientation, tesseract::WritingDirection *writing_direction, tesseract::TextlineOrder *textline_order, float *deskew_angle) const
 
void ParagraphInfo (tesseract::ParagraphJustification *justification, bool *is_list_item, bool *is_crown, int *first_line_indent) const
 

Static Public Member Functions

static ResultIteratorStartOfParagraph (const LTRResultIterator &resit)
 
static void CalculateTextlineOrder (bool paragraph_is_ltr, const GenericVector< StrongScriptDirection > &word_dirs, GenericVectorEqEq< int > *reading_order)
 

Static Public Attributes

static const int kMinorRunStart = -1
 
static const int kMinorRunEnd = -2
 
static const int kComplexWord = -3
 

Protected Member Functions

TESS_LOCAL ResultIterator (const LTRResultIterator &resit)
 
- Protected Member Functions inherited from tesseract::PageIterator
TESS_LOCAL void BeginWord (int offset)
 

Additional Inherited Members

- Protected Attributes inherited from tesseract::LTRResultIterator
const char * line_separator_
 
const char * paragraph_separator_
 
- Protected Attributes inherited from tesseract::PageIterator
PAGE_RESpage_res_
 
Tesseracttesseract_
 
PAGE_RES_ITit_
 
WERDword_
 
int word_length_
 
int blob_index_
 
C_BLOB_IT * cblob_it_
 
int scale_
 
int scaled_yres_
 
int rect_left_
 
int rect_top_
 
int rect_width_
 
int rect_height_
 

Detailed Description

Definition at line 37 of file resultiterator.h.

Constructor & Destructor Documentation

virtual tesseract::ResultIterator::~ResultIterator ( )
inlinevirtual

ResultIterator is copy constructible! The default copy constructor works just fine for us.

Definition at line 45 of file resultiterator.h.

45 {}
tesseract::ResultIterator::ResultIterator ( const LTRResultIterator resit)
explicitprotected

We presume the data associated with the given iterator will outlive us. NB: This is private because it does something that is non-obvious: it resets to the beginning of the paragraph instead of staying wherever resit might have pointed.

Definition at line 33 of file resultiterator.cpp.

34  : LTRResultIterator(resit) {
35  in_minor_direction_ = false;
36  at_beginning_of_minor_run_ = false;
37  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
38  MoveToLogicalStartOfTextline();
39 }
LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)

Member Function Documentation

void tesseract::ResultIterator::Begin ( )
virtual

Moves the iterator to point to the start of the page to begin an iteration.

Reimplemented from tesseract::PageIterator.

Definition at line 408 of file resultiterator.cpp.

408  {
410  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
411  in_minor_direction_ = false;
412  at_beginning_of_minor_run_ = false;
413  MoveToLogicalStartOfTextline();
414 }
void tesseract::ResultIterator::CalculateTextlineOrder ( bool  paragraph_is_ltr,
const GenericVector< StrongScriptDirection > &  word_dirs,
GenericVectorEqEq< int > *  reading_order 
)
static

Yields the reading order as a sequence of indices and (optional) meta-marks for a set of words (given left-to-right). The meta marks are passed as negative values: kMinorRunStart Start of minor direction text. kMinorRunEnd End of minor direction text. kComplexWord The next indexed word contains both left-to-right and right-to-left characters and was treated as neutral.

For example, suppose we have five words in a text line, indexed [0,1,2,3,4] from the leftmost side of the text line. The following are all believable reading_orders:

Left-to-Right (in ltr paragraph): { 0, 1, 2, 3, 4 } Left-to-Right (in rtl paragraph): { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd } Right-to-Left (in rtl paragraph): { 4, 3, 2, 1, 0 } Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph: { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }

Definition at line 248 of file resultiterator.cpp.

251  {
252  reading_order->truncate(0);
253  if (word_dirs.size() == 0) return;
254 
255  // Take all of the runs of minor direction words and insert them
256  // in reverse order.
257  int minor_direction, major_direction, major_step, start, end;
258  if (paragraph_is_ltr) {
259  start = 0;
260  end = word_dirs.size();
261  major_step = 1;
262  major_direction = DIR_LEFT_TO_RIGHT;
263  minor_direction = DIR_RIGHT_TO_LEFT;
264  } else {
265  start = word_dirs.size() - 1;
266  end = -1;
267  major_step = -1;
268  major_direction = DIR_RIGHT_TO_LEFT;
269  minor_direction = DIR_LEFT_TO_RIGHT;
270  // Special rule: if there are neutral words at the right most side
271  // of a line adjacent to a left-to-right word in the middle of the
272  // line, we interpret the end of the line as a single LTR sequence.
273  if (word_dirs[start] == DIR_NEUTRAL) {
274  int neutral_end = start;
275  while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
276  neutral_end--;
277  }
278  if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
279  // LTR followed by neutrals.
280  // Scan for the beginning of the minor left-to-right run.
281  int left = neutral_end;
282  for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
283  if (word_dirs[i] == DIR_LEFT_TO_RIGHT) left = i;
284  }
285  reading_order->push_back(kMinorRunStart);
286  for (int i = left; i < word_dirs.size(); i++) {
287  reading_order->push_back(i);
288  if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
289  }
290  reading_order->push_back(kMinorRunEnd);
291  start = left - 1;
292  }
293  }
294  }
295  for (int i = start; i != end;) {
296  if (word_dirs[i] == minor_direction) {
297  int j = i;
298  while (j != end && word_dirs[j] != major_direction)
299  j += major_step;
300  if (j == end) j -= major_step;
301  while (j != i && word_dirs[j] != minor_direction)
302  j -= major_step;
303  // [j..i] is a minor direction run.
304  reading_order->push_back(kMinorRunStart);
305  for (int k = j; k != i; k -= major_step) {
306  reading_order->push_back(k);
307  }
308  reading_order->push_back(i);
309  reading_order->push_back(kMinorRunEnd);
310  i = j + major_step;
311  } else {
312  reading_order->push_back(i);
313  if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
314  i += major_step;
315  }
316  }
317 }
static const int kMinorRunStart
static const int kComplexWord
int push_back(T object)
int size() const
Definition: genericvector.h:59
static const int kMinorRunEnd
virtual void truncate(int size)
char * tesseract::ResultIterator::GetUTF8Text ( PageIteratorLevel  level) const
virtual

Returns the null terminated UTF-8 encoded text string for the current object at the given level. Use delete [] to free after use.

Definition at line 551 of file resultiterator.cpp.

551  {
552  if (it_->word() == NULL) return NULL; // Already at the end!
553  STRING text;
554  switch (level) {
555  case RIL_BLOCK:
556  {
557  ResultIterator pp(*this);
558  do {
559  pp.AppendUTF8ParagraphText(&text);
560  } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
561  }
562  break;
563  case RIL_PARA:
564  AppendUTF8ParagraphText(&text);
565  break;
566  case RIL_TEXTLINE:
567  {
568  ResultIterator it(*this);
569  it.MoveToLogicalStartOfTextline();
570  it.IterateAndAppendUTF8TextlineText(&text);
571  }
572  break;
573  case RIL_WORD:
574  AppendUTF8WordText(&text);
575  break;
576  case RIL_SYMBOL:
577  {
578  bool reading_direction_is_ltr =
579  current_paragraph_is_ltr_ ^ in_minor_direction_;
580  if (at_beginning_of_minor_run_) {
581  text += reading_direction_is_ltr ? kLRM : kRLM;
582  }
583  text = it_->word()->BestUTF8(blob_index_, !reading_direction_is_ltr);
584  if (IsAtFinalSymbolOfWord()) AppendSuffixMarks(&text);
585  }
586  break;
587  }
588  int length = text.length() + 1;
589  char* result = new char[length];
590  strncpy(result, text.string(), length);
591  return result;
592 }
inT32 length() const
Definition: strngs.cpp:151
const char *const BestUTF8(int blob_index, bool in_rtl_context) const
Definition: pageres.h:477
#define NULL
Definition: host.h:144
WERD_RES * word() const
Definition: pageres.h:757
BLOCK_RES * block() const
Definition: pageres.h:763
const char * kRLM
Definition: unicodes.cpp:28
const char * string() const
Definition: strngs.cpp:156
Definition: strngs.h:40
const char * kLRM
Definition: unicodes.cpp:27
TESS_LOCAL ResultIterator(const LTRResultIterator &resit)
bool tesseract::ResultIterator::IsAtBeginningOf ( PageIteratorLevel  level) const
virtual

IsAtBeginningOf() returns whether we're at the logical beginning of the given level. (as opposed to ResultIterator's left-to-right top-to-bottom order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf(). For a full description, see pageiterator.h

Reimplemented from tesseract::PageIterator.

Definition at line 491 of file resultiterator.cpp.

491  {
492  if (it_->block() == NULL) return false; // Already at the end!
493  if (it_->word() == NULL) return true; // In an image block.
494  if (level == RIL_SYMBOL) return true; // Always at beginning of a symbol.
495 
496  bool at_word_start = IsAtFirstSymbolOfWord();
497  if (level == RIL_WORD) return at_word_start;
498 
499  ResultIterator line_start(*this);
500  // move to the first word in the line...
501  line_start.MoveToLogicalStartOfTextline();
502 
503  bool at_textline_start = at_word_start && *line_start.it_ == *it_;
504  if (level == RIL_TEXTLINE) return at_textline_start;
505 
506  // now we move to the left-most word...
507  line_start.RestartRow();
508  bool at_block_start = at_textline_start &&
509  line_start.it_->block() != line_start.it_->prev_block();
510  if (level == RIL_BLOCK) return at_block_start;
511 
512  bool at_para_start = at_block_start ||
513  (at_textline_start &&
514  line_start.it_->row()->row->para() !=
515  line_start.it_->prev_row()->row->para());
516  if (level == RIL_PARA) return at_para_start;
517 
518  ASSERT_HOST(false); // shouldn't happen.
519  return false;
520 }
#define NULL
Definition: host.h:144
WERD_RES * word() const
Definition: pageres.h:757
BLOCK_RES * block() const
Definition: pageres.h:763
#define ASSERT_HOST(x)
Definition: errcode.h:84
TESS_LOCAL ResultIterator(const LTRResultIterator &resit)
bool tesseract::ResultIterator::IsAtFinalElement ( PageIteratorLevel  level,
PageIteratorLevel  element 
) const
virtual

Implement PageIterator's IsAtFinalElement correctly in a BiDi context. For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we point at the last word in a paragraph. See PageIterator for full comment.

NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the change that the variable next is now a ResultIterator instead of a PageIterator.

Reimplemented from tesseract::PageIterator.

Definition at line 527 of file resultiterator.cpp.

528  {
529  if (Empty(element)) return true; // Already at the end!
530  // The result is true if we step forward by element and find we are
531  // at the the end of the page or at beginning of *all* levels in:
532  // [level, element).
533  // When there is more than one level difference between element and level,
534  // we could for instance move forward one symbol and still be at the first
535  // word on a line, so we also have to be at the first symbol in a word.
536  ResultIterator next(*this);
537  next.Next(element);
538  if (next.Empty(element)) return true; // Reached the end of the page.
539  while (element > level) {
540  element = static_cast<PageIteratorLevel>(element - 1);
541  if (!next.IsAtBeginningOf(element))
542  return false;
543  }
544  return true;
545 }
bool Empty(PageIteratorLevel level) const
TESS_LOCAL ResultIterator(const LTRResultIterator &resit)
bool tesseract::ResultIterator::Next ( PageIteratorLevel  level)
virtual

Moves to the start of the next object at the given level in the page hierarchy in the appropriate reading order and returns false if the end of the page was reached. NOTE that RIL_SYMBOL will skip non-text blocks, but all other PageIteratorLevel level values will visit each non-text block once. Think of non text blocks as containing a single para, with a single line, with a single imaginary word. Calls to Next with different levels may be freely intermixed. This function iterates words in right-to-left scripts correctly, if the appropriate language has been loaded into Tesseract.

Reimplemented from tesseract::PageIterator.

Definition at line 416 of file resultiterator.cpp.

416  {
417  if (it_->block() == NULL) return false; // already at end!
418  switch (level) {
419  case RIL_BLOCK: // explicit fall-through
420  case RIL_PARA: // explicit fall-through
421  case RIL_TEXTLINE:
422  if (!PageIterator::Next(level)) return false;
424  // if we've advanced to a new paragraph,
425  // recalculate current_paragraph_is_ltr_
426  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
427  }
428  in_minor_direction_ = false;
429  MoveToLogicalStartOfTextline();
430  return it_->block() != NULL;
431  case RIL_SYMBOL:
432  {
433  GenericVector<int> blob_order;
434  CalculateBlobOrder(&blob_order);
435  int next_blob = 0;
436  while (next_blob < blob_order.size() &&
437  blob_index_ != blob_order[next_blob])
438  next_blob++;
439  next_blob++;
440  if (next_blob < blob_order.size()) {
441  // we're in the same word; simply advance one blob.
442  BeginWord(blob_order[next_blob]);
443  at_beginning_of_minor_run_ = false;
444  return true;
445  }
446  level = RIL_WORD; // we've fallen through to the next word.
447  }
448  case RIL_WORD: // explicit fall-through.
449  {
450  if (it_->word() == NULL) return Next(RIL_BLOCK);
451  GenericVectorEqEq<int> word_indices;
452  int this_word_index = LTRWordIndex();
453  CalculateTextlineOrder(current_paragraph_is_ltr_,
454  *this,
455  &word_indices);
456  int final_real_index = word_indices.size() - 1;
457  while (final_real_index > 0 && word_indices[final_real_index] < 0)
458  final_real_index--;
459  for (int i = 0; i < final_real_index; i++) {
460  if (word_indices[i] == this_word_index) {
461  int j = i + 1;
462  for (; j < final_real_index && word_indices[j] < 0; j++) {
463  if (word_indices[j] == kMinorRunStart) in_minor_direction_ = true;
464  if (word_indices[j] == kMinorRunEnd) in_minor_direction_ = false;
465  }
466  at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
467  // awesome, we move to word_indices[j]
468  if (BidiDebug(3)) {
469  tprintf("Next(RIL_WORD): %d -> %d\n",
470  this_word_index, word_indices[j]);
471  }
473  for (int k = 0; k < word_indices[j]; k++) {
475  }
476  MoveToLogicalStartOfWord();
477  return true;
478  }
479  }
480  if (BidiDebug(3)) {
481  tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
482  }
483  // we're going off the end of the text line.
484  return Next(RIL_TEXTLINE);
485  }
486  }
487  ASSERT_HOST(false); // shouldn't happen.
488  return false;
489 }
static const int kMinorRunStart
virtual void RestartRow()
virtual bool Next(PageIteratorLevel level)
#define NULL
Definition: host.h:144
WERD_RES * word() const
Definition: pageres.h:757
BLOCK_RES * block() const
Definition: pageres.h:763
virtual bool Next(PageIteratorLevel level)
static void CalculateTextlineOrder(bool paragraph_is_ltr, const GenericVector< StrongScriptDirection > &word_dirs, GenericVectorEqEq< int > *reading_order)
bool IsWithinFirstTextlineOfParagraph() const
TESS_LOCAL void BeginWord(int offset)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int size() const
Definition: genericvector.h:59
static const int kMinorRunEnd
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool tesseract::ResultIterator::ParagraphIsLtr ( ) const

Return whether the current paragraph's dominant reading direction is left-to-right (as opposed to right-to-left).

Definition at line 46 of file resultiterator.cpp.

46  {
47  return current_paragraph_is_ltr_;
48 }
ResultIterator * tesseract::ResultIterator::StartOfParagraph ( const LTRResultIterator resit)
static

Definition at line 41 of file resultiterator.cpp.

42  {
43  return new ResultIterator(resit);
44 }
TESS_LOCAL ResultIterator(const LTRResultIterator &resit)

Member Data Documentation

const int tesseract::ResultIterator::kComplexWord = -3
static

Definition at line 129 of file resultiterator.h.

const int tesseract::ResultIterator::kMinorRunEnd = -2
static

Definition at line 128 of file resultiterator.h.

const int tesseract::ResultIterator::kMinorRunStart = -1
static

Definition at line 127 of file resultiterator.h.


The documentation for this class was generated from the following files: