scantools 1.0.8
Graphics manipulation with a view towards scanned documents
HOCRTextBox.h
1/*
2 * Copyright © 2016--2018 Stefan Kebekus <stefan.kebekus@math.uni-freiburg.de>
3 *
4 * This program is free software: you can redistribute it and/or modify it under
5 * the terms of the GNU General Public License as published by the Free Software
6 * Foundation, either version 3 of the License, or (at your option) any later
7 * version.
8 *
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
12 * details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18
19#ifndef HOCRTEXTBOX
20#define HOCRTEXTBOX 1
21
22#include <QPainter>
23#include <QSet>
24#include <QXmlStreamReader>
25
26#include "resolution.h"
27
28class HOCRDocument;
29
30
43
45{
46 public:
49
55 bool hasText() const;
56
62 qreal angle() const {return _angle;};
63
69 QXmlStreamAttributes attributes() const {return _attributes;};
70
77 QVector<qreal> baselinePolynomial() const {return _baselinePolynomial;};
78
85 QPoint baselineReferencePoint() const {return _baselineReferencePoint;};
86
92 QRect boundingBox() const {return _boundingBox;};
93
101 QString classType() const;
102
108 int confidence() const {return _confidence;};
109
116 QString direction() const {return _direction;};
117
123 qreal fontSize() const {return _fontSize;};
124
131 QString imageName() const {return _imageName;};
132
138 QString language() const {return _language;};
139
150 void render(QPainter &painter) const;
151
164 QImage toImage(const QFont &overrideFont, QImage::Format format=QImage::Format_Grayscale8) const;
165
191 QByteArray toRawPDFContentStream(const QFont &font, resolution xRes, resolution yRes, length deltaX=length(), length deltaY=length()) const;
192
198 QString toText() const;
199
213 qint64 estimateFit(const QFont &font) const;
214
225 QFont suggestFont() const;
226
232 QString text() const {return _text;};
233
234 private:
235 // Specifies how and where the textbox' content should be drawn.
236 struct renderingHints {
238 int fontSize;
239
241 QPoint referencePoint;
242
244 qreal horizontalStretchFactor;
245 };
246
247 // For a given font, this method computes rendering hints so that the text
248 // fits the bounding box best.
249 //
250 // - If the HOCR file specifies a font size, this size is taken. Otherwise,
251 // - the font metric is used to find a size that best fits the bounding
252 // - box. As a last resort, a standard value is returned.
253 //
254 // - The font metric is used to suggest a point from which to draw the text of
255 // - this text box, and a horizontal stretch factor
256 renderingHints getRenderingHints(const QFont &font) const;
257
258 // This is to ensure that text boxes can be constructed from HOCR documents.
259 friend HOCRDocument;
260
261 // Constructs a textbox by reading in an xml file. This constructor expects
262 // that the QXmlStreamReader points to a start element. It will read the file
263 // until it reaches the corresponding end element. When the method returns,
264 // the QXmlStreamReader points to this end element. In case of problems,
265 // warnings are added to 'warnings'.
266 HOCRTextBox(QXmlStreamReader &xml, QSet<QString> &warnings, HOCRTextBox *parent=0);
267
268 // Interprets _attributes and fills in the members _baseLine, _boundingBox,
269 // _class, _confidence, _fontSize, _imageName. Problems encoutered in the
270 // interpretation are added to the set 'warnings'. This method is called only
271 // in the constructor. The code is not part of the constructor to keep the
272 // source readable.
273 void interpretAttributes(QSet<QString> &warnings, qint64 line, qint64 column);
274
275 // Attributes, as read from the HOCR file
276 QXmlStreamAttributes _attributes;
277
278 // List of sub boxes, as read from the HOCR file
279 QList<HOCRTextBox> _subBoxes;
280
281 /*
282 * Attributes extracted from the HOCR file
283 */
284
285 // Textangle, as specified in the HOCR file or inherited from parent. If no
286 // angle was specified, this number is zero.
287 qreal _angle;
288
289 // Base line as a polynomial, as specified in the HOCR file or inherited from
290 // parent. If no base line was specified, this vector is empty.
291 QVector<qreal> _baselinePolynomial;
292
293 // Base line reference point, as specified in the HOCR file or inherited from
294 // parent. If no base line polynomial is specified, this member is
295 // meaningless.
296 QPoint _baselineReferencePoint;
297
298 // Bounding box, as specified in the HOCR file. If no bounding box was
299 // specified, this box is empty.
300 QRect _boundingBox;
301
302 // Contains the class of the corresponding element in the HOCR file. Typical
303 // values are "ocr_page", "ocr_carea", "ocr_par", "ocr_line" or "ocrx_word".
304 QString _class;
305
306 // Contains the confidence level of the corresponding element in the HOCR
307 // file. If no confidence level is specified, this member contains '-1'.
308 int _confidence;
309
310 // Contains the text flow direction of the corresponding element in the HOCR
311 // file. The value 'ltr' means left-to-right, 'rtl' means right-to-left. Any
312 // other value means 'undefined'.
313 QString _direction;
314
315 // Contains the font size specified in the corresponding element in the HOCR
316 // file. If no font size is specified, this member contains '0.0'.
317 qreal _fontSize;
318
319 // Contains the name of an image associated with the content of this text
320 // box. If nothing is specified in the HOCR file, this string is empty.
321 QString _imageName;
322
323 // Language of the content of this text box. If nothing is specified in the
324 // HOCR file, this string is empty.
325 QString _language;
326
327 // Contains the text of this text box. If nothing is specified in the HOCR
328 // file, this string is empty.
329 QString _text;
330
331 /*
332 * Helper functions
333 */
334
335 // Expects a string of the form "blabla int int int …" and returns a vector
336 // containing the integers
337 QVector<int> getIntegers(const QString& spec) const;
338
339 // Expects a string of the form "blabla qreal qreal qreal …" and returns a
340 // vector containing the qreals
341 QVector<qreal> getFloats(const QString& spec) const;
342
343 // Trivial method that writes out a floating point number in ASCII, up to four
344 // decimal points of precision. Trailing zeroes are deleted for brevity's
345 // sake. It seems that Qt cannot do that
346 static QByteArray toNumber(qreal x);
347
348 // Internal method that actually does the work for the user method with the
349 // same name. This method is applies recursively over all sub-boxes, and the
350 // results are joined. It differs from the user method in that it takes two
351 // additional arguments: 'height' is the height of the bounding box for which
352 // the user method was called; this is necessary for correct text
353 // placement. The argument "currentSize" is the font size last set; this is
354 // used to set and re-set the same sizes times and again. The parameter
355 // 'codec' is a pointer to the "Windows-1252" QTextCodec.
356 QByteArray toRawPDFContentStream(const QFont &font, resolution xRes, resolution yRes, length deltaX, length deltaY, quint16 height, qreal &currentFontSize, QTextCodec *codec) const;
357};
358
359
360#endif
Reads and interprets HOCR files, the standard output file format for Optical Character Recognition sy...
HOCRTextBox()
Constructs an empty text box.
void render(QPainter &painter) const
Paint the contents of the text box to a painter.
qint64 estimateFit(const QFont &font) const
Estimate how well a given font fits the textbox.
QString classType() const
Class of this textBox.
QByteArray toRawPDFContentStream(const QFont &font, resolution xRes, resolution yRes, length deltaX=length(), length deltaY=length()) const
Return raw PDF text rendering commands.
QString toText() const
Export this text box as text.
qreal angle() const
Text angle.
Definition HOCRTextBox.h:62
qreal fontSize() const
Font size.
QPoint baselineReferencePoint() const
Base line reference point.
Definition HOCRTextBox.h:85
QRect boundingBox() const
Bounding box.
Definition HOCRTextBox.h:92
QString text() const
Text content of the text box.
bool hasText() const
Decide if the text box contains non-trivial text.
QString imageName() const
Image associated with content of this text box.
int confidence() const
Confidence level.
QXmlStreamAttributes attributes() const
Returns the attributes of the textBox.
Definition HOCRTextBox.h:69
QVector< qreal > baselinePolynomial() const
Base line as a polynomial.
Definition HOCRTextBox.h:77
QFont suggestFont() const
Suggest font.
QImage toImage(const QFont &overrideFont, QImage::Format format=QImage::Format_Grayscale8) const
Export this text box as an image.
QString language() const
Language of the content of this text box.
QString direction() const
Text flow direction.
The length stores a length and converts between units.
Definition length.h:38
The resolution class stores a resolution and converts between units.
Definition resolution.h:40