scantools 1.0.8
Graphics manipulation with a view towards scanned documents
HOCRDocument.h
1/*
2 * Copyright © 2016-2018 Stefan Kebekus <stefan.kebekus@math.uni-freiburg.de>
3 *
4 * This program is free software: you can redistribute it and/or modify it under
5 * the terms of the GNU General Public License as published by the Free Software
6 * Foundation, either version 3 of the License, or (at your option) any later
7 * version.
8 *
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
12 * details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18
19#ifndef HOCRDOCUMENT
20#define HOCRDOCUMENT 1
21
22
23#include "HOCRTextBox.h"
24#include "resolution.h"
25#include <QPageSize>
26#include <QSet>
27
28
40
42{
43 public:
46
54 explicit HOCRDocument(QIODevice *device) {read(device);};
55
63 explicit HOCRDocument(QString fileName) {read(fileName);};
64
75 HOCRDocument(const QImage &image, QStringList languages=QStringList()) {read(image,languages);};
76
81 void clear();
82
88 bool hasError() const {return !_error.isEmpty();};
89
96 QString error() const {return _error; };
97
105 bool hasWarnings() const {return !_warnings.isEmpty();};
106
111 QSet<QString> warnings() const {return _warnings;};
112
119 QSet<QString> system() const {return _OCRSystem;};
120
129 QSet<QString> capabilities() const {return _OCRCapabilities;};
130
136 QList<HOCRTextBox> pages() const {return _pages;};
137
142 bool isEmpty() const {return _pages.isEmpty();};
143
144
152 bool hasText() const;
153
163 if (_pages.size() > 0)
164 return _pages.takeFirst();
165 else
166 return HOCRTextBox();
167 };
168
182 void read(QIODevice *device);
183
191 void read(const QString& fileName);
192
210 void read(const QImage &image, const QStringList& languages=QStringList());
211
222 QFont suggestFont() const;
223
248 QString toPDF(const QString& fileName, resolution _resolution, const QString& title=QString(), const QPageSize& overridePageSize=QPageSize(), QFont *overrideFont=0) const;
249
264 QList<QImage> toImages(QFont *overrideFont=0, QImage::Format format=QImage::Format_Grayscale8) const;
265
272 QString toText() const;
273
281 void append(const HOCRDocument &other);
282
289 static QStringList tesseractLanguages();
290
300 static bool areLanguagesSupportedByTesseract(const QStringList& lingos);
301
302 private:
303 // This is a convenience method that suggests a page size for a given page of
304 // the document, taking resolution and overridePageSize into account. The
305 // reason for the existence of this method is that the computation is needed
306 // in two different places in the method exportToPDF, and I wanted to avoid
307 // duplicated code.
308 QPageSize findPageSize(int pageNumber, resolution _resolution, const QPageSize &overridePageSize) const;
309
310 // Error
311 QString _error;
312
313 // System(s) that generated this file, as specified in a meta tag of the HOCR
314 // file
315 QSet<QString> _OCRSystem;
316
317 // OCR capabilites used in this file, as specified in a meta tag of the HOCR
318 // file
319 QSet<QString> _OCRCapabilities;
320
321 // Pages of the document
322 QList<HOCRTextBox> _pages;
323
324 // Warnings
325 QSet<QString> _warnings;
326};
327
328#endif
HOCRDocument(QString fileName)
Constructs an HOCR document from a file.
HOCRDocument(QIODevice *device)
Constructs an HOCR document from a QIODevice.
QString toText() const
Export this document as text.
static bool areLanguagesSupportedByTesseract(const QStringList &lingos)
Check if languages are supported by tesseract.
void read(const QImage &image, const QStringList &languages=QStringList())
Generates an HOCR document by running the tesseract OCR engine.
void read(QIODevice *device)
Reads an HOCR document from a QIODevice.
QList< HOCRTextBox > pages() const
Pages in the document.
bool isEmpty() const
Returns true if the document contains no pages.
QString error() const
Error message.
QSet< QString > capabilities() const
OCR capabilites.
HOCRDocument()
Constructs an empty HOCR document.
HOCRTextBox takeFirstPage()
Removes the first page of the document and returns it.
void read(const QString &fileName)
Reads an HOCR document from a file.
QFont suggestFont() const
Suggest font.
QSet< QString > warnings() const
Warning messages.
QList< QImage > toImages(QFont *overrideFont=0, QImage::Format format=QImage::Format_Grayscale8) const
Export to images.
bool hasText() const
Check if the document does contain text.
HOCRDocument(const QImage &image, QStringList languages=QStringList())
Constructs an HOCR document by running the tesseract OCR engine.
void append(const HOCRDocument &other)
Appends other HOCRDocument.
static QStringList tesseractLanguages()
List of languages supported by tesseract.
bool hasWarnings() const
Warning status.
bool hasError() const
Error status.
QString toPDF(const QString &fileName, resolution _resolution, const QString &title=QString(), const QPageSize &overridePageSize=QPageSize(), QFont *overrideFont=0) const
Export to PDF.
void clear()
Resets the document.
QSet< QString > system() const
System(s) that generated this file.
Text box, as defined in an HOCR file.
Definition HOCRTextBox.h:45
The resolution class stores a resolution and converts between units.
Definition resolution.h:40