scantools 1.0.8
Graphics manipulation with a view towards scanned documents
PDFAWriter.h
1/*
2 * Copyright © 2016 - 2020 Stefan Kebekus <stefan.kebekus@math.uni-freiburg.de>
3 *
4 * This program is free software: you can redistribute it and/or modify it under
5 * the terms of the GNU General Public License as published by the Free Software
6 * Foundation, either version 3 of the License, or (at your option) any later
7 * version.
8 *
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
12 * details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18
19#ifndef PDFDOCUMENT
20#define PDFDOCUMENT 1
21
22#include <QFuture>
23#include <QList>
24#include <QReadWriteLock>
25#include <QString>
26
27#include "HOCRDocument.h"
28#include "JBIG2Document.h"
29#include "paperSize.h"
30#include "resolution.h"
31
32
127class PDFAWriter : public QObject
128{
129 Q_OBJECT
130 Q_PROPERTY(QString author READ author WRITE setAuthor NOTIFY authorChanged)
131 Q_PROPERTY(QString keywords READ keywords WRITE setKeywords NOTIFY keywordsChanged)
132 Q_PROPERTY(QString subject READ subject WRITE setSubject NOTIFY subjectChanged)
133 Q_PROPERTY(QString title READ title WRITE setTitle NOTIFY titleChanged)
134 Q_PROPERTY(paperSize pageSize READ pageSize WRITE setPageSize NOTIFY pageSizeChanged)
135 Q_PROPERTY(resolution resolutionOverrideHorizontal READ resolutionOverrideHorizontal WRITE setResolutionOverrideHorizontal NOTIFY resolutionOverrideHorizontalChanged)
136 Q_PROPERTY(resolution resolutionOverrideVertical READ resolutionOverrideVertical WRITE setResolutionOverrideVertical NOTIFY resolutionOverrideVerticalChanged)
137 Q_PROPERTY(bool autoOCR READ autoOCR WRITE setAutoOCR NOTIFY autoOCRChanged)
138 Q_PROPERTY(QStringList autoOCRLanguages READ autoOCRLanguages WRITE setAutoOCRLanguages NOTIFY autoOCRLanguagesChanged)
139
140 public:
147
170 explicit PDFAWriter(bool bestCompression=false, QObject* parent=nullptr);
171
175 QString author();
176
181 void setAuthor(const QString &author);
182
186 QString keywords();
187
192 void setKeywords(const QString &keywords);
193
198 QString subject();
199
204 void setSubject(const QString &subject);
205
209 QString title();
210
215 void setTitle(const QString &title);
216
222
227 void setPageSize(const paperSize size);
228
234
240
248
254
262
276 void setResolutionOverride(resolution horizontal, resolution vertical);
277
283 {
284 setResolutionOverride(res, res);
285 }
286
292
296 bool autoOCR();
297
307 void setAutoOCR(bool autoOCR);
308
313 QStringList autoOCRLanguages();
314
332 QString setAutoOCRLanguages(const QStringList& nOCRLanguages);
333
350
358
364
398 QString addPages(const QImage &image, QStringList *warnings=0);
399
419 QString addPages(const JBIG2Document &jbig2doc, QStringList *warnings=0);
420
489 QString addPages(const QString &imageFileName, QStringList *warnings=0);
490
502 operator QByteArray();
503
504 public slots:
517
518 signals:
521
524
527
530
533
536
539
542
545
553 void finished();
554
567 void progress(qreal percentage);
568
569 private:
570 // Meta data
571 QString _author, _keywords, _subject, _title;
572
573 // Paper size
574 paperSize _pageSize;
575
576 // HOCR Document
577 HOCRDocument userSpecifiedOCRData;
578 QStringList OCRLanguages;
579 bool _autoOCR;
580
581 // Override resolutions
582 resolution horizontalResolutionOverride;
583 resolution verticalResolutionOverride;
584
585 // This private method adds a JBIG2 image to the PDF document. It differs from
586 // the generic method addPages() only in the arguments it expects the name of
587 // a JBIG file inestead of an abitrary graphics file.
588 //
589 // The image will be embedded in the PDF without re-encoding. The method does
590 // not check in detail if the file complies with the JBIG2 standard. If
591 // invalid input data is fed into this method, then the resulting PDF file
592 // might possibly not comply to the PDF/A standard.
593 QString addJBIG2(const QString &fileName, QStringList *warnings=0);
594
595 // This private method adds a JPEG image to the PDF document. It differs from
596 // the generic method addPages() only in the arguments it expects the name of
597 // a JPEG file inestead of an abitrary graphics file.
598 //
599 // The image will be embedded in the PDF without re-encoding. The method does
600 // not check in detail if the file complies with the JPEG standard. If
601 // invalid input data is fed into this method, then the resulting PDF file
602 // might possibly not comply to the PDF/A standard.
603 QString addJPEG(const QString &fileName);
604
605 // This private method adds a JPEG2000 (ISO/IEC 15444-2) image to the PDF
606 // document. The method expects a JPX or JPF file, and NOT a JP2 file. It
607 // differs from the generic method addPages() only in the arguments. It
608 // expects the name of a JPEG2000 file inestead of an abitrary graphics file.
609 //
610 // The image will be embedded in the PDF without re-encoding. The method does
611 // not check in detail if the file complies with the JPEG standard. If
612 // invalid input data is fed into this method, then the resulting PDF file
613 // might possibly not comply to the PDF/A standard.
614 QString addJPX(const QString &fileName);
615
616 // This private method adds a TIFF image to the PDF document. The method
617 // exists because QImageReader cannot handle multi-page TIFF files. The method
618 // reads all images contained in the file, and calls addImage() to add them to
619 // the PDF
620 QString addTIFF(const QString &fileName);
621
622 // This private method is used internally to generate a page containing a
623 // given graphicObject, and optionally a text overlay. This method assumes
624 // that the arguments have been checked and are correct. It also assumes that
625 // the PDFAWriter has been locked for writing.
626 void addGFXPage(quint32 graphicObjectIndex, const imageInfo& bInfo, const QImage& imageForOCR = QImage());
627
628 // Lock used to provide thread-safety
629 QReadWriteLock lock;
630
631 // PDF protoObject. This is either a QByteArray or QFuture<QByteArray>.
632 class protoObject {
633 public:
634 // cppcheck-suppress noExplicitConstructor
635 protoObject(QByteArray _data) : data(_data) {
636 ;
637 };
638
639 // cppcheck-suppress noExplicitConstructor
640 protoObject(QFuture<QByteArray> _future) : future(_future) {
641 ;
642 };
643
644 inline operator QByteArray() {
645 if (!future.isCanceled()) {
646 data = future.result();
647 future = QFuture<QByteArray>();
648 }
649 return data;
650 };
651
652 QString description;
653 QByteArray data;
654 QFuture<QByteArray> future;
655 };
656
657 // List of PDF objects
658 QList<protoObject> objects;
659
660 // Index of the PDF object in the 'objects' list that contains …
661 quint32 catalogObjectIndex; // … the catalog of the PDF file
662 quint32 metaDataObjectIndex; // … the meta data
663 quint32 infoObjectIndex; // … the info object
664 quint32 pageDirectoryObjectIndex; // … the page directory
665 quint32 colorProfileObjectIndex; // … the color profile
666 quint32 fontObjectIndex; // … the font object itself
667
668 // Use zopfli compression for bitmap graphics
669 bool bestCompression;
670
671 // Indices of the PDF page objects in the 'objects' list
672 QList<quint32> pageIndices;
673
674 // Reads file content into QByteArray
675 static QByteArray readFile(const QString& fileName);
676
677 // Constructs a page directory object
678 QByteArray generatePageDirectoryObject() const;
679
680 // Takes data from input, checks is zlib compression actually shrinks the
681 // data, and then generates a stream object, either unencoded or zlib encoded.
682 static QByteArray generateStreamObject(const QByteArray &input);
683
684 // Returns the index of a font object for Times-Roman. Creates the object, if necessary
685 quint32 getFontObjectIndex();
686
687 // Assumes that the image is black-and-white, as returned by
688 // imageOperations::optimizedFormat(), and returns a QByteArray containing a
689 // PDF object containing the FAX G4 compressed image.
690 static QByteArray createImageObject_bw_G4(const QImage &image);
691
692 // Assumes that the image is bitonal, as returned by
693 // imageOperations::optimizedFormat(), and returns a QByteArray containing a
694 // PDF object containing the FAX G4 compressed image.
695 static QByteArray createImageObject_bitonal_G4(const QImage &image);
696
697 // Assumes that the image is grayscale, as returned by
698 // imageOperations::optimizedFormat(), and returns a QByteArray containing a
699 // PDF object containing the zlib/zopfli compressed image.
700 static QByteArray createImageObject_gray_zlib(const QImage &image, bool bestCompression);
701
702 // Assumes that the image has an indexed palette, as returned by
703 // imageOperations::optimizedFormat(), and returns a QByteArray containing a
704 // PDF object containing the zlib/zopfli compressed image.
705 static QByteArray createImageObject_indexed_zlib(const QImage &image, bool bestCompression);
706
707 // Assumes that the image is full color, as returned by
708 // imageOperations::optimizedFormat(), and returns a QByteArray containing a
709 // PDF object containing the zlib/zopfli compressed image.
710 static QByteArray createImageObject_rgb_zlib(const QImage &image, bool bestCompression);
711
712 // Internal method. The method takes a page content stream and generates a
713 // well-compressed pageContent object, using the textBox to create a text
714 // overlay.
715 static QByteArray completePageContentObject_a(QByteArray contentStream, const imageInfo& bInfo, length deltaX, length deltaY, const HOCRTextBox& textBox);
716
717 // Internal method. The method takes runs the tesseract OCR engine to create a
718 // HOCRTextBox and then calls completePageContentObject_a
719 static QByteArray completePageContentObject_b(QByteArray contentStream, const imageInfo& bInfo, length deltaX, length deltaY, const QImage& image, const QStringList& OCRLanguages);
720};
721
722#endif
Reads and interprets HOCR files, the standard output file format for Optical Character Recognition sy...
Text box, as defined in an HOCR file.
Definition HOCRTextBox.h:45
Reads, writes and renders JBIG2 files, and chops them into pieces for inclusion into a PDF document.
Simple generator for PDF/A-2b compliant documents.
Definition PDFAWriter.h:128
void setResolutionOverride(resolution horizontal, resolution vertical)
Sets graphic resolution for future calls of the methods addPage()
void setSubject(const QString &subject)
Set the subject string in the PDF/A meta data.
void setResolutionOverrideVertical(resolution vertical)
Set vertical resolution.
void setPageSize(const paperSize size)
Sets page size, effective for future calls of the methods addPage()
void subjectChanged()
Emitted when subject changes.
QStringList autoOCRLanguages()
List of languages used for OCR.
void waitForWorkerThreads()
Waits for all worker threads to finish.
void setResolutionOverrideHorizontal(resolution horizontal)
Set horizontal resolution.
void pageSizeChanged()
Emitted when pageSize changes.
void resolutionOverrideVerticalChanged()
Emitted when resolutionOverrideVertical changes.
void progress(qreal percentage)
Progress indicator.
void setAutoOCR(bool autoOCR)
Specify if the tesseract OCR engine should be run automatically.
void clearOCRData()
Delete all pages from the internal HOCRDocument.
void resolutionOverrideHorizontalChanged()
Emitted when resolutionOverrideHorizontal changes.
~PDFAWriter()
Destructor.
void autoOCRLanguagesChanged()
Emitted when autoOCRLanguages change.
void setResolutionOverride(resolution res)
Overloaded method that sets horizontal and vertical resolution to the same value.
Definition PDFAWriter.h:282
paperSize pageSize()
Page Size.
QString title()
Metadata: Title String.
PDFAWriter(bool bestCompression=false, QObject *parent=nullptr)
Constructor.
void titleChanged()
Emitted when title changes.
void setPageSize(paperSize::format size=paperSize::empty)
Sets page size, effective for future calls of the methods addPage()
resolution resolutionOverrideHorizontal()
Horizontal resolution.
QString addPages(const QString &imageFileName, QStringList *warnings=0)
Add images to the PDF document.
void setTitle(const QString &title)
Set the title string in the PDF/A meta data.
void finished()
Emitted just before waitForWorkerThreads() returns.
QString addPages(const JBIG2Document &jbig2doc, QStringList *warnings=0)
Add JBIG2 images to the PDF document.
QString keywords()
Metadata: Keywords.
bool autoOCR()
AutoOCR.
void authorChanged()
Emitted when author changes.
void setAuthor(const QString &author)
Set the author string in the PDF/A meta data.
HOCRDocument OCRData()
Return a copy of the internal HOCRDocument.
QString addPages(const QImage &image, QStringList *warnings=0)
Add an image to the PDF document.
void autoOCRChanged()
Emitted when autoOCR changes.
QString author()
Metadata: Author.
QString setAutoOCRLanguages(const QStringList &nOCRLanguages)
Specify languages used by the tesseract OCR engine.
void keywordsChanged()
Emitted when keywords change.
QString subject()
Metadata: Subject string.
void clearResolutionOverride()
Set horizontal and vertical override resolution to zero.
Definition PDFAWriter.h:288
resolution resolutionOverrideVertical()
Vertical resolution.
void setKeywords(const QString &keywords)
Set the author string in the PDF/A meta data.
void appendToOCRData(const HOCRDocument &doc)
Specify pre-processed OCR data.
Trivial class to store elementary info about bitmap graphics.
Definition imageInfo.h:31
The length stores a length and converts between units.
Definition length.h:38
The paperSize class identifies and stores paper sizes.
Definition paperSize.h:32
format
List of supported standard sizes.
Definition paperSize.h:35
@ empty
0x0mm
Definition paperSize.h:38
The resolution class stores a resolution and converts between units.
Definition resolution.h:40