Node.js module for rendering pdf pages to images, svgs, html files, text files and json metadata.
npm install pdf-extractorPdf-extractor is a wrapper around pdf.js to generate
images, svgs, html files, text files and json files from a pdf on node.js.
- Image: A DOM Canvas is used to render and export the graphical layer of the pdf.
Canvas exports .png as a default but can be extended to export to other file types like .jpg.
- SVG: Pdf objects are converted to svg using the
SVGGraphics parser of pdf.js.
- HTML: Pdf text is converted to HTML. This can be used as a (transparent) layer over the image
to enable text selection.
- Text: Pdf text is extracted to a text file for different usages (e.g. indexing the text).
``javascript
const PdfExtractor = require('pdf-extractor').PdfExtractor;
let outputDir = '/path/to/output',
pdfExtractor = new PdfExtractor(outputDir, {
viewportScale: (width, height) => {
//dynamic zoom based on rendering a page to a fixed page size
if (width > height) {
//landscape: 1100px wide
return 1100 / width;
}
//portrait: 800px wide
return 800 / width;
},
pageRange: [1,5],
});
pdfExtractor.parse('/path/to/dummy.pdf').then(function () {
console.log('# End of Document');
}).catch(function (err) {
console.error('Error: ' + err);
});
`
This results in these generated files:
``
info.json
page-1.png
page-2.png
page-3.png
page-4.png
page-5.png
stylesheet.css
text-1.html
text-1.txt
text-2.html
text-2.txt
text-3.html
text-3.txt
text-4.html
text-4.txt
text-5.html
text-5.txt
`javascript
const PdfExtractor = require('pdf-extractor').PdfExtractor;
const CanvasRenderer = require('pdf-extractor').CanvasRenderer;
const SvgRenderer = require('pdf-extractor').SvgRenderer;
const FileWriter = require('pdf-extractor').FileWriter;
class JPGWriter extends FileWriter
{
getFilePathForPage(page) {
return super.getPagePath(page.pageNumber, 'png');
}
writeCanvasPage(page, viewport, canvas) {
return this.writeStreamToFile(canvas.jpgStream(), this.getFilePathForPage(page))
}
}
class JPGCanvasRenderer extends CanvasRenderer
{
getWriters(writerOptions) {
let writers = super.getWriters(writerOptions);
writers.push(new JPGWriter(this.outputDir, writerOptions));
return writers;
}
}
let outputDir = '/path/to/output',
pdfExtractor = new PdfExtractor(outputDir, {
renderers: [
new JPGCanvasRenderer(outputDir, rendererOptions),
new SvgRenderer(outputDir, rendererOptions)
]
});
pdfExtractor.parse('/path/to/dummy.pdf').then(function () {
console.log('# End of Document');
}).catch(function (err) {
console.error('Error: ' + err);
});
``
This adds jpg images to the generated files.