Pure javascript cross-platform module to extract text from PDFs.
Pure javascript cross-platform module to extract texts from PDFs.




This repositary is a fork of https://gitlab.com/autokent/pdf-parse. The original is not maintained anymore and has mutliple issues, refer https://gitlab.com/autokent/pdf-parse/-/issues/24. This fork aims to patch these issues and republish the package.
- pdf2json buggy, no support anymore, memory leak, throws non-catchable fatal errors
- j-pdfjson fork of pdf2json
- pdf-parser buggy, no tests
- pdfreader using pdf2json
- pdf-extract not cross-platform using xpdf
npm install pdf-parse-fork
``js
const fs = require("fs");
const pdf = require("pdf-parse");
let dataBuffer = fs.readFileSync("path to PDF file...");
pdf(dataBuffer).then(function (data) {
// number of pages
console.log(data.numpages);
// number of rendered pages
console.log(data.numrender);
// PDF info
console.log(data.info);
// PDF metadata
console.log(data.metadata);
// PDF.js version
// check https://mozilla.github.io/pdf.js/getting_started/
console.log(data.version);
// PDF text
console.log(data.text);
});
`
You can use crawler-request which uses the pdf-parse
`js
const fs = require("fs");
const pdf = require("pdf-parse");
let dataBuffer = fs.readFileSync("path to PDF file...");
pdf(dataBuffer)
.then(function (data) {
// use data
})
.catch(function (error) {
// handle exceptions
});
`
- v1.0.9 and above break pagerender callback changelog
- If you need another format like json, you can change page render behaviour with a callback
- Check out https://mozilla.github.io/pdf.js/
`jsfalse
// default render callback
function render_page(pageData) {
//check documents https://mozilla.github.io/pdf.js/
let render_options = {
//replaces all occurrences of whitespace with standard spaces (0x20). The default value is .false
normalizeWhitespace: false,
//do not attempt to combine same line TextItem's. The default value is .
disableCombineTextItems: false,
};
return pageData.getTextContent(render_options).then(function (textContent) {
let lastY,
text = "";
for (let item of textContent.items) {
if (lastY == item.transform[5] || !lastY) {
text += item.str;
} else {
text += "\n" + item.str;
}
lastY = item.transform[5];
}
return text;
});
}
let options = {
pagerender: render_page,
};
let dataBuffer = fs.readFileSync("path to PDF file...");
pdf(dataBuffer, options).then(function (data) {
//use new format
});
`
`js
const DEFAULT_OPTIONS = {
// internal page parser callback
// you can set this option, if you need another format except raw text
pagerender: render_page,
// max page number to parse
max: 0,
//check https://mozilla.github.io/pdf.js/getting_started/
version: "v1.10.100",
};
`
If you need another format except raw text.
Max number of page to parse. If the value is less than or equal to 0, parser renders all pages.
check pdf.js
- 'default''v1.9.426'
- 'v1.10.100'
- 'v1.10.88'
- 'v2.0.550'
-
> _default_ version is _v1.10.100_
> mozilla.github.io/pdf.js
- mocha or npm test`
- Check test folder and quickstart.js for extra usages.
I use this package actively myself, so it has my top priority. You can chat on WhatsApp about any infos, ideas and suggestions.

If you find a bug or a mistake, you can help by submitting an issue to GitLab Repository
GitLab calls it merge request instead of pull request.
- A Guide for First-Timers
- How to create a merge request
- Check Contributing Guide
MIT licensed and all it's dependencies are MIT or BSD licensed.