A simple and powerful Node.js PDF text extractor
npm install node-pdf-extractorbash
npm install node-pdf-extractor
`
Or install globally for CLI usage:
`bash
npm install -g node-pdf-extractor
`
Usage
$3
`javascript
const { extractText, extractFromPath, PDFExtractor } = require('node-pdf-extractor');
// Simple text extraction
const text = await extractText('document.pdf');
console.log(text);
// Full extraction with metadata
const result = await extractFromPath('document.pdf');
console.log(result.text); // Extracted text
console.log(result.numPages); // Number of pages
console.log(result.info); // PDF info (title, author, etc.)
// Using the class
const extractor = new PDFExtractor();
const data = await extractor.extract('document.pdf');
console.log(data.text);
`
$3
`javascript
const fs = require('fs');
const { extractFromBuffer } = require('node-pdf-extractor');
const buffer = fs.readFileSync('document.pdf');
const result = await extractFromBuffer(buffer);
console.log(result.text);
`
$3
`javascript
const express = require('express');
const multer = require('multer');
const { extractFromBuffer } = require('node-pdf-extractor');
const app = express();
const upload = multer({ storage: multer.memoryStorage() });
app.post('/extract', upload.single('pdf'), async (req, res) => {
try {
const result = await extractFromBuffer(req.file.buffer);
res.json(result);
} catch (error) {
res.status(500).json({ error: error.message });
}
});
app.listen(3000);
`
$3
`bash
Extract and print to console
pdf-extract document.pdf
Extract and save to file
pdf-extract document.pdf output.txt
`
API Reference
$3
Returns just the text string from a PDF.
- input - File path (string) or Buffer
- options - Optional parsing options
$3
Extracts text and metadata from a PDF file.
- filePath - Path to the PDF file
- Returns: { text, numPages, info, metadata, version }
$3
Extracts text and metadata from a PDF buffer.
- buffer - PDF file as Buffer
- Returns: { text, numPages, info, metadata, version }
$3
Extracts text from specific pages.
- input - File path or Buffer
- startPage - Starting page (1-indexed)
- endPage - Ending page (1-indexed)
$3
Saves text to a file.
- text - Text content to save
- outputPath - Output file path
$3
OOP interface with the same methods:
- extract(filePath) - Extract from path
- extractBuffer(buffer) - Extract from buffer
- getText(input) - Get text only
- getPages(input, start, end) - Extract specific pages
- save(text, path)` - Save to file