node-pdf-extractor

A simple and powerful Node.js PDF text extractor.

Installation

bash

npm install node-pdf-extractor





Or install globally for CLI usage:

bash

npm install -g node-pdf-extractor





Usage



$3

javascript

const { extractText, extractFromPath, PDFExtractor } = require('node-pdf-extractor');



// Simple text extraction

const text = await extractText('document.pdf');

console.log(text);



// Full extraction with metadata

const result = await extractFromPath('document.pdf');

console.log(result.text);       // Extracted text

console.log(result.numPages);   // Number of pages

console.log(result.info);       // PDF info (title, author, etc.)



// Using the class

const extractor = new PDFExtractor();

const data = await extractor.extract('document.pdf');

console.log(data.text);

$3

javascript

const fs = require('fs');

const { extractFromBuffer } = require('node-pdf-extractor');



const buffer = fs.readFileSync('document.pdf');

const result = await extractFromBuffer(buffer);

console.log(result.text);

$3

javascript

const express = require('express');

const multer = require('multer');

const { extractFromBuffer } = require('node-pdf-extractor');



const app = express();

const upload = multer({ storage: multer.memoryStorage() });



app.post('/extract', upload.single('pdf'), async (req, res) => {

    try {

        const result = await extractFromBuffer(req.file.buffer);

        res.json(result);

    } catch (error) {

        res.status(500).json({ error: error.message });

    }

});



app.listen(3000);

$3

bash

Extract and print to console

pdf-extract document.pdf



Extract and save to file

pdf-extract document.pdf output.txt





API Reference



$3

Returns just the text string from a PDF.

-

input

 - File path (string) or Buffer

-

options

 - Optional parsing options



$3

Extracts text and metadata from a PDF file.

-

filePath

 - Path to the PDF file

- Returns:

{ text, numPages, info, metadata, version }





$3

Extracts text and metadata from a PDF buffer.

-

buffer

 - PDF file as Buffer

- Returns:

{ text, numPages, info, metadata, version }





$3

Extracts text from specific pages.

-

input

 - File path or Buffer

-

startPage

 - Starting page (1-indexed)

-

endPage

 - Ending page (1-indexed)



$3

Saves text to a file.

-

text

 - Text content to save

-

outputPath

 - Output file path



$3

OOP interface with the same methods:

-

extract(filePath)

 - Extract from path

-

extractBuffer(buffer)

 - Extract from buffer

-

getText(input)

 - Get text only

-

getPages(input, start, end)

 - Extract specific pages

-

save(text, path)` - Save to file

License

MIT

node-pdf-extractor

A simple and powerful Node.js PDF text extractor.

Installation

bash

npm install node-pdf-extractor





Or install globally for CLI usage:

bash

npm install -g node-pdf-extractor





Usage



$3

javascript

const { extractText, extractFromPath, PDFExtractor } = require('node-pdf-extractor');



// Simple text extraction

const text = await extractText('document.pdf');

console.log(text);



// Full extraction with metadata

const result = await extractFromPath('document.pdf');

console.log(result.text);       // Extracted text

console.log(result.numPages);   // Number of pages

console.log(result.info);       // PDF info (title, author, etc.)



// Using the class

const extractor = new PDFExtractor();

const data = await extractor.extract('document.pdf');

console.log(data.text);

$3

javascript

const fs = require('fs');

const { extractFromBuffer } = require('node-pdf-extractor');



const buffer = fs.readFileSync('document.pdf');

const result = await extractFromBuffer(buffer);

console.log(result.text);

$3

javascript

const express = require('express');

const multer = require('multer');

const { extractFromBuffer } = require('node-pdf-extractor');



const app = express();

const upload = multer({ storage: multer.memoryStorage() });



app.post('/extract', upload.single('pdf'), async (req, res) => {

    try {

        const result = await extractFromBuffer(req.file.buffer);

        res.json(result);

    } catch (error) {

        res.status(500).json({ error: error.message });

    }

});



app.listen(3000);

$3

bash

Extract and print to console

pdf-extract document.pdf



Extract and save to file

pdf-extract document.pdf output.txt





API Reference



$3

Returns just the text string from a PDF.

-

input

 - File path (string) or Buffer

-

options

 - Optional parsing options



$3

Extracts text and metadata from a PDF file.

-

filePath

 - Path to the PDF file

- Returns:

{ text, numPages, info, metadata, version }





$3

Extracts text and metadata from a PDF buffer.

-

buffer

 - PDF file as Buffer

- Returns:

{ text, numPages, info, metadata, version }





$3

Extracts text from specific pages.

-

input

 - File path or Buffer

-

startPage

 - Starting page (1-indexed)

-

endPage

 - Ending page (1-indexed)



$3

Saves text to a file.

-

text

 - Text content to save

-

outputPath

 - Output file path



$3

OOP interface with the same methods:

-

extract(filePath)

 - Extract from path

-

extractBuffer(buffer)

 - Extract from buffer

-

getText(input)

 - Get text only

-

getPages(input, start, end)

 - Extract specific pages

-

save(text, path)` - Save to file

License

MIT