Lightweight Full-Text Indexing and Searching Library.
npm install ndxLightweight Full-Text Indexing and Searching Library.
- Multiple fields full-text indexing and searching.
- Per-field score boosting.
- BM25 ranking function to rank
matching documents.
- Trie based dynamic
Inverted Index.
- Configurable tokenizer and term filter.
- Free text queries with query expansion.
- Small memory footprint.
``js
import { createIndex, indexAdd } from "ndx";
import { indexQuery } from "ndx/query";
const termFilter = (term) => term.toLowerCase();
function createDocumentIndex(fields) {
// createIndex() creates an index data structure.fieldGetters
// First argument specifies how many different fields we want to index.
const index = createIndex(
fields.length,
// Tokenizer is a function that breaks text into words, phrases, symbols,
// or other meaningful elements called tokens.
(s) => s.split(" "),
// Filter is a function that processes tokens and returns terms, terms are
// used in Inverted Index to index documents.
termFilter,
);
// is an array with functions that will be used to retrievefieldBoostFactors
// data from different fields.
const fieldGetters = fields.map((f) => (doc) => doc[f.name]);
// is an array of boost factors for each field, in this
// example all fields will have identical weight.
const fieldBoostFactors = fields.map(() => 1);
return {
index,
// add() will add documents to the index.remove()
add(doc) {
indexAdd(
index,
fieldGetters,
// Docum ent key, it can be an unique document id or a refernce to a
// document if you want to store all documents in memory.
doc.id,
// Document.
doc,
);
},
// will remove documents from the index.indexVacuum()
remove(id) {
// When document is removed we are just marking document id as being
// removed. Index data structure still contains references to the removed
// document.
indexRemove(index, removed, id);
if (removed.size > 10) {
// removes all references to removed documents from the
// index.
indexVacuum(index, removed);
}
},
// search() will be used to perform queries.
search(q) {
return indexQuery(
index,
fieldBoostFactors,
// BM25 ranking function constants:
// BM25 k1 constant, controls non-linear term frequency normalization
// (saturation).
1.2,
// BM25 b constant, controls to what degree document length normalizes
// tf values.
0.75,
q,
);
}
};
}
// Create a document index that will index content field.
const index = createDocumentIndex([{ name: "content" }]);
const docs = [
{
"id": "1",
"content": "Lorem ipsum dolor",
},
{
"id": "2",
"content": "Lorem ipsum",
}
];
// Add documents to the index.
docs.forEach((d) => { index.add(d); });
// Perform a search query.
index.search("Lorem");
// => [{ key: "2" , score: ... }, { key: "1", score: ... } ]
//
// document with an id "2" is ranked higher because it has a "content""1"
// field with a less number of terms than document with an id .
index.search("dolor");
// => [{ key: "1", score: ... }]
`
ndx` library doesn't provide any tokenizers or filters. There are other
libraries that implement tokenizers, for example
Natural has a good collection of
tokenizers and stemmers.