Simhash implementation for detecting near-duplicate text using various hash functions like SipHash, MD5, and SHA256
npm install @counterrealist/simhash``bash`
npm i @counterrealist/simhash
`typescript
import {SimHash, BitArray, HashFunction} from "@counterrealist/simhash"
const simHash = new SimHash({
ngramSize: 3, // Default to 3
hashFunction: HashFunction.SIPHASH // Default to SIPHASH, Options: SIPHASH, MD5, SHA256
});
const text1: string = "Hello, world!";
const text2: string = "Hell's world";
const text1_bitarray: BitArray = simHash.compute_bitarray(text1); // [0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, ...]
const text1_buffer: Buffer = simHash.compute_buffer(text1); //
const text1_hex: string = simHash.compute_hex(text1); // 5f4cd1d87730f4e5fdafecb758c79c5b
const text2_bitarray: BitArray = simHash.compute_bitarray(text2); // [0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, ...]
const text2_buffer: Buffer = simHash.compute_buffer(text2); //
const text2_hex: string = simHash.compute_hex(text2); // 19644258f74804e45d030c2f50d58e4d
const textSimilarity: number = simHash.similarity(text1, text2); // 0.6875
const textSimilarityFromHex: number = simHash.similarityFromHex(text1_hex, text2_hex); // 0.6875
const textSimilarityFromBuffers: number = simHash.similarityFromBuffers(text1_buffer, text2_buffer); // 0.6875
const textSimilarityFromBits: number = simHash.similarityFromBits(text1_bitarray, text2_bitarray); // 0.6875
``