XML Stream parser (native js - no dependency)




A simple js parser for XML for nodejs and the browser.
It will handle smoothly line-breaks, spaces, chunks (from stream), namespaces...
It is fast and optimized for Buffer or Uint8Array of encoded strings (utf8) but will take strings too.
Parses some (or all) xml elements. Uses the tokenizer bellow but facilitate state managment (in between 2 writes).
To get started.
#### Example 1
``javascript`
const aString = '
const docParser = new DocumentParser(new Tokenizer())
// parsing the whole document (this might not the best usecase for this library)
docParser.onRoot(XmlElementParser()) // parser preserving child node order
docParser.write(aString)
const a = docParser.next() // XmlElement
a.toString() // produces back this xml string
a.getAttribute('id') === 'a0'
#### Example 2
`javascript
const a0String = '
const a1String = '
const b0String = 'some text b0'
const docParser = new DocumentParser(new Tokenizer())
docParser.on('root/a', XmlToObject())
docParser.on('root/b', XmlElementParser())
docParser.write()`
const a0 = docParser.next() // object
const a1 = docParser.next() // object
const b = docParser.next() // XmlElement
const u = docParser.next() // undefined
a0.id === 'a0'
a0.aa[0].id === 'aa0'
`javascripta
const xmlStr = '
const docParser = new DocumentParser(new Tokenizer())
docParser.on('root/a', {
onStart(startTag) {
// object representing docParser.next()
return {
title: startTag.getAttribute('title'),
items: [],
}
},
onText(text, a) {
if (!a.firstText) a.firstText = text.textContent.trim()
},
onEnd: (a) => a, // this is the object returned by aa
onChild(startTag) {
switch (startTag.tagName) {
case 'aa': {
// returning a new parser for 'aa'
return {
onStart(startTag, parentCtx) {
parentCtx.items.push({
name: startTag.getAttribute('id'),
label: startTag.getAttribute('label'),
type: 'aa',
})
return false // skipping child nodes of aa
},
}
}
// only interesed in children of a`
default:
return false
}
},
})
docParser.write(xmlStr)
console.log(docParser.next())
{
title: '...',
firstText: 'some text a0',
items: [
{ type: 'aa', name: 'aa00', label: 'item aa00' },
{ type: 'aa', name: 'aa01', label: 'item aa01' },
],
}
`javascript``
import { Tokenizer } from '@jerp/xml-stream-js'
const tokenizer = new Tokenizer()
let token // any of StartTag | EndTag | Text | CDATA | undefined (undifined meaning end-of-chunk)
const tokens = [] // collected tokens
try {
// write the first chunk of the xml string
tokenizer.write('
while ((token = tokenizer.nextToken())) {
tokens.push(token)
}
// write the last chunk of the xml string
tokenizer.write('er
while ((token = tokenizer.nextToken())) {
tokens.push(token)
}
} catch (e) {
// will not happen in this case, but will if xml string is corrupted
}
tokens[0].tagName === 'a'
tokens[1].getAttribute('b1') === 'value b1'
tokens.join('') === '
tokenizer.exhausted === true // the whole string has been consumed
tokens.join('') // === '