transform stream to tokenize html
npm install html-tokenizetransform stream to tokenize html

`` js
var fs = require('fs');
var tokenize = require('html-tokenize');
var through = require('through2');
fs.createReadStream(__dirname + '/table.html')
.pipe(tokenize())
.pipe(through.obj(function (row, enc, next) {
row[1] = row[1].toString();
console.log(row);
next();
}))
;
`
this html:
` html`
blah blah blah
there
it
is
generates this output:
``
[ 'open', '' ]
' ]
[ 'text', '\n ' ]
[ 'open', '' ]
[ 'text', 'blah blah blah' ]
[ 'close', '' ]
[ 'text', '\n ' ]
[ 'open', '' ] ' ]
[ 'open', '' ] ' ]
[ 'text', 'there' ]
[ 'close', '
[ 'close', '
[ 'text', '\n ' ]
[ 'open', '' ] ' ]
[ 'open', '' ] ' ]
[ 'text', 'it' ]
[ 'close', '
[ 'close', '
[ 'text', '\n ' ]
[ 'open', '' ] ' ]
[ 'open', '' ] ' ]
[ 'text', 'is' ]
[ 'close', '
[ 'close', '
[ 'text', '\n' ]
[ 'close', '
[ 'text', '\n' ]
` js`
var tokenize = require('html-tokenize');
Return a tokenize transform stream t that takes html input and produces rows
of output. The output rows are of the form:
* [ name, buffer ]
The input stream maps completely onto the buffers from the object stream.
The types of names are:
* open
* close
* text
cdata, comments, and scripts all use 'open' with their contents appearing in'text'
subsequent rows.
There is an html-tokenize command too.
`
usage: html-tokenize {FILE}
Tokenize FILE into newline-separated json arrays for each tag.
If FILE is not specified, use stdin.
`
With npm, to get the library do:
``
npm install html-tokenize
or to get the command do:
```
npm install -g html-tokenize
MIT