用js解析html字符串
目标用js将html字符串解析为一个类似于虚拟dom的对象
const htmlStr = `<html>
<head></head>
<body>
<h1>我是标签</h1>
<div>我是div标签</div>
<span id="root" style="color:red">我是span标签</span>
</body>
</html>`;
htmlTransform(htmlStr);
// 期望结果格式:
// { nodeName: 'html', children: [ ...,{ nodeName: 'body', id: 'xxxx', }, .... ] }
开发的htmlstr-parser-n插件
npm i htmlstr-parser-n 使用
const { htmlObjParser, htmlStrParser } = require("html-parser-n");
const fs = require("fs");
fs.writeFileSync('./demo.json', JSON.stringify(
htmlStrParser(`
<html>
<body>
<span id="root" style="color:red;">我是span标签</span>
</body>
</html>
`)
))
console.log(htmlObjParser(require("./demo.json")))
实现原理
- 状态机记录执行状态
let sign_enum = {
SIGN_END: "SIGN_END", // 结束标签读取 如 </xxxxx>
SIGN_END_OK: "SIGN_EN_OK", // 结束标签读取完成
SIGN_START: "SIGN_START", // 开始标签读取 如 <xxxxx>
SIGN_START_OK: "SIGN_START_OK", // 开始标签读取完成
};
- 字符串轮训读取,根据特殊符号< 、</、>来标注状态
- 标记每次读取的内容 sign
- 用浅拷贝来标记每次操作的节点
完整代码
let sign_enum = {
SIGN_END: "SIGN_END", // 结束标签读取 如 </xxxxx>
SIGN_END_OK: "SIGN_EN_OK", // 结束标签读取完成
SIGN_START: "SIGN_START", // 开始标签读取 如 <xxxxx>
SIGN_START_OK: "SIGN_START_OK", // 开始标签读取完成
};
function htmlStrParser(htmlStr) {
const str = htmlStr.replace(/\n/g, "");
let result = { nodeName: "root", children: [] };
// 默认 result.children[0]插入, ,这里记录调试用的栈信息
let use_line = [0];
let current_index = 0; // 记录当前插入children的下标
let node = result; // 当前操作的节点
let sign = ""; // 标记标签字符串(可能包含属性字符)、文本信息
let status = ""; // 当前状态,为空的时候我们认为是在读取当前节点(node)的文本信息
for (var i = 0; i < str.length; i++) {
var current = str.charAt(i);
var next = str.charAt(i + 1);
if (current === "<") {
// 在开始标签完成后记录文本信息到当前节点
if (sign && status === sign_enum.SIGN_START_OK) {
node.text = sign;
sign = "";
}
// 根据“</”来区分是 结束标签的(</xxx>)读取中 还是开始的标签(<xxx>) 读取中
if (next === "/") {
status = sign_enum.SIGN_END;
} else {
status = sign_enum.SIGN_START;
}
} else if (current === ">") {
// (<xxx>) 读取中,遇到“>”, (<xxx>) 读取中完成
if (status === sign_enum.SIGN_START) {
// 记录当前node所在的位置,并更改node
node = result;
use_line.map((_, index) => {
if (!node.children) node.children = [];
if (index === use_line.length - 1) {
sign = sign.replace(/^\s*/g, "").replace(/\"/g, "");
let mark = sign.match(/^[a-zA-Z0-9]*\s*/)[0].replace(/\s/g, ""); // 记录标签
// 标签上定义的属性获取
let attributeStr = sign.replace(mark, '').replace(/\s+/g, ",").split(",");
let attrbuteObj = {};
let style = {};
attributeStr.map(attr => {
if (attr) {
let value = attr.split("=")[1];
let key = attr.split("=")[0];
if (key === "style") {
value.split(";").map(s => {
if (s) {
style[s.split(":")[0]] = s.split(":")[1]
}
})
return attrbuteObj[key] = style;
}
attrbuteObj[key] = value;
}
})
node.children.push({ nodeName: mark, children: [], ...attrbuteObj })
}
current_index = node.children.length - 1;
node = node.children[current_index];
});
use_line.push(current_index);
sign = "";
status = sign_enum.SIGN_START_OK;
}
// (</xxx>) 读取中,遇到“>”, (</xxx>) 读取中完成
if (status === sign_enum.SIGN_END) {
use_line.pop();
node = result;
// 重新寻找操作的node
use_line.map((i) => {
node = node.children[i];
});
sign = "";
status = sign_enum.SIGN_END_OK;
}
} else {
sign = sign + current;
}
}
return result;
}
console.dir(htmlStrParser(htmlStr))
fs.writeFileSync("htmlObj.text", JSON.stringify(htmlStrParser(htmlStr)))
格式化查看
{
"nodeName":"root",
"children":[
{
"nodeName":"html",
"children":[
{
"nodeName":"head",
"children":[]
},
{
"nodeName":"body",
"children":[
{
"nodeName":"h1",
"children":[],
"text":"我是标签"
},
{
"nodeName":"div",
"children":[],
"text":"我是div标签"
},
{
"nodeName":"span",
"children":[],
"id":"root",
"style":{
"color":"red"
},
"text":"我是span标签"
}
],
"text":" "
}
]
}
]
}
用js解析html对象
实现html的增删查改可以先转成对象数组的形式,然后操作对象数组,操作完成后再转成字符串
function htmlObjParser(obj) {
let htmlStr = "";
function work(obj) {
const children = obj.children;
let attrStr = "";
Object.keys(obj).map(key => {
if (key !== 'nodeName' && key !== 'text' && key !== "children") {
if (key !== 'style') {
attrStr += ` ${key}=${obj[key]}`
} else if (key === 'style') {
let styleStr = '';
Object.keys(obj[key]).map(k => {
styleStr += ` ${k}:${obj[key][k]};`
})
attrStr += styleStr;
}
}
})
htmlStr += `<${obj.nodeName}${attrStr}>${obj.text ? obj.text : ''}`;
if (children && children.length) {
children.map(c => {
work(c)
});
}
htmlStr += `</${obj.nodeName}>`;
}
work(obj);
return htmlStr;
}
htmlObjParser(require("demo.text"))