陆君风 2019-07-01
请支持正版
https://time.geekbang.org/col...
字符流 -> 状态机 -> 词token -> 栈 -> dom
构建 DOM 的过程是:从父到子,从先到后,一个一个节点构造,并且挂载到DOM树上
把respone拿到的字符流通过状态机解析成一个个的词
词(token)是如何被拆分的
eg:
<p class="a">text text text</p>
状态机
Node 类,所有的节点都会是这个 Node 类的实例。不一样的 HTML 节点对应了不同的 Node 的子类,此处的实现,我们进行简化,只把 Node 分为 Element 和 Text
function Element(){ this.childNodes = []; } function Text(value){ this.value = value || ""; }
规则:
构建过程:
(默认:源代码完全遵循 xhtml,HTML 具有很强的容错能力,奥妙在于当 tag end 跟栈顶的 start tag 不匹配的时候如何处理,暂时不考虑)
完整的语法和词法分析代码
词法分析
每一个状态是一个函数,通过“if else”来区分下一个字符做状态迁移。这里所谓的状态迁移,就是当前状态函数返回下一个状态函数。
const EOF = void 0 // 词法分析器接受字符的 function HTMLLexicalParser(syntaxer) { let state = data let token = null let attribute = null let characterReference = '' this.receiveInput = function (char) { if (state == null) { throw new Error('there is an error') } else { // 通过 state 来处理输入的字符流 state = state(char) } } this.reset = function () { state = data } // 状态机 c:每一个字符 function data(c) { switch (c) { case '&': return characterReferenceInData // tagOpenState 是接受了一个“ < ” 字符,来判断标签类型的状态 case '<': return tagOpen // perhaps will not encounter in javascript? // case '\0': // error() // emitToken(c) // return data // can be handle by default case // case EOF: // emitToken(EOF) // return data default: emitToken(c) return data } } // only handle right character reference function characterReferenceInData(c) { if (c === ';') { characterReference += c emitToken(characterReference) characterReference = '' return data } else { characterReference += c return characterReferenceInData } } function tagOpen(c) { if (c === '/') { return endTagOpen } if (/[a-zA-Z]/.test(c)) { token = new StartTagToken() token.name = c.toLowerCase() return tagName } // no need to handle this // if (c === '?') { // return bogusComment // } return error(c) } function tagName(c) { if (c === '/') { return selfClosingTag } if (/[\t \f\n]/.test(c)) { return beforeAttributeName } if (c === '>') { emitToken(token) return data } if (/[a-zA-Z]/.test(c)) { token.name += c.toLowerCase() return tagName } } function beforeAttributeName(c) { if (/[\t \f\n]/.test(c)) { return beforeAttributeName } if (c === '/') { return selfClosingTag } if (c === '>') { emitToken(token) return data } if (/["'<]/.test(c)) { return error(c) } attribute = new Attribute() attribute.name = c.toLowerCase() attribute.value = '' return attributeName } function attributeName(c) { if (c === '/') { token[attribute.name] = attribute.value return selfClosingTag } if (c === '=') { return beforeAttributeValue } if (/[\t \f\n]/.test(c)) { return beforeAttributeName } attribute.name += c.toLowerCase() return attributeName } function beforeAttributeValue(c) { if (c === '"') { return attributeValueDoubleQuoted } if (c === "'") { return attributeValueSingleQuoted } if (/\t \f\n/.test(c)) { return beforeAttributeValue } attribute.value += c return attributeValueUnquoted } function attributeValueDoubleQuoted(c) { if (c === '"') { token[attribute.name] = attribute.value return beforeAttributeName } attribute.value += c return attributeValueDoubleQuoted } function attributeValueSingleQuoted(c) { if (c === "'") { token[attribute.name] = attribute.value return beforeAttributeName } attribute.value += c return attributeValueSingleQuoted } function attributeValueUnquoted(c) { if (/[\t \f\n]/.test(c)) { token[attribute.name] = attribute.value return beforeAttributeName } attribute.value += c return attributeValueUnquoted } function selfClosingTag(c) { if (c === '>') { emitToken(token) endToken = new EndTagToken() endToken.name = token.name emitToken(endToken) return data } } function endTagOpen(c) { if (/[a-zA-Z]/.test(c)) { token = new EndTagToken() token.name = c.toLowerCase() return tagName } if (c === '>') { return error(c) } } // 输出解析好的 token(词) function emitToken(token) { syntaxer.receiveInput(token) } function error(c) { console.log(`warn: unexpected char '${c}'`) } } class StartTagToken {} class EndTagToken {} class Attribute {} module.exports = { HTMLLexicalParser, StartTagToken, EndTagToken } // 使用 const { HTMLLexicalParser } = require('./lexer') const testHTML = `<html maaa=a > <head> <title>cool</title> </head> <body> <img src="a" /> </body> </html>` const dummySyntaxer = { receiveInput: (token) => { if (typeof token === 'string') { console.log(`String(${token.replace(/\n/, '\\n').replace(/ /, '<whitespace>')})`) } else { console.log(token) } } } const lexer = new HTMLLexicalParser(dummySyntaxer) for (let c of testHTML) { lexer.receiveInput(c) } //便于理解:状态迁移代码 var state = data; var char while(char = getInput()) state = state(char);
语法分析
//简单实现:伪代码 function HTMLSyntaticalParser(){ var stack = [new HTMLDocument]; this.receiveInput = function(token) { //…… } this.getOutput = function(){ return stack[0]; } } const { StartTagToken, EndTagToken } = require('./lexer') class HTMLDocument { constructor () { this.isDocument = true this.childNodes = [] } } // 仅仅把 Node 分为 Element 和 Text class Node {} class Element extends Node { constructor (token) { super(token) for (const key in token) { this[key] = token[key] } this.childNodes = [] } [Symbol.toStringTag] () { return `Element<${this.name}>` } } class Text extends Node { constructor (value) { super(value) this.value = value || '' } } function HTMLSyntaticalParser () { const stack = [new HTMLDocument] // receiveInput 负责接收词法部分产生的词(token),构建dom树的算法 this.receiveInput = function (token) { // 检查栈顶是否是 Text 节点,如果是的话就合并 Text节点 if (typeof token === 'string') { if (getTop(stack) instanceof Text) { getTop(stack).value += token } else { let t = new Text(token) getTop(stack).childNodes.push(t) stack.push(t) } } else if (getTop(stack) instanceof Text) { stack.pop() } // 匹配开始和结束标签 if (token instanceof StartTagToken) { let e = new Element(token) getTop(stack).childNodes.push(e) return stack.push(e) } if (token instanceof EndTagToken) { return stack.pop() } } this.getOutput = () => stack[0] } function getTop (stack) { return stack[stack.length - 1] } module.exports = { HTMLSyntaticalParser } // 使用 const { HTMLSyntaticalParser } = require('./syntaxer') const { HTMLLexicalParser } = require('./lexer') const syntaxer = new HTMLSyntaticalParser() const lexer = new HTMLLexicalParser(syntaxer) const testHTML = `<html maaa=a > <head> <title>cool</title> </head> <body> <img src="a" /> </body> </html>` for (let c of testHTML) { lexer.receiveInput(c) } console.log(JSON.stringify(syntaxer.getOutput(), null, 2))
扩展阅读:从Chrome源码看浏览器如何构建DOM树
https://zhuanlan.zhihu.com/p/...
Vue和React是数据驱动视图,如何有效控制DOM操作?能不能把计算,更多的转移为js计算?因为js执行速度很快。patch函数-->patch,对比tag,对比tag与key,对比children