/** * @author: Tobias Nickel * @date: 06.04.2015 * I needed a small xmlparser chat can be used in a worker. */ /** * parseXML / html into a DOM Object. with no validation and some failur tolerance * @params S {string} your XML to parse * @param options {object} all other options: * searchId {string} the id of a single element, that should be returned. using this will increase the speed rapidly * filter {function} filter method, as you know it from Array.filter. but is goes throw the DOM. * simplify {bool} to use tXml.simplify. */ function tXml(S, options) { "use strict"; options = options || {}; var openBracket = "<"; var openBracketCC = "<".charCodeAt(0); var closeBracket = ">"; var closeBracketCC = ">".charCodeAt(0); var minus = "-"; var minusCC = "-".charCodeAt(0); var slash = "/"; var slashCC = "/".charCodeAt(0); var exclamation = '!'; var exclamationCC = '!'.charCodeAt(0); var singleQuote = "'"; var singleQuoteCC = "'".charCodeAt(0); var doubleQuote = '"'; var doubleQuoteCC = '"'.charCodeAt(0); /** * parsing a list of entries */ function parseChildren() { var children = []; while (S[pos]) { if (S.charCodeAt(pos) == openBracketCC) { if (S.charCodeAt(pos + 1) === slashCC) { //while(S[pos]!=='>'){ pos++; } pos = S.indexOf(closeBracket, pos); return children; } else if (S.charCodeAt(pos + 1) === exclamationCC) { if (S.charCodeAt(pos + 2) == minusCC) { //comment support while (!(S.charCodeAt(pos) === closeBracketCC && S.charCodeAt(pos - 1) == minusCC && S.charCodeAt(pos - 2) == minusCC && pos != -1)) { pos = S.indexOf(closeBracket, pos + 1); } if (pos === -1) pos = S.length } else { // doctypesupport pos += 2; while (S.charCodeAt(pos) !== closeBracketCC) { pos++; } } pos++; continue; } var node = parseNode(); children.push(node); } else { var text = parseText() if (text.trim().length > 0) children.push(text); } pos++; } return children; } /** * returns the text outside of texts until the first '<' */ function parseText() { var start = pos; pos = S.indexOf(openBracket, pos) - 1; if (pos === -2) pos = S.length; return S.slice(start, pos + 1); } /** * returns text until the first nonAlphebetic letter */ var nameSpacer = '\n\t>/= '; function parseName() { var start = pos; while (nameSpacer.indexOf(S[pos]) === -1) { pos++; } return S.slice(start, pos); } /** * is parsing a node, including tagName, Attributes and its children, * to parse children it uses the parseChildren again, that makes the parsing recursive */ var NoChildNodes = [/*'img', 'br', 'input', 'meta', 'link'*/]; function parseNode() { var node = {}; pos++; node.tagName = parseName(); // parsing attributes var attrFound = false; while (S.charCodeAt(pos) !== closeBracketCC) { var c = S.charCodeAt(pos); if ((c > 64 && c < 91) || (c > 96 && c < 123)) { //if('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'.indexOf(S[pos])!==-1 ){ var name = parseName(); // search beginning of the string var code = S.charCodeAt(pos); while (code !== singleQuoteCC && code !== doubleQuoteCC && !((code > 64 && code < 91) || (code > 96 && code < 123)) && code !== closeBracketCC) { pos++; code = S.charCodeAt(pos); } if (!attrFound) { node.attributes = {}; attrFound = true; } if (code === singleQuoteCC || code === doubleQuoteCC) { var value = parseString(); } else { value = null ; pos--; } node.attributes[name] = value; } pos++; } // optional parsing of children if (S.charCodeAt(pos - 1) !== slashCC) { if (node.tagName == "script") { var start = pos + 1; pos = S.indexOf('', pos); node.children = [S.slice(start, pos - 1)]; pos += 8; } else if (node.tagName == "style") { var start = pos + 1; pos = S.indexOf('', pos); node.children = [S.slice(start, pos - 1)]; pos += 7; } else if (NoChildNodes.indexOf(node.tagName) == -1) { pos++; node.children = parseChildren(name); } } return node; } /** * is parsing a string, that starts with a char and with the same usually ' or " */ function parseString() { var startChar = S[pos]; var startpos = ++pos; pos = S.indexOf(startChar, startpos) return S.slice(startpos, pos); } function findId() { return new RegExp('\s*id\s*=\s*[\'"]' + options.searchId + '[\'"]').exec(S).index; } var out=null; if (options.searchId) { var pos = findId(); if (pos !== -1) { pos = S.lastIndexOf('<', pos); if (pos !== -1) { out = parseNode(); } } return pos; } else { var pos = 0; out = parseChildren(); } if(options.filter){ out = tXml.filter(out,options.filter); } if(options.simplify){ out = tXml.simplify(out); } return out; } /** * transform the DomObject to an object that is like the object of PHPs simplexmp_load_*() methods. * this format helps you to write that is more likely to keep your programm working, even if there a small changes in the XML schema. * be aware, that it is not possible to reproduce the original xml from a simplefied version, because the order of elements is not saved. * therefore your programm will be more flexible and easyer to read. * * @param {array} the childrenList */ tXml.simplify = function simplify(children) { var out = {}; if (children == undefined) return out; if(children.length === 1 && typeof children[0] == 'string') return {value: children[0] }; // map each object children.forEach(function(child) { if (!out[child.tagName]) out[child.tagName] = []; if (typeof child == 'object') { var kids = tXml.simplify(child.children); out[child.tagName].push(kids); if (child.attributes) { kids._attributes = child.attributes; } }else{ out[child.tagName].push(child); } } ); for (var i in out) { if (out[i].length == 1) { out[i] = out[i][0]; } } return out; }; /** * behaves the same way as Array.filter, if the filter method return true, the element is in the resultList * @params children{Array} the children of a node * @param f{function} the filter method */ tXml.filter = function(children,f){ var out=[]; children.forEach(function(child){ if(typeof(child) === 'object' && f(child))out.push(child); if(child.children){ var kids = tXml.filter(child.children,f); out = out.concat(kids); } }); return out; }; /* console.clear(); tXml(d,'content'); //some testCode var s = document.body.innerHTML.toLowerCase(); var start = new Date().getTime(); var o = tXml(s,'content'); var end = new Date().getTime(); //console.log(JSON.stringify(o,undefined,'\t')); console.log("MILLISECONDS",end-start); var nodeCount=document.querySelectorAll('*').length; console.log('node count',nodeCount); console.log("speed:",(1000/(end-start))*nodeCount,'Nodes / second') //console.log(JSON.stringify(tXml('testPage

TestPage

this is a testpage

'),undefined,'\t')); var p = new DOMParser(); var s2=''+s+'' var start2= new Date().getTime(); var o2 = p.parseFromString(s2,'text/html').querySelector('#content') var end2=new Date().getTime(); console.log("MILLISECONDS",end2-start2); // */