今天一直在看两个纯js的parser,天哪我还是水平不够,继续努力。

第一个是 John Resig 写的比较完善的版本

 

  1 /*
  2  * HTML Parser By John Resig (ejohn.org)
  3  * Original code by Erik Arvidsson, Mozilla Public License
  4  * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js
  5  *
  6  * // Use like so:
  7  * HTMLParser(htmlString, {
  8  *     start: function(tag, attrs, unary) {},
  9  *     end: function(tag) {},
 10  *     chars: function(text) {},
 11  *     comment: function(text) {}
 12  * });
 13  *
 14  * // or to get an XML string:
 15  * HTMLtoXML(htmlString);
 16  *
 17  * // or to get an XML DOM Document
 18  * HTMLtoDOM(htmlString);
 19  *
 20  * // or to inject into an existing document/DOM node
 21  * HTMLtoDOM(htmlString, document);
 22  * HTMLtoDOM(htmlString, document.body);
 23  *
 24  */
 25 
 26 (function(){
 27 
 28     // Regular Expressions for parsing tags and attributes
 29     var startTag = /^<([-A-Za-z0-9_]+)((?:\s+\w+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/,
 30         endTag = /^<\/([-A-Za-z0-9_]+)[^>]*>/,
 31         attr = /([-A-Za-z0-9_]+)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g;
 32         
 33     // Empty Elements - HTML 4.01
 34     var empty = makeMap("area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed");
 35 
 36     // Block Elements - HTML 4.01
 37     var block = makeMap("address,applet,blockquote,button,center,dd,del,dir,div,dl,dt,fieldset,form,frameset,hr,iframe,ins,isindex,li,map,menu,noframes,noscript,object,ol,p,pre,script,table,tbody,td,tfoot,th,thead,tr,ul");
 38 
 39     // Inline Elements - HTML 4.01
 40     var inline = makeMap("a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,object,q,s,samp,script,select,small,span,strike,strong,sub,sup,textarea,tt,u,var");
 41 
 42     // Elements that you can, intentionally, leave open
 43     // (and which close themselves)
 44     var closeSelf = makeMap("colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr");
 45 
 46     // Attributes that have their values filled in disabled="disabled"
 47     var fillAttrs = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected");
 48 
 49     // Special Elements (can contain anything)
 50     var special = makeMap("script,style");
 51 
 52     var HTMLParser = this.HTMLParser = function( html, handler ) {
 53         var index, chars, match, stack = [], last = html;
 54         stack.last = function(){
 55             return this[ this.length - 1 ];
 56         };
 57 
 58         while ( html ) {
 59             chars = true;
 60 
 61             // Make sure we're not in a script or style element
 62             if ( !stack.last() || !special[ stack.last() ] ) {
 63 
 64                 // Comment
 65                 if ( html.indexOf("<!--") == 0 ) {
 66                     index = html.indexOf("-->");
 67     
 68                     if ( index >= 0 ) {
 69                         if ( handler.comment )
 70                             handler.comment( html.substring( 4, index ) );
 71                         html = html.substring( index + 3 );
 72                         chars = false;
 73                     }
 74     
 75                 // end tag
 76                 } else if ( html.indexOf("</") == 0 ) {
 77                     match = html.match( endTag );
 78     
 79                     if ( match ) {
 80                         html = html.substring( match[0].length );
 81                         match[0].replace( endTag, parseEndTag );
 82                         chars = false;
 83                     }
 84     
 85                 // start tag
 86                 } else if ( html.indexOf("<") == 0 ) {
 87                     match = html.match( startTag );
 88     
 89                     if ( match ) {
 90                         html = html.substring( match[0].length );
 91                         match[0].replace( startTag, parseStartTag );
 92                         chars = false;
 93                     }
 94                 }
 95 
 96                 if ( chars ) {
 97                     index = html.indexOf("<");
 98                     
 99                     var text = index < 0 ? html : html.substring( 0, index );
100                     html = index < 0 ? "" : html.substring( index );
101                     
102                     if ( handler.chars )
103                         handler.chars( text );
104                 }
105 
106             } else {
107                 html = html.replace(new RegExp("(.*)<\/" + stack.last() + "[^>]*>"), function(all, text){
108                     text = text.replace(/<!--(.*?)-->/g, "$1")
109                         .replace(/<!\[CDATA\[(.*?)]]>/g, "$1");
110 
111                     if ( handler.chars )
112                         handler.chars( text );
113 
114                     return "";
115                 });
116 
117                 parseEndTag( "", stack.last() );
118             }
119 
120             if ( html == last )
121                 throw "Parse Error: " + html;
122             last = html;
123         }
124         
125         // Clean up any remaining tags
126         parseEndTag();
127 
128         function parseStartTag( tag, tagName, rest, unary ) {
129             tagName = tagName.toLowerCase();
130 
131             if ( block[ tagName ] ) {
132                 while ( stack.last() && inline[ stack.last() ] ) {
133                     parseEndTag( "", stack.last() );
134                 }
135             }
136 
137             if ( closeSelf[ tagName ] && stack.last() == tagName ) {
138                 parseEndTag( "", tagName );
139             }
140 
141             unary = empty[ tagName ] || !!unary;
142 
143             if ( !unary )
144                 stack.push( tagName );
145             
146             if ( handler.start ) {
147                 var attrs = [];
148     
149                 rest.replace(attr, function(match, name) {
150                     var value = arguments[2] ? arguments[2] :
151                         arguments[3] ? arguments[3] :
152                         arguments[4] ? arguments[4] :
153                         fillAttrs[name] ? name : "";
154                     
155                     attrs.push({
156                         name: name,
157                         value: value,
158                         escaped: value.replace(/(^|[^\\])"/g, '$1\\\"') //"
159                     });
160                 });
161     
162                 if ( handler.start )
163                     handler.start( tagName, attrs, unary );
164             }
165         }
166 
167         function parseEndTag( tag, tagName ) {
168             // If no tag name is provided, clean shop
169             if ( !tagName )
170                 var pos = 0;
171                 
172             // Find the closest opened tag of the same type
173             else
174                 for ( var pos = stack.length - 1; pos >= 0; pos-- )
175                     if ( stack[ pos ] == tagName )
176                         break;
177             
178             if ( pos >= 0 ) {
179                 // Close all the open elements, up the stack
180                 for ( var i = stack.length - 1; i >= pos; i-- )
181                     if ( handler.end )
182                         handler.end( stack[ i ] );
183                 
184                 // Remove the open elements from the stack
185                 stack.length = pos;
186             }
187         }
188     };
189     
190     this.HTMLtoXML = function( html ) {
191         var results = "";
192         
193         HTMLParser(html, {
194             start: function( tag, attrs, unary ) {
195                 results += "<" + tag;
196         
197                 for ( var i = 0; i < attrs.length; i++ )
198                     results += " " + attrs[i].name + '="' + attrs[i].escaped + '"';
199         
200                 results += (unary ? "/" : "") + ">";
201             },
202             end: function( tag ) {
203                 results += "</" + tag + ">";
204             },
205             chars: function( text ) {
206                 results += text;
207             },
208             comment: function( text ) {
209                 results += "<!--" + text + "-->";
210             }
211         });
212         
213         return results;
214     };
215     
216     this.HTMLtoDOM = function( html, doc ) {
217         // There can be only one of these elements
218         var one = makeMap("html,head,body,title");
219         
220         // Enforce a structure for the document
221         var structure = {
222             link: "head",
223             base: "head"
224         };
225     
226         if ( !doc ) {
227             if ( typeof DOMDocument != "undefined" )
228                 doc = new DOMDocument();
229             else if ( typeof document != "undefined" && document.implementation && document.implementation.createDocument )
230                 doc = document.implementation.createDocument("", "", null);
231             else if ( typeof ActiveX != "undefined" )
232                 doc = new ActiveXObject("Msxml.DOMDocument");
233             
234         } else
235             doc = doc.ownerDocument ||
236                 doc.getOwnerDocument && doc.getOwnerDocument() ||
237                 doc;
238         
239         var elems = [],
240             documentElement = doc.documentElement ||
241                 doc.getDocumentElement && doc.getDocumentElement();
242                 
243         // If we're dealing with an empty document then we
244         // need to pre-populate it with the HTML document structure
245         if ( !documentElement && doc.createElement ) (function(){
246             var html = doc.createElement("html");
247             var head = doc.createElement("head");
248             head.appendChild( doc.createElement("title") );
249             html.appendChild( head );
250             html.appendChild( doc.createElement("body") );
251             doc.appendChild( html );
252         })();
253         
254         // Find all the unique elements
255         if ( doc.getElementsByTagName )
256             for ( var i in one )
257                 one[ i ] = doc.getElementsByTagName( i )[0];
258         
259         // If we're working with a document, inject contents into
260         // the body element
261         var curParentNode = one.body;
262         
263         HTMLParser( html, {
264             start: function( tagName, attrs, unary ) {
265                 // If it's a pre-built element, then we can ignore
266                 // its construction
267                 if ( one[ tagName ] ) {
268                     curParentNode = one[ tagName ];
269                     if ( !unary ) {
270                         elems.push( curParentNode );
271                     }
272                     return;
273                 }
274             
275                 var elem = doc.createElement( tagName );
276                 
277                 for ( var attr in attrs )
278                     elem.setAttribute( attrs[ attr ].name, attrs[ attr ].value );
279                 
280                 if ( structure[ tagName ] && typeof one[ structure[ tagName ] ] != "boolean" )
281                     one[ structure[ tagName ] ].appendChild( elem );
282                 
283                 else if ( curParentNode && curParentNode.appendChild )
284                     curParentNode.appendChild( elem );
285                     
286                 if ( !unary ) {
287                     elems.push( elem );
288                     curParentNode = elem;
289                 }
290             },
291             end: function( tag ) {
292                 elems.length -= 1;
293                 
294                 // Init the new parentNode
295                 curParentNode = elems[ elems.length - 1 ];
296             },
297             chars: function( text ) {
298                 curParentNode.appendChild( doc.createTextNode( text ) );
299             },
300             comment: function( text ) {
301                 // create comment node
302             }
303         });
304         
305         return doc;
306     };
307 
308     function makeMap(str){
309         var obj = {}, items = str.split(",");
310         for ( var i = 0; i < items.length; i++ )
311             obj[ items[i] ] = true;
312         return obj;
313     }
314 })();
htmlparser

 

用法:

var results = "";
 
HTMLParser("<p id=test>hello <i>world", {
  start: function( tag, attrs, unary ) {
    results += "<" + tag;
 
    for ( var i = 0; i < attrs.length; i++ )
      results += " " + attrs[i].name + '="' + attrs[i].escaped + '"';
 
    results += (unary ? "/" : "") + ">";
  },
  end: function( tag ) {
    results += "</" + tag + ">";
  },
  chars: function( text ) {
    results += text;
  },
  comment: function( text ) {
    results += "<!--" + text + "-->";
  }
});
 
results == '<p id="test">hello <i>world</i></p>"

然后John提到他是在 Erik Arvidsson 的基础上做的,

又去看了erik的版本

  1 // Copyright 2004 Erik Arvidsson. All Rights Reserved.
  2 //
  3 // This code is triple licensed using Apache Software License 2.0,
  4 // Mozilla Public License or GNU Public License
  5 //
  6 ///////////////////////////////////////////////////////////////////////////////
  7 //
  8 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  9 // use this file except in compliance with the License.  You may obtain a copy
 10 // of the License at http://www.apache.org/licenses/LICENSE-2.0
 11 //
 12 ///////////////////////////////////////////////////////////////////////////////
 13 //
 14 // The contents of this file are subject to the Mozilla Public License
 15 // Version 1.1 (the "License"); you may not use this file except in
 16 // compliance with the License. You may obtain a copy of the License at
 17 // http://www.mozilla.org/MPL/
 18 //
 19 // Software distributed under the License is distributed on an "AS IS"
 20 // basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
 21 // License for the specific language governing rights and limitations
 22 // under the License.
 23 //
 24 // The Original Code is Simple HTML Parser.
 25 //
 26 // The Initial Developer of the Original Code is Erik Arvidsson.
 27 // Portions created by Erik Arvidssson are Copyright (C) 2004. All Rights
 28 // Reserved.
 29 //
 30 ///////////////////////////////////////////////////////////////////////////////
 31 //
 32 // This program is free software; you can redistribute it and/or
 33 // modify it under the terms of the GNU General Public License
 34 // as published by the Free Software Foundation; either version 2
 35 // of the License, or (at your option) any later version.
 36 //
 37 // This program is distributed in the hope that it will be useful,
 38 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 39 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 40 // GNU General Public License for more details.
 41 //
 42 // You should have received a copy of the GNU General Public License
 43 // along with this program; if not, write to the Free Software
 44 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 45 //
 46 ///////////////////////////////////////////////////////////////////////////////
 47 
 48 /*
 49 var handler ={
 50     startElement:   function (sTagName, oAttrs) {},
 51     endElement:     function (sTagName) {},
 52     characters:        function (s) {},
 53     comment:        function (s) {}
 54 };
 55 */
 56 
 57 function SimpleHtmlParser()
 58 {
 59 }
 60 
 61 SimpleHtmlParser.prototype = {
 62 
 63     handler:    null,
 64 
 65     // regexps
 66 
 67     startTagRe:    /^<([^>\s\/]+)((\s+[^=>\s]+(\s*=\s*((\"[^"]*\")|(\'[^']*\')|[^>\s]+))?)*)\s*\/?\s*>/m,
 68     endTagRe:    /^<\/([^>\s]+)[^>]*>/m,
 69     attrRe:        /([^=\s]+)(\s*=\s*((\"([^"]*)\")|(\'([^']*)\')|[^>\s]+))?/gm,
 70 
 71     parse:    function (s, oHandler)
 72     {
 73         if (oHandler)
 74             this.contentHandler = oHandler;
 75 
 76         var i = 0;
 77         var res, lc, lm, rc, index;
 78         var treatAsChars = false;
 79         var oThis = this;
 80         while (s.length > 0)
 81         {
 82             // Comment
 83             if (s.substring(0, 4) == "<!--")
 84             {
 85                 index = s.indexOf("-->");
 86                 if (index != -1)
 87                 {
 88                     this.contentHandler.comment(s.substring(4, index));
 89                     s = s.substring(index + 3);
 90                     treatAsChars = false;
 91                 }
 92                 else
 93                 {
 94                     treatAsChars = true;
 95                 }
 96             }
 97 
 98             // end tag
 99             else if (s.substring(0, 2) == "</")
100             {
101                 if (this.endTagRe.test(s))
102                 {
103                     lc = RegExp.leftContext;
104                     lm = RegExp.lastMatch;
105                     rc = RegExp.rightContext;
106 
107                     lm.replace(this.endTagRe, function ()
108                     {
109                         return oThis.parseEndTag.apply(oThis, arguments);
110                     });
111 
112                     s = rc;
113                     treatAsChars = false;
114                 }
115                 else
116                 {
117                     treatAsChars = true;
118                 }
119             }
120             // start tag
121             else if (s.charAt(0) == "<")
122             {
123                 if (this.startTagRe.test(s))
124                 {
125                     lc = RegExp.leftContext;
126                     lm = RegExp.lastMatch;
127                     rc = RegExp.rightContext;
128 
129                     lm.replace(this.startTagRe, function ()
130                     {
131                         return oThis.parseStartTag.apply(oThis, arguments);
132                     });
133 
134                     s = rc;
135                     treatAsChars = false;
136                 }
137                 else
138                 {
139                     treatAsChars = true;
140                 }
141             }
142 
143             if (treatAsChars)
144             {
145                 index = s.indexOf("<");
146                 if (index == -1)
147                 {
148                      this.contentHandler.characters(s);
149                     s = "";
150                 }
151                 else
152                 {
153                     this.contentHandler.characters(s.substring(0, index));
154                     s = s.substring(index);
155                 }
156             }
157 
158             treatAsChars = true;
159         }
160     },
161 
162     parseStartTag:    function (sTag, sTagName, sRest)
163     {
164         var attrs = this.parseAttributes(sTagName, sRest);
165         this.contentHandler.startElement(sTagName, attrs);
166     },
167 
168     parseEndTag:    function (sTag, sTagName)
169     {
170         this.contentHandler.endElement(sTagName);
171     },
172 
173     parseAttributes:    function (sTagName, s)
174     {
175         var oThis = this;
176         var attrs = [];
177         s.replace(this.attrRe, function (a0, a1, a2, a3, a4, a5, a6)
178         {
179             attrs.push(oThis.parseAttribute(sTagName, a0, a1, a2, a3, a4, a5, a6));
180         });
181         return attrs;
182     },
183 
184     parseAttribute: function (sTagName, sAttribute, sName)
185     {
186         var value = "";
187         if (arguments[7])
188             value = arguments[8];
189         else if (arguments[5])
190             value = arguments[6];
191         else if (arguments[3])
192             value = arguments[4];
193 
194         var empty = !value && !arguments[3];
195         return {name: sName, value: empty ? null : value};
196     }
197 };
ericparser

让我再折腾折腾。。。待续

 

posted on 2014-09-05 16:53  meeming  阅读(552)  评论(0编辑  收藏  举报



Fork me on GitHub