一个C#分析html的包，从老外的一个Spider程序里提取出来的

一个C#分析html的包，以前思考如何分析html，总是毫无头绪，情况太多，太复杂。后来在一个老外写的Spider程序里发现了这个。非常好用，屡试不爽，现在才知道自己写程序还在一个很低的层次，尤其是面向对象思想的理解。利用继承的思想来分解问题，利用多个类来实现多种情况的变化，这个才是OO思想，而自己很多时候只是为了OO而去OO,其实水平还是停留在面向过程里。好好分析下这些代码，受益颇多。

/// <summary> /// Attribute holds one attribute, as is normally stored in /// an HTML or XML file. This includes a name, value and delimiter. /// /// This spider is copyright 2003 by Jeff Heaton. However, it is /// released under a Limited GNU Public License (LGPL). You may /// use it freely in your own programs. For the latest version visit /// http://www.jeffheaton.com. /// /// </summary> public class Attribute: ICloneable { /// <summary> /// The name of this attribute /// </summary> private string m_name; /// <summary> /// The value of this attribute /// </summary> private string m_value; /// <summary> /// The delimiter for the value of this /// attribute(i.e. " or '). /// </summary> private char m_delim; /// <summary> /// Construct a new Attribute. The name, delim and value /// properties can be specified here. /// </summary> /// <param name="name">The name of this attribute.</param> /// <param name="value">The value of this attribute.</param> /// <param name="delim">The delimiter character for the value.</param> public Attribute(string name,string value,char delim) { m_name = name; m_value = value; m_delim = delim; } /// <summary> /// The default constructor. Construct a blank attribute. /// </summary> public Attribute():this("","",(char)0) { } /// <summary> /// Construct an attribute without a delimiter. /// </summary> /// <param name="name">The name of this attribute.</param> /// <param name="value">The value of this attribute.</param> public Attribute(String name,String value):this(name,value,(char)0) { } /// <summary> /// The delimiter for this attribute. /// </summary> public char Delim { get { return m_delim; } set { m_delim = value; } } /// <summary> /// The name for this attribute. /// </summary> public string Name { get { return m_name; } set { m_name = value; } } /// <summary> /// The value for this attribute. /// </summary> public string Value { get { return m_value; } set { m_value = value; } } #region ICloneable Members public virtual object Clone() { return new Attribute(m_name,m_value,m_delim); } #endregion }
以上是一个基础类，专门用来分析html属性的。

public class AttributeList:Attribute { /// <summary> /// An internally used Vector. This vector contains /// the entire list of attributes. /// </summary> protected ArrayList m_list; /// <summary> /// Make an exact copy of this object using the cloneable interface. /// </summary> /// <returns>A new object that is a clone of the specified object.</returns> public override Object Clone() { AttributeList rtn = new AttributeList(); for ( int i=0;i<m_list.Count;i++ ) rtn.Add( (Attribute)this[i].Clone() ); return rtn; } /// <summary> /// Create a new, empty, attribute list. /// </summary> public AttributeList():base("","") { m_list = new ArrayList(); } /// <summary> /// Add the specified attribute to the list of attributes. /// </summary> /// <param name="a">An attribute to add to this AttributeList.</param> public void Add(Attribute a) { m_list.Add(a); } /// <summary> /// Clear all attributes from this AttributeList and return it /// to a empty state. /// </summary> public void Clear() { m_list.Clear(); } /// <summary> /// Returns true of this AttributeList is empty, with no attributes. /// </summary> /// <returns>True if this AttributeList is empty, false otherwise.</returns> public bool IsEmpty() { return( m_list.Count<=0); } /// <summary> /// If there is already an attribute with the specified name, /// then it will have its value changed to match the specified value. /// If there is no Attribute with the specified name, then one will /// be created. This method is case-insensitive. /// </summary> /// <param name="name">The name of the Attribute to edit or create. Case-insensitive.</param> /// <param name="value">The value to be held in this attribute.</param> public void Set(string name,string value) { if ( name==null ) return; if ( value==null ) value=""; Attribute a = this[name]; if ( a==null ) { a = new Attribute(name,value); Add(a); } else a.Value = value; } /// <summary> /// How many attributes are in this AttributeList /// </summary> public int Count { get { return m_list.Count; } } /// <summary> /// A list of the attributes in this AttributeList /// </summary> public ArrayList List { get { return m_list; } } /// <summary> /// Access the individual attributes /// </summary> public Attribute this[int index] { get { if ( index<m_list.Count ) return(Attribute)m_list[index]; else return null; } } /// <summary> /// Access the individual attributes by name. /// </summary> public Attribute this[string index] { get { int i=0; while ( this[i]!=null ) { if ( this[i].Name.ToLower().Equals( (index.ToLower()) )) return this[i]; i++; } return null; } } }
以上是一个属性列表。

public class Parse:AttributeList { /// <summary> /// The source text that is being parsed. /// </summary> private string m_source; /// <summary> /// The current position inside of the text that /// is being parsed. /// </summary> private int m_idx; /// <summary> /// The most reciently parsed attribute delimiter. /// </summary> private char m_parseDelim; /// <summary> /// This most receintly parsed attribute name. /// </summary> private string m_parseName; /// <summary> /// The most reciently parsed attribute value. /// </summary> private string m_parseValue; /// <summary> /// The most reciently parsed tag. /// </summary> public string m_tag; /// <summary> /// Determine if the specified character is whitespace or not. /// </summary> /// <param name="ch">A character to check</param> /// <returns>true if the character is whitespace</returns> public static bool IsWhiteSpace(char ch) { return( "\t\n\r ".IndexOf(ch) != -1 ); } /// <summary> /// Advance the index until past any whitespace. /// </summary> public void EatWhiteSpace() { while ( !Eof() ) { if ( !IsWhiteSpace(GetCurrentChar()) ) return; m_idx++; } } /// <summary> /// Determine if the end of the source text has been /// reached. /// </summary> /// <returns>True if the end of the source text has been /// reached.</returns> public bool Eof() { return(m_idx>=m_source.Length ); } /// <summary> /// Parse the attribute name. /// </summary> public void ParseAttributeName() { EatWhiteSpace(); // get attribute name while ( !Eof() ) { if ( IsWhiteSpace(GetCurrentChar()) || (GetCurrentChar()=='=') || (GetCurrentChar()=='>') ) break; m_parseName+=GetCurrentChar(); m_idx++; } EatWhiteSpace(); } /// <summary> /// Parse the attribute value /// </summary> public void ParseAttributeValue() { if ( m_parseDelim!=0 ) return; if ( GetCurrentChar()=='=' ) { m_idx++; EatWhiteSpace(); if ( (GetCurrentChar()=='\'') || (GetCurrentChar()=='\"') ) { m_parseDelim = GetCurrentChar(); m_idx++; while ( GetCurrentChar()!=m_parseDelim ) { m_parseValue+=GetCurrentChar(); m_idx++; } m_idx++; } else { while ( !Eof() && !IsWhiteSpace(GetCurrentChar()) && (GetCurrentChar()!='>') ) { m_parseValue+=GetCurrentChar(); m_idx++; } } EatWhiteSpace(); } } /// <summary> /// Add a parsed attribute to the collection. /// </summary> public void AddAttribute() { Attribute a = new Attribute(m_parseName, m_parseValue,m_parseDelim); Add(a); } /// <summary> /// Get the current character that is being parsed. /// </summary> /// <returns></returns> public char GetCurrentChar() { return GetCurrentChar(0); } /// <summary> /// Get a few characters ahead of the current character. /// </summary> /// <param name="peek">How many characters to peek ahead for.</param> /// <returns>The character that was retrieved.</returns> public char GetCurrentChar(int peek) { if( (m_idx+peek)<m_source.Length ) return m_source[m_idx+peek]; else return (char)0; } /// <summary> /// Obtain the next character and advance the index by one. /// </summary> /// <returns>The next character</returns> public char AdvanceCurrentChar() { return m_source[m_idx++]; } /// <summary> /// Move the index forward by one. /// </summary> public void Advance() { m_idx++; } /// <summary> /// The last attribute name that was encountered. /// </summary> public string ParseName { get { return m_parseName; } set { m_parseName = value; } } /// <summary> /// The last attribute value that was encountered. /// </summary> public string ParseValue { get { return m_parseValue; } set { m_parseValue = value; } } /// <summary> /// The last attribute delimeter that was encountered. /// </summary> public char ParseDelim { get { return m_parseDelim; } set { m_parseDelim = value; } } /// <summary> /// The text that is to be parsed. /// </summary> public string Source { get { return m_source; } set { m_source = value; } } }
上面的是分析标签的，并且将标签的属性Parse进属性列表里。

public class ParseHTML:Parse { public AttributeList GetTag() { AttributeList tag = new AttributeList(); tag.Name = m_tag; foreach(Attribute x in List) { tag.Add((Attribute)x.Clone()); } return tag; } public String BuildTag() { String buffer="<"; buffer+=m_tag; int i=0; while ( this[i]!=null ) {// has attributes buffer+=" "; if ( this[i].Value == null ) { if ( this[i].Delim!=0 ) buffer+=this[i].Delim; buffer+=this[i].Name; if ( this[i].Delim!=0 ) buffer+=this[i].Delim; } else { buffer+=this[i].Name; if ( this[i].Value!=null ) { buffer+="="; if ( this[i].Delim!=0 ) buffer+=this[i].Delim; buffer+=this[i].Value; if ( this[i].Delim!=0 ) buffer+=this[i].Delim; } } i++; } buffer+=">"; return buffer; } protected void ParseTag() { m_tag=""; Clear(); // Is it a comment? if ( (GetCurrentChar()=='!') && (GetCurrentChar(1)=='-')&& (GetCurrentChar(2)=='-') ) { while ( !Eof() ) { if ( (GetCurrentChar()=='-') && (GetCurrentChar(1)=='-')&& (GetCurrentChar(2)=='>') ) break; if ( GetCurrentChar()!='\r' ) m_tag+=GetCurrentChar(); Advance(); } m_tag+="--"; Advance(); Advance(); Advance(); ParseDelim = (char)0; return; } // Find the tag name while ( !Eof() ) { if ( IsWhiteSpace(GetCurrentChar()) || (GetCurrentChar()=='>') ) break; m_tag+=GetCurrentChar(); Advance(); } EatWhiteSpace(); // Get the attributes while ( GetCurrentChar()!='>' ) { ParseName = ""; ParseValue = ""; ParseDelim = (char)0; ParseAttributeName(); if ( GetCurrentChar()=='>' ) { AddAttribute(); break; } // Get the value(if any) ParseAttributeValue(); AddAttribute(); } Advance(); } public char Parse() { if( GetCurrentChar()=='<' ) { Advance(); char ch=char.ToUpper(GetCurrentChar()); if ( (ch>='A') && (ch<='Z') || (ch=='!') || (ch=='/') ) { ParseTag(); return (char)0; } else return(AdvanceCurrentChar()); } else return(AdvanceCurrentChar()); } }
最后一个类是用来从HTML里分离出Tag，进而适合分析属性时使用。

通过以上4个类，将一个原本很难分析问题的变得很轻松的解决了。其实这里就透露出一种分析问题解决问题的思考过程。我想程序员应该养成的习惯就是将一个复杂的问题步步分解，分解成很多个很细小的问题，小问题解决了，这个复杂的问题也就解决了。而这些问题之间的逻辑联系如何组织，如何做到偶合性最低，是很值得思考的一个问题。现在想想以前在大学里学的软件工程，似乎可以渐渐明白了什么是健壮，什么是可复用。

发表于 2011-10-27 17:12 卡莱阅读(371) 评论(1) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

一个C#分析html的包，从老外的一个Spider程序里提取出来的

公告

导航