C# 解析html —— 将html转为XHTML,然后利用Xml解析
呵呵,由于正则不熟,所以另谋出路——利用XML去解析html。
要想将抓取到的数据(直接抓取到的是byte[]) 转为XML文档(即XMLDocument对象),有两个要点:
一、判断编码(http头 charset 在某些网站上是不准确的)
我利用的是 第三方的一开源项目 去判断编码的,效果还不错:链接 。
二、将html转为XHTML
我利用的是 : SgmlReaderDll.dll ,微软提供的,虽然不是100%的准确,但是足以满足 轻量级的商业需求 。
核心代码如下:
public class XHtmlTools { private const string RegBody = @"<body[\s\S]*?>(?<body>[\s\S]*)</body>"; /// <summary> /// 获取xml文档 /// </summary> /// <param name="html"></param> /// <returns></returns> public XmlDocument GetXmlDocument(byte[] html) { StringBuilder XMLHEAD = new StringBuilder(); XMLHEAD.Append("<?xml version=\"1.0\" encoding=\"utf-8\" ?>"); XMLHEAD.Append("<!DOCTYPE ARTICLE["); XMLHEAD.Append("<!ENTITY nbsp \" \"><!ENTITY iexcl \"¡\"><!ENTITY cent \"¢\"><!ENTITY pound \"£\"><!ENTITY curren \"¤\"><!ENTITY yen \"¥\">"); XMLHEAD.Append("<!ENTITY brvbar \"¦\"><!ENTITY sect \"§\"><!ENTITY uml \"¨\"><!ENTITY copy \"©\"><!ENTITY ordf \"ª\"><!ENTITY laquo \"«\">"); XMLHEAD.Append("<!ENTITY not \"¬\"><!ENTITY shy \"-\"><!ENTITY reg \"®\"><!ENTITY macr \"¯\"><!ENTITY deg \"°\"><!ENTITY plusmn \"±\">"); XMLHEAD.Append("<!ENTITY sup2 \"²\"><!ENTITY sup3 \"³\"><!ENTITY acute \"´\"><!ENTITY micro \"µ\"><!ENTITY para \"¶\"><!ENTITY middot \"·\">"); XMLHEAD.Append("<!ENTITY cedil \"¸\"><!ENTITY sup1 \"¹\"><!ENTITY ordm \"º\"><!ENTITY raquo \"»\"><!ENTITY frac14 \"¼\"><!ENTITY frac12 \"½\">"); XMLHEAD.Append("<!ENTITY frac34 \"¾\"><!ENTITY iquest \"¿\"><!ENTITY times \"×\"><!ENTITY divide \"÷\"><!ENTITY Agrave \"À\"><!ENTITY Aacute \"Á\">"); XMLHEAD.Append("<!ENTITY Acirc \"Â\"><!ENTITY Atilde \"Ã\"><!ENTITY Auml \"Ä\"><!ENTITY Aring \"Å\"><!ENTITY AElig \"Æ\"><!ENTITY Ccedil \"Ç\">"); XMLHEAD.Append("<!ENTITY Egrave \"È\"><!ENTITY Eacute \"É\"><!ENTITY Ecirc \"Ê\"><!ENTITY Euml \"Ë\"><!ENTITY Igrave \"Ì\"><!ENTITY Iacute \"Í\">"); XMLHEAD.Append("<!ENTITY Icirc \"Î\"><!ENTITY Iuml \"Ï\"><!ENTITY ETH \"Ð\"><!ENTITY Ntilde \"Ñ\"><!ENTITY Ograve \"Ò\"><!ENTITY Oacute \"Ó\">"); XMLHEAD.Append("<!ENTITY Ocirc \"Ô\"><!ENTITY Otilde \"Õ\"><!ENTITY Ouml \"Ö\"><!ENTITY Oslash \"Ø\"><!ENTITY Ugrave \"Ù\"><!ENTITY Uacute \"Ú\">"); XMLHEAD.Append("<!ENTITY Ucirc \"Û\"><!ENTITY Uuml \"Ü\"><!ENTITY Yacute \"Ý\"><!ENTITY THORN \"Þ\"><!ENTITY szlig \"ß\"><!ENTITY agrave \"à\">"); XMLHEAD.Append("<!ENTITY aacute \"á\"><!ENTITY acirc \"â\"><!ENTITY atilde \"ã\"><!ENTITY auml \"ä\"><!ENTITY aring \"å\"><!ENTITY aelig \"æ\">"); XMLHEAD.Append("<!ENTITY ccedil \"ç\"><!ENTITY egrave \"è\"><!ENTITY eacute \"é\"><!ENTITY ecirc \"ê\"><!ENTITY euml \"ë\"><!ENTITY igrave \"ì\">"); XMLHEAD.Append("<!ENTITY iacute \"í\"><!ENTITY icirc \"î\"><!ENTITY iuml \"ï\"><!ENTITY eth \"ð\"><!ENTITY ntilde \"ñ\"><!ENTITY ograve \"ò\">"); XMLHEAD.Append("<!ENTITY oacute \"ó\"><!ENTITY ocirc \"ô\"><!ENTITY otilde \"õ\"><!ENTITY ouml \"ö\"><!ENTITY oslash \"ø\"><!ENTITY ugrave \"ù\">"); XMLHEAD.Append("<!ENTITY uacute \"ú\"><!ENTITY ucirc \"û\"><!ENTITY uuml \"ü\"><!ENTITY yacute \"ý\"><!ENTITY thorn \"þ\"><!ENTITY yuml \"ÿ\">"); XMLHEAD.Append("<!ENTITY lsquo \"‘\"><!ENTITY rsquo \"’\"><!ENTITY ldquo \"“\"><!ENTITY rdquo \"”\"><!ENTITY sbquo \"'\"><!ENTITY mdash \"—\">"); XMLHEAD.Append("<!ENTITY Prime \"′\"><!ENTITY hellip \"…\">"); XMLHEAD.Append("]>"); if (html == null) return null; string xml = Convert(html); if (string.IsNullOrEmpty(xml)) return null; try { XmlDocument xmlDoc = new XmlDocument(); xmlDoc.XmlResolver = null; xmlDoc.LoadXml(string.Format("{0}{1}", XMLHEAD.ToString(), xml)); return xmlDoc; } catch (XmlException) { return null; } } /// <summary> /// 将html转为xml /// </summary> /// <param name="html"></param> /// <returns></returns> public string Convert(byte[] html) { string xml = string.Empty; try { using (HtmlReader reader = new HtmlReader(GetString(html))) { StringBuilder sb = new StringBuilder(); using (HtmlWriter writer = new HtmlWriter(sb)) { while (!reader.EOF) { writer.WriteNode(reader, true); } } xml = sb.ToString(); } } catch (Exception) { } Match match = Regex.Match(xml, RegBody, RegexOptions.IgnoreCase); if (match.Success) { xml = match.Value; } if (string.IsNullOrEmpty(xml)) { xml = "<body></body>"; } return xml; } /// <summary> /// 解析编码并获得字符串 /// </summary> /// <param name="buffer"></param> /// <returns></returns> public string GetString(byte[] buffer) { string result = string.Empty; if (buffer == null) return result; using (MemoryStream msTemp = new MemoryStream(buffer)) { if (msTemp.Length > 0) { msTemp.Seek(0, SeekOrigin.Begin); int DetLen = 0; byte[] DetectBuff = new byte[4096]; UniversalDetector det = new UniversalDetector(null); while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !det.IsDone()) { det.HandleData(DetectBuff, 0, DetectBuff.Length); } det.DataEnd(); if (det.GetDetectedCharset() != null) { try { result = System.Text.Encoding.GetEncoding(det.GetDetectedCharset()).GetString(buffer); } catch (ArgumentException) { } } } } return result; } } public class HtmlReader : Sgml.SgmlReader { public HtmlReader(TextReader reader) : base() { base.InputStream = reader; base.DocType = "HTML"; } public HtmlReader(string content) : base() { base.InputStream = new StringReader(System.Web.HttpUtility.HtmlDecode(content)); base.DocType = "HTML"; } public override bool Read() { bool status = false; try { status = base.Read(); if (status) { if (base.NodeType == XmlNodeType.Element && (string.Compare(base.Name, "head", true) == 0 || string.Compare(base.Name, "script", true) == 0)) { base.Skip(); } } } catch (Exception ex) { Console.WriteLine(ex.Message); } return status; } } public class HtmlWriter : XmlTextWriter { private char[] chArrFilter = new char[] { '\'', '=', '?', '\"', '.', ';', ':', ')', '(', ' ', ' ' }; public HtmlWriter(TextWriter writer) : base(writer) { } public HtmlWriter(StringBuilder builder) : base(new StringWriter(builder)) { } public HtmlWriter(Stream stream, Encoding enc) : base(stream, enc) { } public override void WriteCData(string text) { // base.WriteCData(text); } public override void WriteComment(string text) { } public override void WriteWhitespace(string ws) { if (ws.IndexOf("\r\n") > -1 || ws.IndexOf("\t") > -1) { return; } if (ws != " ") { // 处理空白字符 base.WriteWhitespace(ws); } } public override void WriteStartElement(string prefix, string localName, string ns) { if (localName != "") { int index = localName.LastIndexOf(':'); if (index > -1) { // 防止带有前缀 localName = localName.Substring(index + 1); } localName = string.Join("", localName.Split(chArrFilter)).ToLower(); base.WriteStartElement("", localName, ""); } } public override void WriteAttributes(XmlReader reader, bool defattr) { if ((reader.NodeType == XmlNodeType.Element) || (reader.NodeType == XmlNodeType.XmlDeclaration)) { if (reader.MoveToFirstAttribute()) { this.WriteAttributes(reader, defattr); reader.MoveToElement(); } } else if (reader.NodeType == XmlNodeType.Attribute) { string localName = ""; string value = ""; do { localName = reader.LocalName.ToLower(); // 单过滤 if (localName != "xml:space" && (localName.LastIndexOf(':') > -1 || localName.StartsWith("xml"))) { // 防止带有前缀 continue; } localName = string.Join("", localName.Split(chArrFilter)); if (localName == "") { continue; } this.WriteStartAttribute("", localName, ""); while (reader.ReadAttributeValue()) { // if (reader.NodeType == XmlNodeType.EntityReference) // { // this.WriteEntityRef(reader.Name); // continue; // } value = reader.Value; if (value == "") { continue; } this.WriteString(value); // this.WriteRawString(reader.Value); // this.WriteAttributeString(localName, reader.Value); } this.WriteEndAttribute(); // =========================================== //string attributeLocalName = reader.LocalName; //while (reader.ReadAttributeValue()) //{ // string str = reader.Name; //} //string strValue = reader.Value; //attributeLocalName = reader.Name; //// 过滤无效的属性 //if (attributeLocalName == "" || strValue == "") //{ // attributeLocalName = attributeLocalName.TrimStart(new char[] { '\'', '=', '?', '\"', '.' }).ToLower(); // this.WriteAttributeString(attributeLocalName, strValue); //} } while (reader.MoveToNextAttribute()); } } }
上述源码及DLL : https://files.cnblogs.com/08shiyan/XHtmlTools.zip
下面再说一下解析XML,我利用的XPath:
XPath 和 jQuery所支持的选择器有一定的相似之处,借助jQuery所支持的选择器去理解XPath会更容易一些。
http://www.cnblogs.com/08shiyan/archive/2013/05/02/3055078.html
续:
由 imfunny 分享的 HtmlAgilityPack,开源的力量很强大!
HtmlAgilityPack地址:http://htmlagilitypack.codeplex.com/