利用SgmlReader获取网页源代码,进行提取
1根据sgmlReader类获得完整的html代码
/// <summary> /// 读取html页面内容 /// </summary> /// <param name="uri">网址</param> /// <returns></returns> private string GetWellFormedHTML(string uri) { StreamReader sReader = null;//读取字节流 StringWriter sw = null;//写入字符串 SgmlReader reader = null;//sgml读取方法 XmlTextWriter writer = null;//生成xml数据流 try { if (uri == String.Empty) return null; WebClient webclient = new WebClient(); webclient.Encoding = Encoding.UTF8; //页面内容 string strWebContent = webclient.DownloadString(uri); reader = new SgmlReader(); reader.DocType = "HTML"; reader.InputStream = new StringReader(strWebContent); sw = new StringWriter(); writer = new XmlTextWriter(sw); writer.Formatting = System.Xml.Formatting.Indented; while (reader.Read()) { if (reader.NodeType != XmlNodeType.Whitespace) { writer.WriteNode(reader, true); } } return sw.ToString(); } catch (Exception exp) { writer.Close(); reader.Close(); sw.Close(); sReader.Close(); return exp.Message; } }
2根据xpath规则,进行查找
/// <summary> /// 加载html源码,根据xpath规则查找所需内容 /// </summary> /// <param name="htmlStr">源码</param> /// <param name="xpath">xpath规则</param> /// <returns>查询结果</returns> private string GetResult(string htmlStr, string xpath) { StringBuilder sb = new StringBuilder();//存储结果 XPathDocument doc = new XPathDocument(new StringReader(htmlStr));//记载文件 XPathNavigator nav = doc.CreateNavigator();//产生节点 XPathNodeIterator nodes = nav.Select(xpath);//需找目标 while (nodes.MoveNext()) { XPathNavigator navCon = nodes.Current; sb.AppendLine(navCon.InnerXml);//获取全部内容(包含属性等) sb.AppendLine(navCon.Value);//获取值(不包含属性等) } return sb.ToString(); }
完!