这两天,编码做了一个新蛋网手机信息的采集,web页面信息采集是用WebClient控件。需要调用方法Gather()。希望能有帮助。
代码如下:
/* * Created By ChinaAgan 2012-1-18 * */ using System; using System.Collections.Generic; using System.Text; using System.Collections; using System.Net; using System.IO; using System.Text.RegularExpressions; using CnBlogCollector.Properties; namespace CnBlogCollector { /// <summary> /// 数据采集类 /// </summary> public class Collector { #region 变量 private string cnblogMain = "http://www.newegg.com.cn/SubCategory/1043-{0}.htm" ;//cnblog首页地址 private WebClient wc = new WebClient(); #endregion #region 创建目录 /// <summary> /// 判断目录是否存在,若不存在则创建该目录 /// </summary> /// <param name="path"></param> /// <returns></returns> public string CreateFolderIfNot( string path) { //获取该目录的完整路径 string rtn = Path.GetFullPath(path); //若该目录不存在 if (!Directory.Exists(rtn)) { //创建该目录 Directory.CreateDirectory(rtn); } return rtn; } #endregion #region 采集网页数据 public void Gather( int startIndex, int endIndex) { WebProxy webProxy = new WebProxy( "proxy.cn1.global.***.com:8080" ); webProxy.Credentials = new System.Net.NetworkCredential( "user" , "password" ); wc.Proxy = webProxy; string outContent = "" ; //根据startIndex和endIndex来遍历cnblog首页上文章 for ( int i = startIndex; i < endIndex; i++) { //从cnblog首页下载页面数据并将其转换成UTF8编码格式的STRING string url = string .Format(cnblogMain, i.ToString()); string mainData = Encoding.GetEncoding( "GB2312" ).GetString(wc.DownloadData(url)).Replace( "\r\n" , "" ); string strPattern = @"<p\s+class=""info""><a\s+href=(?<url>.+?)\s+title=""(?<title>.+?)"">(?<content>.+?)</a>" ; string oldPricePattern = @"<p\s+class=""bypast""><span>¥(?<OldPrice>.+?)</span></p>" ; string newPricePattern = @"<p\s+class=""current""><strong\s+class=""price""><span>¥</span>(?<NewPrice>\d+?\..+?)</strong></p>" ; List< string > nameList = new List< string >(); List< string > oldPriceList = new List< string >(); List< string > newPriceList = new List< string >(); string oldPrice = String.Empty; string newPrice = String.Empty; MatchCollection MatchesName = Regex.Matches(mainData, strPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled); MatchCollection MatchesOldPrice = Regex.Matches(mainData, oldPricePattern, RegexOptions.IgnoreCase | RegexOptions.Compiled); MatchCollection MatchesNewPrice = Regex.Matches(mainData, newPricePattern, RegexOptions.IgnoreCase | RegexOptions.Compiled); foreach (Match NextMatch in MatchesName) { nameList.Add(NextMatch.Groups[ "content" ].Value); } foreach (Match NextMatch in MatchesOldPrice) { oldPriceList.Add(NextMatch.Groups[ "OldPrice" ].Value); } foreach (Match NextMatch in MatchesNewPrice) { newPriceList.Add(NextMatch.Groups[ "NewPrice" ].Value); } for ( int iLen = 0; iLen < nameList.Count; iLen++) { outContent += String.Format( "手机名称:{0}," + "原价:{1},现价:{2}" , nameList[iLen].ToString(), oldPriceList[iLen].ToString(), newPriceList[iLen].ToString()) + "\r\n" ; } // 现价和&32;之类符号的处理。 string pth = CreateFolderIfNot(Settings.Default.OutPath) + i + ".txt" ; if (File.Exists(pth)) { File.Delete(pth); } File.AppendAllText(pth, outContent, Encoding.GetEncoding( "GB2312" )); outContent = "" ; } } #endregion } } |
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步