爬虫之博客园精华客户端
在博客园学习知识是很方便的,但若做成客户端,自定义获取数据,那就更好啦!
那么需求有哪些呢,第一,我只查看推荐数大于2的文章;第二,我想要只查看C#或者Java的文章;第三,我想要查看推荐数大于2的新闻;第四,我还想搜索文章,并且只搜索推荐数大于2的文章。
先来预览一下成品吧
其中列表里左边是推荐数,反正我是优先看推荐数多的,中间是标题,右边是日期,至于其他信息,额,我其实不太关心,点击一行后直接在浏览器打开。
额,大体先这样吧,那么实现这些功能需要什么技能呢,首先我得准备一下通用类,大概需要web请求的帮助类、Gzip格式网页的加解密帮助类、html字符串解析的帮助类。
- web请求的帮助类:WebHelper
public class WebHelper { public readonly WebClient Web = new WebClient(); //错误重试次数 private int _tryTimes; public Encoding Encoding { set { Web.Encoding = value; } } public WebHelper() { Web.Encoding = Encoding.UTF8; } public WebHelper(Encoding encoding) { Web.Encoding = encoding; } /// <summary> /// 下载请求的资源 /// </summary> /// <param name="url">URL</param> /// <returns></returns> public string DownloadString(string url) { try { return Web.DownloadString(url); } catch(WebException e) { if (e.Message.Contains("404") || e.Status == WebExceptionStatus.ConnectFailure || e.Status == WebExceptionStatus.ProtocolError || _tryTimes == 2) { _tryTimes = 0; return null; } _tryTimes++; return DownloadString(url); } } /// <summary> /// 将指定的字符串上载到指定的资源 /// </summary> /// <param name="address">地址</param> /// <param name="data">参数</param> /// <returns></returns> public string UploadString(string address, string data) { Web.Headers.Add("Content-Type", "application/x-www-form-urlencoded"); try { return Web.UploadString(address, "POST", data); } catch { if (_tryTimes == 2) { _tryTimes = 0; return null; } _tryTimes++; return UploadString(address, data); } } /// <summary> /// 下载请求的资源(资源采用Gzip压缩) /// </summary> /// <param name="url">URL</param> /// <param name="encoding">页面编码格式</param> /// <returns></returns> public string DownloadGzipString(string url, Encoding encoding) { Web.Headers.Add("Accept-Encoding", "gzip"); try { return encoding.GetString(ZipHelper.GzipDecompress(Web.DownloadData(url))); } catch (WebException e) { if (e.Message.Contains("404") || e.Status == WebExceptionStatus.ConnectFailure || e.Status == WebExceptionStatus.ProtocolError || _tryTimes == 2) { _tryTimes = 0; return null; } _tryTimes++; return DownloadGzipString(url, encoding); } finally { Web.Headers.Remove("Accept-Encoding"); } } }
这里有三个方法,其中的DownloadString和UploadString和.net Framework的WebClient的方法用法一样,多了一个DownloadGzipString方法,这个方法用于get一个用Gzip压缩的页面,之所以重复写DownloadString和UploadString是因为我懒,有时候请求网页出现异常并不是该网页不能请求,多请求几次就能获取,这里自动尝试3次请求,3次请求过后依然失败则返回null。当然还有一种情况是需要用代理的,考虑到需要用代理的地方不多,并且代理的IP端口一般需要花钱来买,这里就不贴用代理来请求页面的代码了,之前买过两天耍过代理,我那时候的实现思路就是加一个ProxyPool代理池类,代理池从代理网站获取当前可用的代理,一般是一次获取十几个,然后放入代理池,请求需要代理的网站时就去代理池获取代理,WebClient.Proxy = new WebProxy(host, port);加了这个再去请求页面就可以了,当然代理不一定可靠,所以当失败后不要灰心,再用其他代理试试,总有一个成功的,当需要多线程请求网页时,就new多个WebHelper类,他们都会共用一个ProxyPool代理池的。
- Gzip格式网页的加解密帮助类ZipHelper
public class ZipHelper { /// <summary> /// Gzip压缩 /// </summary> /// <param name="cbytes">需压缩的数据</param> /// <returns></returns> public static byte[] GzipCompress(byte[] cbytes) { using (MemoryStream cms = new MemoryStream()) { using (GZipStream gzip = new GZipStream(cms, CompressionMode.Compress)) { //将数据写入基础流,同时会被压缩 gzip.Write(cbytes, 0, cbytes.Length); } return cms.ToArray(); } } /// <summary> /// Gzip解压 /// </summary> /// <param name="cbytes">需解压的数据</param> /// <returns></returns> public static byte[] GzipDecompress(byte[] cbytes) { using (MemoryStream dms = new MemoryStream()) { using (MemoryStream cms = new MemoryStream(cbytes)) { using (GZipStream gzip = new GZipStream(cms, CompressionMode.Decompress)) { byte[] bytes = new byte[1024]; int len = 0; //读取压缩流,同时会被解压 while ((len = gzip.Read(bytes, 0, bytes.Length)) > 0) { dms.Write(bytes, 0, len); } return dms.ToArray(); } } } } }
- html字符串解析的帮助类StringHelper
public class StringHelper { /// <summary> /// 根据传入str进行遍历取出列表 /// </summary> /// <param name="str">传入字符串</param> /// <param name="startStr">开始字符串</param> /// <param name="endStr">结束字符串</param> /// <param name="remove">是否去除开始和结束字符串取出数据</param> /// <returns></returns> public static List<string> GetList(string str, string startStr, string endStr, bool remove = true) { var lst = new List<string>(); int startIndex = 0; while (true) { string v = GetVal(str, startStr, endStr, remove, ref startIndex); if (startIndex == -1) { break; } lst.Add(v); } return lst; } public static string GetVal(string str, string startStr, string endStr, bool remove = true, int startIndex = 0) { return GetVal(str, startStr, endStr, remove, ref startIndex); } private static string GetVal(string str, string startStr, string endStr, bool remove, ref int startIndex) { int istart = str.IndexOf(startStr, startIndex, StringComparison.CurrentCulture); if (istart == -1) { startIndex = -1; return string.Empty; } int iend = str.IndexOf(endStr, istart + startStr.Length, StringComparison.Ordinal); if (iend == -1) { startIndex = -1; return string.Empty; } startIndex = iend + endStr.Length; if (remove) { istart += startStr.Length; return str.Substring(istart, iend - istart); } return str.Substring(istart, startIndex - istart); } /// <summary> /// 根据传入str进行遍历取出列表 /// </summary> /// <param name="str">传入字符串</param> /// <param name="startStr">开始字符串</param> /// <param name="needLength">需要获取的长度(不含开始字符串的长度)</param> /// <param name="remove">是否去除开始字符串取出数据</param> /// <returns></returns> public static List<string> GetList(string str, string startStr, int needLength, bool remove = true) { var lst = new List<string>(); int startIndex = 0; while (true) { string v = GetVal(str, startStr, needLength, remove, ref startIndex); if (startIndex == -1) { break; } lst.Add(v); } return lst; } public static string GetVal(string str, string startStr, int needLength, bool remove = true, int startIndex = 0) { return GetVal(str, startStr, needLength, remove, ref startIndex); } public static string GetVal(string str, string startStr, int needLength, bool remove, ref int startIndex) { int istart = str.IndexOf(startStr, startIndex, StringComparison.Ordinal); if (istart == -1) { startIndex = -1; return string.Empty; } startIndex = istart + startStr.Length + needLength; if (startIndex > str.Length) { startIndex = -1; return string.Empty; } return remove ? str.Substring(istart + startStr.Length, needLength) : str.Substring(istart, startStr.Length + needLength); } /// <summary> /// 获取字符串里的所有href链接 /// </summary> /// <param name="str">字符串</param> /// <returns></returns> public static List<string> GetUrls(string str) { return GetList(str, "href=\"", "\""); } /// <summary> /// 获取字符串里的首个href链接 /// </summary> /// <param name="str"></param> /// <returns></returns> public static string GetUrl(string str) { return GetVal(str, "href=\"", "\""); } public static string ToGB2312(string str) { string r = ""; MatchCollection mc = Regex.Matches(str, @"\\u([\w]{2})([\w]{2})", RegexOptions.Compiled | RegexOptions.IgnoreCase); var bts = new byte[2]; foreach (Match m in mc) { bts[0] = (byte) int.Parse(m.Groups[2].Value, NumberStyles.HexNumber); bts[1] = (byte) int.Parse(m.Groups[1].Value, NumberStyles.HexNumber); r += Encoding.Unicode.GetString(bts); } return r; } /// <summary> /// 除去所有在html元素中标记 /// </summary> /// <param name="html"></param> /// <returns></returns> public static string RemoveHTMLTags(string html) { Regex regex = new Regex(@"<[^>]+>|</[^>]+>"); return regex.Replace(html, ""); } }
这里主要包含了GetList、RemoveHTMLTags和GetVal方法,爬虫解析数据就靠他们了,具体的使用方法下面会有讲解。
到这里通用类大体就介绍完了,现在开始实地施工。
- 首先获取文章、获取新闻和查找文章的关键方法
/// <summary> /// 获取推荐数大于2的博客 /// </summary> /// <param name="pageIndex"></param> private bool AddPost(int pageIndex) { var url = "https://www.cnblogs.com/mvc/AggSite/PostList.aspx"; var html = _web.UploadString(url, GetUrl() + pageIndex); var posts = StringHelper.GetList(html, "\"post_item", "\"article_comment"); if (posts.Count == 0) { return false; } foreach (var item in posts) { var n = StringHelper.GetVal(item, "\"diggnum", "/span>"); var diggnum = Convert.ToInt32(StringHelper.GetVal(n, ">", "<")); if (diggnum < 3) { continue; } var t = StringHelper.GetVal(item, "\"titlelnk", "/a>"); var title = StringHelper.GetVal(t, ">", "<"); var time = StringHelper.GetVal(item, "发布于 ", 16); _urls.Add(StringHelper.GetUrl(t)); lstPost.Items.Add($"{diggnum} {title} {time}"); } return true; } /// <summary> /// 添加搜索的博客 /// </summary> /// <param name="pageIndex">页数</param> private bool AddSearchPost(int pageIndex) { var url = $"http://zzk.cnblogs.com/s/blogpost?Keywords={txtSearch.Text.Trim()}&pageindex={pageIndex}";var html = _web.DownloadGzipString(url, Encoding.UTF8); var posts = StringHelper.GetList(html, "\"searchItem", "\"searchItemInfo-comments"); if (posts.Count == 0) { return false; } foreach (var item in posts) { var diggnum = StringHelper.GetVal(item, ">推荐(", ")"); var n = StringHelper.GetVal(item, "searchItemTitle\">", "</h3>"); var title = StringHelper.RemoveHTMLTags(StringHelper.GetVal(n, "\">", "</a>")); var date = StringHelper.GetVal(item, "searchItemInfo-publishDate\">", "</span>"); _urls.Add(StringHelper.GetUrl(n)); lstPost.Items.Add($"{diggnum} {title} {date}"); } return true; } /// <summary> /// 获取推荐数大于2的新闻 /// </summary> /// <param name="pageIndex"></param> private bool AddNews(int pageIndex) { var url = "https://www.cnblogs.com/mvc/AggSite/NewsList.aspx"; var html = _web.UploadString(url, $"CategoryId=-1&CategoryType=News&ItemListActionName=NewsList&ItemListActionName=NewsList&PageIndex=" + pageIndex); var posts = StringHelper.GetList(html, "\"post_item", "\"article_comment"); if (posts.Count == 0) { return false; } foreach (var item in posts) { var n = StringHelper.GetVal(item, "\"diggnum", "/span>"); var diggnum = Convert.ToInt32(StringHelper.GetVal(n, ">", "<")); if (diggnum < 3) { continue; } var t = StringHelper.GetVal(item, "\"titlelnk", "/a>"); var title = StringHelper.GetVal(t, ">", "<"); var time = StringHelper.GetVal(item, "发布于 ", 16); var link = StringHelper.GetUrl(t); if (!link.Contains("http")) { link = "https:" + link; } _urls.Add(link); lstPost.Items.Add($"{diggnum} {title} {time}"); } return true; }
授人以鱼不然授人以渔,这些是怎么回事呢
在博客园首页按下F12,点击下一页,看看那些请求,瞄一瞄,就知道PostList.aspx是数据关键,里面的参数中CategoryId是分类ID,CategoryType是分类种类,暂时发现SiteHome和TopSiteCategory两个值,当点击母分类时,这个值就是TopSiteCategory,当点击子分类时,这个值就是SiteHome,PageIndex当前页这个众所周知啦,ParentCategoryId是父分类的ID,只有点击子分类时需要把父分类的ID赋值到这个字段。说了这么多,这个还只是获取文章的接口,另外两个查询文章的和获取新闻的也大同小异啦,大家自己研究。另外贴出的代码里有个GetUrl方法,这个就是为了赋值这些参数的,也贴出来吧
private string GetUrl() { string categoryId = "808"; string categoryType = "SiteHome"; string parentCategoryId = "0"; switch (cbbCate.SelectedIndex) { case 0: parentCategoryId = "108698"; categoryType = "SiteCategory"; switch (cbbType.SelectedIndex) { case 0: categoryId = "18156"; break; case 1: categoryId = "108699"; break; case 2: categoryId = "108700"; break; case 3: categoryId = "108760"; break; case 4: categoryId = "108716"; break; case 5: categoryId = "108717"; break; case 6: categoryId = "108718"; break; case 7: categoryId = "108719"; break; case 8: categoryId = "108720"; break; case 9: categoryId = "108728"; break; case 10: categoryId = "108729"; break; case 11: categoryId = "108730"; break; case 12: categoryId = "108738"; break; case 13: categoryId = "108739"; break; case 14: categoryId = "108758"; break; default: categoryId = parentCategoryId; categoryType = "TopSiteCategory"; parentCategoryId = "0"; break; } break; case 1: parentCategoryId = "2"; categoryType = "SiteCategory"; switch (cbbType.SelectedIndex) { case 0: categoryId = "106876"; break; case 1: categoryId = "106880"; break; case 2: categoryId = "106882"; break; case 3: categoryId = "106877"; break; case 4: categoryId = "108696"; break; case 5: categoryId = "106894"; break; case 6: categoryId = "108735"; break; case 7: categoryId = "108746"; break; case 8: categoryId = "108748"; break; case 9: categoryId = "108751"; break; case 10: categoryId = "108752"; break; case 11: categoryId = "108753"; break; case 12: categoryId = "108742"; break; case 13: categoryId = "108754"; break; default: categoryId = parentCategoryId; categoryType = "TopSiteCategory"; parentCategoryId = "0"; break; } break; case 2: parentCategoryId = "108701"; categoryType = "SiteCategory"; switch (cbbType.SelectedIndex) { case 0: categoryId = "106892"; break; case 1: categoryId = "108702"; break; case 2: categoryId = "106884"; break; case 3: categoryId = "108750"; break; default: categoryId = parentCategoryId; categoryType = "TopSiteCategory"; parentCategoryId = "0"; break; } break; case 3: parentCategoryId = "108703"; categoryType = "SiteCategory"; switch (cbbType.SelectedIndex) { case 0: categoryId = "106883"; break; case 1: categoryId = "106893"; break; case 2: categoryId = "108731"; break; case 3: categoryId = "108737"; break; default: categoryId = parentCategoryId; categoryType = "TopSiteCategory"; parentCategoryId = "0"; break; } break; case 4: parentCategoryId = "108704"; categoryType = "SiteCategory"; switch (cbbType.SelectedIndex) { case 0: categoryId = "78111"; break; case 1: categoryId = "50349"; break; case 2: categoryId = "106878"; break; case 3: categoryId = "108732"; break; case 4: categoryId = "108734"; break; case 5: categoryId = "108747"; break; case 6: categoryId = "108749"; break; case 7: categoryId = "3"; break; default: categoryId = parentCategoryId; categoryType = "TopSiteCategory"; parentCategoryId = "0"; break; } break; case 5: parentCategoryId = "108705"; categoryType = "SiteCategory"; switch (cbbType.SelectedIndex) { case 0: categoryId = "108706"; break; case 1: categoryId = "108707"; break; case 2: categoryId = "108736"; break; case 3: categoryId = "108708"; break; case 4: categoryId = "106886"; break; default: categoryId = parentCategoryId; categoryType = "TopSiteCategory"; parentCategoryId = "0"; break; } break; case 6: parentCategoryId = "108709"; categoryType = "SiteCategory"; switch (cbbType.SelectedIndex) { case 0: categoryId = "108710"; break; case 1: categoryId = "106891"; break; case 2: categoryId = "106889"; break; default: categoryId = parentCategoryId; categoryType = "TopSiteCategory"; parentCategoryId = "0"; break; } break; case 7: parentCategoryId = "108712"; categoryType = "SiteCategory"; switch (cbbType.SelectedIndex) { case 0: categoryId = "108713"; break; case 1: categoryId = "108714"; break; case 2: categoryId = "108715"; break; case 3: categoryId = "108743"; break; case 4: categoryId = "108756"; break; case 5: categoryId = "106881"; break; default: categoryId = parentCategoryId; categoryType = "TopSiteCategory"; parentCategoryId = "0"; break; } break; case 8: parentCategoryId = "108724"; categoryType = "SiteCategory"; switch (cbbType.SelectedIndex) { case 0: categoryId = "108721"; break; case 1: categoryId = "108725"; break; case 2: categoryId = "108726"; break; case 3: categoryId = "108755"; break; case 4: categoryId = "108757"; break; default: categoryId = parentCategoryId; categoryType = "TopSiteCategory"; parentCategoryId = "0"; break; } break; case 9: parentCategoryId = "4"; categoryType = "SiteCategory"; switch (cbbType.SelectedIndex) { case 0: categoryId = "807"; break; case 1: categoryId = "106879"; break; case 2: categoryId = "33909"; break; case 3: categoryId = "106885"; break; case 4: categoryId = "106895"; break; case 5: categoryId = "108759"; break; default: categoryId = parentCategoryId; categoryType = "TopSiteCategory"; parentCategoryId = "0"; break; } break; } return $"CategoryId={categoryId}&CategoryType={categoryType}&ParentCategoryId={parentCategoryId}&ItemListActionName=PostList&PageIndex="; }
功能大体介绍完了,末了还有个小惊喜,就是提示框,怎么在Winform中弹出提示框,过段时间自动消失呢,像这样
其实这个不难,弄个定时器就好啦
但需要注意的是,怎么才能弹出提示在最顶层呢,不然看不到呢,其实把TopMost属性设为True就好了,另外ShowIcon、ShowInTaskbar、MaximizeBox和MinimizeBox也要设为false,StartPosition设为CenterScreen,这样才专业。
由于刚弄成,难免会有疏忽八哥,大家看到后要帮忙指正,本项目已开源:https://github.com/ihambert/test。