控制台爬取小说(大王饶命)
1 var url = GetWBJokeUrl("/book/1719.html"); 2 string next; 3 GetContent(url, out next); 4 while (true) 5 { 6 ConsoleKeyInfo info = System.Console.ReadKey(); 7 switch (info.Key) 8 { 9 case ConsoleKey.E: 10 Environment.Exit(0); 11 break; 12 case ConsoleKey.RightArrow: 13 GetContent(GetWBJokeUrl(next), out next); 14 break; 15 default: 16 System.Console.WriteLine(info.Key); 17 break; 18 } 19 20 }
1 /// <summary> 2 /// 获取大王饶命小说页面 3 /// </summary> 4 /// <param name="firstUrl">第一次进入的页面</param> 5 /// <param name="nexturl">下一页</param> 6 private static void GetContent(string firstUrl, out string nexturl) 7 { 8 var html = GetUrlContent(firstUrl); 9 var url = @"<a href=([^>]+?)>下一页</a>"; 10 string re1 = "/.+html"; 11 nexturl = MatchReg(re1, MatchReg(url, html)); 12 var divContent = @"(?m)<div id=""BookText""[^>]*>(?<div>(?:\w|\W)*?)</div[^>]*>"; 13 html = MatchReg(divContent, html, "div").Trim().Replace("<br />", ""); 14 var delh4 = @"<h4>([\s\S]*?)</h4>"; 15 html = html.Replace(MatchReg(delh4, html), ""); 16 Console.WriteLine(html); 17 18 } 19 20 /// <summary> 21 /// 筛选数据 22 /// </summary> 23 /// <param name="regStr">正则字符串</param> 24 /// <param name="html">网页标签</param> 25 /// <param name="input">需要获取的标签</param> 26 /// <returns></returns> 27 public static string MatchReg(string regStr, string html, string input = "0") 28 { 29 var reg = new Regex(regStr, RegexOptions.Multiline | RegexOptions.IgnoreCase); 30 var mc = reg.Match(html); 31 if (mc.Success) 32 { 33 return mc.Groups[input].Value; 34 } 35 return ""; 36 } 37 38 /// <summary> 39 /// 爬取地址 40 /// </summary> 41 const string qsbkMainUrl = "http://www.dawangraoming.com"; 42 /// <summary> 43 /// 爬取页面位置 44 /// </summary> 45 /// <param name="firsturl"></param> 46 /// <returns></returns> 47 private static string GetWBJokeUrl(string firsturl) 48 { 49 StringBuilder url = new StringBuilder(); 50 url.Append(qsbkMainUrl); 51 url.Append(firsturl); 52 return url.ToString(); 53 } 54 55 /// <summary> 56 /// /伪装网站访问 57 /// </summary> 58 /// <param name="url">目标网站地址</param> 59 /// <returns></returns> 60 private static string GetUrlContent(string url) 61 { 62 try 63 { 64 65 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); 66 67 request.UserAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.8.1000 Chrome/30.0.1599.101 Safari/537.36"; 68 69 request.Method = "GET"; 70 71 request.ContentType = "text/html;charset=UTF-8"; 72 73 HttpWebResponse response = (HttpWebResponse)request.GetResponse(); 74 75 Stream myResponseStream = response.GetResponseStream(); 76 77 StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8")); 78 79 string retString = myStreamReader.ReadToEnd(); 80 81 myStreamReader.Close(); 82 83 myResponseStream.Close(); 84 85 return retString; 86 87 } 88 89 catch { return null; } 90 91 }
好好学习,天天向上。
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步