没事虫子爬个书...
x
看到程序猿爬虫的故事...一个无聊的周末...也想用Jumony爬点书,,,囤起来...仓鼠症...
using Ivony.Html; using System; using System.Collections.Generic; using System.IO; using System.Text; using System.Windows.Forms; namespace BookGet { public partial class Form1 : Form { public Form1() { InitializeComponent(); } private void button1_Click(object sender, EventArgs e) { string host = "https://m.xxx.net/"; string baseUrl = "https://m.xxx.net/wapsort/"; var nextUrl = "11_1.html";// "171271.html"; string url = ""; //所有书籍路径... Dictionary<string, string> bookInfoDic = new Dictionary<string, string>(); string bookName = string.Empty; string bookUrl = string.Empty; #region 获取所有的页→获取此页中的所有书籍字典 int testI = 0; //循环所有页... nextUrl = baseUrl + nextUrl; while (nextUrl != "" && testI < 20) { testI++; try { url = nextUrl; var doc = new Ivony.Html.Parser.JumonyParser().LoadDocument(url); //此页中的所有书籍... IEnumerable<IHtmlElement> bookList = doc.Find("#nr_body div div.common-bookele h3 a"); foreach (var bookItem in bookList) { bookName = bookItem.InnerText(); bookUrl = bookItem.Attribute("href").Value(); if (!bookInfoDic.ContainsKey(bookName)) { //if (bookName == "好想宠坏你") { bookInfoDic.Add(bookName, bookUrl); } } } var domNext = doc.FindFirst("#nr_body div#page a.next"); nextUrl = domNext.Attribute("href").Value(); if (domNext.Attribute("class").Value() == "prev none") { nextUrl = ""; } } catch { Console.WriteLine(string.Format("{0}没有成功", url)); nextUrl = ""; } } #endregion #region 读取所有书,并下载到本地... bookName = string.Empty; //保存此书的路径... string bookPath = string.Empty; //书的ID //string bookIDStr = string.Empty; string beginReadUrl = string.Empty; //一个章节的标题(分段阅读) string bookTitlePage = string.Empty; //一个章节的文本... string bookTextPage = string.Empty; StringBuilder bookTextBuil = new StringBuilder(); string nextTextPage = string.Empty; FileStream fs = null; StreamWriter sw = null; //循环书List... foreach (var item in bookInfoDic) { bookTextBuil.Clear(); try { bookPath = string.Format("D:\\yuzhaiwu\\{0}.txt", item.Key); if (File.Exists(bookPath)) { fs = new FileStream(bookPath, FileMode.Append); } else { fs = new FileStream(bookPath, FileMode.Create); } sw = new StreamWriter(fs, Encoding.UTF8); //进入书的主页... var mainPage = new Ivony.Html.Parser.JumonyParser().LoadDocument(item.Value); //开始阅读... var beginReadEle = mainPage.FindFirst("#novelMain a.btn"); beginReadUrl = beginReadEle.Attribute("href").Value(); nextTextPage = (host + beginReadUrl); //下一页下一页... while (nextTextPage != "") { //各个章节... var firstPage = new Ivony.Html.Parser.JumonyParser().LoadDocument(nextTextPage, Encoding.UTF8, true); #region 如果发现页面中所有的html代码在一个title中的话...读取title中的html代码,在转换... //string htmlPage = firstPage.FindFirst("title").InnerHtml(); //var firstPageTemp = new JumonyParser().Parse(htmlPage); //bookTextPage = firstPageTemp.FindFirst("#nr1").InnerText(); #endregion bookTitlePage = firstPage.FindFirst("#nr_title").InnerText(); //bookTextPage = firstPage.FindFirst("#nr1").InnerText(); bookTextPage = firstPage.FindFirst("#nr1").InnerHtml().Replace("<p>", "\n").Replace("</p>", "\n").Replace("<p></p>", ""); bookTextBuil.AppendFormat("\r\n\r\n\r\n{0}\r\n{1}", bookTitlePage, bookTextPage); //获取下一章节路径... var nextPageEle = firstPage.FindFirst("#nr_body a#pb_next");//#nr_body div.nr_page table tbody tr td.next a#pb_next nextTextPage = nextPageEle.Attribute("href").Value(); //如果相等,表明是最后一页了... if (nextTextPage == item.Value) { nextTextPage = ""; } } //var bookIDEle = mainPage.FindFirst("SOHUCS"); //bookIDStr = bookIDEle.Attribute("sid").Value(); } catch (System.IO.IOException ioEx) { MessageBox.Show(ioEx.Message); } catch (Exception ex) { MessageBox.Show(ex.Message); } finally { //Console.WriteLine(title); //sw.WriteLine(""); //sw.WriteLine(title); //sw.WriteLine(""); sw.WriteLine(bookTextBuil.ToString()); } } sw.Close(); fs.Close(); #endregion MessageBox.Show("全部成功!"); } } }
x
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了