批量下载小说网站文章

 

 

 1 using HtmlAgilityPack;
 2 using System;
 3 using System.Collections.Generic;
 4 using System.IO;
 5 using System.Net;
 6 using System.Text;
 7 
 8 namespace DownloadNovel
 9 {
10     class Program
11     {
12         static void Main(string[] args)
13         {
14             //小说网站的网址
15             string webSiteUrl = "http://www.biqugew.com";
16             //指定小说目录的网址
17             string NovelUrl = "http://www.biqugew.com/book/10/";
18             
19            
20 
21             DownNovel(webSiteUrl, NovelUrl);
22         }
23         private static void DownNovel(string webSiteUrl, string NovelUrl)
24         {
25             string[] split = { "<br>", "\r\n" };
26             //指定小说的目录的 Xpath
27             string TableXpath = "/body[1]/div[1]/div[5]/div[1]/dl[1]/dd";
28             //获取小说标题的 XPath
29             string TitleXpath = "/html[1]/body[1]/div[1]/div[3]/div[1]/div[2]/h1[1]";
30             //获取指定小说的内容的 Xpath
31             string ContentsXpath = "/html/body/div/div[3]/div/div[3]";
32             WebClient client = new WebClient { Encoding = Encoding.GetEncoding("GB2312") };
33             HtmlNodeCollection nodes = null;
34             {
35                 HtmlDocument doc = new HtmlDocument();
36                 //获取目录页
37                 doc.LoadHtml(client.DownloadString(NovelUrl));
38                 nodes = doc.DocumentNode.SelectNodes(TableXpath);
39             }
40             //解析目录页
41             foreach (HtmlNode node in nodes)
42             {
43                 HtmlDocument doc = new HtmlDocument();
44                 //获取小说单章的网站
45                 string url = webSiteUrl + node.SelectSingleNode("a").Attributes["href"].Value;
46                 //获取小说单章整个网页
47                 doc.LoadHtml(client.DownloadString(url));
48                 //获取本章小说的标题
49                 string title = doc.DocumentNode.SelectSingleNode(TitleXpath).InnerHtml;
50                 //获取小说文本内容 doc.DocumentNode.SelectSingleNode("/html/body/div/div[3]/div/div[3]").OuterHtml
51                 string str = doc.DocumentNode.SelectSingleNode(ContentsXpath).InnerHtml.Replace("&nbsp;", "");
52                 //过滤文本中的特殊字符和字符串
53                 string aticale = "";
54                 foreach (var txt in str.Split(split, StringSplitOptions.RemoveEmptyEntries))
55                 {
56                     if (!txt.Contains("<a"))
57                         aticale += txt;
58                 }
59                 Console.WriteLine(title);
60                 WriteLog(title + Environment.NewLine + aticale);
61             }
62             
63         }
64 
65         static void WriteLog(string msg)
66         {
67             string path = Environment.CurrentDirectory + "/novel/";
68             if (!Directory.Exists(path)) Directory.CreateDirectory(path);
69             string fileName = DateTime.Now.ToString("yyyy-MM-dd");
70             string filepath = path + fileName + ".txt";
71             Stream fileStream = File.Open(filepath, FileMode.Append, FileAccess.Write, FileShare.Write);
72             StreamWriter writeAdapter = new StreamWriter(fileStream, Encoding.Default);
73             writeAdapter.WriteLine(msg);
74             writeAdapter.WriteLine();
75             writeAdapter.Close();
76         }
77 
78     }
79 }

 

posted @ 2019-01-22 18:17  落霞秋水  阅读(2044)  评论(0编辑  收藏  举报