C#备份博客园随笔文章和图片----使用HtmlAgilityPack解析html

之前用.NET做网页采集实现采用正则表达式去匹配解析，比较繁琐，花费时间较多，若是Html复杂的话真是欲哭无泪。

很早就听过包HtmlAgilityPack，其是在.NET下用XPath来解析的HTML的一个类库（包）。但是一直没时间尝试，简单了解了下HtmlAgilityPack的API后，发现真是HTML解析利器，于是花些时间做一个例子记录下。

本次是以下载博客园随笔分类文章为例，采用两部分实现，第一部分是将采集到的文章放到集合变量中，第二部分是通过操作集合变量将文章下载到本地，

这样做效率较低，因为可以直接边采集文章边下载。暂时没有考虑效率问题，仅仅只是实现功能。下面简单阐述下。

获取随笔分类

根据输入的博客名取得对应的随笔分类。

   /// <summary>
        /// 获取博客分类
        /// </summary>
        /// <param name=" uname"></param>
        /// <returns></returns>
        private static List< BlogType> GettBlogTypeList(string uname)
        {
            string url = "http://www.cnblogs.com/" + uname + "/mvc/blog/sidecolumn.aspx?blogApp=" + uname;
            string htmlStr = CommonHelper .GetRequestStr(url);
            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(htmlStr);
            var nodes = doc.DocumentNode.SelectNodes("//div[@id='sidebar_postcategory']//a"); //随笔分类
            if (nodes == null || nodes.Count <= 0)
                return null ;

            List<BlogType > list = new List< BlogType>();
            for (int i = 0; i < nodes.Count; i++)
            {
                var aUrl = nodes[i].Attributes["href" ].Value;
                var name = nodes[i].InnerText;
                list.Add( new BlogType () { BlogTypeUrl = aUrl, BlogTypeName = name.Contains( "(") ? name.Split('(')[0] : name,BlogTypeNameShow=name });
            }
            return list;
        }

 

  public class BlogType
    {
        public string BlogTypeUrl { get; set; }
        public string BlogTypeName { get; set; }
        public string BlogTypeNameShow { get; set; }
    }

如获取到的随笔分类如下：

采集分类的文章

采用两步实现，第一步获取只包含标题和url的文章，第二步再获取文章内容。

 /// <summary>
        /// 根据分类获取博客
        /// </summary>
        /// <param name=" blogTypes"></param>
        /// <param name=" useTime"></param>
        /// <returns></returns>
        public static Dictionary< BlogType,List <BlogInfo>> GetBlogsByType( List<BlogType > blogTypes,out long useTime)
        {
            Stopwatch sw = new Stopwatch();
            sw.Start();
            Dictionary<BlogType , List< BlogInfo>> dic = new Dictionary< BlogType, List <BlogInfo>>();          
            foreach (var blogType in blogTypes)
            {
                List<BlogInfo > list = new List< BlogInfo>();
                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml( CommonHelper.GetRequestStr(blogType.BlogTypeUrl));
                var typeNameNode = doc.DocumentNode.SelectSingleNode("//div[@class='entrylist']/h1");
                string typeName = typeNameNode.InnerText;
                var listPosttitleNodes = doc.DocumentNode.SelectNodes("//div[@class='entrylistPosttitle']/a");
                if (listPosttitleNodes != null && listPosttitleNodes.Count > 0)
                {
                    for (int i = 0; i < listPosttitleNodes.Count; i++)
                    {
                        Console.WriteLine("正在爬取文章【{0}】..." , listPosttitleNodes[i].InnerText);
                        list.Add( new BlogInfo ()
                        {
                            BlogUrl = listPosttitleNodes[i].Attributes[ "href"].Value,
                            BlogTitle = listPosttitleNodes[i].InnerText,
                            BlogTypeName = typeName
                        });
                    }
                }

                dic.Add(blogType,list);

            }

            sw.Stop();
            useTime = sw.ElapsedMilliseconds;
            return dic;
        }

 

 

     /// <summary>
        /// 获取详细的博客信息
        /// </summary>
        /// <param name=" dic"></param>
        /// <param name=" useTime"></param>
        /// <returns></returns>
        public static Dictionary< BlogType, List <BlogInfo>> GetBlogDetail( Dictionary<BlogType , List<BlogInfo >> dic, out long useTime)
        {
            Stopwatch sw = new Stopwatch();
            sw.Start();
            HtmlDocument doc = new HtmlDocument();
            for(int k=0;k<dic.Keys.Count;k++)
            {
                var blogType = dic.Keys.ElementAt(k);
                var list = dic[blogType];
                for (int i = 0; i < list.Count; i++)
                {
                    Console.WriteLine("正在获取文章【{0}】内容..." , list[i].BlogTitle);
                    doc.LoadHtml( CommonHelper.GetRequestStr(list[i].BlogUrl));
                    var bodyNode = doc.DocumentNode.SelectSingleNode("//div[@id='cnblogs_post_body']");
                    var dateNode = doc.DocumentNode.SelectSingleNode("//span[@id='post-date']");
                    var userNode = doc.DocumentNode.SelectSingleNode("//div[@class='postDesc']/a[1]");
                    list[i].BlogContent = bodyNode == null ? "内容获取失败" : bodyNode.InnerHtml;
                    list[i].BlogPostTime = dateNode == null ? "发布时间获取失败" : dateNode.InnerText;
                    list[i].BlogName = userNode == null ? "用户获取失败" : userNode.InnerText;
                }
                dic[blogType] = list;
            }
            sw.Stop();
            useTime = sw.ElapsedMilliseconds;
            return dic;
        }

 

    public class BlogInfo
    {
        public string BlogUrl { get; set; }
        public string BlogName { get; set; }
        public string BlogTitle { get; set; }
        public string BlogContent { get; set; }
        public string BlogTypeName { get; set; }
        public string BlogPostTime { get; set; }
    }

下载到本地

根据上面采集到的文章再一步步下载到本地，期间分两步，第一步下载图片，第二步下载文章内容。

 /// <summary>

        /// 下载

        /// </summary>

        /// <param name=" dic"></param>

        /// <param name=" uname"></param>

        /// <param name=" useTime"></param>

        /// <returns></returns>

        public static string DowanloadBlog( Dictionary<BlogType , List< BlogInfo>> dic, string uname,out long useTime)

        {

            Stopwatch sw = new Stopwatch();

            sw.Start();

            int countFlag = 0;

            for (int i = 0; i < dic.Keys.Count; i++)

            {

                var blogType = dic.Keys.ElementAt(i);

                var blogList = dic[blogType];

                var dicPath = AppDomain .CurrentDomain.BaseDirectory +"BlogFiles\\" + uname + "\\" + blogType.BlogTypeName;

                Console.WriteLine("<<开始处理分类【{0}】<<" , blogType.BlogTypeName);

                FileHelper.CreatePath(dicPath);

                var blogModel = new BlogInfo();

                for (int j = 0; j < blogList.Count; j++)

                {

                    countFlag++;

                    try

                    {

                        Console.WriteLine("~~~~开始处理文章{0}【{1}】~~~~" , countFlag,blogModel.BlogTitle);

                        blogModel = blogList[j];

                        var filePath = dicPath + "\\" + FileHelper.FilterInvalidChar(blogModel.BlogTitle, "_") + ".html" ;

                        HtmlDocument doc = new HtmlDocument();

                        doc.DocumentNode.InnerHtml = blogModel.BlogContent;

 

                        //处理图片

                        Console.WriteLine("~~开始处理图片" );

                        var imgPath = dicPath + "\\images" ;

                        FileHelper.CreatePath(imgPath);

                        SaveImage(doc, imgPath);

                        Console.WriteLine("~~处理图片完成" );

 

                        //去掉a标签

                        var aNodes = doc.DocumentNode.SelectNodes("//a");

                        if (aNodes != null && aNodes.Count > 0)

                        {

                            for (int a = 0; a < aNodes.Count; a++)

                            {

                                if (aNodes[a].Attributes["href" ] != null && aNodes[a].Attributes[ "href"].Value != "#" )

                                {

                                    aNodes[a].Attributes[ "href"].Value = "javascript:void()" ;

                                }

                            }

                        }

                        doc.DocumentNode.InnerHtml = "<div id='div_head'>" + uname + " " + blogType.BlogTypeName + "</div><div id='div_title'>" + blogModel.BlogTitle + "<div><div id='div_body'>" + doc.DocumentNode.InnerHtml + "</div>";

                        doc.Save(filePath, Encoding.UTF8);

                        Console.WriteLine("~~~~处理文章{0}【{1}】完毕~~~~" ,countFlag,blogModel.BlogTitle);

                    }

                    catch (Exception ex)

                    {

                        string errorMsg = DateTime .Now.ToString("yyyyMMdd HH:mm:ss") + "\r\n" + "url=" + blogModel.BlogUrl + "\r\n" + "title=" + blogModel.BlogTitle + "\r\n" + "errorMsg=" + ex.Message + "\r\n" + "stackTrace=" + ex.StackTrace + "\r\n\r\n\r\n";

                        Console.WriteLine("error>>处理文章【{0}】出现错误，开始记录错误信息~~" , blogModel.BlogTitle);

                        FileHelper.SaveTxtFile(dicPath, "errorLog.txt" , errorMsg, false);

                        Console.WriteLine("error>>处理文章【{0}】出现错误，记录错误信息完成~~" , blogModel.BlogTitle);

                    }

                }

                Console.WriteLine("<<处理分类【{0}】完成<<" , blogType.BlogTypeName);

 

            }

            sw.Start();

            useTime = sw.ElapsedMilliseconds;

            return AppDomain .CurrentDomain.BaseDirectory + "BlogFiles\\" + uname;

        }

 

 /// <summary>

        /// 保存图片

        /// </summary>

        /// <param name=" doc"></param>

        /// <param name=" filePath"></param>

        public static void SaveImage( HtmlDocument doc, string filePath)

        {

            var imgNodes = doc.DocumentNode.SelectNodes("//img");

            if (imgNodes != null && imgNodes.Count > 0)

            {

                for (int i = 0; i < imgNodes.Count; i++)

                {

                    try

                    {                     

                        string src = imgNodes[i].Attributes["src" ].Value;

                        string fileName = "" ;

                        if (src != null && src.Contains("/"))

                        {

                            fileName = src.Substring(src.LastIndexOf( "/") + 1);

                            Console.WriteLine("~~开始下载图片【{0}】~~" , fileName);

                            string imgPath = filePath + "\\" + fileName;

                            imgNodes[i].Attributes[ "src"].Value = imgPath;

                            byte[] imgByte = CommonHelper .GetRequestByteArr(src);

                            if (imgByte != null )

                            {

                                FileHelper.SaveImage(imgPath, imgByte);

                                Console.WriteLine("~~下载图片【{0}】完成~~" , fileName);

                            }

                            else

                            {

                                Console.WriteLine("~~下载图片【{0}】失败~~" , fileName);

                            }

                        }

                    }

                    catch (Exception ex)

                    {

                        throw new Exception( "SaveImage_Error:" + ex.Message);

                    }

 

                }

            }

        }

View Code

程序入口

主要代码如下

    var types = GettBlogTypeList(uname);
                    long time1 = 0;
                    long time2 = 0;
                    long timeDownload = 0;
                    Console.WriteLine("正在爬取，请耐心等待..." );
                    var blogList = GetBlogsByType(types,out time1);
                    var blogDetailList = GetBlogDetail(blogList,out time2);
                    Console.WriteLine("爬取完毕，开始下载..." );
                    string filePath=DowanloadBlog(blogDetailList, uname,out timeDownload);
                    Console.WriteLine("**处理完毕，爬取用时{0}ms，下载用时{1}ms，{2}" , time1+time2, timeDownload, filePath);
                    handlerRight = false;

演示效果

文件存储在项目bin目录下，一个用户一个文件夹

按随笔分类生成不同的文件夹

生成.html文件，一个分类的所有图片都存在该分类下的images下。

完整源码放在github下，https://github.com/kungge/CommonTest/tree/dev/WebCollect

欢迎指出程序bug，提出优化意见，(●'◡'●)

出处：https://www.cnblogs.com/kungge/p/5956501.html

==========================================================================

上面的获取文章分类已经无法使用了，查看博客园的API说明文档，参考文章如下，

http://wcf.open.cnblogs.com/blog/help

根据这个里面的提示，我们可以分两步备份文章，先获取文章标题和摘要，然后在下载每一篇文章，保存文章和图片到本地文件夹

首先我们定义一个博客文章类

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace BackupBlogs.Models
{
    public class BlogInfo
    {
        public string id { get; set; }

        public string title { get; set; }

        public string summary { get; set; }

        public string published { get; set; }

        public string updated { get; set; }

        public string link { get; set; }

        public string diggs { get; set; }

        public string views { get; set; }

        public string comments { get; set; }

        public string body { get; set; }
    }
}

View Code

在定义两个工具类

using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;

namespace BackupBlogs.Common
{
    public static class CommonHelper
    {
        #region HttpClient
        private static HttpClient _httpClient;
        public static HttpClient httpClient
        {
            get
            {
                if (_httpClient == null)
                {
                    _httpClient = new HttpClient();
                    _httpClient.Timeout = new TimeSpan(0, 4, 0);

                }
                return _httpClient;
            }
            set { _httpClient = value; }
        }

        #endregion

        #region get请求
        /// <summary>
        /// get请求返回的字符串
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static string GetRequestStr(string url)
        {
            try
            {
                var response = httpClient.GetAsync(new Uri(url)).Result;
                return response.Content.ReadAsStringAsync().Result;
            }
            catch (Exception ex)
            {
                FileHelper.SaveTxtFile(AppDomain.CurrentDomain.BaseDirectory + "\\Error.txt", $"[{DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss.fff")}]  请求URL发生异常：" + url + Environment.NewLine + ex.Message + Environment.NewLine, false);
                return null;
            }
        }
        /// <summary>
        /// get请求返回的二进制
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static byte[] GetRequestByteArr(string url)
        {
            try
            {
                if (url.Contains("://"))
                {
                    var response = httpClient.GetAsync(new Uri(url)).Result;
                    return response.Content.ReadAsByteArrayAsync().Result;
                }
                else if (url.IndexOf("data:image") == 0)
                {
                    return Convert.FromBase64String(url.Split(';')[1].Split(',')[1]);
                }
                else
                    return null;
            }
            catch (Exception ex)
            {
                FileHelper.SaveTxtFile(AppDomain.CurrentDomain.BaseDirectory + "\\Error.txt", $"[{DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss.fff")}]  请求URL发生异常：" + url + Environment.NewLine + ex.Message + Environment.NewLine, false);
                return null;
            }
        }
        #endregion

        #region post请求
        /// <summary>
        /// post请求返回的字符串
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static string PostRequestStr(string url)
        {
            try
            {
                string contentStr = "";
                StringContent sc = new StringContent(contentStr);
                sc.Headers.ContentType = new System.Net.Http.Headers.MediaTypeHeaderValue("application/x-www-form-urlencoded");//todo
                var response = httpClient.PostAsync(new Uri(url), sc).Result;
                return response.Content.ReadAsStringAsync().Result;
            }
            catch (Exception ex)
            {
                FileHelper.SaveTxtFile(AppDomain.CurrentDomain.BaseDirectory + "\\Error.txt", $"[{DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss.fff")}]  请求URL发生异常：" + url + Environment.NewLine + ex.Message + Environment.NewLine, false);
                return null;
            }
        }
        #endregion

        #region MD5加密解密

        /// <summary>
        /// 用MD5加密字符串，可选择生成16位或者32位的加密字符串
        /// </summary>
        /// <param name="password">待加密的字符串</param>
        /// <param name="bit">位数，一般取值16 或 32</param>
        /// <returns>返回的加密后的字符串</returns>
        public static string MD5Encrypt(string strWord, int bit)
        {
            string tmp = MD5Encrypt(strWord);
            if (bit == 16)
                return tmp.ToString().Substring(8, 16);
            else if (bit == 32)
                return tmp.ToString();//默认情况
            else
                return string.Empty;
        }

        /// <summary>
        /// 用MD5加密字符串
        /// </summary>
        /// <param name="password">待加密的字符串</param>
        /// <returns></returns>
        public static string MD5Encrypt(string strWord)
        {
            System.Security.Cryptography.MD5CryptoServiceProvider md5Hasher = new System.Security.Cryptography.MD5CryptoServiceProvider();
            byte[] hashedDataBytes;
            hashedDataBytes = md5Hasher.ComputeHash(Encoding.GetEncoding("gb2312").GetBytes(strWord));
            StringBuilder tmp = new StringBuilder();
            foreach (byte i in hashedDataBytes)
            {
                tmp.Append(i.ToString("x2"));
            }
            return tmp.ToString();
        }
        #endregion
    }
}

View Code

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace BackupBlogs.Common
{
    public static class FileHelper
    {
        #region 创建路径
        /// <summary>
        /// 创建路径
        /// </summary>
        /// <param name="path"></param>
        public static bool CreatePath(string path)
        {
            if (!Directory.Exists(path))
            {
                Directory.CreateDirectory(path);
                return true;
            }
            return false;
        }
        #endregion

        #region 保存图片
        /// <summary>
        /// 保存图片
        /// </summary>
        /// <param name="bt"></param>
        public static void SaveImage(string filePath, byte[] bt)
        {
            try
            {
                File.WriteAllBytes(filePath, bt);
            }
            catch (Exception ex)
            {
                Console.WriteLine("SaveImage 方法发生异常：" + ex.Message);
            }
        }
        #endregion

        #region 保存文本文件
        public static void SaveTxtFile(string filePath, string txtStr, bool isCover = true)
        {
            try
            {
                CreatePath(System.IO.Path.GetDirectoryName(filePath));
                if (isCover)
                    File.WriteAllText(filePath, txtStr, Encoding.Default);
                else
                    File.AppendAllText(filePath, txtStr, Encoding.Default);

            }
            catch (Exception ex)
            {
                Console.WriteLine("SaveTxtFile 方法发生异常：" + ex.Message);
            }
        }
        #endregion

        #region 过滤文件名中特殊字符
        public static string FilterInvalidChar(string fileName, string replaceStr)
        {
            foreach (var c in Path.GetInvalidFileNameChars())
            {
                fileName = fileName.Replace(c.ToString(), replaceStr);
            }
            fileName = fileName.Replace(" ", replaceStr);
            return fileName;
        }
        #endregion
    }
}

View Code

最后，再来个主要的类

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace BackupBlogs
{
    using Common;
    using HtmlAgilityPack;
    using Models;
    using System.Diagnostics;

    class Program
    {
        private static string userPath = "";
        private static string blogPath = "";
        private static string imgPath = "";
        private static string htmlTemp = @"
<!DOCTYPE HTML>
<html>
<head>
<title>C#备份博客园文章列表</title>
</head>
<body>
</body>
</html>";

        static void Main(string[] args)
        {
            //输入博客名称
            string uname = "";
            bool unameNull = true;

            do
            {
                ShowLog("--请输入要下载的博客名称--");
                uname = Console.ReadLine();
                if (string.IsNullOrEmpty(uname))
                {
                    ShowLog("--请输入要下载的博客名称--");
                    uname = Console.ReadLine();
                }
                else
                {
                    unameNull = false;
                }
            } while (unameNull);

            //获取博客标题
            bool hasTypes = true;
            List<BackupBlogs.Models.BlogInfo> blogList = new List<Models.BlogInfo>();
            do
            {

                userPath = AppDomain.CurrentDomain.BaseDirectory + "cnblogFiles\\" + uname + "_" + DateTime.Now.ToString("yyyyMMdd");
                blogPath = userPath + "\\Blogs";
                FileHelper.CreatePath(blogPath);
                imgPath = userPath + "\\Images";
                FileHelper.CreatePath(imgPath);

                blogList = GettBlogSummaryList(uname);
                if (blogList == null || blogList.Count == 0)
                {
                    ShowLog("--未获取到文章列表，请重新输入要下载的博客名称--", true);
                    uname = Console.ReadLine();
                    blogList = GettBlogSummaryList(uname);
                }
                else
                {
                    hasTypes = false;
                }
            } while (hasTypes);

            //保存标题列表
            SaveBlogsSummary(blogList);

            //保存博客详细文章
            bool handlerRight = true;
            do
            {
                long time1 = 0;
                long time2 = 0;
                long timeDownload = 0;
                ShowLog(Environment.NewLine + "---------------------------------" + Environment.NewLine);
                ShowLog($"--正在爬取博客文章，共计 {blogList.Count} 篇，请耐心等待...", true);
                var blogDetailCount = GetBlogDetail(blogList, out time2);
                ShowLog($"--爬取完毕，成功下载了【{blogDetailCount}/{blogList.Count}】篇博客文章.", true);
                ShowLog(Environment.NewLine + "---------------------------------" + Environment.NewLine);
                int saveCount = SaveBlogDetail(blogList, uname, out timeDownload);
                ShowLog(Environment.NewLine + "---------------------------------" + Environment.NewLine);
                ShowLog(saveCount == blogList.Count ? "保存全部文章成功！" : $"保存文章【{saveCount}】条成功");
                ShowLog($"--处理完毕，爬取用时{(time1 + time2)}ms，下载用时{timeDownload}ms，\r\n保存路径：{userPath}", true);
                handlerRight = false;

            } while (handlerRight);



            Console.ReadKey();
        }



        #region 获取博客标题
        /// <summary>
        /// 获取博客分类
        /// </summary>
        /// <param name="uname"></param>
        /// <returns></returns>
        private static List<BackupBlogs.Models.BlogInfo> GettBlogSummaryList(string uname)
        {
            string msgTitle = "第1阶段：";
            string url = "";
            List<BlogInfo> list = new List<BlogInfo>();
            ShowLog(msgTitle + $"获取{uname}的随笔如下：", true);
            HtmlDocument doc = new HtmlDocument();
            bool isGetBlog = true;
            int pageNum = 1;
            int pageSize = 10;
            do
            {
                int currPageBlogCount = 0;
                url = "http://wcf.open.cnblogs.com/blog/u/" + uname + $"/posts/{pageNum}/{pageSize}";
                string htmlStr = CommonHelper.GetRequestStr(url);
                doc.LoadHtml(htmlStr);
                var nodes = doc.DocumentNode.SelectNodes("//entry");
                if (nodes == null || nodes.Count <= 0)
                    isGetBlog = false;
                else
                {
                    foreach (var item in nodes)
                    {
                        currPageBlogCount++;
                        if (item.ChildNodes.Count != 10)
                            continue;
                        BlogInfo blogSummary = new BlogInfo()
                        {
                            id = item.ChildNodes["id"].InnerText,
                            comments = item.ChildNodes["comments"].InnerText,
                            diggs = item.ChildNodes["diggs"].InnerText,
                            link = item.ChildNodes["link"].Attributes["href"].Value,
                            published = item.ChildNodes["published"].InnerText,
                            summary = item.ChildNodes["summary"].InnerText,
                            title = item.ChildNodes["title"].InnerText,
                            updated = item.ChildNodes["updated"].InnerText,
                            views = item.ChildNodes["views"].InnerText
                        };
                        list.Add(blogSummary);
                        ShowLog(msgTitle + $"【{currPageBlogCount + (pageNum - 1) * pageSize}】获取文章标题【{blogSummary.title}】", true);
                    }
                }
                //isGetBlog = false;
                pageNum++;
            } while (isGetBlog);
            return list;
        }


        private static void SaveBlogsSummary(List<BackupBlogs.Models.BlogInfo> blogSummaries)
        {

            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(htmlTemp);
            HtmlNode nList = doc.DocumentNode.SelectSingleNode("/html/body");
            nList.AppendChild(HtmlNode.CreateNode("<table border='1' cellpadding='0' cellspacing='0' width='98%'><tr>" +
                "<th width='150'>发布时间</th>" +
                "<th width='100'>阅读数</th>" +
                "<th width='100'>评论数</th>" +
                "<th width='*'>博文标题</th>" +
                "</tr></table>"));
            ShowLog(Environment.NewLine + "开始保存博客文章标题");
            foreach (var item in blogSummaries)
            {
                string div = "<div>";
                //div += "&nbsp;&nbsp;&nbsp;&nbsp;<a href='" + item.link + "' target='_blank' >原文</a>";
                div += "【发布：" + DateTime.Parse(item.published).ToString("yyyy-MM-dd HH:mm") + "】";
                //div += "【更新：" + DateTime.Parse(item.updated).ToString("yyyy-MM-dd HH:mm:ss") + "】";
                div += "【阅读数：" + item.views.PadLeft(4, '0') + "】";
                div += "【评论数：" + item.comments.PadLeft(3, '0') + "】";
                div += "<a href='.\\Blogs\\" + System.Web.HttpUtility.UrlEncode(FileHelper.FilterInvalidChar(item.title, "_")) + ".html" + "' target='_blank'>" + item.title + "</a>";
                div += "</div>";

                string divTR = "<tr>";
                //divTR += "&nbsp;&nbsp;&nbsp;&nbsp;<a href='" + item.link + "' target='_blank' >原文</a>";
                divTR += "<td>" + DateTime.Parse(item.published).ToString("yyyy-MM-dd HH:mm") + "</td>";
                divTR += "<td align='right'>" + item.views + "</td>";
                divTR += "<td align='right'>" + item.comments + "</td>";
                divTR += "<td><a href='.\\Blogs\\" + System.Web.HttpUtility.UrlEncode(FileHelper.FilterInvalidChar(item.title, "_")) + ".html" + "' target='_blank'>" + item.title + "</a></td>";
                divTR += "</tr>";
                nList.SelectSingleNode("table").AppendChild(HtmlNode.CreateNode(divTR));
            }
            doc.Save(userPath + "\\index.html", Encoding.UTF8);
            ShowLog($"共保存【{blogSummaries.Count}】篇博客标题保存完成", true);
        }

        #endregion


        #region 获取博客详细信息
        /// <summary>
        /// 获取详细的博客信息
        /// </summary>
        /// <param name="dic"></param>
        /// <param name="useTime"></param>
        /// <returns></returns>
        private static int GetBlogDetail(List<BlogInfo> blogs, out long useTime)
        {
            Stopwatch sw = new Stopwatch();
            sw.Start();
            string msgTitle = "第2阶段：";
            int GetDetailCount = 0;
            HtmlDocument doc = new HtmlDocument();

            for (int k = 0; k < blogs.Count; k++)
            {
                string url = $"http://wcf.open.cnblogs.com/blog/post/body/{blogs[k].id}";
                ShowLog(msgTitle + string.Format("【{0}/{1}】正在获取文章【{2}】", k + 1, blogs.Count, blogs[k].title), true);
                string blogBody = CommonHelper.GetRequestStr(url);
                doc.LoadHtml(blogBody);
                var bodyNode = doc.DocumentNode.SelectSingleNode("//string");
                blogs[k].body = bodyNode == null ? "内容获取失败" : System.Web.HttpUtility.HtmlDecode(bodyNode.InnerHtml);
            }
            ShowLog("下载失败的文章如下：", true);
            var errBlogs = blogs.Where(x => x.body == "内容获取失败");
            foreach (var item in errBlogs)
            {
                ShowLog(Newtonsoft.Json.JsonConvert.SerializeObject(item), true);
            }
            GetDetailCount = blogs.Count - errBlogs.Count();
            sw.Stop();
            useTime = sw.ElapsedMilliseconds;
            return GetDetailCount;
        }
        #endregion


        #region 保存博客
        /// <summary>
        /// 保存图片
        /// </summary>
        /// <param name="doc"></param>
        /// <param name="filePath"></param>
        private static void SaveImage(BlogInfo blog, string filePath, string uname)
        {
            string msgTitle = "第3阶段：";
            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(blog.body);
            var imgNodes = doc.DocumentNode.SelectNodes("//img");
            if (imgNodes != null && imgNodes.Count > 0)
            {
                for (int i = 0; i < imgNodes.Count; i++)
                {
                    try
                    {
                        string src = imgNodes[i].Attributes["src"].Value;
                        if (src.IndexOf("//") == 0 && src.IndexOf("http") == -1)
                        {
                            src = src.Remove(0, src.IndexOf("//")).Insert(0, "http:");
                        }
                        string fileName = "";
                        string imgPath = "";
                        if (src != null && src.Contains("/"))
                        {
                            if (src.IndexOf("data:image") == 0)
                                fileName = blog.id + "_" + CommonHelper.MD5Encrypt(src) + "." + src.Split(';')[0].Split('/')[1];
                            else
                                fileName = src.Substring(src.LastIndexOf("/") + 1);

                            imgPath = filePath.Replace(userPath, "..") + "\\" + fileName;
                            imgNodes[i].Attributes["src"].Value = imgPath;
                            byte[] imgByte = CommonHelper.GetRequestByteArr(src);
                            if (imgByte != null)
                            {
                                FileHelper.SaveImage(filePath + "\\" + fileName, imgByte);
                            }
                            else
                            {
                                ShowLog(msgTitle + $"下载图片失败!   下载URL:{src}", true);
                                ShowLog(msgTitle + $"下载图片失败详细博客：【id:{blog.id};title:{blog.title};url:{blog.link};】");
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        ShowErrorLog(msgTitle + " SaveImage 方法发生异常：" + ex.Message, true);
                    }

                }
                blog.body = doc.DocumentNode.InnerHtml;
            }
        }

        /// <summary>
        /// 下载
        /// </summary>
        /// <param name="dic"></param>
        /// <param name="uname"></param>
        /// <param name="useTime"></param>
        /// <returns></returns>
        public static int SaveBlogDetail(List<BlogInfo> blogs, string uname, out long useTime)
        {
            Stopwatch sw = new Stopwatch();
            sw.Start();
            string msgTitle = "第3阶段：";
            int countFlag = 0;
            foreach (var item in blogs)
            {

                countFlag++;
                try
                {
                    ShowLog(string.Format(msgTitle + "【{0}/{1}】开始处理【{2}】", countFlag, blogs.Count, item.title), true);
                    var filePath = blogPath + "\\" + FileHelper.FilterInvalidChar(item.title, "_") + ".html";

                    //处理图片
                    SaveImage(item, imgPath, uname);

                    //去掉a标签
                    //var aNodes = doc.DocumentNode.SelectNodes("//a");
                    //if (aNodes != null && aNodes.Count > 0)
                    //{
                    //    for (int a = 0; a < aNodes.Count; a++)
                    //    {
                    //        if (aNodes[a].Attributes["href"] != null && aNodes[a].Attributes["href"].Value != "#")
                    //        {
                    //            aNodes[a].Attributes["href"].Value = "javascript:void()";
                    //        }
                    //    }
                    //}

                    HtmlDocument doc = new HtmlDocument();
                    doc.LoadHtml(htmlTemp);
                    doc.DocumentNode.SelectSingleNode("/html/head/title").InnerHtml = item.title;
                    var n1 = HtmlNode.CreateNode("<div id='div_head'><h1>" + item.title + "</h1><br />" +
                        "【发布：" + DateTime.Parse(item.published).ToString("yyyy-MM-dd HH:mm") + "】" +
                        "【阅读数：" + item.views + "】" +
                        "【评论数：" + item.comments + "】" +
                        "<a href='" + item.link + "' target='_blank' >阅读原文</a>" +
                        "</div>");
                    var n2 = HtmlNode.CreateNode("<div id='div_body'>" + item.body + "</div>");
                    doc.DocumentNode.SelectSingleNode("/html/body").AppendChild(n1);
                    doc.DocumentNode.SelectSingleNode("/html/body").AppendChild(n2);
                    doc.Save(filePath, Encoding.UTF8);
                    ShowLog(msgTitle + string.Format("【{0}/{1}】处理文章【{2}】完毕", countFlag, blogs.Count, item.title), true);
                }
                catch (Exception ex)
                {
                    string errorMsg = DateTime.Now.ToString("yyyyMMdd HH:mm:ss") + "\r\n" + "url=" + item.link + "\r\n" + "title=" + item.title + "\r\n" + "errorMsg=" + ex.Message + "\r\n" + "stackTrace=" + ex.StackTrace + "\r\n\r\n\r\n";
                    ShowErrorLog(msgTitle + $"error>>处理文章【{item.title}】出现错误:{ex.Message}" + Environment.NewLine + errorMsg, true);
                }
            }
            sw.Start();
            useTime = sw.ElapsedMilliseconds;
            return countFlag;
        }


        #endregion

        private static void ShowLog(string msg, bool isSaveLog = false)
        {
            Console.WriteLine(msg);
            if (isSaveLog)
                FileHelper.SaveTxtFile(userPath + "\\Log.txt", $"[{DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss.fff")}]  " + msg + Environment.NewLine, false);
        }
        private static void ShowErrorLog(string msg, bool isSaveLog = false)
        {
            Console.WriteLine(msg);
            if (isSaveLog)
                FileHelper.SaveTxtFile(userPath + "\\Error.txt", $"[{DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss.fff")}]  " + msg + Environment.NewLine, false);
        }

    }
}

View Code

执行试试吧

这里只是给个代码轮廓和思想，代码中同样没有考虑多线程、内存资源的释放等问题，难免不会有异常的错误，后续继续优化。

posted on 2019-09-22 21:36 jack_Meng 阅读(341) 评论(0) 编辑收藏举报

刷新页面返回顶部

Jack_孟

C#备份博客园随笔文章和图片----使用HtmlAgilityPack解析html

导航

公告