利用HtmlAgilityPack抓取网站图片并下载~~~~~~邪恶完善版

 

今日看博客园发现一个不错的抓取贴(主要是那个url。。。你懂的),花几分钟改了下,代码增加了按年月日建立目录,按文章建立子目录,图片都保存于内,命令行方式运行,增加了全站的参数。。。

原始版本:

利用HtmlAgilityPack抓取XX网站图片并下载~~~~~~邪恶版。。。。

 

老版本代码:  记住哦!,在E盘下新建一个DownLoadImg文件夹

主要代码如下:
WebClient wc = new WebClient();
    private static int i = 0;
    protected void Page_Load(object sender, EventArgs e)
    {

    }
    protected void Button1_Click(object sender, EventArgs e)
    {
        HtmlWeb web = new HtmlWeb();

        string imgurl = "";
        //目前一般XX(你懂的...)网站,都是以.../版块/yyyymmdd/一堆数字.html结尾
        //由于XX网站一般分为好几个版块,所以每个.html文件在同一版块下并不是连续的
        //我用了外层两层循环,最外层循环yyyymmdd(日期如2012-02-15)
        //内层是循环每个.html文件,当然你可以自己修改两个循环


        for (int k = 20120215; k <= 20120215; k++)
                                                  
      {
          for (int j = 124289; j <= 124306; j++)
        {
           
       
            string cnblogs = "";//看这里,需要填写某一网站的格式,在源码下载里有
            HtmlDocument doc = web.Load(cnblogs);
            HtmlNode node = doc.GetElementbyId("ks_xp");
            if (node == null)
            {
                continue;

            }
            else
            {

                foreach (HtmlNode child in node.SelectNodes("//img"))
                {
                    if (child.Attributes["src"] == null)
                        continue;

                    imgurl = child.Attributes["src"].Value.ToString();
                    DownLoadImg(imgurl);
                }
            }
         
        }
      }

    }

 

新版本代码:

 

复制代码
#region Using namespace

using System;
using System.IO;
using System.Linq;
using System.Net;
using HtmlAgilityPack;

#endregion

namespace DownloadImages
{
    internal class Program
    {
        private static readonly WebClient Wc = new WebClient();
        private static readonly char[] InvalidFileNameChars = new[]
                                                                  {
                                                                      '"',
                                                                      '<',
                                                                      '>',
                                                                      '|',
                                                                      '\0',
                                                                      '\u0001',
                                                                      '\u0002',
                                                                      '\u0003',
                                                                      '\u0004',
                                                                      '\u0005',
                                                                      '\u0006',
                                                                      '\a',
                                                                      '\b',
                                                                      '\t',
                                                                      '\n',
                                                                      '\v',
                                                                      '\f',
                                                                      '\r',
                                                                      '\u000e',
                                                                      '\u000f',
                                                                      '\u0010',
                                                                      '\u0011',
                                                                      '\u0012',
                                                                      '\u0013',
                                                                      '\u0014',
                                                                      '\u0015',
                                                                      '\u0016',
                                                                      '\u0017',
                                                                      '\u0018',
                                                                      '\u0019',
                                                                      '\u001a',
                                                                      '\u001b',
                                                                      '\u001c',
                                                                      '\u001d',
                                                                      '\u001e',
                                                                      '\u001f',
                                                                      ':',
                                                                      '*',
                                                                      '?',
                                                                      '\\',
                                                                      '/'
                                                                  };
        public static string CleanInvalidFileName(string fileName)
        {
            fileName = fileName + "";
            fileName = InvalidFileNameChars.Aggregate(fileName, (current, c) => current.Replace(c + """"));

            if (fileName.Length > 1)
                if (fileName[0] == '.')
                    fileName = "dot" + fileName.TrimStart('.');

            return fileName;
        }
        private static void Main(string[] args)
        {
            Start();
        }

        private static void Start()
        {
            var web = new HtmlWeb();
            var startDate = int.Parse(DateTime.Parse("2010-08-18").ToString("yyyyMMdd"));
            var endDate = int.Parse(DateTime.Now.ToString("yyyyMMdd"));
            const int startPageId = 49430;
            const int endPageId = 124621;
            for (int k = startDate; k <= endDate; k++)
            {
                for (int j = startPageId; j <= endPageId; j++)
                {
                    string cnblogs = http://xxxxxxxx/ + k + "/" + j + ".html";  //此处省略……源码内详
                    HtmlDocument doc = web.Load(cnblogs);
                    var titles = doc.DocumentNode.SelectNodes("//title");
                    var titleName = j.ToString();
                    if( titles!=null && titles.Count>0)
                        titleName = titles[0].InnerText;
                    HtmlNode node = doc.GetElementbyId("ks_xp");
                    if (node == null)
                    {
                        continue;
                    }
                    foreach (HtmlNode child in node.SelectNodes("//img"))
                    {
                        if (child.Attributes["src"] == null)
                            continue;

                        string imgurl = child.Attributes["src"].Value;
                        DownLoadImg(imgurl, k + "", CleanInvalidFileName(titleName));
                        Console.WriteLine("正在下载:" + titleName + " " + imgurl);
                    }
                }
            }
            //善后
            CleanEmptyFolders();
        }

        private static void CleanEmptyFolders()
        {
            var rootFolders = Environment.CurrentDirectory + "\\Images\\";
            var folders = Directory.GetDirectories(rootFolders, "*.*", SearchOption.AllDirectories);
            foreachvar f in folders)
            {
                if (Directory.GetFiles(f, "*.*", SearchOption.AllDirectories).Length == 0)
                    Directory.Delete(f);
            }
        }

        private static void DownLoadImg(string url, string folderName, string subFolderName)
        {
            var fileName = CleanInvalidFileName(url.Substring(url.LastIndexOf("/") + 1));
            var fileFolder = Environment.CurrentDirectory + "\\Images\\" + folderName + "\\" + subFolderName + "\\" ;
            if (!Directory.Exists(fileFolder))
                Directory.CreateDirectory(fileFolder);
            fileName = fileFolder + fileName;
            try
            {
                Wc.DownloadFile(url, fileName);
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }
        }
    }
}
 

测试程序和源码下载:

/Files/Chinasf/DownloadImages.rar

 posted on 2012-04-27 15:56  纳米程序员  阅读(626)  评论(0编辑  收藏  举报