利用HtmlAgilityPack抓取网站图片并下载~~~~~~邪恶完善版
今日看博客园发现一个不错的抓取贴(主要是那个url。。。你懂的),花几分钟改了下,代码增加了按年月日建立目录,按文章建立子目录,图片都保存于内,命令行方式运行,增加了全站的参数。。。
原始版本:
利用HtmlAgilityPack抓取XX网站图片并下载~~~~~~邪恶版。。。。
老版本代码: 记住哦!,在E盘下新建一个DownLoadImg文件夹
主要代码如下:
WebClient wc = new WebClient(); private static int i = 0; protected void Page_Load(object sender, EventArgs e) { } protected void Button1_Click(object sender, EventArgs e) { HtmlWeb web = new HtmlWeb(); string imgurl = ""; //目前一般XX(你懂的...)网站,都是以.../版块/yyyymmdd/一堆数字.html结尾 //由于XX网站一般分为好几个版块,所以每个.html文件在同一版块下并不是连续的 //我用了外层两层循环,最外层循环yyyymmdd(日期如2012-02-15) //内层是循环每个.html文件,当然你可以自己修改两个循环 for (int k = 20120215; k <= 20120215; k++) { for (int j = 124289; j <= 124306; j++) { string cnblogs = "";//看这里,需要填写某一网站的格式,在源码下载里有 HtmlDocument doc = web.Load(cnblogs); HtmlNode node = doc.GetElementbyId("ks_xp"); if (node == null) { continue; } else { foreach (HtmlNode child in node.SelectNodes("//img")) { if (child.Attributes["src"] == null) continue; imgurl = child.Attributes["src"].Value.ToString(); DownLoadImg(imgurl); } } } } }
新版本代码:
#region Using namespace
using System;
using System.IO;
using System.Linq;
using System.Net;
using HtmlAgilityPack;
#endregion
namespace DownloadImages
{
internal class Program
{
private static readonly WebClient Wc = new WebClient();
private static readonly char[] InvalidFileNameChars = new[]
{
'"',
'<',
'>',
'|',
'\0',
'\u0001',
'\u0002',
'\u0003',
'\u0004',
'\u0005',
'\u0006',
'\a',
'\b',
'\t',
'\n',
'\v',
'\f',
'\r',
'\u000e',
'\u000f',
'\u0010',
'\u0011',
'\u0012',
'\u0013',
'\u0014',
'\u0015',
'\u0016',
'\u0017',
'\u0018',
'\u0019',
'\u001a',
'\u001b',
'\u001c',
'\u001d',
'\u001e',
'\u001f',
':',
'*',
'?',
'\\',
'/'
};
public static string CleanInvalidFileName(string fileName)
{
fileName = fileName + "";
fileName = InvalidFileNameChars.Aggregate(fileName, (current, c) => current.Replace(c + "", ""));
if (fileName.Length > 1)
if (fileName[0] == '.')
fileName = "dot" + fileName.TrimStart('.');
return fileName;
}
private static void Main(string[] args)
{
Start();
}
private static void Start()
{
var web = new HtmlWeb();
var startDate = int.Parse(DateTime.Parse("2010-08-18").ToString("yyyyMMdd"));
var endDate = int.Parse(DateTime.Now.ToString("yyyyMMdd"));
const int startPageId = 49430;
const int endPageId = 124621;
for (int k = startDate; k <= endDate; k++)
{
for (int j = startPageId; j <= endPageId; j++)
{
string cnblogs = http://xxxxxxxx/ + k + "/" + j + ".html"; //此处省略……源码内详
HtmlDocument doc = web.Load(cnblogs);
var titles = doc.DocumentNode.SelectNodes("//title");
var titleName = j.ToString();
if( titles!=null && titles.Count>0)
titleName = titles[0].InnerText;
HtmlNode node = doc.GetElementbyId("ks_xp");
if (node == null)
{
continue;
}
foreach (HtmlNode child in node.SelectNodes("//img"))
{
if (child.Attributes["src"] == null)
continue;
string imgurl = child.Attributes["src"].Value;
DownLoadImg(imgurl, k + "", CleanInvalidFileName(titleName));
Console.WriteLine("正在下载:" + titleName + " " + imgurl);
}
}
}
//善后
CleanEmptyFolders();
}
private static void CleanEmptyFolders()
{
var rootFolders = Environment.CurrentDirectory + "\\Images\\";
var folders = Directory.GetDirectories(rootFolders, "*.*", SearchOption.AllDirectories);
foreach( var f in folders)
{
if (Directory.GetFiles(f, "*.*", SearchOption.AllDirectories).Length == 0)
Directory.Delete(f);
}
}
private static void DownLoadImg(string url, string folderName, string subFolderName)
{
var fileName = CleanInvalidFileName(url.Substring(url.LastIndexOf("/") + 1));
var fileFolder = Environment.CurrentDirectory + "\\Images\\" + folderName + "\\" + subFolderName + "\\" ;
if (!Directory.Exists(fileFolder))
Directory.CreateDirectory(fileFolder);
fileName = fileFolder + fileName;
try
{
Wc.DownloadFile(url, fileName);
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
}
}
}
using System;
using System.IO;
using System.Linq;
using System.Net;
using HtmlAgilityPack;
#endregion
namespace DownloadImages
{
internal class Program
{
private static readonly WebClient Wc = new WebClient();
private static readonly char[] InvalidFileNameChars = new[]
{
'"',
'<',
'>',
'|',
'\0',
'\u0001',
'\u0002',
'\u0003',
'\u0004',
'\u0005',
'\u0006',
'\a',
'\b',
'\t',
'\n',
'\v',
'\f',
'\r',
'\u000e',
'\u000f',
'\u0010',
'\u0011',
'\u0012',
'\u0013',
'\u0014',
'\u0015',
'\u0016',
'\u0017',
'\u0018',
'\u0019',
'\u001a',
'\u001b',
'\u001c',
'\u001d',
'\u001e',
'\u001f',
':',
'*',
'?',
'\\',
'/'
};
public static string CleanInvalidFileName(string fileName)
{
fileName = fileName + "";
fileName = InvalidFileNameChars.Aggregate(fileName, (current, c) => current.Replace(c + "", ""));
if (fileName.Length > 1)
if (fileName[0] == '.')
fileName = "dot" + fileName.TrimStart('.');
return fileName;
}
private static void Main(string[] args)
{
Start();
}
private static void Start()
{
var web = new HtmlWeb();
var startDate = int.Parse(DateTime.Parse("2010-08-18").ToString("yyyyMMdd"));
var endDate = int.Parse(DateTime.Now.ToString("yyyyMMdd"));
const int startPageId = 49430;
const int endPageId = 124621;
for (int k = startDate; k <= endDate; k++)
{
for (int j = startPageId; j <= endPageId; j++)
{
string cnblogs = http://xxxxxxxx/ + k + "/" + j + ".html"; //此处省略……源码内详
HtmlDocument doc = web.Load(cnblogs);
var titles = doc.DocumentNode.SelectNodes("//title");
var titleName = j.ToString();
if( titles!=null && titles.Count>0)
titleName = titles[0].InnerText;
HtmlNode node = doc.GetElementbyId("ks_xp");
if (node == null)
{
continue;
}
foreach (HtmlNode child in node.SelectNodes("//img"))
{
if (child.Attributes["src"] == null)
continue;
string imgurl = child.Attributes["src"].Value;
DownLoadImg(imgurl, k + "", CleanInvalidFileName(titleName));
Console.WriteLine("正在下载:" + titleName + " " + imgurl);
}
}
}
//善后
CleanEmptyFolders();
}
private static void CleanEmptyFolders()
{
var rootFolders = Environment.CurrentDirectory + "\\Images\\";
var folders = Directory.GetDirectories(rootFolders, "*.*", SearchOption.AllDirectories);
foreach( var f in folders)
{
if (Directory.GetFiles(f, "*.*", SearchOption.AllDirectories).Length == 0)
Directory.Delete(f);
}
}
private static void DownLoadImg(string url, string folderName, string subFolderName)
{
var fileName = CleanInvalidFileName(url.Substring(url.LastIndexOf("/") + 1));
var fileFolder = Environment.CurrentDirectory + "\\Images\\" + folderName + "\\" + subFolderName + "\\" ;
if (!Directory.Exists(fileFolder))
Directory.CreateDirectory(fileFolder);
fileName = fileFolder + fileName;
try
{
Wc.DownloadFile(url, fileName);
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
}
}
}
测试程序和源码下载: