C# 使用HtmlAgilityPack 抓取 网站链接
今天在找电视剧下载链接的时候,找了一个整部剧的下载地址,但是有40多集,链接地址较长且不好复制,于是就想到了HtmlAgilityPack抓取的方式。
先看实现效果:
使用到的NUGET包: HtmlAgilityPack、HttpCode.Core
请求到网址获取整个网址
static void Main(string[] args) { HttpHelpers httpHelpers = new HttpHelpers(); HttpItems items = new HttpItems(); items.Url = "https://www.123455.com/videodetails/2222.html";//请求地址 items.Method = "Get";//请求方式 post HttpResults hr = httpHelpers.GetHtml(items); JX(hr.Html); }
解析获取到的网址
/// <summary> /// 解析XML /// </summary> /// <param name="htmlCode"></param> public static void JX(string htmlCode) { //HtmlAgilityPack //源码地址:https://html-agility-pack.net/?z=codeplex //下载地址2:https://codeplexarchive.blob.core.windows.net/archive/projects/htmlagilitypack/htmlagilitypack.zip string path = System.AppDomain.CurrentDomain.BaseDirectory; var filname = "抓取文件.txt"; HtmlDocument document = new HtmlDocument(); document.LoadHtml(htmlCode); HtmlNode rootNode = document.DocumentNode; //categoryNodeList 具有相同类型的节点的集合 //标签@属性='属性名称' HtmlNodeCollection categoryNodeList = rootNode.SelectNodes("//div[@id='content']//li[@id='li3_0']//span[@id='s3p0']"); //也可以通过Xpath路径的形式获取 Xpath路径可以使用HAPExplorer.exe(通过上面的源码地址可以下载并生成工具) //HtmlNodeCollection categoryNodeList = rootNode.SelectNodes("/html[1]/head[1]/div[2]/div[6]/ul[1]"); foreach (var item in categoryNodeList) { var sapn = item.InnerHtml.Trim(); var herf = sapn.Split('"')[3]; WriteMessage(path + filname, herf); } }
输出到文本文件
/// <summary> /// 输出指定信息到文本文件 /// </summary> /// <param name="path">文本文件路径</param> /// <param name="msg">输出信息</param> public static void WriteMessage(string path, string msg) { using (FileStream fs = new FileStream(path, FileMode.OpenOrCreate, FileAccess.Write)) { using (StreamWriter sw = new StreamWriter(fs)) { sw.BaseStream.Seek(0, SeekOrigin.End); sw.WriteLine("{0}\n", msg, DateTime.Now); sw.Flush(); } } }