以对象的方式访问html中的标签,比正则表达式更好用的方式获取html中的内容,linq方式直接获取所有的链接,更加先进的c#版本爬虫开源库
这是我本人自己写的一个开源库,现已经发布到nuget,可以直接在vs的nuget包管理中搜索到,或者可以到nuget官网下载:https://www.nuget.org/packages/ZmjConvert/,也可以到我的个人网站上下载源码:https://www.zhaimaojun.cn/P/C%23%e6%a0%87%e7%ad%be%e7%b1%bb%e6%96%87%e6%9c%ac%e5%ba%8f%e5%88%97%e5%8c%96%e5%ba%93/
本包是.NET standard标准库包,可以在.NET core,.NET,.NET framework等多种项目中直接使用,我本人也是在多种环境中使用这个包,没有任何问题的,我本人主要用来提取网页上的链接,图片,可显示文本等内容。
先安装这个包,然后就可以使用了,下面给出了一些使用方式的模板:
第一步 序列化html
1 /**先安装我的nuget包,名字为ZmjConvert(by 钟国庆)*/ 2 /**引入命名空间*/ 3 using ZmjConvert; 4 5 /**这是html源码的序列化*/ 6 var dts = GetSource();//获取html源码,可以现下或者读取html文件,dts需要是string类型的html内容 7 if (!HtmlElement.TryParse(dts, out var eles, out var err)) 8 { 9 OnDownloadErrorHappend($"无法序列化url:{surl}");//反序列化失败 10 return; 11 } 12 var srcurl = new Uri(surl);//这是html的源地址的当前文档的url,用于将当前所有节点中相对路径解析为绝对路径时使用 13 eles = eles.ListElement();//列出所有的子元素,就是将所有的节点提取为一个list,方便使用linq,或者使用foreach循环来遍历,当然原来的标签结构是不会改变的。
1 /**这是部分html标签源码的反序列化,比如富文本编辑器的内容*/ 2 try 3 { 4 var labstr = Gethtmlstr();//获取部分标签的html源码,比如富文本编辑器 5 var eles = CHtmlElement<HtmlElement>.Parse(htmlstr);//反序列化标签组 6 return eles;//反序列化成功后返回结构对象 7 } 8 catch (Exception e) 9 { 10 throw e;//反序列化失败 11 }
第二步 获取html文档的标题
1 /**获取文档标题*/ 2 var title = eles.GetWebTitle(); 3 /**获取文档的keywords*/ 4 var keywords = eles.Find("meta", "name", "keywords").FirstOrDefault()?["keywords"]; 5 /**获取文档的description*/ 6 var description = eles.Find("meta", "name", "description").FirstOrDefault()?["description"];
第三步 提取所有的img标签的src,也就是获取所有页面上的图片
1 /**这是获取所有img标签的src属性内容*/ 2 var imgs = new List<Uri>(); 3 foreach (var e in eles.Find("img", "src")) 4 { 5 var furi = new Uri(srcurl, e["src"]); 6 imgs.Add(furi); 7 } 8 /**这是提取所有img标签的data-src属性,因为有些网站为了降低流量成本,img标签的src属性中使用的是缩略图,而data-src中使用的才是原图*/ 9 var imgs = new List<Uri>(); 10 foreach (var e in eles.Find("img", "data-src")) 11 { 12 var furi = new Uri(srcurl, e["data-src"]); 13 imgs.Add(furi); 14 }
第四步 提取所有的a标签的href,也就是获取所有页面上的链接
1 /**使用我写的方法提取所有a的href*/ 2 var links = eles.GetAllLink(new Uri("当前html页面的url"), "a","href"); 3 /**使用linq方法提取a的href*/ 4 var links = eles.Find("a", "href").Select(x => x["href"]).ToList();
第五步 提取所有可视文本,并保持换行,并且保持链接的有效
1 /**使用我写的方法提取所有的可视文本*/ 2 var txt = eles.GetText("br,div,li","script,title");//其中第一个参数是强制换行的标签名,后一个参数是忽略内容的标签名 3 4 /**提取所有的可视文本的代码示例*/ 5 /// <summary> 6 /// 获取所有可现实的文本内容 7 /// </summary> 8 /// <param name="element"></param> 9 /// <returns></returns> 10 public static string GetText(this IEnumerable<HtmlElement> element, string splittags = "br,div,li", string excepttags = "script,title") 11 { 12 var etgs = excepttags.Split(",".ToCharArray(), StringSplitOptions.RemoveEmptyEntries); 13 var tgs = splittags.Split(",".ToCharArray(), StringSplitOptions.RemoveEmptyEntries); 14 var tsts = new List<string>(); 15 //将所有的文本内容读出 16 var sb = new StringBuilder(); 17 foreach (var item in element.Where(x => !etgs.Contains(x.TagName))) 18 { 19 if (tgs.Contains(item.TagName)) 20 { 21 sb.Append(HttpUtility.HtmlDecode(item.InnerText)); 22 var innt = sb.ToString(); 23 tsts.Add(innt); 24 sb.Clear(); 25 } 26 else if (item.TagName == "a" && item.Attributes.ContainsKey("href")) sb.Append($"<a href=\"{HttpUtility.HtmlDecode(item.Attributes["href"])}\">{HttpUtility.HtmlDecode(item.InnerText)}</a>"); 27 else if (!item.InnerText.Contains('<')) sb.Append(HttpUtility.HtmlDecode(item.InnerText)); 28 } 29 return string.Join("\r\n", tsts); 30 }
第六步 找到特定的标签
1 /**查找包含main-content样式类的div*/ 2 var e = eles.FirstOrDefault(x => x.TagName == "div" && x.Attributes.ContainsKey("class") && x.Attributes["class"].ToLower().Contains("main-content"));//linq方式 3 var e = eles.Find("div", "class", "main-content").FirstOrDefault();//使用我写的Find方法更简单 4 5 /**查找id为mainp的p标签*/ 6 var e = eles.FirstOrDefault(x => x.TagName == "p" && x.Attributes.ContainsKey("id") && x.Attributes["id"].ToLower() == "mainp");//linq方式 7 var e = eles.Find("p", "id", "mainp").FirstOrDefault();//使用我写的Find方法更简单
第七步 一些网站的抓取方法,已经开源到了网站nuget,版本是:1.1.2,以下是源码:
1 /// <summary> 2 /// http://yskhd.com/archives 3 /// </summary> 4 internal class ArchivesPictureCatcher : HtmlCatcher, IHtmlCatcher 5 { 6 public override bool CanApplicableHost(string url) 7 { 8 return url.Contains("://yskhd.com/archives/"); 9 } 10 public override void CatchHtml(string surl, string savepath, bool saveAsNewFolder = true) 11 { 12 if (string.IsNullOrWhiteSpace(surl)) return; 13 OnProgChanged($"正在抓取url:{surl}", 0); 14 var dts = GetSource.Invoke(surl); 15 if (!HtmlElement.TryParse(dts, out var eles, out var err)) 16 { 17 OnDownloadErrorHappend($"无法序列化url:{surl}"); 18 return; 19 } 20 var srcurl = new Uri(surl); 21 eles = eles.ListElement();//列出所有的子元素 22 string title = eles.GetWebTitle(); 23 var imgs = new List<Uri>(); 24 foreach (var e in eles.Find("img", "src").Where(x => x.Attributes["src"].EndsWith("285x285.jpg"))) 25 { 26 var furl = e["src"].Replace("285x285", "scaled"); 27 var furi = new Uri(srcurl, furl); 28 imgs.Add(furi); 29 } 30 GC.Collect(); 31 var ph = GetPathFrom(title, savepath, saveAsNewFolder); 32 foreach (var img in imgs) 33 { 34 var fm = img.Segments.Last().Trim('/'); 35 var fp = System.IO.Path.Combine(ph, fm); 36 DownLoadToFile(img.AbsoluteUri, fp); 37 } 38 OnProgChanged($"抓取到{imgs.Count}图片从页面:{title}", 0); 39 } 40 private string GetPathFrom(string title, string savepath, bool saveAsNewFolder) 41 { 42 if (!saveAsNewFolder) return savepath; 43 StringTool.GetLastLabel(ref title); 44 title = title.ReplacePathInvalidChars(); 45 var ph = System.IO.Path.Combine(savepath, title); 46 if (!System.IO.Directory.Exists(ph)) System.IO.Directory.CreateDirectory(ph); 47 return ph; 48 } 49 }
1 /// <summary> 2 /// 电影天堂网 3 /// </summary> 4 internal class Dy2018Catcher : HtmlCatcher, IHtmlCatcher 5 { 6 /// <summary> 7 /// 电影天堂网 8 /// </summary> 9 public override bool CanApplicableHost(string url) 10 { 11 return url.Contains("://www.dy2018.com"); 12 } 13 public override void CatchHtml(string surl, string savepath, bool saveAsNewFolder = true) 14 { 15 if (string.IsNullOrWhiteSpace(surl)) return; 16 OnProgChanged($"正在抓取url:{surl}", 0); 17 var dts = GetSource.Invoke(surl); 18 if (!HtmlElement.TryParse(dts, out var eles, out var err)) 19 { 20 OnDownloadErrorHappend($"无法序列化url:{surl}"); 21 return; 22 } 23 var srcurl = new Uri(surl); 24 eles = eles.ListElement();//列出所有的子元素 25 string title = eles.GetWebTitle(); 26 IEnumerable<HtmlElement> deles = eles.Find("div", "id", "zoom"); 27 if (deles is null || deles.Count() < 1) 28 { 29 OnDownloadErrorHappend($"没找到可抓取内容,页面:{title}"); 30 return; 31 } 32 deles = deles.ListElement(); 33 var imgs = new List<string>(); 34 foreach (var e in deles.Find("img", "src")) 35 {//封面图片或者其他图片 36 var furi = new Uri(srcurl, e["src"]); 37 imgs.Add(furi.AbsoluteUri); 38 } 39 var links = new List<string>(); 40 foreach (var e in deles.Find("a", "href")) 41 {//所有的下载链接或者外站连接 42 var furi = new Uri(srcurl, e["src"]); 43 var sit = e.InnerText; 44 sit = sit.Length < 50 ? sit : "获取方式"; 45 links.Add($"{sit} {furi.AbsoluteUri}"); 46 } 47 var txt = deles.GetText(); 48 var dir = eles.Find("div", "class", "position"); 49 IEnumerable<HtmlElement> dirs = (IEnumerable<HtmlElement>)dir?.ListElement() ?? Array.Empty<HtmlElement>(); 50 var tags = dirs.Find("a").Select(x => x.InnerText.Trim('/').Replace("类型:", string.Empty)); 51 GC.Collect(); 52 var ph = GetPathFrom(title, savepath, saveAsNewFolder); 53 base.TextPath = System.IO.Path.Combine(ph, "新建文本文档.txt"); 54 System.IO.File.WriteAllLines(TextPath, new[] { surl, "", "", }.Concat(links).Concat(new[] { "", "" }).Concat(tags).Concat(new[] { txt })); 55 foreach (var img in imgs) 56 { 57 var url = new Uri(img); 58 var fm = url.Segments.Last().Trim('/'); 59 var fp = System.IO.Path.Combine(ph, fm); 60 if (fm == "dico.jpg") continue; 61 DownLoadToFile(img, fp); 62 } 63 OnProgChanged($"抓取到{imgs.Count}图片{links.Count}连接从页面:{title}", 0); 64 } 65 private string GetPathFrom(string title, string savepath, bool saveAsNewFolder) 66 { 67 if (!saveAsNewFolder) return savepath; 68 title = title.Replace("_电影天堂", string.Empty).ReplacePathInvalidChars(); 69 var ph = System.IO.Path.Combine(savepath, title.ToUpper()); 70 if (!System.IO.Directory.Exists(ph)) System.IO.Directory.CreateDirectory(ph); 71 return ph; 72 } 73 }
1 /// <summary> 2 /// fap 3 /// </summary> 4 internal class FapCatcher : HtmlCatcher, IHtmlCatcher 5 { 6 public override bool CanApplicableHost(string url) 7 { 8 return url.Contains("://fap-nation.com"); 9 } 10 public override void CatchHtml(string surl, string savepath, bool saveAsNewFolder = true) 11 { 12 if (string.IsNullOrWhiteSpace(surl)) return; 13 OnProgChanged($"正在抓取url:{surl}", 0); 14 //var dts = Encoding.UTF8.GetString(DownloadDatas(main, surl)); 15 var dts = GetSource.Invoke(surl); 16 if (!HtmlElement.TryParse(dts, out var eles, out var err)) 17 { 18 OnDownloadErrorHappend($"无法序列化url:{surl}"); 19 return; 20 } 21 var srcurl = new Uri(surl); 22 eles = eles.ListElement();//列出所有的子元素 23 if (!(eles.FirstOrDefault(x => x.TagName == "main" && x.Attributes.ContainsKey("class") && x.Attributes["class"].Contains("col-md-9") && x.Attributes["class"].Contains("col-md-push-3")) is HtmlElement me)) return; 24 string title = eles.GetWebTitle(); 25 eles = me.Children.Find("article").ListElement(null, "script,style,!--"); 26 var ph = GetPathFrom(title, savepath, saveAsNewFolder, out var ver); 27 var imgs = new List<string>(); 28 foreach (var e in eles.Find("img", "src")) 29 {//封面图片或者其他图片 30 var furl = e["src"]; 31 if (!furl.StartsWith("http") || furl.Contains("loading")) continue; 32 var furi = new Uri(srcurl, furl); 33 imgs.Add(furi.AbsoluteUri); 34 } 35 var links = new List<string>(); 36 //提取所有的下载链接 37 var tags = new List<string>();//从中获取标签 38 var tgs = eles.Find("div", "class", "tags").ListElement(); 39 foreach (var t in tgs.Find("a")) 40 { 41 var tag = t.InnerText.Trim().ToUpper(); 42 if (string.IsNullOrWhiteSpace(tag)) continue; 43 tags.Add(tag); 44 //if (Trans(tag, out var tts)) tags.Add(tts); 45 } 46 var tsts = new List<string>(); 47 //将所有的文本内容读出 48 var sb = new StringBuilder(); 49 var os = ""; 50 var ttgs = new[] { "br", "div", "li", }; 51 foreach (var item in eles) 52 { 53 var nt = item.InnerText; 54 if (ttgs.Contains(item.TagName)) 55 { 56 sb.Append(nt); 57 if (item.TagName == "div" && item.Attributes.ContainsKey("class") && item.Attributes["class"].StartsWith("vc_")) continue; 58 var innt = sb.ToString(); 59 tsts.Add(innt); 60 sb.Clear(); 61 } 62 else if (item.InnerText.StartsWith("Download")) 63 { 64 var itxt = item.InnerText.Trim(); 65 os = itxt.Substring(itxt.LastIndexOf(' ') + 1); 66 } 67 else if (item.TagName == "a" && item.Attributes.ContainsKey("href")) 68 { 69 if (item["href"].IndexOfAny("#?=-".ToArray()) < 0) links.Add($"{item.InnerText} {item["href"]}"); 70 else if (item.Attributes["href"].StartsWith("https://fap-nation.com/fndlnwd/")) links.Add($"{os}({ver}) {item["href"]}"); 71 else if (!string.IsNullOrWhiteSpace(nt) && !item.Attributes["href"].Contains("tab-") && !item.Attributes["href"].Contains("#copy")) sb.Append($"<a href=\"{item["href"]}\" target=\"_blank\">{nt}</a>"); 72 else sb.Append(nt); 73 } 74 else if (!nt.Contains('<')) sb.Append(nt); 75 } 76 var txt = string.Join("\r\n", tsts); 77 GC.Collect(); 78 TextPath = System.IO.Path.Combine(ph, "新建文本文档.txt"); 79 System.IO.File.WriteAllLines(TextPath, new[] { surl, "", }.Concat(links).Concat(new[] { "", }).Concat(tags).Concat(new[] { "", txt })); 80 foreach (var img in imgs) 81 { 82 var url = new Uri(img); 83 var fm = url.Segments.Last().Trim('/'); 84 var fp = System.IO.Path.Combine(ph, fm); 85 DownLoadToFile(img, fp); 86 } 87 OnProgChanged($"抓取到{imgs.Count}图片从页面:{title}", 0); 88 } 89 protected string GetPathFrom(string title, string savepath, bool saveAsNewFolder, out string ver) 90 { 91 ver = string.Empty; 92 if (!saveAsNewFolder) return savepath; 93 StringTool.GetLastLabel(ref title, " "); 94 var lbs = StringTool.GetLabels1(ref title); 95 ver = lbs[0]; 96 title = $"[][{lbs[1]}][{lbs[1]}][][][]{title}".ReplacePathInvalidChars(); 97 string ph = null; 98 ph = ph ?? System.IO.Path.Combine(savepath, title.ToUpper()); 99 if (!System.IO.Directory.Exists(ph)) System.IO.Directory.CreateDirectory(ph); 100 return ph; 101 } 102 }
1 /// <summary> 2 /// https://pcgamestorrents.com/?s= 3 /// </summary> 4 internal class PcgamestorrentsCatcher : HtmlCatcher, IHtmlCatcher 5 { 6 /// <summary> 7 /// 游戏下载网站的内容 8 /// </summary> 9 public override bool CanApplicableHost(string url) 10 { 11 return url.Contains("//pcgamestorrents.com/"); 12 } 13 public override void CatchHtml(string surl, string savepath, bool saveAsNewFolder = true) 14 { 15 if (string.IsNullOrWhiteSpace(surl)) return; 16 OnProgChanged($"正在抓取url:{surl}", 0); 17 //var dts = Encoding.UTF8.GetString(DownloadDatas(main, surl)); 18 var dts = GetSource.Invoke(surl); 19 if (!HtmlElement.TryParse(dts, out var eles, out var err)) 20 { 21 OnDownloadErrorHappend($"无法序列化url:{surl}"); 22 return; 23 } 24 var srcurl = new Uri(surl); 25 eles = eles.ListElement();//列出所有的子元素 26 string title = eles.GetWebTitle(); 27 var contente = eles.Find("div", "class", "uk-margin-medium-top"); 28 eles = contente.ListElement(); 29 var imgs = new List<string>(); 30 foreach (var e in eles.Find("img", "class", "igg-image-content")) 31 {//封面图片或者其他图片 32 var furl = e["src"]; 33 var furi = new Uri(srcurl, furl); 34 imgs.Add(furi.AbsoluteUri); 35 } 36 var tsts = new List<string>(); 37 //将所有的文本内容读出 38 var sb = new StringBuilder(); 39 foreach (var item in eles) 40 { 41 var tgs = new[] { "br", "button", "div", "li", "p", }; 42 if (tgs.Contains(item.TagName)) 43 { 44 sb.Append(item.InnerText); 45 tsts.Add(sb.ToString()); 46 sb.Clear(); 47 } 48 else sb.Append(item.InnerText); 49 } 50 var tags = new List<string>();//从文本中获取标签 51 var tgstr = tsts.First(x => x.Trim().ToUpper().StartsWith("GENRE")).ToUpper().Replace("GENRE", "").Replace(":", "").Trim().Split(',') ?? new string[0]; 52 foreach (var t in tgstr.Where(x => x.Length < 25)) 53 { 54 var tag = HttpUtility.HtmlDecode(t); 55 tag = Regex.Replace(tag, @"[^ \/\w]", "").Trim().ToUpper(); 56 tags.Add(tag); 57 //if (Trans(tag, out var tts)) tags.Add(tts); 58 } 59 var dev = tsts.First(x => x.Trim().ToUpper().StartsWith("DEVELOPER")).ToUpper().Replace("DEVELOPER", "").Replace(":", "").Trim(); 60 var pub = tsts.First(x => x.Trim().ToUpper().StartsWith("PUBLISHER")).ToUpper().Replace("PUBLISHER", "").Replace(":", "").Trim(); 61 GC.Collect(); 62 var ph = GetPathFrom(title, dev, pub, savepath, saveAsNewFolder, out var ver); 63 TextPath = System.IO.Path.Combine(ph, "新建文本文档.txt"); 64 System.IO.File.WriteAllLines(TextPath, new[] { surl, "", $"Win({ver.Replace("« PCGamesTorrents", "").Replace("-", "").Trim()})(磁力) ", "", "破解版", }.Concat(tags).Concat(new[] { "", }).Concat(tsts)); 65 foreach (var img in imgs) 66 { 67 var url = new Uri(img); 68 var fm = url.Segments.Last().Trim('/'); 69 var fp = System.IO.Path.Combine(ph, fm); 70 DownLoadToFile(img, fp); 71 } 72 OnProgChanged($"抓取到{imgs.Count}图片从页面:{title}", 0); 73 } 74 private string GetPathFrom(string title, string dev, string pub, string savepath, bool saveAsNewFolder, out string ver) 75 { 76 ver = Regex.Match(title, " - .*?$").Value; 77 if (!string.IsNullOrWhiteSpace(ver)) title = title.Replace(ver, ""); 78 if (!saveAsNewFolder) return savepath; 79 var ed = title.IndexOf(HttpUtility.HtmlDecode(" « ")); 80 title = ed <= 0 ? title : title.Substring(0, ed); 81 title = $"[][{pub}][{dev}][][][完结]{title}".ReplacePathInvalidChars(); 82 var ph = System.IO.Path.Combine(savepath, title); 83 if (!System.IO.Directory.Exists(ph)) System.IO.Directory.CreateDirectory(ph); 84 return ph; 85 } 86 }
1 /// <summary> 2 /// http://www.mfsft.com/ 3 /// </summary> 4 internal class XiuRenPictureCatcher : HtmlCatcher, IHtmlCatcher 5 { 6 public override bool CanApplicableHost(string url) 7 { 8 return url.Contains("://www.mfsft.com/"); 9 } 10 public override async void CatchHtml(string surl, string savepath, bool saveAsNewFolder = true) 11 { 12 if (string.IsNullOrWhiteSpace(surl)) return; 13 OnProgChanged($"正在抓取url:{surl}", 0); 14 //var dts = Encoding.UTF8.GetString(DownloadDatas(main, surl)); 15 var dts = GetSource.Invoke(surl); 16 if (!HtmlElement.TryParse(dts, out var eles, out var err)) 17 { 18 OnDownloadErrorHappend($"无法序列化url:{surl}"); 19 return; 20 } 21 var srcurl = new Uri(surl); 22 eles = eles.ListElement();//列出所有的子元素 23 string title = eles.GetWebTitle(); 24 var imgs = new List<Uri>(); 25 foreach (var e in eles.Find("img", "src").Where(x => x.Attributes.ContainsKey("alt"))) 26 { 27 var furl = e["src"]; 28 var furi = new Uri(srcurl, furl); 29 imgs.Add(furi); 30 } 31 var lsted = srcurl.PathAndQuery.LastIndexOfAny(new[] { '.', '_' }); 32 var lstst = srcurl.PathAndQuery.LastIndexOf('/'); 33 var ststr = srcurl.PathAndQuery.Substring(lstst, lsted - lstst).Trim('/'); 34 var others = new List<string>(); 35 others.AddRange(eles.Find("a", "href").Where(x => x.Attributes["href"].StartsWith(ststr)).Select(x => x.Attributes["href"])); 36 foreach (var e in others.Distinct()) 37 {//抓取其他的页面 38 var furi = new Uri(srcurl, e); 39 await Task.Run(() => Catch(furi.AbsoluteUri, imgs)); 40 } 41 GC.Collect(); 42 //browser.GetCookieManager().VisitUrlCookies(main.Url, true,vt); 43 var ph = GetPathFrom(title, savepath, saveAsNewFolder); 44 foreach (var img in imgs) 45 { 46 var fm = img.Segments.Last().Trim('/').Replace("_gzip.aspx", ""); 47 var fp = System.IO.Path.Combine(ph, fm); 48 DownLoadToFile(img.AbsoluteUri, fp); 49 } 50 OnProgChanged($"抓取到{imgs.Count}图片从页面:{title}", 0); 51 //main.Browser.ShowDevTools(); 52 } 53 private void Catch(string surl, List<Uri> array) 54 { 55 var dts = Encoding.UTF8.GetString(DownloadDatas(surl)); 56 if (!HtmlElement.TryParse(dts, out var eles, out var err)) 57 { 58 OnDownloadErrorHappend($"无法序列化url:{surl}"); 59 return; 60 } 61 var srcurl = new Uri(surl); 62 eles = eles.ListElement();//列出所有的子元素 63 string title = eles.GetWebTitle(); 64 foreach (var e in eles.Find("img", "src").Where(x => x.Attributes.ContainsKey("alt"))) 65 { 66 var furl = e["src"]; 67 var furi = new Uri(srcurl, furl); 68 array.Add(furi); 69 } 70 } 71 private string GetPathFrom(string title, string savepath, bool saveAsNewFolder) 72 { 73 if (!saveAsNewFolder) return savepath; 74 title = title.Substring(0, title.LastIndexOfAny(new[] { ' ', ',', '-' })); 75 title = title.ReplacePathInvalidChars(); 76 var ph = System.IO.Path.Combine(savepath, title); 77 if (!System.IO.Directory.Exists(ph)) System.IO.Directory.CreateDirectory(ph); 78 return ph; 79 } 80 }
1 /// <summary> 2 /// https://www.xpiaohua.com 3 /// </summary> 4 internal class XpiaohuaCatcher : HtmlCatcher, IHtmlCatcher 5 { 6 /// <summary> 7 /// 游戏下载网站的内容 8 /// </summary> 9 public override bool CanApplicableHost(string url) 10 { 11 return url.Contains("://www.xpiaohua.com"); 12 } 13 public override void CatchHtml(string surl, string savepath, bool saveAsNewFolder = true) 14 { 15 if (string.IsNullOrWhiteSpace(surl)) return; 16 OnProgChanged($"正在抓取url:{surl}", 0); 17 //var dts = Encoding.UTF8.GetString(DownloadDatas(main, surl)); 18 var dts = GetSource.Invoke(surl); 19 if (!HtmlElement.TryParse(dts, out var eles, out var err)) 20 { 21 OnDownloadErrorHappend($"无法序列化url:{surl}"); 22 return; 23 } 24 var srcurl = new Uri(surl); 25 eles = eles.ListElement();//列出所有的子元素 26 string title = eles.GetWebTitle(); 27 IEnumerable<HtmlElement> deles = eles.Find("div", "id", "showinfo"); 28 if (deles is null || deles.Count() < 1) 29 { 30 OnDownloadErrorHappend($"没找到可抓取内容,页面:{title}"); 31 return; 32 } 33 deles = deles.ListElement(); 34 var imgs = new List<string>(); 35 foreach (var e in deles.Find("img", "src")) 36 {//封面图片或者其他图片 37 var furl = e["src"]; 38 var furi = new Uri(srcurl, furl); 39 imgs.Add(furi.AbsoluteUri); 40 } 41 var links = new List<string>(); 42 foreach (var e in deles.Find("a", "href")) 43 {//所有的下载链接或者外站连接 44 var furl = e["href"]; 45 Uri furi = null; 46 try { furi = new Uri(srcurl, furl); } catch { continue; } 47 links.Add($"{e.InnerText}: {furi.AbsoluteUri}"); 48 } 49 var txts = deles.GetText(); 50 var ststr = "◎类 别 "; 51 var st = txts.IndexOf(ststr); 52 var ed = txts.IndexOf("◎语 言 "); 53 var tagstr = txts.Substring(st + ststr.Length, ed - st - ststr.Length); 54 var tags = tagstr.Split("/".ToCharArray(), StringSplitOptions.RemoveEmptyEntries); 55 GC.Collect(); 56 var ph = GetPathFrom(title, savepath, saveAsNewFolder); 57 TextPath = System.IO.Path.Combine(ph, "新建文本文档.txt"); 58 System.IO.File.WriteAllLines(TextPath, new[] { surl, "", }.Concat(links).Concat(new[] { "", }).Concat(tags).Concat(new[] { "", txts })); 59 foreach (var img in imgs) 60 { 61 var url = new Uri(img); 62 var fm = url.Segments.Last().Trim('/'); 63 var fp = System.IO.Path.Combine(ph, fm); 64 if (fm == "dico.jpg") continue; 65 DownLoadToFile(img, fp); 66 } 67 OnProgChanged($"抓取到{imgs.Count}图片{links.Count}连接从页面:{title}", 0); 68 } 69 private string GetPathFrom(string title, string savepath, bool saveAsNewFolder) 70 { 71 if (!saveAsNewFolder) return savepath; 72 title = title.Replace("_新飘花电影网", string.Empty).ReplacePathInvalidChars(); 73 var ph = System.IO.Path.Combine(savepath, title.ToUpper()); 74 if (!System.IO.Directory.Exists(ph)) System.IO.Directory.CreateDirectory(ph); 75 return ph; 76 } 77 }
还有的htmlCatcher在我的包里~
有需求的也可以找我代写catcher