网页抓取小工具
最近在线看电子书,由于篇幅太长,而且找不到下载地址,于是写了个小工具,将电子书下载到本地。
整体思路:
1、抓取出目录中各章节的名称及URL
2、遍历章节URL,获取具体内容
3、将章节URL进行分包,交给多线程处理
4、将处理完的内容重新整理,按章节名称排序
5、将内容写入TXT文件
首先抓取导航页面的内容,通过WebRequest对象获取网页内容
/// <summary> /// 通过链接地址获取HTML内容 /// </summary> /// <param name="url"></param> /// <returns></returns> private static string GetHtml(string url) { string html = ""; try { WebRequest request = WebRequest.Create(url); request.Credentials = CredentialCache.DefaultCredentials; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); Stream stream = response.GetResponseStream(); StreamReader reader = new StreamReader(stream, encoding); html = reader.ReadToEnd(); reader.Close(); stream.Close(); response.Close(); } catch { } return html; }
通过正则获取章节地址及名称
/// <summary> /// 获取所有链接地址 /// </summary> /// <param name="html"></param> private static Dictionary<string, string> GetAllUrl(string html) { string titlePattern = @"第(?<index>\d+)节"; Dictionary<string, string> dictRet = new Dictionary<string, string>(); string pattern = @"<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>"; Regex reg = new Regex(pattern, RegexOptions.IgnoreCase); MatchCollection mc = reg.Matches(html); foreach (Match m in mc) { //将TITLE解析成第XXX节 string title = NoHTML(m.Groups["text"].Value).Replace("\n\r", "").Replace(" ", "") ; Match mTitle = Regex.Match(title, titlePattern); if (!mTitle.Success) { continue; } title = string.Format("第{0}节", mTitle.Groups["index"].Value.PadLeft(3, '0')); string url = m.Groups["url"].Value; url = url.StartsWith("http://") ? url : string.Format("{0}/{1}", webUrl.TrimEnd('/'), url); if (!dictRet.ContainsKey(url)) { dictRet.Add(url, title); } } return dictRet; }
按线程数对地址进行分包处理
/// <summary> /// 按线程数对URL进行分包处理 /// </summary> /// <param name="dictUrls"></param> /// <returns></returns> private static Dictionary<int, Dictionary<string, string>> SplitUrl(Dictionary<string, string> dictUrls) { Dictionary<int, Dictionary<string, string>> dictRet = new Dictionary<int, Dictionary<string, string>>(); int count = dictUrls.Count; int splitCount = count / threadCount; //每包的大小 int keyIndex = 0; int calCount = 0; foreach (string keyUrl in dictUrls.Keys) { if (calCount == splitCount && keyIndex < threadCount - 1) { keyIndex++; calCount = 0; } if (!dictRet.ContainsKey(keyIndex)) { dictRet.Add(keyIndex, new Dictionary<string, string>()); } dictRet[keyIndex].Add(keyUrl, dictUrls[keyUrl]); calCount++; } return dictRet; }
开启线程,获取内容,存入字典
static void Main(string[] args) { Dictionary<int, Dictionary<string, string>> dictSplitUrls = SplitUrl(dictUrls); for (int i = 0; i < threadCount; i++) { Dictionary<string, string> dictRun = dictSplitUrls[i]; int index = i; ThreadStart ts = delegate { Run(dictRun, index); }; new Thread(ts).Start(); } } /// <summary> /// 运行内容解析 /// </summary> /// <param name="dictUrls"></param> private static void Run(Dictionary<string, string> dictUrls, int threadId) { foreach (string keyUrl in dictUrls.Keys) { if (!dictContent.ContainsKey(dictUrls[keyUrl])) { dictContent.Add(dictUrls[keyUrl], ""); } Console.WriteLine("Thread {0},deal with {1}:{2}...", threadId, dictUrls[keyUrl], keyUrl); string content = GetHtml(keyUrl); if (content.Length > 0) { content = AnalyseContent(content); } if (content.Length > 0) { dictContent[dictUrls[keyUrl]] = content; Write(content, string.Format("{0}_{1}.txt", systemName, dictUrls[keyUrl])); Console.WriteLine("success"); } else { Console.WriteLine("failed"); } lock (lockObj) { dealedCount++; } } if (dealedCount == totalCount) { WriteToTxt(); } }
最后将所有内容重新整理,输出到文本
/// <summary> /// 输出到文本文件 /// </summary> private static void WriteToTxt() { Console.WriteLine("获取内容完成,生成TXT"); //字典按章节排序 StringBuilder sbTxt = new StringBuilder(); List<KeyValuePair<string, string>> lstOrder = dictContent.OrderBy(t => t.Key).ToList(); StringBuilder sbContent = new StringBuilder(); foreach (KeyValuePair<string, string> item in lstOrder) { Console.WriteLine("生成{0}", item.Key); sbContent.AppendFormat("{0} {1}", item.Key, item.Value); } Write(sbContent.ToString(), string.Format("{0}.txt", systemName)); Console.WriteLine("生成TXT成功"); }