/// <summary>
/// 默认章节字数要1000长度
/// </summary>
private int m_chapter_content_length = 1000;
/// <summary>
/// 从google收录章节
/// </summary>
/// <param name="i_book_name">小说名称</param>
/// <param name="i_chapter_list">需要收录的vip章节列表</param>
/// <returns></returns>
public List<BookChapterInfo> CollectBookChapterList(string i_book_name,List<BookChapterInfo> i_chapter_list)
{
i_book_name = Regex.Replace(i_book_name, "[a-zA-Z0-9()()]", "", RegexOptions.Compiled | RegexOptions.IgnoreCase);
if (i_chapter_list == null || i_chapter_list.Count <= 0)
return null;
Encoding t_encoding = Encoding.GetEncoding("gb2312");
string t_key_word = string.Format("小说{0}最新章节txt", i_book_name);
string t_baidu_url = string.Format("http://www.baidu.com/s?wd={0}", HttpUtility.UrlEncode(t_key_word, t_encoding));
string t_list_reg = "<h3\\s*?class=[\'\"]?t[\'\"]?><a[^<>]*?hrefs*=s*[\'\"]*([^\"\']*)[\'\"]*[^<>]*?>(.*?)</a>\\s*?</h3>";
List<BookChapterInfo> t_need_collect_list = new List<BookChapterInfo>();
List<BookChapterInfo> t_collect_chapter_list = new List<BookChapterInfo>();
List<BookChapterInfo> t_vip_chapter_list = new List<BookChapterInfo>();
string t_book_url = string.Empty;
try
{
string t_html = NetSiteCatchManager.ReadUrl(t_baidu_url, t_encoding);
if (!string.IsNullOrEmpty(t_html))
{
MatchCollection t_ma = Regex.Matches(t_html, t_list_reg, RegexOptions.IgnoreCase | RegexOptions.Compiled);
if (t_ma != null)
{
for(int index=0;index<t_ma.Count;index++)
{
t_book_url = t_ma[index].Groups[1].Value.ToString();
t_html = NetSiteCatchManager.ReadUrl(t_book_url, Encoding.Default);
t_need_collect_list = GetNeedCollectChapter(i_chapter_list, t_vip_chapter_list);
t_collect_chapter_list = GetBookChapterList(t_book_url, t_html, t_need_collect_list, i_book_name);
if (t_collect_chapter_list != null && t_collect_chapter_list.Count > 0)
t_vip_chapter_list.AddRange(t_collect_chapter_list);
//就差10个章节退出
if (t_vip_chapter_list != null && t_vip_chapter_list.Count > 0 && i_chapter_list.Count-t_vip_chapter_list.Count<10)
break;
}
}
}
}
catch (Exception ex)
{
LogHelper.Error("从google收录章节列表失败" + ex.ToString());
}
return t_vip_chapter_list;
}
/// <summary>
/// 获取还没有收录到的章节列表
/// </summary>
/// <param name="i_vip_list"></param>
/// <param name="i_have_collect_list"></param>
/// <returns></returns>
private List<BookChapterInfo> GetNeedCollectChapter(List<BookChapterInfo> i_vip_list, List<BookChapterInfo> i_have_collect_list)
{
if (i_have_collect_list == null || i_have_collect_list.Count <= 0)
return i_vip_list;
List<BookChapterInfo> t_list = new List<BookChapterInfo>();
foreach (BookChapterInfo t_chapter in i_vip_list)
{
List<BookChapterInfo> t_temp = i_have_collect_list.FindAll(delegate(BookChapterInfo t_have_chapter)
{
return t_chapter.ChapterName == t_have_chapter.ChapterName;
});
if (t_temp == null || t_temp.Count <= 0)
{
t_list.Add(t_chapter);
}
}
return t_list;
}
/// <summary>
/// 获取章节列表
/// </summary>
/// <param name="i_html"></param>
/// <param name="i_chapter_list"></param>
/// <returns></returns>
private List<BookChapterInfo> GetBookChapterList(string i_url,string i_html, List<BookChapterInfo> i_chapter_list,string i_book_name)
{
if (!NetSiteCatchManager.IsPiraticSite(i_url))
return null;
if (string.IsNullOrEmpty(i_html))
return null;
string t_chapter_name_reg = "<a[^<>]*?hrefs*=s*[\'\"]*([^\"\']*)[\'\"]*[^<>]*?>(.*?)</a>";
List<BookChapterInfo> t_chapter_list = new List<BookChapterInfo>();
BookChapterInfo t_chapter = null;
bool t_is_stop = false;
string t_chapter_url = string.Empty;
try
{
MatchCollection t_ma = Regex.Matches(i_html, t_chapter_name_reg, RegexOptions.IgnoreCase | RegexOptions.Compiled);
if (t_ma != null)
{
foreach (BookChapterInfo t_ch in i_chapter_list)
{
foreach (Match t_mc in t_ma)
{
if (CompareChapterName(t_mc.Groups[2].Value.ToString().Trim(), t_ch.ChapterName) == true)
{
t_chapter_url = NetSiteCatchManager.GetFullUrl(i_url, t_mc.Groups[1].Value.ToString().Trim());
if (string.IsNullOrEmpty(t_chapter_url))
{
t_is_stop = true;
break;
}
t_chapter = GetBookChapter(t_chapter_url, t_ch, i_book_name);
if (t_chapter == null)
{
t_is_stop = true;
break;
}
if (t_chapter != null)
t_chapter_list.Add(t_chapter);
break;
}
}
if (t_is_stop)
break;
}
}
return t_chapter_list;
}
catch (Exception ex)
{
LogHelper.Error("从百度分离章节名称失败" + ex.ToString());
return null;
}
}
/// <summary>
/// 得到章节信息
/// </summary>
/// <param name="i_url"></param>
/// <param name="i_chapter_name"></param>
/// <param name="i_chapter_list"></param>
/// <returns></returns>
private BookChapterInfo GetBookChapter(string i_url, BookChapterInfo i_chapter, string i_book_name)
{
//最后一个章节不一定有1000字
if (i_chapter.ChapterName.IndexOf("完") > -1 || i_chapter.ChapterName.IndexOf("终") > -1 || i_chapter.ChapterName.IndexOf("结") > -1)
{
m_chapter_content_length = 300;
}
else
{
m_chapter_content_length = 1000;
}
BookChapterInfo t_chapter_info=null;
string t_chapter_content = GetContent(i_url, i_chapter.ChapterName);
t_chapter_content = NetSiteCatchManager.ReplaceContent(t_chapter_content);
if (string.IsNullOrEmpty(t_chapter_content) || t_chapter_content.Length < m_chapter_content_length)
{
t_chapter_content = GetChapterContentByChapterName(i_book_name, i_chapter.ChapterName);
t_chapter_content = NetSiteCatchManager.ReplaceContent(t_chapter_content);
if (string.IsNullOrEmpty(t_chapter_content) || t_chapter_content.Length < m_chapter_content_length)
return null;
}
t_chapter_content = string.Format("document.write('{0}');", t_chapter_content);
t_chapter_info = new BookChapterInfo();
t_chapter_info.ChapterName = i_chapter.ChapterName;
t_chapter_info.ChapterContent = t_chapter_content;
t_chapter_info.WordsCount = t_chapter_content.Length;
t_chapter_info.Comfrom = i_url;
t_chapter_info.IsVip = i_chapter.IsVip;
t_chapter_info.UpdateTime = i_chapter.UpdateTime;
t_chapter_info.VolumeName = i_chapter.VolumeName;
t_chapter_info.BookId = i_chapter.BookId;
t_chapter_info.SiteId = i_chapter.SiteId;
return t_chapter_info;
}
/// <summary>
/// 获取章节内容
/// </summary>
/// <param name="i_url"></param>
/// <param name="i_chapter_name"></param>
/// <returns></returns>
private string GetContent(string i_url, string i_chapter_name)
{
Encoding t_encoding = Encoding.Default;
string t_chapter_content = string.Empty;
string t_charset = string.Empty;
try
{
string t_html = NetSiteCatchManager.ReadUrl(i_url, t_encoding);
if (string.IsNullOrEmpty(t_html))
{
//重复一次
t_html = NetSiteCatchManager.ReadUrl(i_url, t_encoding);
}
t_chapter_content = GetChapterContent(t_html);
return t_chapter_content;
}
catch (Exception ex)
{
LogHelper.Error("获取页面内容失败" + ex.ToString());
return string.Empty;
}
}
/// <summary>
/// 获取html章节内容
/// </summary>
/// <param name="i_html"></param>
/// <param name="i_chapter_name"></param>
/// <returns></returns>
private string GetChapterContent(string i_html)
{
HtmlDocument t_html_doc = HtmlDocument.Create(i_html);
string t_content = string.Empty;
string t_temp_content = string.Empty;
foreach (HtmlElement t_ele in t_html_doc.GetElementsByTagName("td"))
{
t_temp_content = t_ele.InnerText;
t_temp_content = Regex.Replace(t_temp_content, "<.*?>.*?</.*?>", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
t_temp_content = Regex.Replace(t_temp_content, "[a-zA-Z0-9]", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
if (t_temp_content.Length > m_chapter_content_length)
{
t_content = t_ele.HTML;
}
}
if (!string.IsNullOrEmpty(t_content))
return t_content;
foreach (HtmlElement t_ele in t_html_doc.GetElementsByTagName("div"))
{
t_temp_content = t_ele.InnerText;
t_temp_content = Regex.Replace(t_temp_content, "<.*?>.*?</.*?>", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
t_temp_content = Regex.Replace(t_temp_content, "[a-zA-Z0-9,\\/;_()]", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
if (t_temp_content.Length > m_chapter_content_length)
{
t_content = t_ele.HTML;
}
}
if (string.IsNullOrEmpty(t_content) || t_content.Length < m_chapter_content_length)
t_content = GetContentByReg(i_html);
if (t_content.Length < m_chapter_content_length)
return string.Empty;
return t_content;
}
/// <summary>
/// 用正则表达式获取章节内容
/// </summary>
/// <param name="i_html"></param>
/// <returns></returns>
private string GetContentByReg(string i_html)
{
StringBuilder t_sb = new StringBuilder();
string t_reg = "([\u4E00-\u9FA5][^<>]*[\u4E00-\u9FA5])";
MatchCollection t_ma = Regex.Matches(i_html, t_reg, RegexOptions.IgnoreCase | RegexOptions.Compiled);
string t_sub_html = string.Empty;
int t_start_index = 0;
int t_length = 0;
if (t_ma != null)
{
int t_total_count=t_ma.Count;
for (int index = 0; index < t_total_count-1; index++)
{
t_start_index = t_ma[index].Index + t_ma[index].Groups[1].Value.ToString().Length;
t_length = t_ma[index + 1].Index - t_ma[index].Index - t_ma[index].Groups[1].Value.ToString().Length;
t_sub_html = i_html.Substring(t_start_index, t_length);
t_sub_html = Regex.Replace(t_sub_html, " ", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
t_sub_html = Regex.Replace(t_sub_html, "<[/]*p[^<>]*>", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
t_sub_html = Regex.Replace(t_sub_html, "<[/]*br>", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
t_sub_html=Regex.Replace(t_sub_html, "[【】(),!?(),!?;;、……]", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
if (t_sub_html.Length < 10)
{
t_sb.Append(t_ma[index].Groups[1].Value.ToString());
t_sb.Append("<p> ");
}
}
}
return t_sb.ToString();
}
/// <summary>
/// 判断是否是相同的章节
/// </summary>
/// <param name="i_chapter_source"></param>
/// <param name="i_chapter_target"></param>
/// <returns></returns>
private bool CompareChapterName(string i_chapter_source, string i_chapter_target)
{
if (i_chapter_source.Equals(i_chapter_target))
return true;
//去掉空格
i_chapter_source = Regex.Replace(i_chapter_source, "[\\s【】(),!?(),!?;\\.;、/……]", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
i_chapter_target = Regex.Replace(i_chapter_target, "[\\s【】(),!?(),!?;;\\.、/……]", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
if (i_chapter_source.IndexOf(i_chapter_target) > -1 || i_chapter_target.IndexOf(i_chapter_source) > -1)
return true;
return false;
}
/// <summary>
/// 通过章节名称去搜索引擎收录
/// </summary>
/// <param name="i_book_name"></param>
/// <param name="i_chapter_name"></param>
/// <returns></returns>
private string GetChapterContentByChapterName(string i_book_name, string i_chapter_name)
{
string t_key_word=i_chapter_name;
//章节名称长度小于5加上书名作为关键字
if (i_chapter_name.Length < 5)
{
t_key_word = string.Format("{0} {1}", i_book_name, i_chapter_name);
}
Encoding t_encoding = Encoding.GetEncoding("gb2312");
string t_baidu_url = string.Format("http://www.baidu.com/s?wd={0}", HttpUtility.UrlEncode(t_key_word, t_encoding));
string t_list_reg = "<h3\\s*?class=[\'\"]?t[\'\"]?><a[^<>]*?hrefs*=s*[\'\"]*([^\"\']*)[\'\"]*[^<>]*?>(.*?)</a>\\s*?</h3>";
string t_chapter_url = string.Empty;
string t_chapter_content = string.Empty;
try
{
string t_html = NetSiteCatchManager.ReadUrl(t_baidu_url, t_encoding);
if (!string.IsNullOrEmpty(t_html))
{
MatchCollection t_ma = Regex.Matches(t_html, t_list_reg, RegexOptions.IgnoreCase | RegexOptions.Compiled);
if (t_ma != null)
{
foreach (Match t_mc in t_ma)
{
t_chapter_url = t_mc.Groups[1].Value.ToString();
t_html = NetSiteCatchManager.ReadUrl(t_chapter_url, Encoding.Default);
if (string.IsNullOrEmpty(t_html))
{
//重复一次
t_html = NetSiteCatchManager.ReadUrl(t_chapter_url, Encoding.Default);
if (NetSiteCatchManager.IsContainChapterName(i_book_name, i_chapter_name, t_html) == false)
continue;
t_chapter_content = GetChapterContent(t_html);
if (!string.IsNullOrEmpty(t_chapter_content) && t_chapter_content.Length > m_chapter_content_length)
break;
}
}
}
}
return t_chapter_content;
}
catch (Exception ex)
{
LogHelper.Error("根据章节名称收录章节失败" + ex.ToString());
return string.Empty;
}
}