天涯社区离线阅读器(实现只看楼主功能)
长期逛天涯社区,许多的经典的帖子都有许多页,而且里面还夹杂了大量的回复或者广告,不能像MOP一样只看楼主,这样阅读起来非常不便,故此写了一个小程序实现离线阅读并只看楼主功能。实现的思路是通过HttpWebRequest取得要查看的网页内容,然后用正则表达式将其中感性趣的部分取出来,我主要取了分页信息,同时分离出了各贴以及作者,这样可以实现查看任意一个人帖子的功能,如果需要的话可以将取出的数据保存在本地硬盘中以方便查看,在这个程序没有作本地保存。程序中的主要代码部分有:
1、从指定网址下载网页
完整源代码点击下载
1、从指定网址下载网页
public string Get_SourceHtml(string a_strUrl)
{
string strResult;
try
{
HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest.Create(a_strUrl);
myReq.Timeout = 60000;
HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();
Stream myStream = HttpWResp.GetResponseStream();
StreamReader sr = new StreamReader(myStream, Encoding.Default);
strResult = sr.ReadToEnd();
myStream.Close();
}
catch (Exception exp)
{
strResult = "错误:" + exp.Message;
}
return strResult;
}
2、分离出帖子正文{
string strResult;
try
{
HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest.Create(a_strUrl);
myReq.Timeout = 60000;
HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();
Stream myStream = HttpWResp.GetResponseStream();
StreamReader sr = new StreamReader(myStream, Encoding.Default);
strResult = sr.ReadToEnd();
myStream.Close();
}
catch (Exception exp)
{
strResult = "错误:" + exp.Message;
}
return strResult;
}
public string SplitHtml(string htmlContent)
{
string result =htmlContent;
//剪切掉正文后面的内容
Regex regexObj = new Regex("</div><center>");
Match m = regexObj.Match(result);
if (m.Value != string.Empty)
{
result = result.Substring(0, result.IndexOf(m.Value));
}
//清除掉广告
regexObj =new Regex(@"<center><IFRAME(.|\s)+?</IFRAME></center>");
result=regexObj.Replace(result,"");
//剪切掉正文前面的内容
//regexObj = new Regex("<TABLE align=center border=0 cellSpacing=0 width='100%'>");
regexObj = new Regex(@"<span id='AddMyDigest'></span>");
m = regexObj.Match(result);
if (m.Value != string.Empty)
{
result = result.Substring(result.IndexOf(m.Value));
}
return result;
}
3、获取指定作者的发帖在本地生成新的Html文件以便在程序内嵌浏览器中查看{
string result =htmlContent;
//剪切掉正文后面的内容
Regex regexObj = new Regex("</div><center>");
Match m = regexObj.Match(result);
if (m.Value != string.Empty)
{
result = result.Substring(0, result.IndexOf(m.Value));
}
//清除掉广告
regexObj =new Regex(@"<center><IFRAME(.|\s)+?</IFRAME></center>");
result=regexObj.Replace(result,"");
//剪切掉正文前面的内容
//regexObj = new Regex("<TABLE align=center border=0 cellSpacing=0 width='100%'>");
regexObj = new Regex(@"<span id='AddMyDigest'></span>");
m = regexObj.Match(result);
if (m.Value != string.Empty)
{
result = result.Substring(result.IndexOf(m.Value));
}
return result;
}
public string GetAuthorsContent(string SourceHtml, string AuthorName)
{
StringBuilder html = new StringBuilder();
html.Append("<Html><body>");
SourceHtml = SplitHtml(SourceHtml);
Regex r = new Regex(@"<TABLE(.|\s)+?</table>");
MatchCollection ms = r.Matches(SourceHtml);
for (int i = 0; i < ms.Count - 2; i++)
{
Match m = ms[i];
int start = SourceHtml.IndexOf(m.Value) + m.Value.Length;
string title = m.Value;
Regex re = new Regex(@"k>(.|\s)+?<");
Match m1 = re.Match(title);
string author = m1.Value;
if (author.Length > 0)
author = author.Substring(2, author.IndexOf("<") - 2);
if (author == AuthorName)
{
html.Append(title);
Match NextMatch = ms[i + 1];
string content;
if (NextMatch != null)
content = SourceHtml.Substring(start, SourceHtml.IndexOf(NextMatch.Value) - start);
else
content = SourceHtml.Substring(start);
html.Append(content);
html.Append("<br>");
}
}
html.Append("</body></html>");
return html.ToString();
}
4、获取分页数据{
StringBuilder html = new StringBuilder();
html.Append("<Html><body>");
SourceHtml = SplitHtml(SourceHtml);
Regex r = new Regex(@"<TABLE(.|\s)+?</table>");
MatchCollection ms = r.Matches(SourceHtml);
for (int i = 0; i < ms.Count - 2; i++)
{
Match m = ms[i];
int start = SourceHtml.IndexOf(m.Value) + m.Value.Length;
string title = m.Value;
Regex re = new Regex(@"k>(.|\s)+?<");
Match m1 = re.Match(title);
string author = m1.Value;
if (author.Length > 0)
author = author.Substring(2, author.IndexOf("<") - 2);
if (author == AuthorName)
{
html.Append(title);
Match NextMatch = ms[i + 1];
string content;
if (NextMatch != null)
content = SourceHtml.Substring(start, SourceHtml.IndexOf(NextMatch.Value) - start);
else
content = SourceHtml.Substring(start);
html.Append(content);
html.Append("<br>");
}
}
html.Append("</body></html>");
return html.ToString();
}
public List<PageData> GetPages(string SourceHtml)
{
SourceHtml=SplitHtml(SourceHtml);
Regex r = new Regex(@"http:.+?((\[\d+\])|首页)");
MatchCollection ms = r.Matches(SourceHtml);
List<PageData> pages = new List<PageData>();
foreach (Match m in ms)
{
r = new Regex("><.+?>");
string s = r.Replace(m.Value, "");
s = s.Replace(">", "");
if (s.IndexOf("首页") >= 0)
pages.Add(new PageData(s.Substring(0, s.IndexOf("首页")),"首页"));
else
pages.Add(new PageData(s.Substring(0,s.IndexOf("[")),s.Substring(s.IndexOf("["))));
}
return pages;
}
{
SourceHtml=SplitHtml(SourceHtml);
Regex r = new Regex(@"http:.+?((\[\d+\])|首页)");
MatchCollection ms = r.Matches(SourceHtml);
List<PageData> pages = new List<PageData>();
foreach (Match m in ms)
{
r = new Regex("><.+?>");
string s = r.Replace(m.Value, "");
s = s.Replace(">", "");
if (s.IndexOf("首页") >= 0)
pages.Add(new PageData(s.Substring(0, s.IndexOf("首页")),"首页"));
else
pages.Add(new PageData(s.Substring(0,s.IndexOf("[")),s.Substring(s.IndexOf("["))));
}
return pages;
}
完整源代码点击下载