天涯社区离线阅读器（实现只看楼主功能）

长期逛天涯社区，许多的经典的帖子都有许多页，而且里面还夹杂了大量的回复或者广告，不能像MOP一样只看楼主，这样阅读起来非常不便，故此写了一个小程序实现离线阅读并只看楼主功能。实现的思路是通过HttpWebRequest取得要查看的网页内容，然后用正则表达式将其中感性趣的部分取出来，我主要取了分页信息，同时分离出了各贴以及作者，这样可以实现查看任意一个人帖子的功能，如果需要的话可以将取出的数据保存在本地硬盘中以方便查看，在这个程序没有作本地保存。程序中的主要代码部分有：
1、从指定网址下载网页

public string Get_SourceHtml(string a_strUrl)

{

string strResult;

try

{

HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest.Create(a_strUrl);

myReq.Timeout = 60000;

HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();

Stream myStream = HttpWResp.GetResponseStream();

StreamReader sr = new StreamReader(myStream, Encoding.Default);

strResult = sr.ReadToEnd();

myStream.Close();

}

catch (Exception exp)

{

strResult = "错误：" + exp.Message;

}

return strResult;

}

2、分离出帖子正文

public string SplitHtml(string htmlContent)

{

string result =htmlContent;

//剪切掉正文后面的内容

Regex regexObj = new Regex("</div><center>");

Match m = regexObj.Match(result);

if (m.Value != string.Empty)

{

result = result.Substring(0, result.IndexOf(m.Value));

}

//清除掉广告

regexObj =new Regex(@"<center><IFRAME(.|\s)+?</IFRAME></center>");

result=regexObj.Replace(result,"");

//剪切掉正文前面的内容

//regexObj = new Regex("<TABLE align=center border=0 cellSpacing=0 width='100%'>");

regexObj = new Regex(@"<span id='AddMyDigest'></span>");

m = regexObj.Match(result);

if (m.Value != string.Empty)

{

result = result.Substring(result.IndexOf(m.Value));

}

return result;

}

3、获取指定作者的发帖在本地生成新的Html文件以便在程序内嵌浏览器中查看

public string GetAuthorsContent(string SourceHtml, string AuthorName)

{

StringBuilder html = new StringBuilder();

html.Append("<Html><body>");

SourceHtml = SplitHtml(SourceHtml);

Regex r = new Regex(@"<TABLE(.|\s)+?</table>");

MatchCollection ms = r.Matches(SourceHtml);

for (int i = 0; i < ms.Count - 2; i++)

{

Match m = ms[i];

int start = SourceHtml.IndexOf(m.Value) + m.Value.Length;

string title = m.Value;

Regex re = new Regex(@"k>(.|\s)+?<");

Match m1 = re.Match(title);

string author = m1.Value;

if (author.Length > 0)

author = author.Substring(2, author.IndexOf("<") - 2);

if (author == AuthorName)

{

html.Append(title);

Match NextMatch = ms[i + 1];

string content;

if (NextMatch != null)

content = SourceHtml.Substring(start, SourceHtml.IndexOf(NextMatch.Value) - start);

else

content = SourceHtml.Substring(start);

html.Append(content);

html.Append("<br>");

}

html.Append("</body></html>");

return html.ToString();

}

4、获取分页数据

public List<PageData> GetPages(string SourceHtml)

{

SourceHtml=SplitHtml(SourceHtml);

Regex r = new Regex(@"http:.+?((\[\d+\])|首页)");

MatchCollection ms = r.Matches(SourceHtml);

List<PageData> pages = new List<PageData>();

foreach (Match m in ms)

{

r = new Regex("><.+?>");

string s = r.Replace(m.Value, "");

s = s.Replace(">", "");

if (s.IndexOf("首页") >= 0)

pages.Add(new PageData(s.Substring(0, s.IndexOf("首页")),"首页"));

else

pages.Add(new PageData(s.Substring(0,s.IndexOf("[")),s.Substring(s.IndexOf("["))));

}

return pages;

}

完整源代码点击下载

posted @ 2007-10-08 11:46 秦巴故人阅读(4718) 评论(0) 编辑收藏举报

刷新页面返回顶部

挑战自我,超越无限

天涯社区离线阅读器（实现只看楼主功能）