C#程序抓取网页实例

2011-03-14 14:03 smat 阅读(385) 评论(0) 编辑收藏举报

using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Text; using System.Windows.Forms; using System.Net; using System.IO; using System.Text.RegularExpressions; using System.Collections; namespace CopyHtml { public partial class Form1 : Form { public Form1() { InitializeComponent(); } private void button1_Click(object sender, EventArgs e) { //获取指定网页中的源数据 string rl; WebRequest Request = WebRequest.Create(textBox1.Text.Trim()); WebResponse Response = Request.GetResponse(); Stream resStream = Response.GetResponseStream(); StreamReader sr = new StreamReader(resStream, Encoding.Default); StringBuilder sb = new StringBuilder(); while ((rl = sr.ReadLine()) != null) { sb.Append(rl); } textBox2.Text = sb.ToString();//抓取得到的源网页 string he = textBox2.Text.ToString(); textBox3.Text = stripHtml(he);//去除html标签后得到的源网页 Match TitleMatch = Regex.Match(he, "<title>([^<]*)</title>", RegexOptions.IgnoreCase | RegexOptions.Multiline);//获取网页的标题 string title = TitleMatch.Groups[1].Value; textBox4.Text = ("网页的标题是：" + title ); } /// <summary> /// 去掉网页中的html标签 /// </summary> /// <param name="strHtml">待转化的字符串</param> /// <returns></returns> private string stripHtml(string strHtml) { Regex objRegExp = new Regex("<(.|\n)+?>"); string strOutput = objRegExp.Replace(strHtml, ""); strOutput = strOutput.Replace("<", "<"); strOutput = strOutput.Replace(">", ">"); return strOutput; } // 提取HTML代码中的网址 public static ArrayList GetHyperLinks(string htmlCode) { ArrayList al = new ArrayList(); string strRegex = @"(href)[ ]*=[ ]*[""'][^""'#>]+[""']"; Regex r = new Regex(strRegex, RegexOptions.IgnoreCase); MatchCollection m = r.Matches(htmlCode); for (int i = 0; i <= m.Count - 1; i++) { bool rep = false; string strNew = m[i].ToString(); // 过滤重复的URL foreach (string str in al) { if (strNew == str) { rep = true; break; } } if (!rep) al.Add(strNew); } al.Sort(); return al; } } }

刷新页面返回顶部

超越夢想笨蛋程序员每天都要学习，直到有一天，不笨了，也就不做程序员了。