获取页面数据(C#.net)

通过地址获取网页中的数据,读取网页的代码如下:

View Code
public string GetPageData(string url, string EncodingName)
        {
            string str = "";
            if (num < 10)
            {
                HttpWebRequest q;
                HttpWebResponse p;
                StreamReader sr;
                try
                {
                    HttpWebRequest.DefaultCachePolicy = new RequestCachePolicy(RequestCacheLevel.NoCacheNoStore);
                    q = (HttpWebRequest)HttpWebRequest.Create(url);
                    q.Method = "POST";
                    q.ReadWriteTimeout = 300000;
                    q.Timeout = 300000;
                    p = (HttpWebResponse)q.GetResponse();
                    if (p.StatusCode == HttpStatusCode.OK && p.ContentLength < 10240 * 10240)
                    {
                        if ("".Equals(EncodingName))
                        {
                            EncodingName = "GB2312";
                        }
                        try
                        {
                            sr = new StreamReader(p.GetResponseStream(), Encoding.GetEncoding(EncodingName));//HttpWebResponse获得源码需500毫秒
                        }
                        catch
                        {
                            sr = new StreamReader(p.GetResponseStream(), Encoding.GetEncoding("GB2312"));//HttpWebResponse获得源码需500毫秒
                        }
                        str = sr.ReadToEnd();//读入数据流需300毫秒
                        p.Close(); q.Abort();
                    }
                }
                catch (Exception htl)
                {
                    MessageBox.Show(htl.ToString());
                    num++;
                    return GetPageData(url, EncodingName);
                }
            }
            num = 0;
            return str;
        }

要获取网页中特定的数据,就要依情况而定了,下面是我获取网页中表格中的数据的例子:

View Code
public string[] GetTableString(string html, out bool goon)
        {
            string[] s = { "" };
            string cenbegen = "科目</div></td>";
            int index = html.IndexOf(cenbegen);
            if (index < 0)
            {
                goon = true;
                return s;
            }
            html = html.Substring(index + cenbegen.Length);
            cenbegen = "科目</div></td>";
            index = html.LastIndexOf(cenbegen);
            if (index < 0)
            {
                goon = true;
                return s;
            }
            html = html.Substring(index + cenbegen.Length);
            cenbegen = "<!-- 右边列表开始-->";
            index = html.IndexOf(cenbegen);
            if (index < 0)
            {
                goon = true;
                return s;
            }
            html = html.Substring(0, index);
            
            html = StripHtml(html);
            string[] datastr = new string[200];
            datastr = Regex.Split(html, " ", RegexOptions.IgnoreCase);

            goon = true;
            return datastr;
        }

上面的代码中含有去除html标签,并合并成为空格的函数,代码如下:

View Code
 public string StripHtml(string strHtml)
        {
            Regex objRegExp = new Regex("<(.|\n)+?>");
            string strResponse = objRegExp.Replace(strHtml, "");
            strResponse = strResponse.Replace("<", "&lt;");
            strResponse = strResponse.Replace(">", "&gt;");
            //把所有空格变为一个空格
            Regex r = new Regex(@"\s+");
            string strOutput = r.Replace(strResponse, " ");
            return strOutput.Trim();
        }

最后得到的数据就是想要的数据了,可以将其放在字符串数组中,方便使用。

 

posted @ 2012-05-25 11:48  wenwen35  阅读(199)  评论(0编辑  收藏  举报