asp.net 网页抓取内容

网页抓取代码
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
//
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Text;

namespace WSYL.Web.Common
{
    public static class GetSteamShipInfo
    {
        public static string GetWebSite(string steamshipname,int itype)
        {
            if (steamshipname == null || steamshipname.Trim() == "")
                return null;
            //step1: get html from url
            string urlToCrawl = @"网址";
            //generate http request
            HttpWebRequest req = (HttpWebRequest)WebRequest.Create(urlToCrawl);
            //use GET method to get url's html
            req.Method = "GET";
            //use request to get response
            HttpWebResponse resp = (HttpWebResponse)req.GetResponse();
            // 二〇一五年八月十二日 18:14:45 需要增加判断网页解析超时问题 防止网页假死
            // string htmlCharset = "UTF-8";
            string htmlCharset = "utf-8";
            //use songtaste's html's charset GB2312 to decode html
            //otherwise will return messy code
            Encoding htmlEncoding = Encoding.GetEncoding(htmlCharset);
            StreamReader sr = new StreamReader(resp.GetResponseStream(), htmlEncoding);
            //read out the returned html
            string respHtml = sr.ReadToEnd();
            //第三种获取内容
            //Match TitleMatch = Regex.Match(rtbExtractedHtml.Text.ToString(), "<td width=\"30%\">([^<]*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
//需要获取的代码开始和结尾内容

            Match TitleMatch2 = Regex.Match(respHtml.ToString(), "<td align=\"left\" bgcolor=\"#EEEEEE\">([^<]*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
            // txbExtractedInfo.Text = TitleMatch2.Groups[1].Value+"/"+ TitleMatch2.Groups[2].Value;
            if (TitleMatch2.Groups[1].Value.Length == 0 || TitleMatch2.Groups[1].Value=="")
               return respHtml = "";
            if(itype==0)
            {
                respHtml = TitleMatch2.Groups[1].Value.ToString();
            }
             if(itype==1)
            {
                respHtml = StripHtml(TitleMatch2.NextMatch().Value.ToString());
            }
             if (itype == 2)
             {
                 respHtml = TitleMatch2.Groups[1].Value + "/" + StripHtml(TitleMatch2.NextMatch().Value.ToString());
             }
            return  respHtml;
        }
        /// <summary>
        /// 去除html标签和空格有些例外会使得去除不干净，所以建议连续两次转化。这样将Html标签转化为了空格。太多连续的空格会影响之后对字符串的操作
        /// </summary>
        /// <param name="strHtml">标签内容</param>
        /// <returns></returns>
        private static string StripHtml(string strHtml)
        {
            Regex objRegExp = new Regex("<(.|\n)+?>");
            string strOutput = objRegExp.Replace(strHtml, "");
            strOutput = strOutput.Replace("<", "&lt;");
            strOutput = strOutput.Replace(">", "&gt;");
            //把所有空格变为一个空格
            Regex r = new Regex(@"\s+");
            strOutput = r.Replace(strOutput, " ");
            return strOutput.Trim();
        }
    }
}
posted on 2016-03-28 16:10 DavidZang 阅读(1940) 评论(3) 编辑收藏举报
asp.net 网页抓取内容

搜索

常用链接

我的标签

随笔分类

随笔档案

阅读排行榜

评论排行榜

推荐排行榜

最新评论