使用C#抓取网页内容并分析获取数据
private void button5_Click(object sender, EventArgs e) { string html = ""; WebHeaderCollection header = new WebHeaderCollection(); header.Set("Pragma", "no-cache"); html = getHtml("http://www.biomart.cn/info/infoDemand.htm?pge=1", header); Regex regex = new Regex("<!-- 列表 -->(?<1>.*)<!-- /列表 -->"); //MessageBox.Show(regex.Match(html).Groups.Count.ToString()); html = regex.Match(html).Groups[1].Value; regex = new Regex("href=\"(?<1>http://www\\.biomart\\.cn/infodemand/\\w+\\.htm)\""); MatchCollection ms = regex.Matches(html); header.Set(HttpRequestHeader.Cookie, "__utma=124945049.1686326021.1305093063.1305164868.1305187067.3; __utmz=124945049.1305093063.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); JSESSIONID=9D7F7F4B5D73F453DA54B40A53D5E7C8; __utmc=124945049; __utmb=124945049.2.10.1305187067"); foreach (Match m in ms) { MessageBox.Show(m.Groups[1].Value); String content = getHtml(m.Groups[1].Value, header); regex = new Regex("<div class=\"product_card\">(?<1>.*)\\s+</p>\\s+</div>"); MessageBox.Show(regex.Match(content).Groups[1].Value); } } private String getHtml(String url, WebHeaderCollection header) { WebHeaderCollection header = new WebHeaderCollection(); header.Set("", ""); header.Set(HttpRequestHeader.Cookie, ""); HttpWebRequest request = (HttpWebRequest)WebRequest.Create("http://www.biomart.cn/info/infoDemand.htm?pge=1"); request.Timeout = 30000; request.Headers = header; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); Stream stream = response.GetResponseStream(); Encoding encoding = Encoding.GetEncoding("UTF-8"); StreamReader reader = new StreamReader(stream); String content = reader.ReadToEnd(); content = Regex.Replace(content, "\\t|\\r|\\n", ""); return content; }