批量抓取title keywords descrip【seo工具】

前两天帮朋友写个小工具。思路很简单实现也不是太难。写了这么个小工具。

实现功能:通过搜索引擎自动换页抓取记录页面中的属性值。贴代码:

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
using System.Collections;
//作者:CppCoding 最后编辑 2011 5 6 特征码:yschttl spt 这个是在雅虎中用的 换搜索引擎的话 针对不同的搜索引擎会有不同 测试了 nike puma 均是该特征码
//目前可能运行缓慢
//报错原因:网络速度,网页响应时间 由于是国外网站 读取源代码可能失败 所以报错
//Ver0.0.2
//实现了导出、自动翻页、设置搜索页面数量等功能。
namespace 获取html_控制截取
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private static string GetStringByUrl(string strUrl)
{
WebRequest wrt
= System.Net.WebRequest.Create(strUrl);
WebResponse wrse
= wrt.GetResponse();
Stream strM
= wrse.GetResponseStream();
StreamReader SR
= new StreamReader(strM, Encoding.GetEncoding("utf-8"));
string strallstrm = SR.ReadToEnd();
return strallstrm;
}
private void button1_Click(object sender, EventArgs e)
{
string url = txt_Url.Text;
int j=0;
int want = Convert.ToInt32(txtWant.Text);
while( j<want&&url!=null)
{
string herf = GetStringByUrl(url);

//string[] splitHerf = Regex.Split(herf, "yschttl spt", RegexOptions.IgnoreCase);
string[] splitHerf = herf.Split(new string[] { "\"yschttl spt\""},StringSplitOptions.None);
for (int i = 1; i < splitHerf.Length; i++)
{
//string[] splitWant = Regex.Split(splitHerf[i], "data-", RegexOptions.IgnoreCase);

string[] splitWount = splitHerf[i].Split(new string[] { "\"" }, StringSplitOptions.None);
//for (int t = 0; t < 1; t++)
//{

// //Regex reg = new Regex(@"(?is)<a(?:(?!href=).)*href=(['""]?)(?<url>[^""\s>]*)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
// //MatchCollection mc = reg.Matches(splitWount[1]);
// //foreach (Match m in mc)
// //{
// // txtEnd.Text += m.Groups["url"].Value + "\n";

// //}
// txtEnd.Text += sR;
//}
string title="";
string key="";
string des="";
//string[] st= splitWount[0].Split(new string[]{"href=\""},StringSplitOptions.None);
//string[] go = st[1].Split('"');
string nextUrl = splitWount[1];
string code = GetStringByUrl(nextUrl);
string checkTitle = "<title>";
string checkKey = "\"Keywords\" content=\"";
string checkDes = "\"Description\" content=\"";
if (code.IndexOf(checkTitle) > -1)
{
string[] fistTitle = code.Split(new string[] { "<title>" }, StringSplitOptions.None);
string[] secTitle = fistTitle[1].Split(new string[] { "<" }, StringSplitOptions.None);
title
= secTitle[0];


}
else
continue;
if (code.IndexOf(checkKey) > -1)
{
string[] fistKey = code.Split(new string[] { "ds\" content=\"" }, StringSplitOptions.None);
string[] secKey = fistKey[1].Split(new string[] { "\"" }, StringSplitOptions.None);
key
= secKey[0];

}
else
continue;
if (code.IndexOf(checkDes) > -1)
{
string[] fistDes = code.Split(new string[] { "\"Description\" content=\"" }, StringSplitOptions.RemoveEmptyEntries);
string[] secDes = fistDes[1].Split(new string[] { "\"" }, StringSplitOptions.RemoveEmptyEntries);
des
= secDes[0];
}
else
continue;//

txtEnd.Text
+= j+"..目前扫描到的网页为: "+url + "\n";

txtEnd.Text
+= des + "|" + key + "|" + des + "\n";
//sw.WriteLine(txtEnd.Text);

}
j
++;
string[] fistNext=herf.Split(new string[]{"\">Next &"},StringSplitOptions.None);
string[] secNext=fistNext[0].Split(new string[]{"xt\" href=\""},StringSplitOptions.None);
if(secNext.Length<1)
{
url
=null;
}
url
=secNext[1];

}

}

private void button2_Click(object sender, EventArgs e)
{
StreamWriter sw
= new StreamWriter(@"C:\Documents and Settings\Administrator\桌面\获取字符串.doc", true, Encoding.UTF8);
sw.WriteLine(txtEnd.Text);
sw.Close();
}
}




}

由于本人是菜鸟,里面代码基本没有优化,大家就当看着娱乐吧。希望各位尽情批评指导。

附截图

posted @ 2011-05-07 14:13  Starf  阅读(794)  评论(0编辑  收藏  举报