批量抓取title keywords descrip【seo工具】
前两天帮朋友写个小工具。思路很简单实现也不是太难。写了这么个小工具。
实现功能:通过搜索引擎自动换页抓取记录页面中的属性值。贴代码:
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
using System.Collections;
//作者:CppCoding 最后编辑 2011 5 6 特征码:yschttl spt 这个是在雅虎中用的 换搜索引擎的话 针对不同的搜索引擎会有不同 测试了 nike puma 均是该特征码
//目前可能运行缓慢
//报错原因:网络速度,网页响应时间 由于是国外网站 读取源代码可能失败 所以报错
//Ver0.0.2
//实现了导出、自动翻页、设置搜索页面数量等功能。
namespace 获取html_控制截取
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private static string GetStringByUrl(string strUrl)
{
WebRequest wrt = System.Net.WebRequest.Create(strUrl);
WebResponse wrse = wrt.GetResponse();
Stream strM = wrse.GetResponseStream();
StreamReader SR = new StreamReader(strM, Encoding.GetEncoding("utf-8"));
string strallstrm = SR.ReadToEnd();
return strallstrm;
}
private void button1_Click(object sender, EventArgs e)
{
string url = txt_Url.Text;
int j=0;
int want = Convert.ToInt32(txtWant.Text);
while( j<want&&url!=null)
{
string herf = GetStringByUrl(url);
//string[] splitHerf = Regex.Split(herf, "yschttl spt", RegexOptions.IgnoreCase);
string[] splitHerf = herf.Split(new string[] { "\"yschttl spt\""},StringSplitOptions.None);
for (int i = 1; i < splitHerf.Length; i++)
{
//string[] splitWant = Regex.Split(splitHerf[i], "data-", RegexOptions.IgnoreCase);
string[] splitWount = splitHerf[i].Split(new string[] { "\"" }, StringSplitOptions.None);
//for (int t = 0; t < 1; t++)
//{
// //Regex reg = new Regex(@"(?is)<a(?:(?!href=).)*href=(['""]?)(?<url>[^""\s>]*)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");
// //MatchCollection mc = reg.Matches(splitWount[1]);
// //foreach (Match m in mc)
// //{
// // txtEnd.Text += m.Groups["url"].Value + "\n";
// //}
// txtEnd.Text += sR;
//}
string title="";
string key="";
string des="";
//string[] st= splitWount[0].Split(new string[]{"href=\""},StringSplitOptions.None);
//string[] go = st[1].Split('"');
string nextUrl = splitWount[1];
string code = GetStringByUrl(nextUrl);
string checkTitle = "<title>";
string checkKey = "\"Keywords\" content=\"";
string checkDes = "\"Description\" content=\"";
if (code.IndexOf(checkTitle) > -1)
{
string[] fistTitle = code.Split(new string[] { "<title>" }, StringSplitOptions.None);
string[] secTitle = fistTitle[1].Split(new string[] { "<" }, StringSplitOptions.None);
title = secTitle[0];
}
else
continue;
if (code.IndexOf(checkKey) > -1)
{
string[] fistKey = code.Split(new string[] { "ds\" content=\"" }, StringSplitOptions.None);
string[] secKey = fistKey[1].Split(new string[] { "\"" }, StringSplitOptions.None);
key = secKey[0];
}
else
continue;
if (code.IndexOf(checkDes) > -1)
{
string[] fistDes = code.Split(new string[] { "\"Description\" content=\"" }, StringSplitOptions.RemoveEmptyEntries);
string[] secDes = fistDes[1].Split(new string[] { "\"" }, StringSplitOptions.RemoveEmptyEntries);
des = secDes[0];
}
else
continue;//
txtEnd.Text += j+"..目前扫描到的网页为: "+url + "\n";
txtEnd.Text += des + "|" + key + "|" + des + "\n";
//sw.WriteLine(txtEnd.Text);
}
j++;
string[] fistNext=herf.Split(new string[]{"\">Next &"},StringSplitOptions.None);
string[] secNext=fistNext[0].Split(new string[]{"xt\" href=\""},StringSplitOptions.None);
if(secNext.Length<1)
{
url=null;
}
url=secNext[1];
}
}
private void button2_Click(object sender, EventArgs e)
{
StreamWriter sw = new StreamWriter(@"C:\Documents and Settings\Administrator\桌面\获取字符串.doc", true, Encoding.UTF8);
sw.WriteLine(txtEnd.Text);
sw.Close();
}
}
}
由于本人是菜鸟,里面代码基本没有优化,大家就当看着娱乐吧。希望各位尽情批评指导。
附截图