公司搜索--关于搜索引擎结果的析出

名字有点大,其实就是我女朋友现在所在的这家公司的工作,要求她必须拥有一个很大的南京公司名单,而且现状就是,她知道的公司就屈指可数,更别说是她同事们不知道而她知道的公司了。但是她被要求每天至少找到一个他们公司数据里面没有人公司,所以她只能借住搜索引擎。

上面是做这个东西的原因,起初我是想用网络爬虫来写的,后来操作了个开头,发现,网络爬虫是个大项目,而且涉及到大数据的操作和数据是否最新等特点,觉得没必要。加上时间紧急,她生怕完成不了任务被辞,哎,就快速的用业余时间写了一个小程序。

 

 

程序的思路很简单,就是通过搜索引擎(以百度为例)构造get型的url,然后在C#里面生成一个request请求,去取得Response,然后从Response里面取得dom,再用正则表达式从dom里面取得自己想要的数据:1、与公司名称匹配度高的字符串;2、路转到搜索结果下一页的地址。将取得的所有公司名称放进到list中,然后遍历去重,再与自己预定义的列表匹配,将不合法的公司名去掉。最张显示出来,加上另存为txt,双击复制等上功能。当然,为了可扩展性,我将取公司名称的正则表达式开放给用户,让用户(我女朋友?肯定不是啊!是我自己)可以尽可能的自定义。

 

好了,二话不说,上代码:

 
截图1
 
 
   1:  using System;
   2:  using System.Collections.Generic;
   3:  using System.ComponentModel;
   4:  using System.Data;
   5:  using System.Drawing;
   6:  using System.IO;
   7:  using System.Linq;
   8:  using System.Net;
   9:  using System.Text;
  10:  using System.Text.RegularExpressions;
  11:  using System.Windows.Forms;
  12:  using System.Threading;
  13:  using System.Collections;
  14:   
  15:   
  16:  namespace SearchCompany
  17:  {
  18:      public partial class Form1 : Form
  19:      {
  20:          Regex rx;
  21:          Thread getCompany;
  22:          public delegate void MyInvoke(string str,int type);
  23:          List<String> lstCom = new List<String>();
  24:          public List<String> lstFackName =new List<string>(){ "公司", "有限公司", "广告公司", "保险公司", "猎头公司", "新公司", "旧公司"};
  25:          public string googleNextPageRx;
  26:          public string baiduNextPageRx;
  27:          public long pagenum;
  28:          public Form1()
  29:          {
  30:              InitializeComponent();
  31:              baiduNextPageRx = "(?<=</a><a href=\")/s[^\"]*(?=\"\\sclass=\"n\">下一页&gt;</a><span class=\"nums\" style=\"margin-left:120px\">)";
  32:              googleNextPageRx = "";
  33:              textBox1.Text = @"\b((\w)|(\(\w+?\)))+(?<=公司)";
  34:          }
  35:   
  36:          private void Search_Click(object sender, EventArgs e)
  37:          {
  38:              pagenum = 1;
  39:              rx = new Regex(textBox1.Text.Trim());//regex传进来的时候被转义,但是给了rx后又自动转义回去了,所以说,不影响,可以直接写任意正确的正则式;
  40:              lblShow.Text = "";
  41:              lblBug.Text = "";
  42:              lstCompany.Items.Clear();
  43:              getCompany = new Thread(new ThreadStart(MakeCompany));
  44:              getCompany.Name = "getPageAndComputeData";
  45:              getCompany.IsBackground = true;
  46:              getCompany.Start();
  47:          }
  48:          /// <summary>
  49:          /// 子线程对主线程的操作
  50:          /// </summary>
  51:          /// <param name="lblstr">要在主界面上显示的内容</param>
  52:          /// <param name="type">操作类型:1、开始处理网站,显示已经处理多少个正在处理第几个,2、正在获取的数据,3、处理完成,搜索按钮可以点击,4、填充数据,5、显示bug</param>
  53:          public void setFromThread(string lblstr, int type)
  54:          {
  55:              if (lblShow.InvokeRequired)
  56:              {
  57:                  MyInvoke _myInvoke = new MyInvoke(setFromThread);
  58:                  this.Invoke(_myInvoke, new object[] { lblstr,type });
  59:              }
  60:              else
  61:              {
  62:                  if (type == 1)
  63:                  {
  64:                      this.Search.Enabled = false;
  65:                      this.AddCompany.Enabled = false;
  66:                      this.lblShow.Text = lblstr;
  67:                  }
  68:                  else if (type == 2)
  69:                  {
  70:                      this.lblShow.Text = lblstr;
  71:                  }
  72:                  else if (type == 3)
  73:                  {
  74:                      this.Search.Enabled = true;
  75:                      this.AddCompany.Enabled = true;
  76:                  }
  77:                  else if (type == 4)
  78:                  {
  79:                      //lstCom中已有数据,现在将它处理优化,再写入lstCompany中
  80:                      lblShow.Text = "正在处理数据,请稍后";
  81:                      List<String> newlst = new List<string>();
  82:                      if (lstCom.Count > 0)
  83:                      {
  84:                          foreach (String str in lstCom.Distinct<String>())
  85:                          {
  86:                              lstCompany.Items.Add(str);
  87:                          }
  88:                      }
  89:                      lblShow.Text=String.Format("数据已经处理完成\n共处理{0}个页面\n找到名称不重复的公司:{1}家",pagenum.ToString(), lstCompany.Items.Count.ToString());
  90:                      this.Search.Enabled = true;
  91:                      this.AddCompany.Enabled = true;
  92:                  }
  93:                  else if (type == 5)
  94:                  {
  95:                      lblBug.Text += lblstr;
  96:                  }
  97:              }
  98:          }
  99:        
 100:          public void MakeCompany()
 101:          {
 102:              try
 103:              {
 104:                  setFromThread("开始获取页面", 1);
 105:                  HttpWebRequest request;
 106:                  StringBuilder sbPageString = new StringBuilder();
 107:                  string oldNextUrl;
 108:                  string newNextUrl;
 109:                  foreach (string url in lstSite.Items)
 110:                  {
 111:                          
 112:                      request = (HttpWebRequest)WebRequest.Create(url);
 113:                      request.MaximumAutomaticRedirections = 500;
 114:                      request.CookieContainer = new CookieContainer();
 115:                      oldNextUrl = url;
 116:                      request.Timeout = 3000;
 117:                      request.Headers.Set("Pragma", "no-cache");
 118:                      HttpWebResponse response = (HttpWebResponse)request.GetResponse();
 119:                      Stream sm = response.GetResponseStream();
 120:                      Encoding ecode = Encoding.GetEncoding("utf-8");
 121:                      StreamReader sr = new StreamReader(sm, ecode);
 122:                      string pages = sr.ReadToEnd();
 123:                      //从pages里面取下一页的地址/s?.*(?="\sclass="n">下一页&gt;</a><span class="nums" style="margin-left:120px">)
 124:                      Regex rxNextUrl = new Regex(baiduNextPageRx);
 125:                      Match mcNextUrl = rxNextUrl.Match(pages);
 126:   
 127:                      //把pages放到sbPageString里面
 128:                      sbPageString.Append(pages.Replace("<em>", "").Replace("</em>", ""));
 129:                      //用新的地址取数据放进pages里面
 130:                      setFromThread("已获取1个页面,正在尝试获取下一页", 2);
 131:                      while (mcNextUrl.Success)
 132:                      {
 133:                          newNextUrl = "http://www.baidu.com" + mcNextUrl.Value;
 134:                          if (oldNextUrl.Equals(newNextUrl)) break;
 135:                          oldNextUrl = newNextUrl;
 136:                          request = (HttpWebRequest)WebRequest.Create("http://www.baidu.com" + mcNextUrl.Value);
 137:                          request.Timeout = 3000;
 138:                          request.Headers.Set("Pragma", "no-cache");
 139:                          response = (HttpWebResponse)request.GetResponse();
 140:                          sm = response.GetResponseStream();
 141:                          ecode = Encoding.GetEncoding("utf-8");
 142:                          sr = new StreamReader(sm, ecode);
 143:                          pages = sr.ReadToEnd();
 144:                          rxNextUrl = new Regex(baiduNextPageRx);
 145:                          mcNextUrl = rxNextUrl.Match(pages);
 146:                          sbPageString.Append(pages.Replace("<em>", "").Replace("</em>", ""));
 147:                          setFromThread(String.Format("已获取{0}个页面,正在尝试获取下一个页面", (++pagenum).ToString()), 2);
 148:                      }
 149:                  }
 150:   
 151:                  setFromThread(String.Format("共获取{0}个页面的数据,正在对数据进行处理",pagenum.ToString()), 2);
 152:                  string strPage = sbPageString.ToString().ToLower(); MatchCollection mc = rx.Matches(strPage);
 153:                  string str="";
 154:                  foreach (Match tmc in mc)
 155:                  {
 156:                      str=tmc.Value.Trim();
 157:                      if(!lstFackName.Contains<String>(str))
 158:                      lstCom.Add(str);
 159:                  }
 160:                  setFromThread("", 4);
 161:              }catch (Exception ex)
 162:              {
 163:                  setFromThread(ex.Message.ToString(), 5);
 164:              }
 165:          }
 166:   
 167:          private void AddCompany_Click(object sender, EventArgs e)
 168:          {
 169:              String strURL=newURL.Text.Trim();
 170:              if (strURL != String.Empty)
 171:              {
 172:                  lstSite.Items.Add(strURL);
 173:                  newURL.Text = "";
 174:                  AddCompany.Enabled = false;
 175:              }
 176:          }
 177:   
 178:          private void newURL_TextChanged(object sender, EventArgs e)
 179:          {
 180:              if (newURL.Text.Trim() == String.Empty)
 181:              {
 182:                  AddCompany.Enabled = false;
 183:              }
 184:              else
 185:              {
 186:                  AddCompany.Enabled = true;
 187:              }
 188:          }
 189:   
 190:          private void button1_Click(object sender, EventArgs e)
 191:          {
 192:              if (lstSite.SelectedItems.Count > 0)
 193:              {
 194:                  for (int i = 0; i < lstSite.SelectedItems.Count; i++)
 195:                  {
 196:                      lstSite.Items.Remove(lstSite.SelectedItems[i]);
 197:                  }
 198:              }
 199:          }
 200:   
 201:          private void button2_Click(object sender, EventArgs e)
 202:          {
 203:              if (saveFileDialog1.ShowDialog() == DialogResult.OK)
 204:              {
 205:                  try
 206:                  {
 207:                      System.IO.FileStream fs = (System.IO.FileStream)saveFileDialog1.OpenFile();
 208:                      StreamWriter sw = new StreamWriter(fs);
 209:                      for (int i = 0; i < lstCompany.Items.Count; i++)
 210:                      {
 211:                          sw.WriteLine(lstCompany.Items[i].ToString());
 212:                      }
 213:                      sw.Flush();
 214:                      sw.Close();
 215:                      fs.Close();
 216:                      MessageBox.Show("文件保存成功");
 217:                  }
 218:                  catch (Exception ex)
 219:                  {
 220:                      MessageBox.Show("异常:\n{0}", ex.Message.ToString());
 221:                  }
 222:              }
 223:          }
 224:   
 225:          private void lstCompany_DoubleClick(object sender, EventArgs e)
 226:          {
 227:              Clipboard.SetText(lstCompany.SelectedItems[0].ToString());
 228:              lblCopy.Text = "复制成功...";
 229:              timer1.Tick += lblCopyClear;
 230:              timer1.Interval = 3000;
 231:              timer1.Start();
 232:          }
 233:          public void lblCopyClear(object sender,EventArgs e)
 234:          {
 235:              lblCopy.Text = "";
 236:              timer1.Tick -= lblCopyClear;
 237:              timer1.ToString();
 238:          }
 239:   
 240:      }
 241:  }
posted @ 2013-09-24 17:15  ensleep  阅读(1953)  评论(18编辑  收藏  举报