C#小程序——从百度摘取搜索结果。

百度不使用xhtml，这样使得.NET原有的XML功能就不是那么好用了。

（而且，谁会真正喜欢DOM呢？用起来多累人啊！）

不过百度的页面很不规则，所以迫不得已使用了大量的硬编码。

因此，这个程序对百度的页面设计做了相当多的假设，无法很好的适应百度的页面结构在未来的改变。

还好这种小程序写起来轻松，所以没事改一改也没事。

另外这个程序使用了大量的正则表达式，这可能会使得它在效率上不适合于用来整合各个搜索引擎的结果。

如果需要在一个页面同时展示几个搜索引擎的结果，我建议使用iframe标签，或者呢，就是让后台把网页通过ajax发给前台，然后在前台用js产生页面。

特别注意，程序中使用了FCL中好用的url编码的功能，因此必须额外添加对System.Web这个程序集的引用。

代码——百度机器人

  1 using System;
  2  using System.Collections.Generic;
  3  using System.Text;
  4  using System.Text.RegularExpressions;
  5  using System.Web;
  6  using System.Net;
  7 using System.IO;
  8 namespace baiduRobotStrim
  9 {
 10     struct BaiduEntry
 11     {
 12         public string title, brief, link;
 13     }
 14     class Program
 15     {
 16         static string GetHtml(string keyword)
 17         {
 18             string url = @"http://www.baidu.com/";
 19             string encodedKeyword = HttpUtility.UrlEncode(keyword, Encoding.GetEncoding(936));
 20             //百度使用codepage 936字符编码来作为查询串，果然专注于中文搜索……
 21             //更不用说，还很喜欢微软
 22             //谷歌能正确识别UTF-8编码和codepage这两种情况，不过本身网页在HTTP头里标明是UTF-8的
 23             //估计谷歌也不讨厌微软（以及微软的专有规范）
 24             string query = "s?wd=" + encodedKeyword;
 25 
 26             HttpWebRequest req;
 27             HttpWebResponse response;
 28             Stream stream;
 29             req = (HttpWebRequest)WebRequest.Create(url + query);
 30             response = (HttpWebResponse)req.GetResponse();
 31             stream = response.GetResponseStream();
 32             int count = 0;
 33             byte[] buf = new byte[8192];
 34             string decodedString = null;
 35             StringBuilder sb = new StringBuilder();
 36             try
 37             {
 38                 Console.WriteLine("正在读取网页{0}的内容……", url + query);
 39                 do
 40                 {
 41                     count = stream.Read(buf, 0, buf.Length);
 42                     if (count > 0)
 43                     {
 44                         decodedString = Encoding.GetEncoding(936).GetString(buf, 0, count);
 45                         sb.Append(decodedString);
 46                     }
 47                 } while (count > 0);
 48             }
 49             catch
 50             {
 51                 Console.WriteLine("网络连接失败，请检查网络设置。");
 52             }
 53             return sb.ToString();
 54         }
 55         static void PrintResult(List<BaiduEntry> entries)
 56         {
 57             int count = 0;
 58             entries.ForEach(delegate(BaiduEntry entry)
 59             {
 60                 Console.WriteLine("找到了百度的第{0}条搜索结果：", count += 1);
 61                 if (entry.link != null)
 62                 {
 63                     Console.WriteLine("找到了一条链接：");
 64                     Console.WriteLine(entry.link);
 65                 }
 66                 if (entry.title != null)
 67                 {
 68                     Console.WriteLine("标题为：");
 69                     Console.WriteLine(entry.title);
 70                 }
 71                 if (entry.brief != null)
 72                 {
 73                     Console.WriteLine("下面是摘要：");
 74                     Console.WriteLine(entry.brief);
 75                 }
 76                 Program.Cut();
 77             });
 78         }
 79         static void simpleOutput()
 80         {
 81             string html = "<table><tr><td><font>test</font><a>hello</a><br></td></tr></table>";
 82             Console.WriteLine(RemoveSomeTags(html));
 83         }
 84         static string RemoveVoidTag(string html)
 85         {
 86             string[] filter = { "<br>" };
 87             foreach (string tag in filter)
 88             {
 89                 html = html.Replace(tag, "");
 90             }
 91             return html;
 92         }
 93         static string ReleaseXmlTags(string html)
 94         {
 95             string[] filter = { "<a.*?>", "</a>", "<em>", "</em>", "<b>", "</b>", "<font.*?>", "</font>" };
 96             foreach (string tag in filter)
 97             {
 98                 html = Regex.Replace(html, tag, "");
 99             }
100             return html;
101         }
102 
103         static string RemoveSomeTags(string html)
104         {
105             html = RemoveVoidTag(html);
106             html = ReleaseXmlTags(html);
107             return html;
108         }
109         static void Cut()
110         {
111             Console.WriteLine("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
112         }
113         static void MainProc(string input)
114         {
115             MainProc(input, false);
116         }
117         static void MainProc(string input, bool tagsForBrief)
118         {
119             Regex r = new Regex("<table.*?</table>", RegexOptions.IgnoreCase);
120             //提取出(<table>,</table>)对，并等待进一步处理。
121             Match m = r.Match(input);
122             List<string> collection = new List<string>();
123             while (m.Success)
124             {
125                 collection.Add(m.Value);
126                 //找出tagname为table的节点并存储到collection变量中
127                 m = m.NextMatch();
128             }
129             List<BaiduEntry> entries = new List<BaiduEntry>();
130             collection.ForEach(delegate(string entry)
131             {
132                 r = new Regex("<td.*?>(.*)</td>", RegexOptions.IgnoreCase);
133                 if(r.IsMatch(entry))
134                 {//从entry字符串里捕获到的就是百度里存储在每个table标签里的td标签了。
135                     //现阶段中，百度页面里有几个table标签是兄弟节点的关系，
136                     //第一个table标签是一个广告，剩下的table标签刚好都是搜索结果。
137                     //理想状态下input字符串里只有几个由table标签组织的搜索结果项。
138                     //理应使用预处理过的字符串来调用本函数
139                     m = r.Match(entry);
140                     string html = m.Groups[1].Value;//直接使用捕获分组1的值。
141                     //html变量里存储着td节点的innerHTML，那里有真正的搜索结果
142                     BaiduEntry baidu = new BaiduEntry();
143                     r = new Regex("<a.*?href=\"(.*?)\".*?>", RegexOptions.IgnoreCase);
144                     if (r.IsMatch(html))
145                     {
146                         string linkString = r.Match(html).Groups[1].Captures[0].Value;
147                         baidu.link = linkString;
148                     }
149                     r = new Regex("<font.*</font>");
150                     //td节点下有一些嵌套了2层的font标签，把这个大的font标签拿下来。
151                     html = r.Match(html).Value;//现在html变量里存储着比较浓缩的信息了。
152 
153                     r = new Regex("<font.*?>(.*?)</font>");
154                     Match contentMatch = r.Match(html);
155                     if (contentMatch.Success)
156                     {
157                         string title = contentMatch.Groups[1].Captures[0].Value;
158                         title = RemoveSomeTags(title);
159                         baidu.title = title;
160                         contentMatch = contentMatch.NextMatch();
161                         if (contentMatch.Success)
162                         {
163                             string brief = contentMatch.Groups[1].Captures[0].Value;
164                             int splitIndex = brief.IndexOf("<font");
165                             if (splitIndex > -1)
166                                 brief = brief.Substring(0, splitIndex);
167                             if (!tagsForBrief)
168                                 brief = RemoveSomeTags(brief);
169                             //如果不需要带有HTML格式的摘要，那么就处理掉HTML标签
170                             baidu.brief = brief;
171                         }
172                     }
173                     else
174                     {
175                         if (html == "") return;
176                         Console.WriteLine("怪了，这里没有找到任何结果。");
177                         Console.WriteLine("如果百度已经更改了页面的结构那么程序需要重新设计。");
178                         Console.WriteLine("Mark:");
179                         Console.WriteLine(html);
180                         Cut();
181                         Cut();
182                         Cut();
183                     }
184                     entries.Add(baidu);
185                 }
186             });
187 
188             PrintResult(entries);
189         }
190         public static void Main(string[] args)
191         {
192             Console.WriteLine("请输入一个关键字。");
193             string keyword;
194             keyword = Console.ReadLine();
195             Console.WriteLine("正在从百度上获取结果，请稍等……");
196             string input;
197             input = GetHtml(keyword);
198             Regex r = new Regex("<table.*class=\"result\"[\\s\\S]*</table>", RegexOptions.IgnoreCase);
199             input = r.Match(input).Value;
200             MainProc(input);
201             Console.ReadKey(true);
202         }
203     }
204 }
205

posted @ 2010-11-02 18:29 bombless 阅读(3884) 评论(6) 收藏举报

刷新页面返回顶部