C#制作网盘搜索工具(简单的爬虫)
最近学习C#编程,在网上发现一篇winform下制作百度网盘搜索器的文章,故而下载源码学习一二。无奈原博所用的网址失效,故而自己改写了网址和相关源代码,也进行了实现。因为初学,接触的知识较多,为免忘记,进行整理复习。
1.知识点:
思路:主要是利用HttpWebRequest,HttpWebResponse进行http模拟请求,然后利用HtmlAgilityPack+XPath语法对html dom进行元素获取,将截取到的相关内容在datagridview中展示,最后利用process.start()方法进行点击访问。
2.具体实现:
2.1关于请求头的获取:
本例子使用网址为:http://www.pansoso.com/
分析上述网址的请求头进行模拟:
查看具体请求头信息:
根据获取的request url分析出其请求地址的规律为:所搜索的关键字:hello直接利用get方法添加到了url的最后,其中页数规律为hello_1,hello_2。。。(每页十条记录)
2.2关于结果的获取:
结果的获取,直接利用对response网页的分析截取关键信息即可。
3.代码实现:
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
namespace 百度网盘资源搜索
{
class HttpHelper
{
static readonly string urlTemplate = "http://www.pansoso.com/zh/{0}";
public static string Requset(string key)
{
string url = string.Format(urlTemplate, key);
//Console.WriteLine(url);
HttpWebRequest httpRequest = (HttpWebRequest)WebRequest.Create(url);
httpRequest.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8";
httpRequest.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36";
httpRequest.Host = "www.pansoso.com";
httpRequest.Referer = "http://www.pansoso.com/zh/" + Uri.EscapeUriString(key);
try
{
HttpWebResponse httpResponse = (HttpWebResponse)httpRequest.GetResponse();
Stream s = httpResponse.GetResponseStream();
StreamReader sr = new StreamReader(s);
string jsonString = sr.ReadToEnd();
//Console.WriteLine(jsonString);
//string jsonProcessed = null;
//if ((jsonProcessed = JsonPreProcessing(jsonString)) != null)
//{
// SearchResult searchResult = UtilityClass.GetObject<SearchResult>(jsonProcessed);
// return searchResult;
//}
return jsonString;
}
catch
{
return null;
}
}
public static SearchResult dodata(string str)
{
SearchResult searchResult = UtilityClass.GetObject<SearchResult>(str);
return searchResult;
}
//if (doc.DocumentNode.SelectNodes("//comment()") != null)
//{
// foreach (var commet in doc.DocumentNode.SelectNodes("//comment"))
// {
// commet.Remove();
// }
//}
public static string JsonPreProcessing(string jsonString)
{
int startIndex = jsonString.IndexOf("(");
if (startIndex > 0)
{
string json = jsonString.Substring(startIndex + 1);
return "{\"resources\":" + json.Remove(json.Length - 3) + "}";
}
else
{
return null;
}
}
}
}
Utility.Class
using System;
using System.Collections.Generic;
using System.IO;
//using System.Linq;
using System.Runtime.Serialization.Json;
using System.Text;
namespace 百度网盘资源搜索
{
class UtilityClass
{
public static T GetObject<T>(string json)
{
DataContractJsonSerializer serializer = new DataContractJsonSerializer(typeof(T));
MemoryStream ms = new MemoryStream(Encoding.UTF8.GetBytes(json));
T obj = (T)serializer.ReadObject(ms);
return obj;
}
}
}
JSontoObject.cs
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace 百度网盘资源搜索
{
public class SearchResult
{
public BDWPResource[] resources { get; set; }
}
public class BDWPResource
{
public string title { get; set; }
public string content { get; set; }
public string unescapedUrl { get; set; }
}
}
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Threading;
using System.Diagnostics;
namespace 百度网盘资源搜索
{//主窗体
public partial class FrmMain : Form
{
bool isSearch = true;
string url = "http://www.pansoso.com";
public FrmMain()
{
InitializeComponent();
}
private void btnSearch_Click(object sender, EventArgs e)
{
string key = this.txtKey.Text;
if (!string.IsNullOrEmpty(key))
{
this.dataGridView1.Rows.Clear();
this.lblResult.Text = "0";
this.pgsBar.Value = 0;
this.btnSearch.Text = "正在搜索";
this.btnSearch.Enabled = false;
this.btnStop.Enabled = true;
Thread thread = new Thread(() =>
{
for (int i = 1; i < 11; i ++)//共取得10页网页数据
{
if (isSearch)
{
gethtml(HttpHelper.Requset(key+"_"+i));
//gethtml(HttpHelper.Requset(key));
//if(textBox1.Text!=null)
//{
// string name=textBox1.Text;
// SearchResult sr= HttpHelper.dodata(name);
// if (sr != null)
// {
// foreach (BDWPResource resource in sr.resources)
// {
// BindResource(resource);
// }
// }
// }
// webBrowser1.DocumentText = HttpHelper.Requset(key);
// Navigate to HTML document string
//webBrowser1.Navigate(HttpHelper.Requset(key));
// SearchResult sr = HttpHelper.Requset(key);
}
else break;
}
//搜索完成
SearchOver();
});
thread.IsBackground = true;
thread.Start();
}
}
public void gethtml(string docs)
{
try
{
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(docs);
if (doc.DocumentNode.SelectNodes("//script") != null)
{
foreach (var script in doc.DocumentNode.SelectNodes("//script"))
{
script.Remove();
}
HtmlAgilityPack.HtmlNodeCollection hrefList = doc.DocumentNode.SelectNodes(".//h2/a[@href]");
HtmlAgilityPack.HtmlNodeCollection list2 = doc.DocumentNode.SelectNodes(".//div[@class='des']");
HtmlAgilityPack.HtmlNodeCollection list3 = doc.DocumentNode.SelectNodes(".//h2/a[@href]");
if (hrefList != null && list2 != null && list3 != null)
{
for (int i = 0; i < list2.Count; i++)
{
string url1 = url + list3[i].Attributes["href"].Value;
string json = "title:" + hrefList[i].InnerText + "content:" + list2[i].InnerText + "unescapedUrl:" +"【"+url1+"】" ;
// Process.Start(url1);
SearchOver1(json);
this.Invoke(new Action<string, string, string>((tle, ctt, url3) =>
{
this.dataGridView1.Rows.Add(tle, ctt, url3);
this.lblResult.Text = (Int32.Parse(this.lblResult.Text) + 1).ToString();
if (this.pgsBar.Value < this.pgsBar.Maximum)
{
this.pgsBar.Value++;
}
}), hrefList[i].InnerText,list2[i].InnerText, url1);
}
}
}
}
catch (Exception)
{
MessageBox.Show("该关键字没有收录资源!!!");
}
}
//if (doc.DocumentNode.SelectNodes("//style") != null)
//{
// foreach (var style in doc.DocumentNode.SelectNodes("style"))
// {
// style.Remove();
// }
//}
private void BindResource(BDWPResource resource)
{
string title = resource.title.Replace("</b>", "").Replace("<b>", "");
string content = resource.content.Replace("</b>", "").Replace("<b>", "");
this.Invoke(new Action<string, string, string>((tle, ctt, url) =>
{
this.dataGridView1.Rows.Add(tle, ctt, url);
this.lblResult.Text = (Int32.Parse(this.lblResult.Text) + 1).ToString();
if (this.pgsBar.Value < this.pgsBar.Maximum)
{
this.pgsBar.Value++;
}
}), title, content, resource.unescapedUrl);
}
private void SearchOver()
{
this.Invoke(new Action(() =>
{
this.btnSearch.Text = "开始搜索";
this.btnSearch.Enabled = true;
this.btnStop.Enabled = false;
this.isSearch = true;
}));
}
public void SearchOver1(string str)
{
this.Invoke(new Action(() =>
{
this.richTextBox1.Text += str + System.Environment.NewLine;
}));
}
private void dataGridView1_RowPostPaint(object sender, DataGridViewRowPostPaintEventArgs e)
{
SolidBrush b = new SolidBrush(this.dataGridView1.RowHeadersDefaultCellStyle.ForeColor);
e.Graphics.DrawString((e.RowIndex + 1).ToString(System.Globalization.CultureInfo.CurrentUICulture), this.dataGridView1.DefaultCellStyle.Font, b, e.RowBounds.Location.X + 20, e.RowBounds.Location.Y + 6);
e.Graphics.FillRectangle(Brushes.White, new Rectangle(new Point(e.RowBounds.Location.X + 2, e.RowBounds.Location.Y + 2), new Size(20, 20)));//隐藏每行前面的图标
}
//打开网页链接
private void dataGridView1_CellDoubleClick(object sender, DataGridViewCellEventArgs e)
{
if (e.RowIndex > -1)
{
string url = this.dataGridView1.Rows[e.RowIndex].Cells[2].Value.ToString();
Process.Start(url);//进行打开浏览器的方法。
}
}
private void btnStop_Click(object sender, EventArgs e)
{
isSearch = false;
this.btnSearch.Enabled = true;
}
private void richTextBox1_LinkClicked(object sender, LinkClickedEventArgs e)
{
System.Diagnostics.Process.Start(e.LinkText);
}
}
}
4.效果实现:
本文来自博客园,作者:cache.yuan,转载请注明原文链接:https://www.cnblogs.com/cache-yuan/p/10104249.html