爬虫-HtmlAgilityPack
写了一个简单爬婴儿配方奶粉的小爬虫,使用HtmlAgilityPack
HtmlAgilityPack:https://html-agility-pack.net/
参考
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Web;
using System.Web.Mvc;
using CrawlerForBaby.Models;
using CrawlerForBaby.Untity;
using HtmlAgilityPack;
namespace CrawlerForBaby.Controllers
{
public class HomeController : Controller
{
public ActionResult Index()
{
return View();
}
public JsonResult GetTables(int page, int limit)
{
using (CrawlerForBabyEntities1 db = new CrawlerForBabyEntities1())
{
var query = (from s in db.BabyRecipe
join p in db.Product on s.ProductId equals p.ProductId
select new DTOList
{
Id = s.Id,
SerialNum = s.SerialNum,
Project = s.Project,
Unit = s.Unit,
EveryHundredKJ = s.EveryHundredKJ,
EveryHundredG = s.EveryHundredG,
ProductId = p.ProductId,
RegistrationID = p.RegistrationID,
CommonName = p.CommonName,
ProductName = p.ProductName,
EngLishName = p.EngLishName,
Process = p.Process,
ProcessName = p.ProcessName,
IsRawMilkSkim = p.IsRawMilkSkim,
Type = p.Type
}).AsQueryable();
var tables = query.OrderBy(s => s.Id).Skip((page - 1) * limit).Take(limit).ToList();
return Json(new ResultModel<DTOList>() { success = true, code = 0, count = query.Count(), data = tables, msg = "" }, JsonRequestBehavior.AllowGet);
}
}
[HttpPost]
public JsonResult AddTables(string json)
{
if (json.IndexOf("http://tsspxx.gsxt.gov.cn/tyyp/detailyp.xhtml") < 0)
{
return Json(new ResultModel<string>() { success = false, msg = "链接不对" });
}
var url = json;
var web = new HtmlWeb();
var doc = web.Load(url);
// With LINQ
var zch = doc.DocumentNode.Descendants("tr")
.Where(x => x.ChildNodes["th"].InnerText == "注册号")
.FirstOrDefault();
string registNumber = zch.ChildNodes["td"].InnerText.Substring(4, zch.ChildNodes["td"].InnerText.Length - 4);
var commonName = doc.DocumentNode.Descendants("tr")
.Where(x => x.ChildNodes["th"].InnerText == "通用名称(产品)")
.FirstOrDefault();
var productName = doc.DocumentNode.Descendants("tr")
.Where(x => x.ChildNodes["th"].InnerText == "商品名称(产品)")
.FirstOrDefault();
var ENName = doc.DocumentNode.Descendants("tr")
.Where(x => x.ChildNodes["th"].InnerText == "英文名称(产品)")
.FirstOrDefault();
var process = doc.DocumentNode.Descendants("tr")
.Where(x => x.ChildNodes["th"].InnerText == "生产工艺")
.FirstOrDefault();
var url1 = "http://tsspxx.gsxt.gov.cn:80//tyyp/detailPf.xhtml?COLUMN1667=%25E5%259B%25BD%25E9%25A3%259F%25E6%25B3%25A8%25E5%25AD%2597" + registNumber;
var web1 = new HtmlWeb();
var doc1 = web1.Load(url1);
var headers = HTTPHeader.GetHTTPResponseHeaders(url1);
string cookie = headers["Set-Cookie"];
var url2 = "http://tsspxx.gsxt.gov.cn:80//tyyp/yppfpage.xhtml?currentPage=6";
var web2 = new HtmlWeb();
HtmlAgilityPack.HtmlWeb.PreRequestHandler handler = delegate (HttpWebRequest request)
{
request.Headers[HttpRequestHeader.AcceptEncoding] = "gzip, deflate";
request.Headers[HttpRequestHeader.Cookie] = cookie;
//request.Headers[HttpRequestHeader.Referer] = url1;
request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip;
return true;
};
web2.PreRequest += handler;
var doc2 = web2.Load(url2);
var list = GetPageData.GetData(cookie);
using (CrawlerForBabyEntities1 db = new CrawlerForBabyEntities1())
{
string commonNameText = commonName.ChildNodes["td"].InnerText;
string productNameText = productName.ChildNodes["td"].InnerText;
var isAdd = (from s in db.Product
where s.CommonName == commonNameText && s.ProductName == productNameText
select s).Any();
if (!isAdd)
{
Product product = new Product()
{
RegistrationID = zch.ChildNodes["td"].InnerText,
CommonName = commonNameText,
ProductName = productNameText,
EngLishName = ENName.ChildNodes["td"].InnerText,
Process = process.ChildNodes["td"].InnerText,
};
db.Product.Add(product);
db.SaveChanges();
foreach (var item in list)
{
var tds = item.DocumentNode.Descendants("td").ToList();
for (int i = 0; i < tds.Count; i++)
{
if (i % 5 == 0)
{
double kjResult = 0;
double gResult = 0;
var kj = double.TryParse(tds[i + 3].InnerText, out kjResult);
var g = double.TryParse(tds[i + 4].InnerText, out gResult);
BabyRecipe model = new BabyRecipe()
{
SerialNum = Convert.ToInt32(tds[i].InnerText),
Project = tds[i + 1].InnerText,
Unit = tds[i + 2].InnerText,
EveryHundredKJ = double.Parse(kjResult.ToString("0.00")),
EveryHundredG = double.Parse(gResult.ToString("0.00")),
ProductId = product.ProductId
};
db.BabyRecipe.Add(model);
db.SaveChanges();
}
}
}
}
else
{
return Json(new ResultModel<string>() { success = false, msg = "已经存在" });
}
}
return Json(new ResultModel<string>() { success = true, code = 0, msg = "" }, JsonRequestBehavior.AllowGet);
}
}
}
前端
@{
ViewBag.Title = "Home Page";
}
<link href="~/Content/layui/css/layui.css" rel="stylesheet" />
<script src="~/Content/layui/layui.js"></script>
<div class="jumbotron">
<h1>ASP.NET</h1>
</div>
<form class="layui-form" action="">
<div class="layui-form-item layui-form-text">
<label class="layui-form-label">URL</label>
<div class="layui-input-block">
示例:<a href="http://tsspxx.gsxt.gov.cn/tyyp/detailyp.xhtml?id=A4FA632E8E15B4D8E055620810C6201A" target="_blank">http://tsspxx.gsxt.gov.cn/tyyp/detailyp.xhtml?id=A4FA632E8E15B4D8E055620810C6201A</a>
</div>
<div class="layui-input-block">
<textarea name="json" placeholder="请输入URL" class="layui-textarea" id="textjson" style="height: 50px;min-height:50px"></textarea>
</div>
</div>
<div class="layui-form-item">
<div class="layui-input-block">
<button class="layui-btn" lay-submit lay-filter="formDemo">立即提交</button>
<button type="reset" class="layui-btn layui-btn-primary">重置</button>
</div>
</div>
</form>
<div class="row">
<table id="demo" lay-filter="test"></table>
</div>
<style>
.layui-table-cell {
height: 44px;
line-height: 44px;
}
.optherName:active {
}
.optherName:hover {
color: #ffffff;
background-color: #379736;
}
</style>
<script>
layui.use(['laydate','table', 'form'], function () {
var table = layui.table;
var form = layui.form;
var laydate = layui.laydate;
//监听提交
form.on('submit(formDemo)',
function (data) {
$.ajax({
url: "Home/AddTables",
//dataType: 'text',
contentType: "application/x-www-form-urlencoded",
data: data.field, //请求的附加参数,用json对象
method: 'POST',
success: function (res) {
console.log(res);
if (res.success) {
layer.msg("新增成功!");
tableObj.reload(); //重载表格
} else {
layer.msg(res.msg);
}
}
});
$('#textjson').val('');
return false;
});
//第一个实例
var tableObj = table.render({
elem: '#demo'
, height: 312
, url: '/home/GetTables/' //数据接口
, page: true //开启分页
, cols: [[ //表头
{ field: 'Id', title: 'ID', width: 80, sort: true, fixed: 'left' }
, { field: 'SerialNum', title: '序号', width: 60 }
, { field: 'CommonName', title: '通用名', width: 240 }
, { field: 'ProductName', title: '产品名', width: 140 }
, { field: 'EngLishName', title: '英文', width: 120 }
, { field: 'Process', title: '工艺', width: 150 }
, { field: 'Unit', title: '单位', width: 80 }
, { field: 'Project', title: '项目', width: 250, sort: true }
, { field: 'Unit', title: '单位', width: 80 }
, { field: 'EveryHundredKJ', title: '每100kJ', width: 120 }
, { field: 'EveryHundredG', title: '每100g', width: 120, sort: true }
]]
});
});
</script>