怎么写爬虫,怎么找网站练手抓取链家、中原、安居客、我爱我家,今年5月份开始写论文啦!!!
//设置请求时间
string html = string.Empty;
try
{
HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;//模拟请求
request.Timeout = 30 * 1000;//设置30s的超时
request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36";
request.ContentType = "text/html; charset=utf-8";
using (HttpWebResponse response = request.GetResponse() as HttpWebResponse) //发起请求
{
if (response.StatusCode != HttpStatusCode.OK)
{
log.Error("抓取{0}地址返回失败,响应状态为{1}", url, response.StatusCode);
}
else
{
try
{
StreamReader sr = new StreamReader(response.GetResponseStream(), encode);
html = sr.ReadToEnd();//读取数据
sr.Close();
}
catch (Exception ex)
{
log.Error("抓取{0}失败", url, ex);
html = null;
}
}
}
}
catch (Exception ex)
{
log.Error("抓取{0}出现异常", url, ex);
html = null;
}
return html;
//抓取链家、中原、安居客、我爱我家
string html = HttpHelper.DownloadUrl(pageurl);
if (html == null)
{
return houseList;
}
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
//链家
string psht = @"//*[@class='content']/div[@class='leftContent']/ul[@class='listContent']/li[@class='clear xiaoquListItem']";
HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(psht);
if (noneNodeList == null)
{
log.ErrorAsync("数据为空!");
return houseList;
}
foreach (var item in noneNodeList)
{
TrojanHorse house = new TrojanHorse();
HtmlDocument docChild = new HtmlDocument();
docChild.LoadHtml(item.OuterHtml);
//链家
string urlPath = @"//*[@class='info']/div[@class='title']/a";
HtmlNode urlNode = docChild.DocumentNode.SelectSingleNode(urlPath);
string tsct = urlNode.InnerText;//小区名称
//链家 [@class='xiaoquListItemRight']/div
string strs = @"//*[@class='xiaoquListItemPrice']/div[@class='totalPrice']/span";
HtmlNode urlNodes = docChild.DocumentNode.SelectSingleNode(strs);
string s = "";
if (urlNodes == null)
{
s = "null";
}
else
{
s = urlNodes.InnerText;
}
//链家
string strst = @"//*[@class='xiaoquListItemRight']/div[@class='xiaoquListItemSellCount']/a[@class='totalSellCount']/span";
HtmlNode urlNodest = docChild.DocumentNode.SelectSingleNode(strst);
string st = "";
if (urlNodest == null)
{
st = "null";
}
else
{
st = urlNodest.InnerText;
}
#region
//string tscts = s.Replace(" ", "");
//string tsctst = tscts.Substring(0, 8);
//string tsctsb = tscts.Substring(tscts.Length - 7, 7);
// string rsf = s;
//string zf = tsctsb.Substring(0, 5);//租房
#endregion
house.title = tsct;
house.price = s;
house.remark = st;
houseList.Add(house);
#region
//house.StaffName = urlNode.Attributes["title"].Value; //经纪人姓名
//string companyPath = "//*[@class='jjr-info']/p[@class='jjr-desc mg-top']/a[position()<2]";
//HtmlNode companyNode = docChild.DocumentNode.SelectSingleNode(companyPath);
//if (companyNode == null)
//{
// continue;
//}
//house.Company = companyNode.InnerText; //中介公司
//string telPath = "//*[@class='jjr-side']";
//HtmlNode telNode = docChild.DocumentNode.SelectSingleNode(telPath);
//if (telNode == null)
//{
// continue;
//}
//string telstr = telNode.InnerText.Trim();
//house.Mobile = telstr; //经纪人电话
//house.CityCode = citycode; //城市代号
//house.CreateTime = DateTime.Now;
//var flag = houseList.Where(x => x.Mobile == telstr).FirstOrDefault(); //有重复的手机号不添加
//if (flag == null)
//{
// houseList.Add(house);
//}
#endregion
}
}
/// <summary>
/// 抓取每一页的数据
/// </summary>
/// <param name="pageurl"></param>
/// <returns></returns>
private static List<TrojanHorse> GetTrojanHorseList(string pageurl)
{
List<TrojanHorse> houseList = new List<TrojanHorse>();
try
{
string html = HttpHelper.DownloadUrl(pageurl);
if (html == null)
{
return houseList;
}
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
//链家
string psht = @"//*[@class='content']/div[@class='leftContent']/ul[@class='listContent']/li[@class='clear xiaoquListItem']";
HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(psht);
if (noneNodeList == null)
{
log.ErrorAsync("数据为空!");
return houseList;
}
foreach (var item in noneNodeList)
{
TrojanHorse house = new TrojanHorse();
HtmlDocument docChild = new HtmlDocument();
docChild.LoadHtml(item.OuterHtml);
//链家
string urlPath = @"//*[@class='info']/div[@class='title']/a";
HtmlNode urlNode = docChild.DocumentNode.SelectSingleNode(urlPath);
string tsct = urlNode.InnerText;//小区名称
//链家 [@class='xiaoquListItemRight']/div
string strs = @"//*[@class='xiaoquListItemPrice']/div[@class='totalPrice']/span";
HtmlNode urlNodes = docChild.DocumentNode.SelectSingleNode(strs);
string s = "";
if (urlNodes == null)
{
s = "null";
}
else
{
s = urlNodes.InnerText;
}
//链家
string strst = @"//*[@class='xiaoquListItemRight']/div[@class='xiaoquListItemSellCount']/a[@class='totalSellCount']/span";
HtmlNode urlNodest = docChild.DocumentNode.SelectSingleNode(strst);
string st = "";
if (urlNodest == null)
{
st = "null";
}
else
{
st = urlNodest.InnerText;
}
#region
//string tscts = s.Replace(" ", "");
//string tsctst = tscts.Substring(0, 8);
//string tsctsb = tscts.Substring(tscts.Length - 7, 7);
// string rsf = s;
//string zf = tsctsb.Substring(0, 5);//租房
#endregion
house.title = tsct;
house.price = s;
house.remark = st;
houseList.Add(house);
#region
//house.StaffName = urlNode.Attributes["title"].Value; //经纪人姓名
//string companyPath = "//*[@class='jjr-info']/p[@class='jjr-desc mg-top']/a[position()<2]";
//HtmlNode companyNode = docChild.DocumentNode.SelectSingleNode(companyPath);
//if (companyNode == null)
//{
// continue;
//}
//house.Company = companyNode.InnerText; //中介公司
//string telPath = "//*[@class='jjr-side']";
//HtmlNode telNode = docChild.DocumentNode.SelectSingleNode(telPath);
//if (telNode == null)
//{
// continue;
//}
//string telstr = telNode.InnerText.Trim();
//house.Mobile = telstr; //经纪人电话
//house.CityCode = citycode; //城市代号
//house.CreateTime = DateTime.Now;
//var flag = houseList.Where(x => x.Mobile == telstr).FirstOrDefault(); //有重复的手机号不添加
//if (flag == null)
//{
// houseList.Add(house);
//}
#endregion
}
}
#region
// {
// string html = HttpHelper.DownloadUrl(pageurl);
// if (html == null)
// {
// return houseList;
// }
// HtmlDocument doc = new HtmlDocument();
// doc.LoadHtml(html);
// //*[@id="pebpwbwege"]
// //链家
// //string psht = @"//*[@class='leftContent']/ul[@class='listContent']/li[@class='clear xiaoquListItem']";
// //中原
// string psht = @"//*[@class='section-wrap section-houselists']/div[@class='section']/div[@class='house-item clearfix']";
// HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(psht);
// if (noneNodeList == null)
// {
// log.ErrorAsync("数据为空!");
// return houseList;
// }
// foreach (var item in noneNodeList)
// {
// TrojanHorse house = new TrojanHorse();
// HtmlDocument docChild = new HtmlDocument();
// docChild.LoadHtml(item.OuterHtml);
// //docChild.LoadHtml();
// //链家
// //string urlPath = @"//*[@class='info']/div[@class='title']/a";
// //中原
// string urlPath = @"//*[@class='item-info fl']/h4/a";
// HtmlNode urlNode = docChild.DocumentNode.SelectSingleNode(urlPath);
// //docChild.DocumentNode.SelectSingleNode(str);
// string tsct = urlNode.InnerText;//小区名称
// //链家
// //string strs = @"//*[@class='xiaoquListItemRight']/div[@class='xiaoquListItemPrice']/div[@class='totalPrice']/span";
// //中原
// string strs = @"//*[@class='item-pricearea fr']/p[@class='tc f666 f12 mt_10']/a";
// HtmlNode urlNodes = docChild.DocumentNode.SelectSingleNode(strs);
// string s = "";
// if (urlNodes == null)
// {
// s = "null";
// }
// else
// {
// s = urlNodes.InnerText;
// }
// //链家
// //string strst = @"//*[@class='xiaoquListItemRight']/div[@class='xiaoquListItemSellCount']/a[@class='totalSellCount']/span";
// //中原
// string strst = @"//*[@class='item-pricearea fr']/p[@class='price-nub cRed tc']/span";
// HtmlNode urlNodest = docChild.DocumentNode.SelectSingleNode(strst);
// string st = "";
// if (urlNodest == null)
// {
// st = "null";
// }
// else
// {
// st = urlNodest.InnerText;
// }
// //string tscts = s.Replace(" ", "");
// //string tsctst = tscts.Substring(0, 8);
// //string tsctsb = tscts.Substring(tscts.Length - 7, 7);
// // string rsf = s;
// //string zf = tsctsb.Substring(0, 5);//租房
// house.title = tsct;
// house.price = s;
// house.remark = st;
// houseList.Add(house);
// //house.StaffName = urlNode.Attributes["title"].Value; //经纪人姓名
// //string companyPath = "//*[@class='jjr-info']/p[@class='jjr-desc mg-top']/a[position()<2]";
// //HtmlNode companyNode = docChild.DocumentNode.SelectSingleNode(companyPath);
// //if (companyNode == null)
// //{
// // continue;
// //}
// //house.Company = companyNode.InnerText; //中介公司
// //string telPath = "//*[@class='jjr-side']";
// //HtmlNode telNode = docChild.DocumentNode.SelectSingleNode(telPath);
// //if (telNode == null)
// //{
// // continue;
// //}
// //string telstr = telNode.InnerText.Trim();
// //house.Mobile = telstr; //经纪人电话
// //house.CityCode = citycode; //城市代号
// //house.CreateTime = DateTime.Now;
// //var flag = houseList.Where(x => x.Mobile == telstr).FirstOrDefault(); //有重复的手机号不添加
// //if (flag == null)
// //{
// // houseList.Add(house);
// //}
// }
//}
#endregion
catch (Exception ex)
{
log.ErrorAsync("服务器异常,异常信息:" + ex.Message);
}