python抓取链家房源信息
闲着没事就抓取了下链家网的房源信息,抓取的是北京二手房的信息情况,然后通过网址进行分析,有100页,并且每页的url都是类似的
url = 'https://bj.lianjia.com/ershoufang/pg' + 页数,然后请求是get 请求,所以静态页面,然后依次来进行分析,并且存储在mongodb中,每次插入的时候还是要字符串装换成json格式在进行插入,页面的解析用的是bs,解析很方便,代码用的是单进程,耗时是大致66s,因为怕ip被封,所以在每次页面请求之后都要sleep 1秒。
#-*-coding:utf-8-*- import urllib import urllib2 import re import requests import json import lxml from bs4 import BeautifulSoup import time from pymongo import MongoClient from lxml import etree client = MongoClient('localhost',27017) db = client.test House = db.House headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate, br', 'Accept-Language':'zh-CN,zh;q=0.9', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'Cookie':'......', 'Host':'bj.lianjia.com', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36' } URL = 'https://bj.lianjia.com/ershoufang/pg' def download(url): num_try = 2 while num_try > 0: num_try -= 1 try: content = requests.get(url,headers = headers) return content.text except urllib2.URLError as e: print 'Download error',e.reason return None def get_message(url): html = download(url) soup = BeautifulSoup(html,'html.parser') prices = soup.find_all('div','priceInfo') total_price = [] for each in prices: total_price.append(each.span.string) address = [] house_types = [] areas = [] towards = [] decorates = [] elevates = [] message = soup.find_all('div',attrs={'class':'houseInfo'}) for each in message: List = each.get_text().split('|') address.append(List[0].strip()) house_types.append(List[1].strip()) areas.append(List[2].strip()) towards.append(List[3].strip()) decorates.append(List[4].strip()) if len(List) == 5: elevates.append("None") else: elevates.append(List[5].strip()) for addres,house_type,area,price,toward,decorate,elevate in zip(address,house_types,areas,total_price,towards,decorates,elevates): mess = "{\"Address\":\"%s\",\"House_type\":\"%s\",\"Area\":\"%s\",\"Price\":\"%s\",\"Toward\":\"%s\",\"Decorate\":\"%s\",\"Elevete\":\"%s\"}"%(addres,house_type,area,price,toward,decorate,elevate) print mess message = json.loads(mess) House.insert(message) if __name__ == '__main__': t = time.time() print t for num in xrange(1,101): url = URL + str(num) print url get_message(url) time.sleep(1) t1 = time.time() print 'Total time:' print t1 - t - 100