广州楼盘抓取分析-分析问题
上文其实还是有不少问题的。
1.顺序执行,效率比较慢;2.不能断点执行。
那么,解决办法是什么呢?
对于问题1,可以采用生产者消费者模式来改写,代码如下
# -*- coding: utf-8 -*- ####################################################################### # Copyright (C) 2005-2016 UC Mobile Limited. All Rights Reserved # File : first_sale_spider.py # # Creation : 2016/2/23 19:41 # Author : shufeng.lsf@ucweb.com ####################################################################### import random from threading import Thread import requests import re import time from pyquery import PyQuery as pq from Queue import Queue import MySQLdb import uniout import sys reload(sys) sys.setdefaultencoding("utf-8") community_list = [] HOST = "127.0.0.1" USER = "root" PASSWD = "" DB = "house_analysis" PORT = 3306 queue = Queue(10) class DBOperate(object): def __init__(self, host, user, passwd, db, port, charset="utf8"): self.host = host self.user = user self.passwd = passwd self.db = db self.port = port self.conn = MySQLdb.connect(self.host, self.user, self.passwd, self.db, self.port, charset="utf8") self.cur = self.conn.cursor() def insertSql(self,sql): self.cur.execute(sql) self.conn.commit() def __del__(self): self.cur.close() self.conn.close() def requestByGet(url): r = requests.get(url) return r.content def getNextPage(content): m = re.findall(r'<a href="(.+?)" class="next-page next-link">下一页</a>',content) if len(m)>0: next_url = m[0] else: next_url = '' return next_url def getCommunityList(content): community_urls = re.findall(r'data-link="(http://gz.fang.anjuke.com/loupan/\d+?.html)"',content) print "正在采集...",community_urls if len(community_urls)>0: return community_urls def getHouseInfo(url): p = pq(url) name = p('h1').text().strip() style = p('.house-item').text().split(",")[0].strip() price = p('.sp-price').text().strip() l = p('.lpAddr-text').text() location = re.split('\[ | \]',l) area = location[-2].split('-')[0].strip() zone = location[-2].split('-')[1].strip() address = location[-1].strip() detail_location = location[-1].strip() result = { "name": name, "area": area, "location": zone, "detail_location": detail_location, "house_style": style, "price": price } return result def detailPageHandler(cur, detail_url): result = getHouseInfo(detail_url) print "result:",result cur.insertSql("insert into first_sale (name,area,location,detail_location,house_style,price) VALUES('%s','%s','%s','%s','%s','%s')" % ( result['name'], result['area'], result['location'], result['detail_location'], result['house_style'], result['price'] )) class UrlProducer(Thread): def __init__(self, start_url): Thread.__init__(self) self.start_url = start_url def run(self): global queue while True: content = requestByGet(self.start_url) next_url = getNextPage(content) community_urls = getCommunityList(content) for url in community_urls: queue.put(url) time.sleep(random.random()) print "进入队列的url:",url if next_url != '': self.start_url = next_url continue else: break class GetHouseInfo(Thread): def __init__(self, cur): Thread.__init__(self) self.cur = cur def run(self): global queue while True: url = queue.get() detailPageHandler(self.cur, url) queue.task_done() time.sleep(random.random()) print "处理完毕的url:", url def main(): cur = DBOperate(host=HOST, user=USER, passwd=PASSWD, db=DB, port=PORT) UrlProducer("http://gz.fang.anjuke.com/loupan/?from=navigation").start() GetHouseInfo(cur).start() if __name__ == '__main__': main()
2.对于不能断点执行的问题,可以用异常捕获的方式将当前执行的url保存下来,下次直接从文件中读取执行即可。