写入文档
import gevent
from gevent import monkey
monkey.patch_all()
from gevent.queue import Queue
import time
import os
import requests
import re
start = time.perf_counter()
work = Queue()
# for i in range(1,101):
# url = f'https://sh.fang.lianjia.com/loupan/pg{i}/'
# print(url)
# work.put_nowait(url)
url = 'https://sh.fang.lianjia.com/loupan/pg{}/'
url_list = (url.format(i) for i in range(1,101))
[work.put_nowait(url) for url in url_list]
info_set = set()
def spider():
while not work.empty():
url = work.get_nowait()
res = requests.get(url).text
title = re.findall('<a href="/loup.*?itle="(.*?)"'
'.*?<div class="resb.*?<span>(.*?)</span>'
'.*?<span>(.*?)</span>'
'.*?<span class="number">(.*?)</span>'
'.*?<span class="desc"> (.*?)</span>',res,re.S)
for i in title:
info_set.add(i)
tasks = []
for x in range(200):
task = gevent.spawn(spider)
tasks.append(task)
gevent.joinall(tasks,timeout=6)
for i,n in enumerate(info_set):
title = f'标题: {n[0]}'
addr = f'地区: {n[1]}{n[2]}'
price = f'价格: {n[3]}{n[4]}'
print(f"""
{i}
{title}
{addr}
{price}
""")
with open('./lianjie.cvs','a',encoding='utf-8') as f:
f.writelines([title,addr,price,'\n'])
print('写入完成')
print(time.perf_counter()-start)
MongoDB 批量插入
当爬虫获取的数据量较大时,一条一条的写入MongoDB会过于消耗资源。
这时候就需要用到insert_many()
方法,把数据放入列表中并批量插入,但是如果你为了数据不被重复建立了MongoDB的唯一索引,就可能会导致部分数据无法写入。
因为insert_many()
默认是按序写入,一条数据写入失败,后面的数据就无法写入了。所以需要修改默认参数ordered
。
当ordered=False
时,数据就会被乱序并行插入,所以每个字典的插入操作互不影响。
import gevent
from gevent import monkey
monkey.patch_all()
from gevent.queue import Queue
import time
import os
import requests
import re
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client['ljw']
db = db.lj
start = time.perf_counter()
work = Queue()
url = 'https://sh.fang.lianjia.com/loupan/pg{}/'
url_list = (url.format(i) for i in range(1,101))
[work.put_nowait(url) for url in url_list]
info_set = set()
def spider():
while not work.empty():
url = work.get_nowait()
res = requests.get(url).text
title = re.findall('<a href="/loup.*?itle="(.*?)"'
'.*?<div class="resb.*?<span>(.*?)</span>'
'.*?<span>(.*?)</span>'
'.*?<span class="number">(.*?)</span>'
'.*?<span class="desc"> (.*?)</span>',res,re.S)
for i in title:
info_set.add(i)
tasks = []
for x in range(200):
task = gevent.spawn(spider)
tasks.append(task)
gevent.joinall(tasks,timeout=6)
info_list = []
for n in info_set:
title = f'{n[0]}'
addr = f'{n[1]}{n[2]}'
price = f'{n[3]}{n[4]}'
items = {f'{title}':[ addr, price]}
info_list.append(items)
try:
db.insert_many(info_list, ordered=False)
except:
pass
print(time.perf_counter()-start)