写入文档

import gevent
from gevent import monkey
monkey.patch_all()
from gevent.queue import Queue
import time
import os
import requests
import re

start = time.perf_counter()
work = Queue()
# for i in range(1,101):
#     url = f'https://sh.fang.lianjia.com/loupan/pg{i}/'
#     print(url)
#     work.put_nowait(url)
url = 'https://sh.fang.lianjia.com/loupan/pg{}/'
url_list = (url.format(i) for i in range(1,101))
[work.put_nowait(url) for url in url_list]

info_set = set()

def spider():
    while not work.empty():
        url = work.get_nowait()
        res = requests.get(url).text
        title = re.findall('<a href="/loup.*?itle="(.*?)"'
                           '.*?<div class="resb.*?<span>(.*?)</span>'
                           '.*?<span>(.*?)</span>'
                           '.*?<span class="number">(.*?)</span>'
                           '.*?<span class="desc">&nbsp;(.*?)</span>',res,re.S)

        for i in title:
            info_set.add(i)

tasks = []

for x in range(200):
    task = gevent.spawn(spider)
    tasks.append(task)
gevent.joinall(tasks,timeout=6)


for i,n in enumerate(info_set):

    title = f'标题:  {n[0]}'
    addr = f'地区:  {n[1]}{n[2]}'
    price = f'价格:  {n[3]}{n[4]}'
    print(f"""
    {i}
    {title}
    {addr}
    {price}
    """)

    with open('./lianjie.cvs','a',encoding='utf-8') as f:
        f.writelines([title,addr,price,'\n'])
        print('写入完成')

print(time.perf_counter()-start)

MongoDB 批量插入

当爬虫获取的数据量较大时,一条一条的写入MongoDB会过于消耗资源。

这时候就需要用到insert_many()方法,把数据放入列表中并批量插入,但是如果你为了数据不被重复建立了MongoDB的唯一索引,就可能会导致部分数据无法写入。

因为insert_many()默认是按序写入,一条数据写入失败,后面的数据就无法写入了。所以需要修改默认参数ordered

ordered=False时,数据就会被乱序并行插入,所以每个字典的插入操作互不影响。

import gevent
from gevent import monkey
monkey.patch_all()
from gevent.queue import Queue
import time
import os
import requests
import re
from pymongo import MongoClient

client = MongoClient('localhost', 27017)
db = client['ljw']
db = db.lj

start = time.perf_counter()
work = Queue()

url = 'https://sh.fang.lianjia.com/loupan/pg{}/'
url_list = (url.format(i) for i in range(1,101))
[work.put_nowait(url) for url in url_list]

info_set = set()

def spider():
    while not work.empty():
        url = work.get_nowait()
        res = requests.get(url).text
        title = re.findall('<a href="/loup.*?itle="(.*?)"'
                           '.*?<div class="resb.*?<span>(.*?)</span>'
                           '.*?<span>(.*?)</span>'
                           '.*?<span class="number">(.*?)</span>'
                           '.*?<span class="desc">&nbsp;(.*?)</span>',res,re.S)

        for i in title:
            info_set.add(i)

tasks = []

for x in range(200):
    task = gevent.spawn(spider)
    tasks.append(task)
gevent.joinall(tasks,timeout=6)

info_list = []

for n in info_set:

    title = f'{n[0]}'
    addr = f'{n[1]}{n[2]}'
    price = f'{n[3]}{n[4]}'

    items = {f'{title}':[ addr, price]}
    info_list.append(items)


try:
    db.insert_many(info_list, ordered=False)
except:
    pass
print(time.perf_counter()-start)



 posted on 2020-04-28 17:27  Rannie`  阅读(190)  评论(0编辑  收藏  举报
去除动画
找回动画