爬虫本质就是将网站或者接口的数据经过筛选后按需求保存

这里实现一个简单爬虫仅供参考

import requests
import bs4
import threading
import queue
import os


class GetWebAndHandle(threading.Thread):
    status = {
        'work': True,
        'path': 'your-path',
        'suffix': '.html'
    }

    headers = {
        'User-Agent': 'your-agent',
        'Host': 'www.baidu.com'
    }

    def __init__(self, kw, page_que):
        super(GetWebAndHandle, self).__init__()
        self.page_que = page_que
        self.url = 'https://www.baidu.com/s?wd=' + kw + '&pn='
        self.file_path_name = self.status['path'] + self.name + self.status['suffix']

    def run(self):
        if os.path.isfile(self.file_path_name):
            os.remove(self.file_path_name)

        while self.status['work']:
            try:
                url = self.url + self.page_que.get(False)
                response = requests.get(url, headers=self.headers)
                self.handle(response.text)
            except queue.Empty:
                self.status['work'] = False

    def handle(self, response):
        soup = bs4.BeautifulSoup(response)
        sifted = soup.find_all('div', 'result c-container ')
        with open(self.file_path_name, mode='ab') as file:
            for sift in sifted:
                file.write(sift.encode('utf-8'))
            file.write(('-=-' * 30).encode('utf-8'))


if __name__ == '__main__':

    # ssl._create_default_https_context = ssl._create_unverified_context()

    kw = input('输入关键字')
    t_num = min(max(int(input('需要的线程数')), 1), 4)
    p_num = min(max(int(input('需要的页数')), 1), 20)

    page_que = queue.Queue(p_num)
    for p in range(p_num):
        page_que.put(str(p) + '0')

    thread_list = []
    for i in range(t_num):
        spider_thread = GetWebAndHandle(kw, page_que)
        spider_thread.start()
        thread_list.append(spider_thread)

 

posted on 2018-10-18 00:32  SHQHDMR  阅读(207)  评论(0编辑  收藏  举报