python爬虫案例--------爬取https://www.qiushibaike.com/用户信息,保存到mysql数据库中

python爬虫案例--------爬取https://www.qiushibaike.com/用户信息,保存到mysql数据库中

 

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2018/3/5 23:32
# @Author  : hyang
# @Site    : 
# @File    : scrapy_qsbk.py
# @Software: PyCharm

import requests
from bs4 import BeautifulSoup
from requests.exceptions import *
import pymysql
import time
import re
'''
爬取/www.qiushibaike.com
热门话题的用户信息,并保存到mysql数据库中
'''

start_url = 'https://www.qiushibaike.com'

class qsbk(object):

    def __init__(self):
        self.session = requests.session()  # 包括了cookies信息
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36",
        }
        mysql_connect_dict = {
            'host': '127.0.0.1',
            'port': 3333,
            'user': 'jianeng',
            'password': 'qwe123',
            'db': 'info',
            'charset': 'utf8'
        }
        # 连接数据库
        self.conn = pymysql.connect(**mysql_connect_dict)
        self.cursor = self.conn.cursor()

    def get_response(self, url):
        try:
            response = self.session.get(url, timeout=1)
            if response.status_code == 200:
                return response.text
            else:
                time.sleep(1)
                return self.get_response(url)
        except ReadTimeout:
            print('ReadTimeout')
        except ConnectionError:  # 网络不通
            print('ConnectionError')
        except RequestException:
            print('Error')

    # 解析用户url /users/24057284/
    def parse_userurl(self, text):
        soup = BeautifulSoup(text,'lxml')
        # print(soup.prettify())
        author_li = soup.findAll('div', class_="author clearfix")
        url_li = []
        for item in author_li:
            if item.find('a') != None:
                # name = item.find('h2').text
                url =  item.find('a').attrs['href']
                url_li.append(url)
        return url_li

    # 解析用户数据
    def parse_userdata(self, text):
        soup = BeautifulSoup(text, 'lxml')

        if '当前用户已关闭糗百个人动态' in text:
            print('当前用户已关闭')
            return None
        else:
            username = soup.find('h2').text
            result = soup.findAll('div', class_='user-statis')

            number = result[0].find_all('li')[0].text
            attentions = result[0].find_all('li')[1].text
            comments = result[0].find_all('li')[3].text

            constellation = result[1].find_all('li')[1].text
            occupation = result[1].find_all('li')[2].text
            address = result[1].find_all('li')[3].text

            return username, number, attentions, comments, constellation, occupation, address

    # 保存到数据库中
    def save_mydata(self, data):
        # print (data)
        if data != None:
            sql = 'insert into qsbk_user (username,num,attentions,comments,constellation,occupation,address) VALUES (%s,%s,%s,%s,%s,%s,%s)'
            li = [item.split(":")[-1] for item in data]
            # print('data=',li) # data= ['绠纱猫猫', '16', '3', '297', '天蝎座', '家里蹲', '湖南 · 长沙']
            try:
                self.cursor.execute(sql, tuple(li))
                self.conn.commit()
            except Exception as e:
                print(e)

    def main(self,url):
        response = self.get_response(url)
        try:
            url_li = self.parse_userurl(response)
            for item in url_li:
                user_detail_url = url+item
                data = self.parse_userdata(self.get_response(user_detail_url))
                self.save_mydata(data)
        except IndexError as e:
            print(e)
        except Exception as e:
            print(e)
if __name__ == '__main__':
    qsbk().main(start_url)

 

posted @ 2018-03-06 00:47  一只小小的寄居蟹  阅读(1682)  评论(0编辑  收藏  举报