python爬虫案例--------爬取https://www.qiushibaike.com/用户信息,保存到mysql数据库中
python爬虫案例--------爬取https://www.qiushibaike.com/用户信息,保存到mysql数据库中
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2018/3/5 23:32 # @Author : hyang # @Site : # @File : scrapy_qsbk.py # @Software: PyCharm import requests from bs4 import BeautifulSoup from requests.exceptions import * import pymysql import time import re ''' 爬取/www.qiushibaike.com 热门话题的用户信息,并保存到mysql数据库中 ''' start_url = 'https://www.qiushibaike.com' class qsbk(object): def __init__(self): self.session = requests.session() # 包括了cookies信息 self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36", } mysql_connect_dict = { 'host': '127.0.0.1', 'port': 3333, 'user': 'jianeng', 'password': 'qwe123', 'db': 'info', 'charset': 'utf8' } # 连接数据库 self.conn = pymysql.connect(**mysql_connect_dict) self.cursor = self.conn.cursor() def get_response(self, url): try: response = self.session.get(url, timeout=1) if response.status_code == 200: return response.text else: time.sleep(1) return self.get_response(url) except ReadTimeout: print('ReadTimeout') except ConnectionError: # 网络不通 print('ConnectionError') except RequestException: print('Error') # 解析用户url /users/24057284/ def parse_userurl(self, text): soup = BeautifulSoup(text,'lxml') # print(soup.prettify()) author_li = soup.findAll('div', class_="author clearfix") url_li = [] for item in author_li: if item.find('a') != None: # name = item.find('h2').text url = item.find('a').attrs['href'] url_li.append(url) return url_li # 解析用户数据 def parse_userdata(self, text): soup = BeautifulSoup(text, 'lxml') if '当前用户已关闭糗百个人动态' in text: print('当前用户已关闭') return None else: username = soup.find('h2').text result = soup.findAll('div', class_='user-statis') number = result[0].find_all('li')[0].text attentions = result[0].find_all('li')[1].text comments = result[0].find_all('li')[3].text constellation = result[1].find_all('li')[1].text occupation = result[1].find_all('li')[2].text address = result[1].find_all('li')[3].text return username, number, attentions, comments, constellation, occupation, address # 保存到数据库中 def save_mydata(self, data): # print (data) if data != None: sql = 'insert into qsbk_user (username,num,attentions,comments,constellation,occupation,address) VALUES (%s,%s,%s,%s,%s,%s,%s)' li = [item.split(":")[-1] for item in data] # print('data=',li) # data= ['绠纱猫猫', '16', '3', '297', '天蝎座', '家里蹲', '湖南 · 长沙'] try: self.cursor.execute(sql, tuple(li)) self.conn.commit() except Exception as e: print(e) def main(self,url): response = self.get_response(url) try: url_li = self.parse_userurl(response) for item in url_li: user_detail_url = url+item data = self.parse_userdata(self.get_response(user_detail_url)) self.save_mydata(data) except IndexError as e: print(e) except Exception as e: print(e) if __name__ == '__main__': qsbk().main(start_url)