<爬虫>利用BeautifulSoup爬取百度百科虚拟人物资料存入Mysql数据库

网页情况:

代码:

import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup as bs
import re
import time
import pymysql

def get_one_page(url):
    #得到一页的内容
    try:
        headers  = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
        }
        response = requests.get(url,headers=headers)
        response.encoding = 'utf-8'
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

def parse_one_page(html):
    #使用BeautifuSoup解析一页的内容
    soup = bs(html,'lxml')
    for data in soup.find_all('div',class_="photo"):
        name = data.a['title']
        href = "https://baike.baidu.com"+data.a['href']
        img = data.img['src']
        #有些人物没有图片,图片链接有误需要拼接
        if re.search("^/static",img):
            img ="https://baike.baidu.com" + img
        yield {
            'name':name,
            'href':href,
            'img':img
        }

def write_mysql(item):
    #写入Mysql数据库
    conn = pymysql.connect(
        host='localhost',
        user='root',
        password='',
        database='baidu',
        charset='utf8'  # 别写成utf-8
    )
    cursor = conn.cursor()  # 建立游标

    sql = "insert into baidu_baike(name,href,img) values(%s,%s,%s)"
    cursor.execute(sql,(item['name'],item['href'],item['img']))  # 注意excute的位置参数的问题
    conn.commit()  # 修改值的时候,一定需要commit
    cursor.close()  # 关闭
    conn.close()  # 关闭



def main(url):
    #主函数
    html = get_one_page(url)
    items = parse_one_page(html)
    for item in items:
        write_mysql(item)



if __name__ == '__main__':
    #分析URL构成,拼接URL
    for i in range(1,7):
        url = "http://baike.baidu.com/fenlei/虚拟人物?limit=30&index=" + str(i) + "&offset=" + str(
            30 * (int(i) - 1)) + "# gotoList"
        main(url)
        print('正在爬取第%s页'%i)
        time.sleep(1)
    print("全部写入成功!")

  运行结果:

mysql数据库结果:

posted @ 2019-01-30 21:36  水墨黑  阅读(533)  评论(0编辑  收藏  举报