爬虫 - demo

# -*- coding = utf-8 -*-
# @Time: 20:55
# @Author: zzc
# @File: spider.py
# @Software: PyCharm
import urllib.request
from bs4 import BeautifulSoup
import openpyxl
import re
import sqlite3

link_pattren = '<a href="(.*?)">'
src_pattren = 'src="(.*?)"'
cname_pattern = '<span class="title">(.*?)</span>'
rating_pattern = '<span class="rating_num" property="v:average">(.*?)</span>'
judge_pattern = '<span>(.*?)人评价</span>'
ing_pattern = re.compile('<span class="inq">(.*?)</span>')
content_pattern = re.compile(r'<p class="">(.*?)</p>', re.S)  # re.S 让.匹配换行



def main():
    baseurl = 'https://movie.douban.com/top250?start='
    # 1.爬取数据
    data_list = get_data(baseurl)
    # 2.保存数据 到xlsx
    # save_path = './豆瓣电影Top250.xlsx'
    # save_data(data_list, save_path)
    db_file = 'douban.db'
    save_data2db(data_list, db_file)


def init_db(db_file):
    ''' 创建保存数据的表 '''
    create_table_sql = '''
        create table douban
        (
            id integer primary key autoincrement,
            address_link text,
            img_link text,
            cname varchar,
            ename varchar,
            rate numberic,
            judge numberic,
            ing text,
            content
        )
    '''
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()
    cursor.execute(create_table_sql)

    conn.commit()
    conn.close()


def save_data2db(data_list, db_file):
    ''' 将爬取到的数据保存到数据库 '''
    init_db(db_file)   # 创建表
    # print(data_list)
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()
    # SQL方式一:
    # for rows in data_list:
    #     fmt = ''
    #     for index in range(len(rows)):
    #         if index == len(rows) -1:
    #             fmt += '"' + rows[index] + '"'
    #         else:
    #             fmt += '"' + rows[index] + '"' + ','
    #     # 2.拼装sql
    #     sql = f'insert into douban7 values(null,{fmt})'

    # SQL 方法二:
    for row in data_list:
        tup = tuple(row)

        sql = 'insert into douban values(null,?,?,?,?,?,?,?,?)'

        cursor.execute(sql,tup) # 第二个参数传列表和元组均可
        conn.commit()
    conn.close()

def ask_url(url):
    '''请求网址'''
    head = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
    }  # 伪装为浏览器
    request = urllib.request.Request(url, headers=head)
    html = ''
    try:
        resp = urllib.request.urlopen(request)
        html = resp.read().decode('utf-8')
    except urllib.error.URLError as e:
        if hasattr(e, 'code'):
            print(e.code)
        if hasattr(e, 'reason'):
            print(e.reason)
    return html


def get_data(baseurl):
    data_list = []
    for i in range(0, 10):
        url = baseurl + str(i * 25)
        html = ask_url(url)
        # 逐一解析数据
        soup = BeautifulSoup(html, 'html.parser')
        item_list = soup.find_all('div', class_="item")  # 获取所有class属性为item 的div
        for item in item_list:
            data = []
            # print(item)
            str_item = str(item)
            # 使用re 正则匹配值
            link = re.findall(link_pattren, str_item)  # link
            data.append(link[0])

            src = re.findall(src_pattren, str_item)
            data.append(src[0])  # 图片信息

            cname = re.findall(cname_pattern, str_item)  # 片名
            # [data.append(x.replace('\xa0/\xa0', '')) for x in cname]  # 片名
            if len(cname) == 2:
                data.append(cname[0])
                ename = cname[1].replace('\xa0/\xa0', '') # 英文名
                data.append(ename)
            else:
                # 只有一个中文名
                data.append(cname[0])
                data.append(' ')

            rating = re.findall(rating_pattern, str_item)
            data.append(rating[0])  # 评分

            judge = re.findall(judge_pattern, str_item)
            data.append(judge[0])  # 评论人数

            inq = re.findall(ing_pattern, str_item)
            if inq:
                data.append(inq[0])
            else:
                data.append(" ")

            content = re.findall(content_pattern, str_item)

            content = re.sub(r'<br/>', '', content[0])
            content = content.strip()  # 去掉空格
            data.append(content)
            data_list.append(data)

    return data_list


def save_data2(data_list, save_path):
    ''' 将数据存储到xlsx '''
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = '豆瓣电影Top250'
    first_row = ('地址', '图片', '中文名称', '外文名称', '评分', '评论人数', '简介', '内容')
    # ws.append(first_row)
    for i in range(1, 9):
        ws.cell(row=1, column=i).value = first_row[i - 1]

    # 循环
    for row in range(0, 249):
        data = data_list[row]
        for col in range(0, 8):
            ws.cell(row=row + 2, column=col + 1).value = data[col]

    wb.save(save_path)


if __name__ == '__main__':
    main()
    # init_db('demo.db')

成功将抓取到的数据存入database:

posted @ 2022-11-04 22:00  chuangzhou  阅读(50)  评论(0编辑  收藏  举报