爬虫:豆瓣电影Top250

感觉爬虫就是一种自动在网上收集信息的程序
对豆瓣Top250的爬取,就是写一个爬虫程序,让他模仿人类行为,打开网站,找到信息,然后提取数据
这段代码是使用lxml库来解析HTML,并通过XPath选择器提取数据

import requests  #用于发起网络请求。
from lxml import etree  #用于解析HTML文档,这里使用的是lxml库
import csv  #用于写入CSV文件
import time  #用于在请求之间添加延迟
import codecs  
headers = {  
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'  
}  # 模拟浏览器访问,防止被豆瓣网站识别为爬虫而遭到封锁
def get_first_text(lst):  
    try:  
        return lst[0].strip()  
    except IndexError:  
        return ""  
  
def scrape_douban_movie(url, headers):  
    try:  
        response = requests.get(url, headers=headers)  
        response.raise_for_status()  
        return response.text  
    except requests.RequestException as e:  
        print(f"Error fetching URL: {url} - {e}")  
        return None  
  
def main():  
    urls = [f"https://movie.douban.com/top250?start={i * 25}&filter=" for i in range(10)]  
    count = 1  #用于计数
    movie_data = []  
    for url in urls:  
        html = scrape_douban_movie(url, headers)  
        if html is None:  
            continue  
  
        parsed_html = etree.HTML(html)      ## 将返回的文本加工为可以解析的html  
        movie_list = parsed_html.xpath('//*[@id="content"]/div/div[1]/ol/li')  
  
        for movie in movie_list:  
            title = get_first_text(movie.xpath('./div/div[2]/div[1]/a/span[1]/text()'))       #电影标题 
            director_actor = get_first_text(movie.xpath('./div/div[2]/div[2]/p[1]/text()'))   #电影导演 演员
            score = get_first_text(movie.xpath('./div/div[2]/div[2]/div/span[2]/text()'))     #电影评分
            introduction = get_first_text(movie.xpath('./div/div[2]/div[2]/p[2]/span/text()'))#电影引言
  
            print(count, title, director_actor, score, introduction)  
            movie_data.append([title, director_actor, score, introduction])  
  
            count += 1  
            time.sleep(1)    
    with codecs.open('movies.csv', 'w', 'utf-8-sig') as file:  
        writer = csv.writer(file)  
        writer.writerow(["Title", "Director/Actor", "Score", "Introduction"])  
        writer.writerows(movie_data)  
  
if __name__ == '__main__':  
    main()

下边是使用bs4解析

import requests  
from bs4 import BeautifulSoup  
  
headers = {  
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'  
}  
  
def get_movie_info(soup):  
    movies = []  
    movie_list = soup.find('ol', class_='grid_view').find_all('li')  #在这个标签下找到所有的li标签(包含每部电影的信息)

    for movie_li in movie_list:  
        title = movie_li.find('span', class_='title').get_text()  
        info = movie_li.find('p', class_='').get_text().strip()  
        director_actor = info.split('\xa0\xa0\xa0')[0]   
        other_info = info.split('\xa0\xa0\xa0')[1:]  
        rating_info = movie_li.find('span', class_='rating_num').get_text()  
        introduction = movie_li.find('span', class_='inq').get_text() if movie_li.find('span', class_='inq') else '' 
        #将提取出的电影信息作为一个字典添加到movies列表中 
        movies.append({  
            'title': title,  
            'director_actor': director_actor,  
            'rating': rating_info,  
            'introduction': introduction  
        })  
    return movies  

def main():
    urls = ["https://movie.douban.com/top250?start={}".format(str(i * 25)) for i in range(10)]
    
    with open('movies.txt', 'w', encoding='utf-8') as file:
        for url in urls:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            movies = get_movie_info(soup)
            
            for movie in movies:
                file.write(f"Title: {movie['title']}\n")
                file.write(f"Director/Actor: {movie['director_actor']}\n")
                file.write(f"Rating: {movie['rating']}\n")
                file.write(f"Introduction: {movie['introduction']}\n")
                file.write('\n')

if __name__ == '__main__':
    main()

下面是使用正则表达式解析HTML
借助了两个网站:
https://curlconverter.com/
https://regex101.com/

import requests
import re
import pandas as pd

listss = []
for i in range(0, 250, 25):
    cookies = {
        'bid': 'T4QMlOS21eo',
        '_pk_id.100001.4cf6': '5e25c0c864dca561.1706688370.',
        '__yadk_uid': '8F9Q5P1dXq6TkE7QP5FEJIEhvxsTIfd4',
        '_pk_ref.100001.4cf6': '%5B%22%22%2C%22%22%2C1707383866%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D',
        '_pk_ses.100001.4cf6': '1',
        'ap_v': '0,6.0',
        '__utma': '30149280.175749301.1706688370.1707269654.1707383866.6',
        '__utmb': '30149280.0.10.1707383866',
        '__utmc': '30149280',
        '__utmz': '30149280.1707383866.6.6.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/',
        '__utma': '223695111.96802592.1706688370.1707269654.1707383866.6',
        '__utmb': '223695111.0.10.1707383866',
        '__utmc': '223695111',
        '__utmz': '223695111.1707383866.6.6.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/',
    }

    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        'Connection': 'keep-alive',
        'Referer': 'https://movie.douban.com/top250?start=25&filter=',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
        'sec-ch-ua': '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
    }

    params = {
        'start': i,
        'filter': '',
    }

    response = requests.get('https://movie.douban.com/top250', params=params, cookies=cookies, headers=headers).text
    titles = re.findall('alt="(.*?)"', response)[:25]  # 电影名称
    directors = re.findall('<p class="pl">(.*?)&nbsp;', response)  # 导演
    actors = re.findall('<p class="pl">(.*?)&nbsp;/&nbsp;', response)  # 演员
    scores = re.findall('<span class="rating_num" property="v:average">(.*?)</span>', response)  # 评分
    quotes = re.findall('<span class="inq">(.*?)</span>', response)  # 引言

    for j in range(len(titles)):
        data = {
            'title': titles[j],
            'director': directors[j].strip(),
            'actor': actors[j].strip(),
            'score': scores[j],
            'quote': quotes[j]
        }
        listss.append(data)
        print(data)
df = pd.DataFrame(listss)
print(df)
posted @   0214jx  阅读(35)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 【译】Visual Studio 中新的强大生产力特性
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 【设计模式】告别冗长if-else语句:使用策略模式优化代码结构
点击右上角即可分享
微信分享提示