豆瓣top250爬取

一、电影名字爬取

import requests
import re

headers = {
	"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40"
}

url = "https://movie.douban.com/top250" 

# 获取html页面
resp = requests.get(url,headers=headers)
page_contenr = resp.text

# 解析数据
obj = re.compile(r'<li>.*?<span class="title">(?P<name>.*?)</span>',re.S)

# 开始匹配
result = obj.finditer(page_contenr)
for i in result:
	print(i.group("name"))

resp.close()

二、爬取年份

import requests
import re

headers = {

	"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40"
}

url = "https://movie.douban.com/top250" 

resp = requests.get(url,headers=headers)
page_contenr = resp.text

obj = re.compile(
# 一行写不下时,换行接着写
r'<li>.*?<span class="title">(?P<name>.*?)</span>.*?'
r'<p class="">.*?<br>(?P<year>.*?)&nbsp'
,re.S)


result = obj.finditer(page_contenr)
for i in result:
	print(i.group("name"))
        # strip()去除空格
	print(i.group("year").strip())

resp.close()

三、将数据存入csv中

import requests
import re
import csv

headers = {

	"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40"
}

url = "https://movie.douban.com/top250" 

resp = requests.get(url,headers=headers)
page_contenr = resp.text

obj = re.compile(
r'<li>.*?<span class="title">(?P<name>.*?)</span>.*?'
r'<p class="">.*?<br>(?P<year>.*?)&nbsp.*?'
r'<span class="rating_num" property="v:average">(?P<score>.*?)</span>.*?'
r'<span>(?P<num>.*?)人评价</span>'
,re.S)


result = obj.finditer(page_contenr)

# 写的模式打开一个文件
f = open("data.csv",mode="w")

# csv一种以逗号分隔按行存储的文本文件,csv方式写入
csvwriter = csv.writer(f)

for i in result:
        # 将结果变为字典型	
	dic = i.groupdict()

        # 其中key为year的结果前有大量空格,将空格删除后的结果赋给year
	dic['year'] = dic['year'].strip()
    
        # 将数据一行一行写入 
	csvwriter.writerow(dic.values())

resp.close()

posted @ 2021-11-02 18:14  lnterpreter  阅读(73)  评论(0编辑  收藏  举报