猫眼100 爬虫
正则表达式爬虫
完整代码
import requests # 获取网页数据
import re # 正则表达式
from bs4 import BeautifulSoup # 网页解析,获取数据
import xlwt # 保存为excel
findIndex = re.compile(r'board-index.*?>(\d+).*?')
findImage = re.compile(r'class="board-img".*?src="(.*?)"')
findTitle = re.compile(r'title="(.*?)">')
findActor = re.compile(r'class="star">(.|\n)(.*)')
findTime = re.compile(r'class="releasetime">(.*?)</p> ')
findScore1 = re.compile(r'class="integer">(.*?)</i>')
findScore2 = re.compile(r'class="fraction">(.*?)</i>')
# 爬取网页
# 解析数据
# 保存数据
def main():
baseurl = "https://maoyan.com/board/4?offset="
datalist = getData(baseurl)
savepath = "猫眼TOP100.xls"
saveData(datalist, savepath)
def getData(baseUrl):
datalist = []
for i in range(0, 10):
url = baseUrl + str(i * 10)
html = askUrl(url)
# 解析数据
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all("dd"):
# print(item) # 测试
data = []
item = str(item)
# 排名
index = re.findall(findIndex, item)[0]
data.append(index)
# 图片地址
image = re.findall(findImage, item)[0]
data.append(image)
# 标题
title = re.findall(findTitle, item)[0]
data.append(title)
# 作者
actor = re.findall(findActor, item)[0]
actorList = list(actor)
for i in actorList:
actorNew = "".join(i).strip()
data.append(actorNew)
# 上映时间
time = re.findall(findTime, item)[0]
data.append(time)
# 成绩
score1 = re.findall(findScore1, item)[0]
# data.append(score1)
score2 = re.findall(findScore2, item)[0]
# data.append(score2)
score = score1 + score2
data.append(score)
# print(data)
datalist.append(data)
#print(datalist)
return datalist
# 爬取网页
def askUrl(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4209.2 Safari/537.36"}
html = ""
try:
response = requests.get(url, headers=headers)
html = response.content.decode("utf-8")
# print(html)
except requests.exceptions as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
def saveData(datalist, savepath):
book = xlwt.Workbook(encoding="utf-8")
sheet = book.add_sheet("猫眼TOP100", cell_overwrite_ok=True)
col = ("电影排名", "图片地址", "电影名称", "演出人员", "上映时间", "电影评分")
for i in range(0, 6):
sheet.write(0, i, col[i])
for i in range(0, 100):
print("第%d条" % (i + 1))
try:
data = datalist[i]
except:
continue
for j in range(0, 6):
sheet.write(i + 1, j, data[j])
book.save(savepath)
if __name__ == '__main__': # 当程序执行时调用函数
main()
print('爬取完成')