电影天堂爬虫实战
#!/usr/bin/python3
# -*- coding:utf-8 -*-
# @Time:2021/8/28 22:38
# @author: Mrwhite
# @File:电影天堂爬虫.py
# @DESC:
import re
import urllib.request,urllib.error #制定URL,获取网页数据
import xlwt
from bs4 import BeautifulSoup
def main():
pass
#电影天堂url
baseurl = "https://dy.dytt8.net/index.htm"
#1.爬取主页-电影名称,超链接,更新日期
#据超链地址打开后,获取导演/主演/豆瓣评分/磁力链接/简介
datalist = getData(baseurl)
#2.保存数据excel表,根据分类插入对应sheet页
saveData(datalist,"电影天堂电影.xls")
#创建正则表达式对象,表示规则(字符串的模式)
findLink = re.compile(r'・\[<a href="/html/gndy/.*<a href="(.*?)">') #影片链接匹配规则
findMovieName = re.compile( r'・\[<a href="/html/gndy/.*">(.*?)</a><br/>' ) #匹配电影名称
findUpDateTime = re.compile( r'<td class="inddline" width="15%"><font color="#FF0000">(.*?)</font></td>' ) #匹配更新日期
findDirect = re.compile( r'<br />◎导 演 (.*?)<br />' ) #匹配导演
findActor = re.compile( r'<br />◎主 演 (.*?)<br /><br />◎标 签' ) #匹配演员
findScore = re.compile( r'<br />◎豆瓣评分 (.*?) from' ) #匹配豆瓣评分
findDownloadLink = re.compile( r'<a target="_blank" href="(.*?)">' ) #匹配下载链接
findInfo = re.compile( r'◎简 介<br /><br /> (.*?)<br />' ) #相信信息
def getData(baseurl):
datalist = []
titles,links,updateTimes,directs,actors,scores,downloadLinks,infos=[],[],[],[],[],[],[],[]
#1.爬取网页
html = askURl(baseurl)
#print(html)
# 2.解析数据
soup=BeautifulSoup( html, "html.parser" )
#nth-child需要替换为nth-of-type
item = soup.select("div:nth-of-type(2) > div:nth-of-type(1) > div > div > div.co_content8")
item = str(item)
#print(item)
titles = re.findall(findMovieName, item) #正则匹配标题
#links = f'https://dy.dytt8.net/{re.findall(findLink, html)}'
linksUnSet = re.findall(findLink, item) #正则匹配超链接并拼接完整路径
for link in linksUnSet:
link = f'https://dy.dytt8.net{link}'
links.append(link)
updateTimes = re.findall(findUpDateTime,item) #正则匹配更新实际
#3.循环访问电影子链接获取:导演/主演/豆瓣评分/磁力链接/简介
for link in links:
#print(link)
html=askURl(link)
#print(html)
directUnSet = re.findall(findDirect,html) # 正则匹配导演并处理
if directUnSet==[]:
directs.append("")
else:
direct=directUnSet[0].replace(" ","").replace("·","·")
directs.append(direct)
actorsUnset = re.findall(findActor,html) # 正则匹配主演
if actorsUnset==[]:
actors.append("")
else:
actorList = actorsUnset[0].replace("·","·").replace(" ","").replace("\u3000","").split("<br />")[0:3]
actor="/".join( actorList )
actors.append(actor)
scoresUnset = re.findall(findScore,html) # 正则匹配豆瓣评分
if scoresUnset==[]:
scores.append("无评分")
else:
score=scoresUnset[0].split("/")[0]
scores.append(score)
downloadLink = re.findall(findDownloadLink,html) # 正则匹配磁力链接
downloadLinks.append(downloadLink)
infosUnSet = re.findall(findInfo,html) # 正则匹配简介
if infosUnSet==[]:
infos.append("")
else:
info = infosUnSet[0].replace("·","·").replace(" ","").replace("“","")
infos.append(info)
dataList=[titles, updateTimes,directs, actors, scores, downloadLinks, infos]
#print( len( titles ), len( updateTimes ),len(links), len( directs ), len( actors ), len( scores ), len( downloadLinks ),len( infos ) )
return dataList
#得到指定一个URL的网页内容
def askURl(url):
#head={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"}
#request = urllib.request.Request(url,headers=head)
request = urllib.request.Request(url)
try:
response = urllib.request.urlopen(request)
html = response.read().decode("gb2312",errors = 'ignore')
#print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
return html
#保存数据
def saveData(datalist,savepath):
print("save......")
book = xlwt.Workbook(encoding="utf8",style_compression=0)
sheet = book.add_sheet("from电影天堂",cell_overwrite_ok=True)
col = ('标题',"更新时间","导演","主演","豆瓣评分","磁力链接","简介")
try:
for j in range(7): #i为行,j为列
sheet.write(0,j,col[j]) #列名
for i in range(1,len(datalist[0])):
sheet.write(i,j,datalist[j][i])
print("datalist的",i,"行",j,"列的数据为:",datalist[j][i],"成功写入")
book.save(savepath) #保存
except Exception as e:
print("datalist的",i,"行",j,"列的数据为:",datalist[j][i],"写入失败")
print(e)
if __name__ == "__main__": #当程序执行时
#调用函数
main()
print("爬取完毕")
展示效果如下:可继续添加下优化爬虫的效率