到豆瓣爬取电影信息
初学puthon爬虫,于是自己怕了豆瓣以电影信息,直接上源码
import re import requests from bs4 import BeautifulSoup import urllib import os class movie: def __init__(self): self.url="https://movie.douban.com/subject/25933890/?tag=%E7%83%AD%E9%97%A8&from=gaia_video" self.head={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', } def getpag(self): req=requests.get(self.url,self.head) html=req.content html=html.decode('utf-8') return html def gettit(self,page): title = r'<span property="v:itemreviewed">(.+?)</span>' power = r'<strong class="ll rating_num" property="v:average">(.+?)</strong>' tit = re.findall(title, page) powe = re.findall(power,page) tit = str(tit) print(tit, '\n') print("豆瓣评分:", powe, '\n') def getinfo(self,page): soup = BeautifulSoup(page, "lxml") infor = soup.find_all('div', 'info') for info in infor: print(info.get_text()) def getping(self,page): soup = BeautifulSoup(page, "lxml") ping = soup.find_all('div', 'comment') for pin in ping: pname=pin.fin pn=pname.find_all('a').d_all('span',class_='comment-info') for pnam in pname: for p in pn: print(p.get_text()) arg=pin.find_all('p') for ar in arg: print(ar.get_text()) def start(self): page=self.getpag() self.gettit(page) self.getinfo(page) self.getping(page) movie().start()
爬取成功
我利用的是BeautifulSoup设个库,这个库将可以将heml代码进行按标签进行分类整理,还可以读取标签属性,详情可以自己搜索,对于爬虫来说非常强大
我的代码理念理念是利用BeautifulSoup,利用for循环一层一层的往下搜索找到自己想要的数据