爬取豆瓣影评2--完整代码
# -*-coding:utf-8-*- # @Time :2021/11/20 13:58 # @Author:shuaichao # @File :.py # @Software: PyCharm import urllib.request from bs4 import BeautifulSoup # 网页解析,获悉数据.231 import urllib.request, urllib.error # 制定URL,获取网页数据 import time import os import requests from lxml import etree import json from urllib.request import Request from urllib.request import urlopen def askUrl(url): headers = { "Cookie": 'bid=ySWyT3eWKHI; ll="118088"; __utma=30149280.292149151.1637469049.1637469049.1637469049.1; __utmc=30149280; __utmz=30149280.1637469049.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1637469049; ap_v=0,6.0; __utma=223695111.1326316524.1637469080.1637469080.1637469080.1; __utmb=223695111.0.10.1637469080; __utmc=223695111; __utmz=223695111.1637469080.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637469080%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _vwo_uuid_v2=D84C2319507104E7EA8DA14C2D366B708|08f1b95ebe80ed5b6c33ac030c3151e7; dbcl2="250389712:+jECS9wlK5g"; ck=ieh6; _pk_id.100001.4cf6=13045fc7b4b26386.1637469080.1.1637469126.1637469080.; push_noty_num=0; push_doumail_num=0', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57" } request = urllib.request.Request(url, headers=headers) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reasen) return html # 爬取网页信息 def get_info(baseurl): html = askUrl(baseurl) bs = BeautifulSoup(html, "html.parser") return bs # soup处理并转换成字符串 def transport(bs, info): ex_info = bs.find_all(class_=info) info = str(ex_info) return ex_info, info def getImg(url, imgName): headers = { "Cookie": 'bid=ySWyT3eWKHI; ll="118088"; __utma=30149280.292149151.1637469049.1637469049.1637469049.1; __utmc=30149280; __utmz=30149280.1637469049.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1637469049; ap_v=0,6.0; __utma=223695111.1326316524.1637469080.1637469080.1637469080.1; __utmb=223695111.0.10.1637469080; __utmc=223695111; __utmz=223695111.1637469080.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637469080%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _vwo_uuid_v2=D84C2319507104E7EA8DA14C2D366B708|08f1b95ebe80ed5b6c33ac030c3151e7; dbcl2="250389712:+jECS9wlK5g"; ck=ieh6; _pk_id.100001.4cf6=13045fc7b4b26386.1637469080.1.1637469126.1637469080.; push_noty_num=0; push_doumail_num=0', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57" } try: req_timeout = 5 req = Request(url=url, headers=headers) f = urlopen(req, None, req_timeout) pic = f.read() # pic= Request.get(url, timeout=10) imgPath = './imgs/%s.jpg' % (imgName) fp = open(imgPath, 'wb') fp.write(pic) fp.close() except Request.exceptions.ConnectionError: print(u'链接失败') ##再写一个爬去豆瓣登录页面的代码,并调用上述所写的方法 ''' TODO:获取豆瓣电影ID ''' if __name__ == '__main__': print("开始") headers = { "Cookie": 'bid=ySWyT3eWKHI; ll="118088"; __utma=30149280.292149151.1637469049.1637469049.1637469049.1; __utmc=30149280; __utmz=30149280.1637469049.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1637469049; ap_v=0,6.0; __utma=223695111.1326316524.1637469080.1637469080.1637469080.1; __utmb=223695111.0.10.1637469080; __utmc=223695111; __utmz=223695111.1637469080.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637469080%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _vwo_uuid_v2=D84C2319507104E7EA8DA14C2D366B708|08f1b95ebe80ed5b6c33ac030c3151e7; dbcl2="250389712:+jECS9wlK5g"; ck=ieh6; _pk_id.100001.4cf6=13045fc7b4b26386.1637469080.1.1637469126.1637469080.; push_noty_num=0; push_doumail_num=0', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57" } # 获取一千个电影ID # 热门类型的 url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=rank&page_limit=1300&page_start=0' # 国产类型的 url_guochan = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E5%9B%BD%E4%BA%A7%E5%89%A7&page_limit=150&page_start=0' # 豆瓣高分 url_douban='https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&page_limit=300&page_start=0' # 美剧 url_meiju='https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%BE%8E%E5%89%A7&page_limit=300&page_start=0' res = get_info(url_meiju) response_data = json.loads(res.text) # 存放评论 comment_high = [] comment_middle = [] comment_low = [] try: for index, k in enumerate(response_data['subjects']): # if index <= 1000: # print(index) # continue # 存放评论 comment_high = [] comment_middle = [] comment_low = [] print(index) if index % 2 == 0: time.sleep(5) id = k['id'] highUrl = "https://movie.douban.com/subject/%s/comments?percent_type=h&limit=20&status=P&sort=new_score" % ( id) middleUrl = "https://movie.douban.com/subject/%s/comments?percent_type=m&limit=20&status=P&sort=new_score" % ( id) lowUrl = "https://movie.douban.com/subject/%s/comments?percent_type=l&limit=20&status=P&sort=new_score" % ( id) print(highUrl) ''' 获取高评价评论 ''' # 循环请求接口 for i in range(0, 10): time.sleep(2) urlTmp = highUrl + "&start=" + str(i * 20) re = requests.get(url=urlTmp, headers=headers).text # 构造了一个XPath解析对象并对HTML文本进行自动修正 html = etree.HTML(re) # XPath使用路径表达式来选取用户名 comment = html.xpath('//div[@class="comment"]') print("开始好评") for content in comment: names = content.xpath('.//a[@class=""]') grades = content.xpath('.//span[contains(@class,"rating")]') texts = content.xpath('.//span[@class="short"]') name = names[0].xpath('./text()')[0] if len(grades) > 0: grade = grades[0].xpath('./@class')[0][7:8] + '星' else: grade = '暂无评价' text = texts[0].xpath('./text()')[0] comment_high.append(text) print(text) print(len(comment_high)) ''' 获取中评价评论 ''' for i in range(0, 10): time.sleep(2) urlTmp = middleUrl + "&start=" + str(i * 20) re = requests.get(url=urlTmp, headers=headers).text # 构造了一个XPath解析对象并对HTML文本进行自动修正 html = etree.HTML(re) # XPath使用路径表达式来选取用户名 print("开始中评") comment = html.xpath('//div[@class="comment"]') for content in comment: names = content.xpath('.//a[@class=""]') grades = content.xpath('.//span[contains(@class,"rating")]') texts = content.xpath('.//span[@class="short"]') name = names[0].xpath('./text()')[0] if len(grades) > 0: grade = grades[0].xpath('./@class')[0][7:8] + '星' else: grade = '暂无评价' text = texts[0].xpath('./text()')[0] print(text) comment_middle.append(text) print(len(comment_middle)) ''' 获取低评价评论 ''' for i in range(0, 10): time.sleep(2) urlTmp = lowUrl + "&start=" + str(i * 20) re = requests.get(url=urlTmp, headers=headers).text # 构造了一个XPath解析对象并对HTML文本进行自动修正 html = etree.HTML(re) # XPath使用路径表达式来选取用户名 comment = html.xpath('//div[@class="comment"]') print("开始差评") for content in comment: names = content.xpath('.//a[@class=""]') grades = content.xpath('.//span[contains(@class,"rating")]') texts = content.xpath('.//span[@class="short"]') name = names[0].xpath('./text()')[0] if len(grades) > 0: grade = grades[0].xpath('./@class')[0][7:8] + '星' else: grade = '暂无评价' text = texts[0].xpath('./text()')[0] comment_low.append(text) print(text) print(len(comment_low)) # 文件夹不存在,则创建文件夹 save_path = './douban' folder = os.path.exists(save_path) if not folder: os.makedirs(save_path) print("开始写入文件") with open('./douban/comments_high.txt', 'a+', encoding='utf-8') as f: for v in comment_high: print(v) f.write('%s high\n' % v) with open('./douban/comments_middle.txt', 'a+', encoding='utf-8') as f: for v in comment_middle: print(v) f.write('%s middle\n' % v) with open('./douban/comments_low.txt', 'a+', encoding='utf-8') as f: for v in comment_low: print(v) f.write('%s low\n' % v) except: with open('./douban/comments_high.txt', 'a+', encoding='utf-8') as f: for v in comment_high: print(v) f.write('%s high\n' % v) with open('./douban/comments_middle.txt', 'a+', encoding='utf-8') as f: for v in comment_middle: print("写入文件") f.write('%s middle\n' % v) with open('./douban/comments_low.txt', 'a+', encoding='utf-8') as f: for v in comment_low: print("写入文件") f.write('%s low\n' % v)