爬取猫眼电影榜单TOP100榜-以命令行输出
一、使用正则表达式匹配
from urllib import request import re import time import random from useragents import ua_list class MaoyanSpider(object): def __init__(self): self.url = 'https://maoyan.com/board/4?offset={}' # 计数 self.num = 0 def get_html(self,url): headers = { 'User-Agent' : random.choice(ua_list) } req = request.Request(url=url,headers=headers) res = request.urlopen(req) html = res.read().decode('utf-8') # 直接调用解析函数 self.parse_html(html) def parse_html(self,html): re_bds = r'<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>' pattern = re.compile(re_bds,re.S) # film_list: [('霸王别姬','张国荣','1993'),()] film_list = pattern.findall(html) # 直接调用写入函数 self.write_html(film_list) def write_html(self,film_list): film_dict = {} for film in film_list: film_dict['name'] = film[0].strip() film_dict['star'] = film[1].strip() film_dict['time'] = film[2].strip()[5:15] print(film_dict) self.num += 1 def main(self): for offset in range(0,31,10): url = self.url.format(offset) self.get_html(url) time.sleep(random.randint(1,2)) print('共抓取数据:',self.num) if __name__ == '__main__': start = time.time() spider = MaoyanSpider() spider.main() end = time.time() print('执行时间:%.2f' % (end-start))
二、使用xpath匹配
一、xpath表达式
1、基准xpath: 匹配所有电影信息的节点对象列表
//dl[@class="board-wrapper"]/dd
2、遍历对象列表,依次获取每个电影信息
for dd in dd_list:
电影名称 :.//p[@class="name"]/a/@title
电影主演 :.//p[@class="star"]/text()
上映时间 :.//p[@class="releasetime"]/text()
二、代码实现
import requests from lxml import etree import time import random from useragents import ua_list class MaoyanSpider(object): def __init__(self): self.url = 'https://maoyan.com/board/4?offset={}' # 计数 self.num = 0 self.blag = 1 def get_html(self,url): headers = { 'User-Agent' : random.choice(ua_list) } if self.blag <= 3: try: res = requests.get(url=url,headers=headers,timeout=3) res.encoding = 'utf-8' html = res.text # 直接调用解析函数 self.parse_html(html) except Exception as e: print('Retry') self.blag += 1 self.get_html(url) def parse_html(self,html): # 此处用xpath实现 - 先基准xpath,再依次遍历 parse_html = etree.HTML(html) base_xpath = '//dl[@class="board-wrapper"]/dd' dd_list = parse_html.xpath(base_xpath) item = {} if dd_list: for dd in dd_list: # 电影名称 xpath_name = './/p[@class="name"]/a/@title' name_list = dd.xpath(xpath_name) item['name'] = [ name_list[0].strip() if name_list else None ][0] # 主演 xpath_star = './/p[@class="star"]/text()' star_list = dd.xpath(xpath_star) item['star'] = [ star_list[0].strip() if star_list else None ][0] # 时间 xpath_time = './/p[@class="releasetime"]/text()' time_list = dd.xpath(xpath_time) item['time'] = [ time_list[0].strip() if time_list else None ][0] print(item) else: print('No dd_list') def main(self): for offset in range(0,31,10): url = self.url.format(offset) self.get_html(url) time.sleep(random.randint(1,2)) # 重置标签 self.blag = 1 print('共抓取数据:',self.num) if __name__ == '__main__': start = time.time() spider = MaoyanSpider() spider.main() end = time.time() print('执行时间:%.2f' % (end-start))