爬虫例子:抓取电影信息
1 import requests 2 from time import sleep 3 from lxml import etree 4 from fake_useragent import UserAgent 5 from random import randint 6 import re 7 from threading import Thread 8 from queue import Queue 9 from os import remove 10 from copy import copy, deepcopy 11 from bs4 import BeautifulSoup, Comment 12 from pyquery import PyQuery 13 14 session = requests.session() 15 headers = { 16 'User-Agent': UserAgent().random 17 } 18 def get_html(url): 19 sleep( randint(1,3) ) 20 response = session.get(url, headers=headers) 21 if response.status_code == 200: 22 return response.text 23 else: 24 return None 25 26 def save_image(filename, url, data): 27 f1 = re.findall(r'/([^/]+\.[^/]+)\?', url) 28 if not f1: 29 f1 = re.findall(r'/([^/]+\.[^/]+)$', url) 30 url = url + '?imageView2/1/w/160/h/220' 31 if f1: 32 f1 = f1[0] 33 else: 34 f1 = '' 35 36 if (f1 == '') or (f1 == None) : 37 f1 = filename 38 39 domain = re.findall(r'^https://([^/]+.com)', url)[0] 40 url = url.replace(domain, r'img1.abcdbio.com') 41 r = session.get(url, stream=True, headers=headers) 42 if r.status_code == 200: 43 data['file_content'] = deepcopy(r.content) 44 # with open(f1, 'wb') as f: 45 # f.write(r.content) 46 47 print(f1) 48 r.close() 49 50 def parse_film_bs4(html): 51 bsoup = BeautifulSoup(html, 'lxml') 52 film_list = bsoup.select('ol', class_='grid_view')[0].select('li') 53 result_list = [] 54 for film in film_list: 55 detail_url = film.select('div > a')[0].attrs['href'] 56 film_name = film.select('div > a > img')[0].attrs['alt'] 57 film_theme = film.select('div > a > img')[0].attrs['src'] 58 film_info = film.select('div', class_='bd')[0].select('p')[1].span.string 59 result_list.append({ 60 'detail_url': detail_url, 61 'film_name': film_name, 62 'film_theme': film_theme, 63 'film_info': film_info, 64 'file_content': None 65 }) 66 return result_list 67 68 def parse_film_pyquery(html): 69 doc = PyQuery(html) 70 film_list = doc('ol')('.grid_view').children() 71 result_list = [] 72 for index in range(0, len(film_list)): 73 film = film_list.eq(index) 74 detail_url = film('div').eq(0)('a').eq(0).attr('href') 75 film_name = film('div').eq(0)('a').eq(0)('img').eq(0).attr('alt') 76 film_theme = film('div').eq(0)('a').eq(0)('img').eq(0).attr('src') 77 film_info = film.find('div.bd p').eq(1).text() 78 result_list.append({ 79 'detail_url': detail_url, 80 'film_name': film_name, 81 'film_theme': film_theme, 82 'film_info': film_info, 83 'file_content': None 84 }) 85 return result_list 86 87 def parse_film_xpath(html): 88 e = etree.HTML(html) 89 film_list = e.xpath(r"//ol[@class='grid_view']/li") 90 result_list = [] 91 for film in film_list: 92 detail_url = film.xpath('div//a')[0].attrib['href'] 93 film_name = film.xpath('div//a/img')[0].attrib['alt'] 94 film_theme = film.xpath('div//a/img')[0].attrib['src'] 95 film_info = film.xpath("div//div[@class='bd']/p[2]/span")[0].xpath('string(.)') 96 result_list.append({ 97 'detail_url': detail_url, 98 'film_name': film_name, 99 'film_theme': film_theme, 100 'film_info': film_info, 101 'file_content': None 102 }) 103 return result_list 104 105 def save_film(data): 106 data1 = { 107 'name': data['film_name'], 108 'detail_url': data['detail_url'], 109 'info': data['film_info'], 110 } 111 files = { 112 'theme': data['file_content'] 113 } 114 115 r = requests.post('http://localhost:8069/abcdb/film', data=data1, files=files) 116 r.close() 117 118 class PostToServer(Thread): 119 def __init__(self, data_queue): 120 Thread.__init__(self) 121 self.data_queue = data_queue 122 123 def run(self): 124 while True: 125 if self.data_queue.empty(): 126 sleep(0.1) 127 continue 128 s = self.data_queue.get() 129 print(s) 130 save_film(s) 131 132 def main(): 133 data_queue = Queue() 134 th1 = PostToServer(data_queue) 135 th1.start() 136 137 url = 'https://movie.abcdb.com/top250?start=0&filter=' 138 html = get_html(url) 139 # data = parse_film_xpath(html) 140 # data = parse_film_bs4(html) 141 data = parse_film_pyquery(html) 142 for film in data[:5]: 143 save_image('theme.jpg', film['film_theme'], film) 144 data_queue.put(film) 145 th1.join() 146 147 if __name__ == '__main__': 148 main()