import requests
from lxml import etree
import csv
import queue
import threading
from fake_useragent import UserAgent
ua = UserAgent().random
headers = {
'User-Agent': ua
}
q_html = queue.Queue()
q_big = queue.Queue()
s = requests.session()
s.get("https://maoyan.com/", headers=headers)
# 发送请求获取HTML
def request_get(url):
# s = requests.session()
# s.get("https://maoyan.com/", headers=headers)
# s.get("https://maoyan.com/films", headers=headers)
res = s.get(url, headers=headers)
# print(res.text)
html = etree.HTML(text=res.text)
return html
# 将HTML添加到队列
def get_video_list():
while 1:
if q_big.empty() != True:
url = q_big.get()
html = request_get(url)
video_ulr_list = html.xpath('//div[@class="movies-list"]/dl//dd')
print("video_url_list-->{}".format(video_ulr_list))
for i in video_ulr_list:
print("0.0")
url_less = i.xpath('./div[2]/a/@href')[0]
url = "https://maoyan.com" + url_less
q_html.put(url)
print("put的url:{}".format(url))
def get_video_detail():
print("==============")
while 1:
if q_html.empty() != True:
video_ulr = q_html.get()
print("get到的url:{}".format(video_ulr))
info = request_get(video_ulr)
try:
title = info.xpath('//div[@class="celeInfo-right clearfix"]/div/h1/text()')[0]
pub_time = info.xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[3]/text()')[0]
star = info.xpath('//*[@id="app"]/div/div[1]/div/div[3]/div[1]/div[2]/div[2]/div/div[1]/ul/li/div/a/text()')[0]
star = star.strip()
star = star.replace(",", "")
print(title, pub_time, star)
except IndexError:
pass
fp = open('maoyan.csv', 'a', encoding='utf-8') # newline 换行符
writer = csv.writer(fp)
writer.writerow([title, pub_time, star])
fp.close()
def main():
fp = open('maoyan.csv', 'a', encoding='utf-8') # newline 换行符
writer = csv.writer(fp)
writer.writerow(['title', 'pub_time', 'star'])
fp.close()
# 构造url请求
urls = ['https://maoyan.com/films?showType=3&offset={}'.format(str(i)) for i in range(0, 500, 30)]
print(urls)
t_list = []
for url in urls:
q_big.put(url)
t1 = threading.Thread(target=get_video_list) # 获取电影的url列表
t_list.append(t1)
t2 = threading.Thread(target=get_video_detail) # 获取url详情页
t_list.append(t2)
for i in t_list:
i.start()
for i in t_list:
i.join()
if __name__ == '__main__':
main()