。。

抓取猫眼电影排行

 

提取猫眼电影TOP100的电影名称、时间、评分、图片等信息,提取的站点URL为,提取的结果会以文件形式保存下来。

正则:

 1 from multiprocessing import Pool
 2 import json
 3 import requests
 4 from requests.exceptions import RequestException
 5 import re
 6 
 7 def get_one_page(url):
 8     headers = {
 9         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
10     }
11     try:
12         response = requests.get(url, headers=headers)
13         if response.status_code == 200:
14             return response.text
15         return None
16     except RequestException:
17         return None
18 
19 def parse_one_page(html):
20     pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
21                          + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
22                          + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
23     items = re.findall(pattern,html)
24     for item in items:
25         yield {
26             'index':item[0],
27             'image':item[1],
28             'title':item[2],
29             'actor':item[3].strip()[3:],
30             'time': item[4].strip()[5:],
31             'score': item[5] + item[6]
32         }
33 
34 def write_to_file(content):
35     with open('result.txt','a',encoding='utf-8') as f:
36         f.write(json.dumps(content,ensure_ascii=False)+'\n')
37         f.close()
38 
39 def main (offset):
40     url = 'http://maoyan.com/board/4?offset='+str(offset)
41     html = get_one_page(url)
42     for item in parse_one_page(html):
43         print(item)
44         write_to_file(item)
45 
46 if __name__ == '__main__':
47     for i in range(10):
48         main(offset = i * 10)
49     pool = Pool()
50     pool.map(main,[i*10 for i in range(10)])

 

 1 #from multiprocessing import Pool
 2 import json
 3 import requests
 4 from requests.exceptions import RequestException
 5 #import re
 6 from lxml import etree
 7 from urllib import parse
 8 
 9 def get_one_page(url):
10     headers = {
11         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
12     }
13     try:
14         response = requests.get(url, headers=headers)
15         if response.status_code == 200:
16             return response.text
17         return None
18     except RequestException:
19         return None
20 
21 def parse_one_page(html):
22     text = etree.HTML(html)
23     node_list = text.xpath('//dl[@board-wrapper]')
24     items ={}
25     for node in node_list:
26         index = text.xpath('./dd[@class="board-index board-index-1"]')[0].text
27         image = text.xpath('./dd/img[@class="board-img"]/@src')
28         title = text.xpath('./dd/p[@class="name"]/a')[0].text
29         actor = text.xpath('./dd/p[@class="star"]')[0].text.strip()[3:]
30         time = text.xpath('./dd/p[@class="releasetime"]')[0].text.strip()[5:]
31         score = text.xpath('./dd/p[@class="score"]').text
32         items = {
33             'index':index,
34             'image':image,
35             'title':title,
36             'actor':actor,
37             'time': time,
38             'score': score
39         }
40         write_to_file(items)
41 '''
42 def parse_one_page(html):
43     pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
44                          + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
45                          + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
46     items = re.findall(pattern,html)
47     for item in items:
48         yield {
49             'index':item[0],
50             'image':item[1],
51             'title':item[2],
52             'actor':item[3].strip()[3:],
53             'time': item[4].strip()[5:],
54             'score': item[5] + item[6]
55         }
56 '''
57 def write_to_file(content):
58     with open('result.txt','a',encoding='utf-8') as f:
59         f.write(json.dumps(content,ensure_ascii=False)+'\n')
60         f.close()
61 
62 def main (offset):
63     url = 'http://maoyan.com/board/4?offset='+str(offset)
64     html = get_one_page(url)
65     parse_one_page(html)
66     #for item in parse_one_page(html):
67       #  print(item)
68       #  write_to_file(items)
69 
70 if __name__ == '__main__':
71     for i in range(10):
72         main(offset = i * 10)
73     #pool = Pool()
74     #pool.map(main,[i*10 for i in range(10)])

 

posted @ 2019-03-08 20:32  王琳杰  阅读(189)  评论(0编辑  收藏  举报