1 import requests
2 import re
3 import json
4 from requests.exceptions import RequestException
5 from multiprocessing import Pool
6
7 def get_one_page(url):
8 headers = {
9 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
10 try:
11 r=requests.get(url,headers=headers)
12 r.encoding='utf-8'
13 if r.status_code==200:
14 return r.text
15 return None
16 except RequestException:
17 return None
18
19 def parse_one_page(html):
20 #pattern=re.compile('<div.*?790.*?<a.*?>(.*?)</a>.*?<div.*?>"(.*?)"</div>',re.S)
21 pattern=re.compile('<h3>.*?">(.*?)</a>.*?">(.*?)</div>',re.S)
22 items=re.findall(pattern,html)
23 for item in items:
24 yield {
25 'title':item[0],
26 'content':item[1].replace(u'\u3000',u'')
27 }
28
29 def write(content):
30 with open('1.txt','a',encoding='utf-8') as f:
31 f.write(json.dumps(content,ensure_ascii=False)+'\n')
32 def main(p):
33 url="http://www.neihan8.com/article/index_"+str(p)+".html"
34 html=get_one_page(url)
35 for item in parse_one_page(html):
36 print(item)
37 write(item)
38
39 if __name__=='__main__':
40 #for p in range(2,100):
41 #main(p)
42 pool=Pool()
43 pool.map(main,[p for p in range(2,50)])