爬虫抓取,保存为josn。

1爬虫抓取,保存为josn。

 1 # -*- coding:utf-8 -*-
 2 
 3 import json
 4 import requests
 5 
 6 from lxml import etree
 7 import codecs
 8 
 9 
10 siteurl = "http://news.163.com/special/shijiuda_roll/"
11 user_agent="Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"
12 headers = {"User-Agent":user_agent}
13 
14 url = siteurl 
15 html = requests.get(url,headers = headers)
16 
17 selector= etree.HTML(html.text)
18         
19 items_time = selector.xpath('//body//div//ul[@class="list_f14d"]//li/span/text()')
20 items_url = selector.xpath('//body//div//ul[@class="list_f14d"]//li/a/@href')
21 items_title =selector.xpath('//body//div//ul[@class="list_f14d"]//li/a/text()')
22 items = [[a,b,c] for a,b,c in zip(items_time,items_url,items_title)]
23 
24 item_dict= {}
25 item_list = []
26 items_url_old =[]
27 
28 
29 for item in items:
30     if item[1] is None :
31           pass
32     if item[1] is not None and item[1] not in items_url_old:
33         new_html= requests.get(item[1],headers = headers)
34         selector1=etree.HTML(new_html.text)
35         text = selector1.xpath('//body//div[@class ="post_text"]//p/text()')
36         item_dict['time']=item[0]
37         item_dict['url']=item[1]
38         item_dict['title']=item[2]
39         item_dict['text']=text
40         items_url_old.append(item[1])
41         item_list.append(item_dict)
42         item_list = {}
43 file = codecs.open('pc2.json','w',encoding = 'utf-8')
44 
45 for item in item_list:
46     line = json.dumps(dict(item),ensure_ascii = False) +"\n"
47     file.write(line)
48 
49 file.close()

 

posted @ 2017-10-21 21:43  dang幸福来敲门  阅读(164)  评论(0编辑  收藏  举报