爬虫抓取,保存为josn。
1爬虫抓取,保存为josn。
1 # -*- coding:utf-8 -*- 2 3 import json 4 import requests 5 6 from lxml import etree 7 import codecs 8 9 10 siteurl = "http://news.163.com/special/shijiuda_roll/" 11 user_agent="Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36" 12 headers = {"User-Agent":user_agent} 13 14 url = siteurl 15 html = requests.get(url,headers = headers) 16 17 selector= etree.HTML(html.text) 18 19 items_time = selector.xpath('//body//div//ul[@class="list_f14d"]//li/span/text()') 20 items_url = selector.xpath('//body//div//ul[@class="list_f14d"]//li/a/@href') 21 items_title =selector.xpath('//body//div//ul[@class="list_f14d"]//li/a/text()') 22 items = [[a,b,c] for a,b,c in zip(items_time,items_url,items_title)] 23 24 item_dict= {} 25 item_list = [] 26 items_url_old =[] 27 28 29 for item in items: 30 if item[1] is None : 31 pass 32 if item[1] is not None and item[1] not in items_url_old: 33 new_html= requests.get(item[1],headers = headers) 34 selector1=etree.HTML(new_html.text) 35 text = selector1.xpath('//body//div[@class ="post_text"]//p/text()') 36 item_dict['time']=item[0] 37 item_dict['url']=item[1] 38 item_dict['title']=item[2] 39 item_dict['text']=text 40 items_url_old.append(item[1]) 41 item_list.append(item_dict) 42 item_list = {} 43 file = codecs.open('pc2.json','w',encoding = 'utf-8') 44 45 for item in item_list: 46 line = json.dumps(dict(item),ensure_ascii = False) +"\n" 47 file.write(line) 48 49 file.close()