巴比特网站爬取
#!/user/bin/env python # -*- conding:utf-8 -*- import requests from lxml import etree import json class BtcSpider(object): def __init__(self): self.base_url = 'https://8btc.com/forum-61-' self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6823.400 QQBrowser/10.3.3117.400'} self.data_list = [] #发送请求 def get_response(self,url): response = requests.get(url,headers=self.headers) #head--meta-charset #抓取网页的编码是gbk data = response.content.decode('gbk') return data #解析网页 def parse_data(self,data): #使用xpath解析当前页面 #转类型 x_data = etree.HTML(data) #根据xpath路径解析 #路径 手写 借助浏览器 右击粘贴xpath路径,需要修改 tittle_list = x_data.xpath('//a[@class="s xst"]/text()') # tittle_list = x_data.xpath('//from[@id="moderate"]/div/div[2]/div/a[1]/text()') #模糊查询 //div[contain(@id,"normathread")] url_list = x_data.xpath('//a[@class="s xst"]/@href') for index,tittle in enumerate(tittle_list): news = {} # print(index) # print(tittle) news['name'] = tittle news['url'] = url_list[index] self.data_list.append(news) #保存数据 def save_data(self): #将列表转换成字符串 data_str = json.dumps(self.data_list) with open('05btc.json','w') as f: f.write(data_str) #启动 def run(self): #拼接完整url for i in range(1,10): url =self.base_url + str(i)+'.html' #发送请求 data = self.get_response(url) #做解析 parse_data = self.parse_data(data) #保存 self.save_data() BtcSpider().run()