Scrapy尝试爬取微博热搜(初稿)
首先自己想要的item:
1 import scrapy 2 3 4 class WeiboItem(scrapy.Item): 5 6 rank = scrapy.Field() 7 title = scrapy.Field() 8 hot_totle = scrapy.Field() 9 tag_pic = scrapy.Field() 10 watch = scrapy.Field() 11 talk = scrapy.Field() 12 weibo_detail = scrapy.Field() 13 bozhu = scrapy.Field() 14 biaoqian = scrapy.Field() 15 time = scrapy.Field()
主要spider:记住referer!!!搜索详情页面会判断来源
1 import scrapy 2 import logging 3 import json 4 logger = logging.getLogger(__name__) 5 from weibo.items import WeiboItem 6 # from weibo.settings import MYSQL_HOST 7 import datetime 8 """ 9 class=icon-top 是置顶 判断其是否是买的 看后面的 icon-txt 如果是icon-txt-recommend推荐 10 """ 11 class WeiboResouSpider(scrapy.Spider): 12 name = 'weibo-resou' # 爬虫的名字 13 allowed_domains = ['weibo.com'] # 允许爬取的范围 14 start_urls = ['http://s.weibo.com/top/summary/'] # 热搜排行榜 15 detail_url = 'https://s.weibo.com/weibo?q=%23{}%23&Refer=top' # 搜索页面 16 tag_url = 'https://s.weibo.com/top/summary/summary?cate=socialevent' # tag 17 18 headers = { 19 "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36", 20 "Referer" : "https://s.weibo.com/" 21 } 22 23 Cookie = { 24 'SCF' : 'XXXX', 25 'SUB' : 'XXXX', 26 'SUBP' : 'XXXX', 27 'SUHB' : 'XXXX', 28 'ALF' : 'XXXX', 29 'SSOLoginState' : 'XXXXX'} 30 31 32 33 34 def parse(self, response): 35 # .extract 返回包含字符串数据的列表 .extract_first() 返回第一个 36 # 这里yield 返回可以是Request , BaseItem , 字典 或 None 37 # 此处yield 即可在pipeline里操作 生成器返回的值,或数据 38 # yield 返回的化就可 每次 循环都返回一个 39 # log 日志获取错误(主要为了定位 40 # 注意看如果图片链接前有 网站的域名 需要在获取到img 后进行拼接!列表推导式 41 # 如果要 循环调用 ,记得yield scrapu.Request 这个自己的parse url记得更新 42 # self.settings['MYSQL_HOST'] 43 # self.settings.get('MYSQL_HOST') 44 logging.warning('this is spider warning') 45 tr_list = response.xpath('//*[@id="pl_top_realtimehot"]/table//tr') 46 for tr in tr_list[1:]: 47 item = WeiboItem() 48 # item = {} 49 # 排名 50 if tr.xpath('./td[1]/text()').extract_first() is None: 51 item['rank'] = '置顶向' 52 else: 53 item['rank'] = tr.xpath('./td[1]/text()').extract_first() 54 # 获取热搜标题 55 if tr.xpath('./td[2]/a//text()').extract_first() is None: 56 item['title'] = '找不到热搜哦' 57 else: 58 item['title'] = tr.xpath('./td[2]/a//text()').extract_first() 59 60 61 # 判断这里是否有热度 62 63 if tr.xpath('./td[2]/span/text()').extract_first() is None: 64 item['hot_totle'] = '找不到热度哟' 65 else: 66 item['hot_totle'] = tr.xpath('./td[2]/span/text()').extract_first() 67 # 获取热搜tag 68 if not tr.xpath('./td[3]//i/text()').extract_first(): 69 item['tag_pic'] = '没有热度标签' 70 else: 71 item['tag_pic'] = tr.xpath('./td[3]//i/text()').extract_first() 72 # print(item) 73 title_search = item['title'] 74 # print(item) 75 yield scrapy.Request( 76 url=self.detail_url.format(title_search), 77 meta={'item':item}, 78 headers=self.headers, # 非常关键!一定要referer是微博的来访! 79 cookies=self.Cookie, 80 callback=self.search_detail 81 ) 82 def search_detail(self,response): 83 item = response.meta['item'] 84 85 if response.xpath('//div[@class="total"]/span[1]/text()').extract_first() is None: 86 item['watch'] = '阅读量爆炸' 87 else: 88 item['watch'] = response.xpath('//div[@class="total"]/span[1]/text()').extract_first() 89 # print(item) 90 if response.xpath('//div[@class="total"]/span[2]/text()').extract_first() is None: 91 item['talk'] = '讨论量爆炸' 92 else: 93 item['talk'] = response.xpath('//div[@class="total"]/span[2]/text()').extract_first() 94 # print(item) 95 # 获取具体消息 会有置顶 当事方 热门 这里读取 96 97 page_row = response.xpath('//div[@class="content"]') 98 # print(page_row) 99 for detail in page_row: 100 item['weibo_detail'] = detail.xpath('//p[@class="txt"]/text()').extract() 101 item['bozhu'] = detail.xpath('//div[@class="content"]//a[@class="name"]/text()').extract() 102 # print(item['bozhu']) 103 # 置顶标签 104 if detail.xpath('//div[@class="card-wrap"]//h4[@class="title"]/a/text()') is None: 105 item['biaoqian'] = '普通微博哟' 106 else: 107 item['biaoqian'] = detail.xpath('//div[@class="card-wrap"]//h4[@class="title"]/a/text()').extract_first() 108 # print(item['biaoqian']) 109 110 item['time'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') 111 112 113 yield item
管道:
这里很粗糙,博文被js修饰了分隔没法一条一条,有待改进
这里先保存成csv 做数据分析,也可入库
1 import csv 2 3 class WeiboPipeline: 4 # 可以通过spider 来判断 我这里是哪个爬虫 如果是哪个爬虫再执行 可用if 判断是从那个爬虫来的 5 # 这样区分后可以分别存到 Mongodb 或者 Mysql里面 6 # process_item 名不能改 7 # 可以在这里 被传给这里后进行处理 用 content = [i.replace('替换',"") for i in content 8 # 取空内容 content = [i.strip() for i in content] 去空格 9 # content = [i for i in content if len(i)>0] 10 # 上面的处理内容 也可以: 11 # content = [re.sub(r'\xa0|\s|\r\n', "", i) for i in content] 12 # content = [i for i in content if len(i)>0] 13 def __init__(self): 14 15 self.file = open('resou.csv','w',encoding='utf-8-sig',newline='') 16 self.writer = csv.writer(self.file) 17 self.head = ['热搜名称','热度','讨论','阅读量','记录时间','热搜标签','排名'] 18 self.writer.writerow(self.head) 19 20 def process_item(self, item, spider): 21 # spider.settings.get('MYSQL_HOST') 22 item['weibo_detail'] = self.parse_content(item['weibo_detail']) 23 print(item) 24 self.writer.writerow([item['title'],item['hot_totle'],item['talk'],item['watch'],item['time'],item['tag_pic'],item['rank']]) 25 return item 26 27 def parse_content(self,content): 28 content = [i.replace('\u200b',"") for i in content] 29 content = [i.replace('\n',"") for i in content] 30 content = [i.replace('【', "") for i in content] 31 content = [i.replace('】', "") for i in content] 32 content = [i.strip() for i in content] 33 content = [i for i in content if len(i) > 0] 34 # print(content) 35 return content 36 37 def close_spider(self, spider): 38 self.file.close()
ps:!!可以使用进程 循环调取! emm 发现问题 咋这里 应该再主调用外面取写 csv 头,要不然重复 写入自己的 标签头了!!
这里以及可以完成定时关闭和永远循环执行,如果服务器想要执行爬虫文件最好还是执行这个文件。此处循环 定300s
1 from multiprocessing import Process 2 from scrapy import cmdline 3 import time 4 import logging 5 import datetime 6 # 配置参数即可, 爬虫名称,运行频率 7 confs = [ 8 { 9 "spider_name": "weibo-resou", 10 "frequency": 120, 11 }, 12 ] 13 14 def start_spider(spider_name, frequency,): 15 args = ["scrapy", "crawl", spider_name] 16 start_time = time.time() 17 18 19 while True: 20 21 start = time.time() 22 p = Process(target=cmdline.execute, args=(args,)) 23 p.start() 24 p.join() 25 logging.debug("### use time: %s" % (time.time() - start)) 26 during_time = (time.time() - start_time) 27 print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "正在爬取微博热搜哟") 28 if during_time > 300: 29 print("爬取结束哟") 30 break 31 else: 32 time.sleep(frequency) 33 print(during_time) 34 35 36 37 if __name__ == '__main__': 38 for conf in confs: 39 40 process = Process(target=start_spider, args=(conf["spider_name"], conf["frequency"])) 41 process.start() 42 43 start_time = time.time() 44 during_time = (time.time() - start_time) 45 46 if during_time > 300: 47 process.terminate() 48 break 49 # time.sleep(86400)
停止时间:settings里面
CLOSESPIDER_TIMEOUT = 秒数
哈哈接下来就是处理数据咯