[置顶]使用scrapy_redis，自动实时增量更新东方头条网全站新闻

存储使用mysql，增量更新东方头条全站新闻的标题新闻简介发布时间新闻的每一页的内容以及新闻内的所有图片。项目文件结构。

这是run.py的内容

 1 #coding=utf-8
 2 from scrapy import cmdline
 3 import redis,time,threading
 4 from multiprocessing import Process
 5 #import scrapy.log
 6 
 7 #cmdline.execute("scrapy crawl baoxian -s LOG_FILE=scrapy10.log".split())
 8 
 9 #scrapy crawl myspider -s LOG_FILE=scrapy2.log
10 
11 
12 start_urls = ['http://mini.eastday.com/',
13                   'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0010&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=170603095010319,170603093955594-2,170603093955594&jsonpcallback=jQuery18303164258797187358_1496455837718&_=1496455838146', #国内
14                     'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0011&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=170603142336718-2,170603142336718,170603122752716&jsonpcallback=jQuery18307262756496202201_1496477922458&_=1496477923254', #国际
15                   'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0005&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18302500620267819613_1496483754044&_=1496483755277',#军事
16                   'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0003&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery183026658024708740413_1496480575988&_=1496480576634',#社会
17                     'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0002&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery1830691694314358756_1496480816841&_=1496480817500',#娱乐
18                   'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0019&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18303703077440150082_1496480892188&_=1496480892581',#健康
19                     'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0015&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery183023222095426172018_1496480961781&_=1496480962307',#时尚
20                   'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0008&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery183017557532875798643_1496481013410&_=1496481013824',#科技
21                   'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0012&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18308183211348950863_1496481106550&_=1496481106993',#汽车
22                   'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0018&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18309359942991286516_1496481227742&_=1496481228242',#人文
23                   'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0007&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery183019699203735217452_1496481313637&_=1496481314077',#游戏
24                     'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0020&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18307782149398699403_1496481413006&_=1496481413401',#星座
25                     'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0021&param=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18306590236281044781_1496481467020&_=1496481467496',#家居
26 
27                   ]
28 
29 r = redis.Redis(host='127.0.0.1',port=6379,db=0)
30 
31 
32 
33 def check_redis_requsts():
34     while(1):
35         '''
36         for url in start_urls:
37             r.rpush('eastdayspider:start_urls',url)
38         print u'插入到start_urls的:',r.lrange('eastdayspider:start_urls',0,-1)
39         '''
40         for url in start_urls:
41             r.sadd('eastdayspider:start_urls',url)
42         print u'插入到start_urls的:',r.smembers('eastdayspider:start_urls')
43 
44         count=0
45         while (count<30):
46             if  r.exists('eastdayspider:requests'):
47                 time.sleep(60)
48                 count=0
49             else:
50                 count+=1
51                 time.sleep(10)
52 
53 def run_spider():
54     cmdline.execute("scrapy crawl eastdayspider".split())
55 
56 
57 if __name__=='__main__':
58     pass
59 
60     
61     p1= Process(target=check_redis_requsts)
62     p2=Process(target=run_spider)
63 
64     p1.start()
65     time.sleep(5)
66     p2.start()
67 
68     p1.join()
69     p2.join()

这是settings.py

1 # -*- coding: utf-8 -*-
  2 
  3 # Scrapy settings for eastday project
  4 #
  5 # For simplicity, this file contains only settings considered important or
  6 # commonly used. You can find more settings consulting the documentation:
  7 #
  8 #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 
 12 BOT_NAME = 'eastday'
 13 
 14 SPIDER_MODULES = ['eastday.spiders']
 15 NEWSPIDER_MODULE = 'eastday.spiders'
 16 
 17 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
 18 SCHEDULER = "scrapy_redis.scheduler.Scheduler"
 19 REDIS_START_URLS_AS_SET=True   #shezhi strat_urls键是集合，默认是false是列表
 20 SCHEDULER_PERSIST = True
 21 
 22 DEPTH_PRIORITY=0
 23 RETRY_TIMES = 20
 24 
 25 IMAGES_STORE = 'd:/'
 26 IMAGES_EXPIRES = 90
 27 
 28 REDIS_HOST = 'localhost'
 29 REDIS_PORT = 6379
 30 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 31 #USER_AGENT = 'eastday (+http://www.yourdomain.com)'
 32 
 33 # Obey robots.txt rules
 34 ROBOTSTXT_OBEY = False
 35 
 36 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 37 CONCURRENT_REQUESTS = 10
 38 
 39 # Configure a delay for requests for the same website (default: 0)
 40 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 41 # See also autothrottle settings and docs
 42 DOWNLOAD_DELAY = 0
 43 # The download delay setting will honor only one of:
 44 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 45 #CONCURRENT_REQUESTS_PER_IP = 16
 46 
 47 # Disable cookies (enabled by default)
 48 #COOKIES_ENABLED = False
 49 
 50 # Disable Telnet Console (enabled by default)
 51 #TELNETCONSOLE_ENABLED = False
 52 
 53 # Override the default request headers:
 54 #DEFAULT_REQUEST_HEADERS = {
 55 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 56 #   'Accept-Language': 'en',
 57 #}
 58 
 59 # Enable or disable spider middlewares
 60 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 61 #SPIDER_MIDDLEWARES = {
 62 #    'eastday.middlewares.EastdaySpiderMiddleware': 543,
 63 #}
 64 
 65 # Enable or disable downloader middlewares
 66 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 67 
 68 DOWNLOADER_MIDDLEWARES = {
 69     "eastday.middlewares.UserAgentMiddleware": 401,
 70     #"eastday.middlewares.CookiesMiddleware": 402,
 71 }
 72 
 73 
 74 
 75 # Enable or disable extensions
 76 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 77 #EXTENSIONS = {
 78 #    'scrapy.extensions.telnet.TelnetConsole': None,
 79 #}
 80 
 81 # Configure item pipelines
 82 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 83 ITEM_PIPELINES = {
 84     #'eastday.pipelines.EastdayPipeline': 300,
 85     'eastday.pipelines.MysqlDBPipeline':400,
 86    'eastday.pipelines.DownloadImagesPipeline':200,
 87     #'scrapy_redis.pipelines.RedisPipeline': 400,
 88 
 89 }
 90 
 91 # Enable and configure the AutoThrottle extension (disabled by default)
 92 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 93 #AUTOTHROTTLE_ENABLED = True
 94 # The initial download delay
 95 #AUTOTHROTTLE_START_DELAY = 5
 96 # The maximum download delay to be set in case of high latencies
 97 #AUTOTHROTTLE_MAX_DELAY = 60
 98 # The average number of requests Scrapy should be sending in parallel to
 99 # each remote server
100 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
101 # Enable showing throttling stats for every response received:
102 #AUTOTHROTTLE_DEBUG = False
103 
104 # Enable and configure HTTP caching (disabled by default)
105 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
106 #HTTPCACHE_ENABLED = True
107 #HTTPCACHE_EXPIRATION_SECS = 0
108 #HTTPCACHE_DIR = 'httpcache'
109 #HTTPCACHE_IGNORE_HTTP_CODES = []
110 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

这是pipelines.py,里面有建表文件。里面有个mysql检查url是否存在的语句，其实是多余的。因为url已经在redis中去重了。

1 # -*- coding: utf-8 -*-
  2 
  3 # Define your item pipelines here
  4 #
  5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 import time,json,pymysql,re
  8 from items import EastdayItem
  9 from scrapy import Request
 10 from scrapy.pipelines.images import ImagesPipeline
 11 from scrapy.exceptions import DropItem
 12 
 13 
 14 '''
 15 CREATE TABLE `eastday` (
 16   `id` INT(10) NOT NULL AUTO_INCREMENT,
 17   `title` VARCHAR(255) DEFAULT NULL,
 18   `url` VARCHAR(80) DEFAULT NULL,
 19   `tag` VARCHAR(30) DEFAULT NULL,
 20   `brief` VARCHAR(300) DEFAULT NULL,
 21    pubdate      DATETIME,
 22    origin    VARCHAR(50),
 23    crawled_time  DATETIME,
 24 
 25   `miniimg` VARCHAR(500) DEFAULT NULL,
 26   `img_urls` TEXT,
 27   `article` TEXT,
 28   PRIMARY KEY (`id`)
 29 ) ENGINE=INNODB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8
 30 '''
 31 
 32 class EastdayPipeline(object):
 33 
 34     def process_item(self, item, spider):
 35         print '----------------------------%s'%json.dumps(dict(item),ensure_ascii=False)
 36         item["crawled_time"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
 37         return item
 38 
 39 
 40 class MysqlDBPipeline(object):
 41     def __init__(self):
 42 
 43 
 44         self.conn = pymysql.connect(
 45                     host='localhost',
 46                     port=3306,
 47                     user='root',
 48 
 49                     passwd='123456',
 50                     db='test',
 51                     charset='utf8',
 52                     )
 53         self.cur = self.conn.cursor()
 54 
 55     def process_item(self, item, spider):
 56 
 57 
 58         if isinstance(item, EastdayItem):
 59             item["crawled_time"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
 60             print item['pubdate']
 61 
 62             try:
 63                 for key in dict(item):
 64                     pass
 65                     item[key]=str(item[key]).replace("'", "\\\'")
 66                     item[key] = str(item[key]).replace('"','\\\"')
 67 
 68                 sql="""insert into eastday values(NULL,"{title}","{url}","{tag}","{brief}","{pubdate}","{origin}","{crawled_time}","{miniimg}","{img_urls}","{article}")""".format(title=item['title'],url=item['url'],tag=item['tag'],brief=item['brief'],pubdate=item['pubdate'],origin=item['origin'],crawled_time=item['crawled_time'],miniimg=item['miniimg'],img_urls=item['img_urls'],article=item['article'])
 69                 sql2 = 'select 1 from eastday where url="%s"'%item['url']
 70                 print 'sql:',sql
 71 
 72                 self.cur.execute(sql2)
 73                 is_exist = self.cur.fetchone()
 74                 if is_exist==(1,):
 75                     print '已存在%s'%item['url']
 76 
 77                 else:
 78                     self.cur.execute(sql)
 79                     self.conn.commit()
 80                     print '插入成功'
 81 
 82             except Exception as e:
 83                 print  u'数据库error:',e
 84                 pass
 85 
 86 
 87         else:
 88             print 'nonnonono'
 89 
 90 
 91 
 92 class DownloadImagesPipeline(ImagesPipeline):
 93 
 94 
 95     def get_media_requests(self, item, info):
 96 
 97 
 98         if item['img_urls']:
 99             for img_url in item['img_urls']:
100 
101                 yield Request(img_url,meta={'name':img_url})
102 
103     def item_completed(self, results, item, info):
104         image_paths = [x['path'] for ok, x in results if ok]
105         if not image_paths:
106             raise DropItem("Item contains no images")
107         return item
108 
109     def file_path(self, request, response=None, info=None):
110         m=request.meta
111         img_name=re.findall('/([a-z_0-9]*)\.[(jpeg)|(jpg)|(png)|(bmp)|(gif)|(JPEG)|(JPG)|(PNG)|(BMP)|(GIF)]',m['name'])[-1]
112         #print 'img_name',img_name
113         filename = 'full3/%s.jpg'%img_name
114         return filename

这是items.py

1 # -*- coding: utf-8 -*-
 2 
 3 # Define here the models for your scraped items
 4 #
 5 # See documentation in:
 6 # http://doc.scrapy.org/en/latest/topics/items.html
 7 
 8 import scrapy
 9 
10 
11 class EastdayItem(scrapy.Item):
12     # define the fields for your item here like:
13     # name = scrapy.Field()
14     title=scrapy.Field()
15     url=scrapy.Field()
16     tag=scrapy.Field()
17     article=scrapy.Field()
18     img_urls=scrapy.Field()
19     crawled_time=scrapy.Field()
20     pubdate=scrapy.Field()
21     origin=scrapy.Field()
22 
23 
24     brief = scrapy.Field()
25     miniimg = scrapy.Field()
26 
27 
28     pass
29 
30 '''
31 class GuoneiItem(scrapy.Item):
32     # define the fields for your item here like:
33     # name = scrapy.Field()
34     title=scrapy.Field()
35     url=scrapy.Field()
36     tag=scrapy.Field()
37     article=scrapy.Field()
38     img_urls=scrapy.Field()
39     crawled_time=scrapy.Field()
40 
41     brief=scrapy.Field()
42     miniimg=scrapy.Field()
43 
44 
45     pass
46 '''

文件太多啦，不一一贴了，源码文件已打包已上传到博客园，但没找到分享文件链接的地方，如果要源码的可以评论中留言。

这是mysql的存储结果：

东方头条内容也是采集其他网站报刊的，内容还是很丰富，把东方头条的爬下来快可以做一个咨询内容的app了。

文章图片采用的是新闻中图片的连接的源文件名，方便前端开发在页面中展现正确的图片。

posted @ 2018-05-03 15:42 从未被超越阅读(681) 评论(0) 收藏举报

刷新页面返回顶部

超越梦想

成功的人是跟别人学习经验，失败的人只跟自己学习经验。

[置顶]使用scrapy_redis，自动实时增量更新东方头条网全站新闻

公告