scrapy实战1分布式爬取有缘网(6.22接口已挂):
直接上代码:
items.py
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # http://doc.scrapy.org/en/latest/topics/items.html 7 8 import scrapy 9 10 11 class YouyuanwangItem(scrapy.Item): 12 # define the fields for your item here like: 13 # name = scrapy.Field() 14 # 个人头像链接 15 header_url=scrapy.Field() 16 # 用户名 17 username=scrapy.Field() 18 # 内心独白 19 monologue=scrapy.Field() 20 # 相册图片链接 21 pic_urls=scrapy.Field() 22 #籍贯 23 place_from=scrapy.Field() 24 #学历 25 education=scrapy.Field() 26 # 年龄 27 age=scrapy.Field() 28 #身高 29 height=scrapy.Field() 30 #工资 31 salary=scrapy.Field() 32 #兴趣爱好 33 hobby=scrapy.Field() 34 # 网站来源 youyuan 35 source=scrapy.Field() 36 # 个人主页源url 37 source_url=scrapy.Field() 38 # 爬虫名 39 spider=scrapy.Field()
spiders >yuoyuan.py
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.linkextractors import LinkExtractor 4 from scrapy.spiders import Rule 5 from scrapy_redis.spiders import RedisCrawlSpider 6 from youyuanwang.items import YouyuanwangItem 7 8 9 10 # class YouyuanSpider(CrawlSpider): 11 class youyuan(RedisCrawlSpider): 12 name = 'youyuan' 13 # allowed_domains = ['www.youyuan.com'] 14 # 有缘网的列表页 15 # start_urls = ['http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/'] 16 redis_key = 'youyuan:start_urls' 17 #动态域范围的获取 18 def __init__(self, *args, **kwargs): 19 # Dynamically define the allowed domains list. 20 domain = kwargs.pop('domain', '') 21 self.allowed_domains = filter(None, domain.split(',')) 22 super(youyuan, self).__init__(*args, **kwargs) 23 #匹配全国 24 #list_page = LinkExtractor(allow=(r'http://www.youyuan.com/find/.+')) 25 # 只匹配北京、18~25岁、女性 的 搜索页面匹配规则,根据response提取链接 26 page_links=LinkExtractor(allow=r"http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\d+/") 27 # 个人主页 匹配规则,根据response提取链接 28 profile_page=LinkExtractor(allow=r"http://www.youyuan.com/\d+-profile/") 29 30 rules = ( 31 # 匹配列表页成功,跟进链接,跳板 32 Rule(page_links), 33 # 匹配个人主页的链接,形成request保存到redis中等待调度,一旦有响应则调用parse_profile_page()回调函数处理,不做继续跟进 34 Rule(profile_page,callback="parse_profile_page",follow=False) 35 ) 36 37 # 处理个人主页信息,得到我们要的数据 38 def parse_profile_page(self, response): 39 item=YouyuanwangItem() 40 # 个人头像链接 41 item['header_url']=self.get_header_url(response) 42 # 用户名 43 item['username']=self.get_username(response) 44 #籍贯 45 item['place_from']=self.get_place_from(response) 46 #学历 47 item['education']=self.get_education(response) 48 49 # 年龄 50 item['age']=self.get_age(response) 51 # 身高 52 item['height']=self.get_height(response) 53 # 工资 54 item['salary']=self.get_salary(response) 55 # 兴趣爱好 56 item['hobby']=self.get_hobby(response) 57 # 相册图片链接 58 item['pic_urls'] = self.get_pic_urls(response) 59 # 内心独白 60 item['monologue'] = self.get_monologue(response) 61 # 个人主页源url 62 item['source_url']=response.url 63 # 网站来源 youyuan 64 item['source']="youyuan" 65 # 爬虫名 66 item['spider']="youyuan" 67 yield item 68 #提取头像地址 69 def get_header_url(self,response): 70 header=response.xpath('//dl[@class="personal_cen"]/dt/img/@src').extract() 71 if len(header): 72 header_url=header[0] 73 else: 74 header_url= "" 75 return header_url.strip() 76 #提取用户名 77 def get_username(self,response): 78 username=response.xpath('//dl[@class="personal_cen"]/dd//div[@class="main"]/strong/text()').extract() 79 if len(username): 80 username=username[0] 81 else: 82 username="" 83 return username.strip() 84 #提取年龄 85 def get_age(self,response): 86 age=response.xpath('//dl[@class="personal_cen"]//p[@class="local"]/text()').extract() 87 if len(age): 88 age=age[0].split()[1] 89 else: 90 age="" 91 return age 92 #提取身高 93 def get_height(self,response): 94 height=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[2]/li[2]/span/text()').extract() 95 if len(height): 96 height=height[0] 97 else: 98 height="" 99 100 return height.strip() 101 #提取工资 102 def get_salary(self,response): 103 salary=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[4]/span/text()').extract() 104 if len(salary): 105 salary=salary[0] 106 else: 107 salary="" 108 return salary.strip() 109 #提取兴趣爱好 110 def get_hobby(self,response): 111 hobby=response.xpath('//dl[@class="personal_cen"]//ol[@class="hoby"]//li/text()').extract() 112 if len(hobby): 113 hobby=",".join(hobby).replace(" ","") 114 else: 115 hobby="" 116 return hobby.strip() 117 #提取相册图片 118 def get_pic_urls(self,response): 119 pic_urls=response.xpath('//div[@class="ph_show"]/ul/li/a/img/@src').extract() 120 if len(pic_urls): 121 pic_urls=",".join(pic_urls) 122 #将相册url列表转换成字符串 123 else: 124 pic_urls="" 125 return pic_urls 126 #提取内心独白 127 def get_monologue(self,response): 128 monologue=response.xpath('//div[@class="pre_data"]/ul/li/p/text()').extract() 129 if len(monologue): 130 monologue=monologue[0] 131 else: 132 monologue="" 133 return monologue.strip() 134 #提取籍贯 135 def get_place_from(self,response): 136 place_from=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[1]/span/text()').extract() 137 if len(place_from): 138 place_from=place_from[0] 139 else: 140 place_from="" 141 return place_from.strip() 142 #提取学历 143 def get_education(self,response): 144 education=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[3]/span/text()').extract() 145 if len(education): 146 education=education[0] 147 else: 148 education="" 149 return education.strip()
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.linkextractors import LinkExtractor 4 from scrapy.spiders import Rule,CrawlSpider 5 #from scrapy_redis.spiders import RedisCrawlSpider 6 from youyuanwang.items import YouyuanwangItem 7 8 9 class YouyuanSpider(CrawlSpider): 10 #class YouyuanSpider(RedisCrawlSpider): 11 name = 'youyuan' 12 allowed_domains = ['www.youyuan.com'] 13 # 有缘网的列表页 14 start_urls = ['http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/'] 15 #redis_key = 'YouyuanSpider:start_urls' 16 #动态域范围的获取 17 # def __init__(self, *args, **kwargs): 18 # # Dynamically define the allowed domains list. 19 # domain = kwargs.pop('domain', '') 20 # self.allowed_domains = filter(None, domain.split(',')) 21 # super(YouyuanSpider, self).__init__(*args, **kwargs) 22 #匹配全国 23 #list_page = LinkExtractor(allow=(r'http://www.youyuan.com/find/.+')) 24 # 只匹配北京、18~25岁、女性 的 搜索页面匹配规则,根据response提取链接 25 page_links=LinkExtractor(allow=r"http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\d+/") 26 # 个人主页 匹配规则,根据response提取链接 27 profile_page=LinkExtractor(allow=r"http://www.youyuan.com/\d+-profile/") 28 29 rules = ( 30 # 匹配列表页成功,跟进链接,跳板 31 Rule(page_links), 32 # 匹配个人主页的链接,形成request保存到redis中等待调度,一旦有响应则调用parse_profile_page()回调函数处理,不做继续跟进 33 Rule(profile_page,callback="parse_profile_page",follow=False) 34 ) 35 36 # 处理个人主页信息,得到我们要的数据 37 def parse_profile_page(self, response): 38 item=YouyuanwangItem() 39 # 个人头像链接 40 item['header_url']=self.get_header_url(response) 41 # 用户名 42 item['username']=self.get_username(response) 43 #籍贯 44 item['place_from']=self.get_place_from(response) 45 #学历 46 item['education']=self.get_education(response) 47 48 # 年龄 49 item['age']=self.get_age(response) 50 # 身高 51 item['height']=self.get_height(response) 52 # 工资 53 item['salary']=self.get_salary(response) 54 # 兴趣爱好 55 item['hobby']=self.get_hobby(response) 56 # 相册图片链接 57 item['pic_urls'] = self.get_pic_urls(response) 58 # 内心独白 59 item['monologue'] = self.get_monologue(response) 60 # 个人主页源url 61 item['source_url']=response.url 62 # 网站来源 youyuan 63 item['source']="youyuan" 64 # 爬虫名 65 item['spider']="youyuan" 66 yield item 67 #提取头像地址 68 def get_header_url(self,response): 69 header=response.xpath('//dl[@class="personal_cen"]/dt/img/@src').extract() 70 if len(header): 71 header_url=header[0] 72 else: 73 header_url= "" 74 return header_url.strip() 75 #提取用户名 76 def get_username(self,response): 77 username=response.xpath('//dl[@class="personal_cen"]/dd//div[@class="main"]/strong/text()').extract() 78 if len(username): 79 username=username[0] 80 else: 81 username="" 82 return username.strip() 83 #提取年龄 84 def get_age(self,response): 85 age=response.xpath('//dl[@class="personal_cen"]//p[@class="local"]/text()').extract() 86 if len(age): 87 age=age[0].split()[1] 88 else: 89 age="" 90 return age 91 #提取身高 92 def get_height(self,response): 93 height=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[2]/li[2]/span/text()').extract() 94 if len(height): 95 height=height[0] 96 else: 97 height="" 98 99 return height.strip() 100 #提取工资 101 def get_salary(self,response): 102 salary=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[4]/span/text()').extract() 103 if len(salary): 104 salary=salary[0] 105 else: 106 salary="" 107 return salary.strip() 108 #提取兴趣爱好 109 def get_hobby(self,response): 110 hobby=response.xpath('//dl[@class="personal_cen"]//ol[@class="hoby"]//li/text()').extract() 111 if len(hobby): 112 hobby=",".join(hobby).replace(" ","") 113 else: 114 hobby="" 115 return hobby.strip() 116 #提取相册图片 117 def get_pic_urls(self,response): 118 pic_urls=response.xpath('//div[@class="ph_show"]/ul/li/a/img/@src').extract() 119 if len(pic_urls): 120 pic_urls=",".join(pic_urls) 121 #将相册url列表转换成字符串 122 else: 123 pic_urls="" 124 return pic_urls 125 #提取内心独白 126 def get_monologue(self,response): 127 monologue=response.xpath('//div[@class="pre_data"]/ul/li/p/text()').extract() 128 if len(monologue): 129 monologue=monologue[0] 130 else: 131 monologue="" 132 return monologue.strip() 133 #提取籍贯 134 def get_place_from(self,response): 135 place_from=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[1]/span/text()').extract() 136 if len(place_from): 137 place_from=place_from[0] 138 else: 139 place_from="" 140 return place_from.strip() 141 #提取学历 142 def get_education(self,response): 143 education=response.xpath('//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[3]/span/text()').extract() 144 if len(education): 145 education=education[0] 146 else: 147 education="" 148 return education.strip()
pipelines.py
1 # -*- coding: utf-8 -*- 2 3 # Define your item pipelines here 4 # 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 # import json 8 # 9 # class YouyuanwangPipeline(object): 10 # def __init__(self): 11 # self.filename=open("youyuanwang.json","wb") 12 # def process_item(self, item, spider): 13 # jsontext=json.dumps(dict(item),ensure_ascii=False) + ",\n" 14 # self.filename.write(jsontext.encode("utf-8")) 15 # return item 16 # def close_spider(self,spider): 17 # self.filename.close() 18 19 import pymysql 20 from .models.es_types import YouyuanType 21 class XiciPipeline(object): 22 def process_item(self, item, spider): 23 # DBKWARGS=spider.settings.get('DBKWARGS') 24 con=pymysql.connect(host='127.0.0.1',user='root',passwd='229801',db='yunyuan',charset='utf8') 25 cur=con.cursor() 26 sql=("insert into youyuanwang(header_url,username,monologue,pic_urls,place_from,education,age,height,salary,hobby,source)" 27 "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)") 28 lis=(item['header_url'],item['username'],item['monologue'],item['pic_urls'],item['place_from'],item['education'],item['age'],item['height'],item['salary'],item['hobby'],item['source']) 29 30 cur.execute(sql,lis) 31 con.commit() 32 cur.close() 33 con.close() 34 return item 35 36 37 38 class ElasticsearchPipeline(object): 39 def process_item(self,item,spider): 40 youyuan = YouyuanType() 41 youyuan.header_url=item["header_url"] 42 youyuan.username=item["username"] 43 youyuan.age=item["age"] 44 youyuan.salary=item["salary"] 45 youyuan.monologue=item["monologue"] 46 youyuan.pic_urls=item["pic_urls"] 47 youyuan.place_from=item["place_from"] 48 49 youyuan.save() 50 51 return item
settings.py
# -*- coding: utf-8 -*- # Scrapy settings for youyuanwang project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = 'youyuanwang' SPIDER_MODULES = ['youyuanwang.spiders'] NEWSPIDER_MODULE = 'youyuanwang.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'youyuanwang (+http://www.yourdomain.com)' # Obey robots.txt rules #ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'youyuanwang.middlewares.YouyuanwangSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'youyuanwang.middlewares.MyCustomDownloaderMiddleware': 543, #} # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" SCHEDULER = "scrapy_redis.scheduler.Scheduler" SCHEDULER_PERSIST = True # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { #'youyuanwang.pipelines.XiciPipeline': 300, 'youyuanwang.pipelines.ElasticsearchPipeline': 300, # 'scrapy_redis.pipelines.RedisPipeline':400, } # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
从redis保存到mongodb 在目录下新建文件process_item_mongo.py(名字随便取)
1 #coding=utf-8 2 3 4 import pymongo 5 import redis 6 import json 7 8 def process_item(): 9 Redis_conn=redis.StrictRedis(host='127.0.0.1',port=6379,db=0) 10 Mongo_conn=pymongo.MongoClient(host='127.0.0.1',port=27017) 11 db=Mongo_conn["youyuan"] 12 table=db["beijing_18_25"] 13 while True: 14 source, data = Redis_conn.blpop(["youyuan:items"]) 15 data = json.loads(data.decode("utf-8")) 16 table.insert(data) 17 if __name__=="__main__": 18 process_item()
从redis保存到mysql 在目录下新建文件process_item_mysql.py(名字随便取)
1 #coding=utf-8 2 3 import pymysql 4 import redis 5 import json 6 7 def process_item(): 8 Redis_conn=redis.StrictRedis(host='127.0.0.1',port=6379,db=0) 9 MySql_conn=pymysql.connect(host='127.0.0.1',user='root',passwd='229801',port=3306,db='yunyuan') 10 while True: 11 source,data=Redis_conn.blpop("youyuan:items") 12 data=json.loads(data.decode("utf-8")) 13 cur=MySql_conn.cursor() 14 sql=("insert into youyuanwang(header_url,username,monologue,pic_urls,place_from,education,age,height,salary,hobby,source)" 15 "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)") 16 lis = (data['header_url'], data['username'], data['monologue'], data['pic_urls'], data['place_from'], 17 data['education'], data['age'], data['height'], data['salary'], data['hobby'], data['source']) 18 cur.execute(sql,lis) 19 MySql_conn.commit() 20 cur.close() 21 MySql_conn.close() 22 if __name__=="__main__": 23 process_item()
数据:
申明:以上只限于参考学习交流!!! 更多:https://github.com/huwei86/Spideryouyuanwang