123123

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2017-07-27 15:08:01
# Project: 36_ke

from pyspider.libs.base_handler import *
from msxflibs.pyspider.public.database.tomysql import ToMysql
from msxflibs.pyspider.projects.newmedia.images import extract_img_url
from datetime import datetime
import hashlib
import time
import json


# 36氪
class Handler(BaseHandler):
# 消息头设置
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.8",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36"
}

ajax_headers = {
"Host": "36kr.com",
"Connection": "keep-alive",
"Accept": "*/*",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
# "Referer": "http://jingji.cctv.com/caijing/index.shtml",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.8"
}

crawl_config = {
"headers": headers,
"timeout": 60000
}

web_name = "36氪"

#newsflashes_url = "http://36kr.com/newsflashes"

columns = {
"明星公司": ["http://36kr.com/api/search/articles/%E6%98%8E%E6%98%9F%E5%85%AC%E5%8F%B8?page=1&pageSize=40&_=",
"http://36kr.com/search/articles/%E6%98%8E%E6%98%9F%E5%85%AC%E5%8F%B8?page=1&ts="],
"行业新闻": ["http://36kr.com/api/search/articles/%E8%A1%8C%E4%B8%9A%E6%96%B0%E9%97%BB?page=1&pageSize=40&_=",
"http://36kr.com/search/articles/%E8%A1%8C%E4%B8%9A%E6%96%B0%E9%97%BB?page=1&ts="],
"行业研究": ["http://36kr.com/api/search/articles/%E8%A1%8C%E4%B8%9A%E7%A0%94%E7%A9%B6?page=1&pageSize=40&_=",
"http://36kr.com/search/articles/%E8%A1%8C%E4%B8%9A%E7%A0%94%E7%A9%B6?page=1&ts="]
}

datetime_format_to_space_s = '%Y-%m-%d %H:%M:%S'

@every(minutes=60)
def on_start(self):
#self.crawl(self.newsflashes_url, callback=self.parse_newsflashes_page)
for i in self.columns:
current_seconds = int(time.time())
current_millis = int(time.time() * 1000)
ajax_url = self.columns[i][0] + str(current_millis)
page_url = self.columns[i][1] + str(current_seconds - 3)
self.ajax_headers['Referer'] = page_url
self.crawl(ajax_url, headers=self.ajax_headers, callback=self.index_page)

# @config(age=60 * 60, priority=6)
# def parse_newsflashes_page(self, response):
# props = self.get_script_props(response)
# if props is None:
# return
# newsflash_list = props.get('newsflashList|newsflash')
# for newsflash in newsflash_list:
# newsflash_r = self.get_newsflash_result(newsflash)
# self.on_result(newsflash_r)

# def get_newsflash_result(self, newsflash):
# url = 'http://36kr.com/newsflashes?column_id=' + (newsflash.get('column_id') or '') + '&id=' + (newsflash.get('id') or '')
# publish_time_str = newsflash.get('published_at')
# publish_time = datetime.strptime(publish_time_str, self.datetime_format_to_space_s)
# return {
# "url": url,
# "url_hash_code": hashlib.sha256(url).hexdigest(),
# "title": newsflash.get('title') or '',
# "keywords": '',
# "description": '',
# "publish_time": publish_time or '',
# "article_resouce": '36氪 7x24 快讯',
# "article_resouce_link": newsflash.get('news_url') or '',
# "content": newsflash.get('description') or '',
# "gmt_create_time": datetime.now(),
# "gmt_update_time": datetime.now(),
# "web_name": self.web_name,
# "article_type": 1,
# "image_url": '',
# "is_image_inside": False
# }

def get_script_props(self, response):
for script in response.doc("script").items():
if script is None:
continue
script_str = script.text().encode('UTF-8').strip()
if script_str is None or len(script_str) <= 0:
continue
if not str(script_str).startswith('var props='):
continue
index = script_str.find('{')
if index == -1:
continue
script_str = script_str[index:]
index = script_str.find(',locationnal=')
if index == -1:
continue
script_str = script_str[:index]
if len(script_str) <= 0:
continue
props = json.loads(script_str)
if props is None:
continue
return props

@config(age=60 * 60, priority=6)
def index_page(self, response):
print(response.url)
data_dict = response.json
print('data_dict: ' + ((data_dict and str(data_dict)) or 'None'))
if data_dict is None:
return
data_dict = data_dict.get(u'data')
if data_dict is None:
return
datas = data_dict.get(u'data')
if datas is None or len(datas) == 0:
return
for data_d in datas:
article_id = data_d.get(u'id')
if article_id is None or len(str(article_id).strip()) <= 0:
continue
article_url = "http://36kr.com/p/" + str(article_id) + ".html"
article_img_url = data_d.get(u'img')
self.crawl(article_url, save={'article_image': article_img_url or ''}, callback=self.detail_page)

@config(age=60 * 60, priority=10)
def detail_page(self, response):
props = self.get_script_props(response)
if props is None:
return
detailArticle = props.get('detailArticle|post')

title = response.doc('title').text().strip()
image_url = self.get_image_url(response)
image_url = image_url or detailArticle.get('cover')

publish_time_str = detailArticle.get('published_at')
publish_time = datetime.strptime(publish_time_str, self.datetime_format_to_space_s)

article_resource = (detailArticle.get('user') and detailArticle.get('user').get('name')) or ''
return {
"url": response.url,
"url_hash_code": hashlib.sha256(response.url).hexdigest(),
"title": title,
"keywords": response.doc('head meta[name="keywords"]').attr('content') or '',
"description": response.doc('head meta[name="description"]').attr('content') or '',
"publish_time": publish_time or '',
"article_resouce": article_resource or '',
"article_resouce_link": detailArticle.get('source_urls') or '',
"content": self.get_content(response) or '',
"gmt_create_time": datetime.now(),
"gmt_update_time": datetime.now(),
"web_name": self.web_name,
"article_type": 1,
"image_url": image_url or '',
"is_image_inside": bool(image_url)
}

def on_result(self, result):
if not result or not result['title']:
return
sql = ToMysql()
sql.into('web_page_content', **result)

# 获取文章整个页面
def get_content(self, response):
content = response.content.replace('\n', '').replace('\r', '').replace('\t', '').replace('&amp;', '&')
content = unicode(content, response.encoding)
print("content:")
print(content)
return content

# 获取文章图片 image_url
def get_image_url(self, response):
# extract_img_url(response.url, response.doc('div.art_context'), 'div[align="center"]>img')
article_image = response.save.get('article_image')
if article_image is not None and len(article_image) > 0:
image_url = article_image
print("image_url:")
print(image_url)
return image_url

 

posted @ 2017-08-30 19:18  二梦非凡  阅读(572)  评论(0编辑  收藏  举报