新浪新闻简单抓取备忘录

一,抓取新浪新闻的简单方法

       1、requests
       2、pyquery解析
       3、自定义日志类logger.py

代码中分别使用了python多进程multiprocessing、gevent和普通的循环进行对比提取。

先提取文章列表,然后调用spiderDetail.py 中的方法云获取新闻具体内容
#!/usr/bin/python3
# -*- coding: utf-8 -*-

import requests
from pyquery import PyQuery as pq
from logger import *
from spiderDetail import *
import time
from multiprocessing import Pool
import gevent.pool
import gevent.monkey

gevent.monkey.patch_all()

sina_forex_url = 'http://finance.sina.com.cn/forex/'
def get_index_pages():
    response = requests.get(sina_forex_url)
    if response.status_code == 200:
        response.encoding = 'utf-8'
        content = response.text
        newsSet = set()#用于存储全部的新闻页面url,因新浪新闻可能出现在多个版块中,因此执行去重操作
        # 获取头条新闻
        hot_ad_link = pq(content)('#wrap > div:nth-child(24) > div.Center.headline > h1 > a')
        hot_ad_url = hot_ad_link.attr('href')
        hot_ad_title = hot_ad_link.text()
        logger.debug('头条新闻')
        print(hot_ad_url)
        newsSet.add(hot_ad_url)

        # 获取焦点新闻列表
        logger.debug('获取到焦点新闻')
        focus_news_lists = get_focus_news(content)
        for new in focus_news_lists:
            newsSet.add(new['url'])
            logger.debug(new)
        #获取滚动新闻
        logger.debug('获取到滚动新闻')
        roll_news_lists = get_roll_news(content)
        for new in roll_news_lists:
            newsSet.add(new['url'])
            logger.debug(new)
        # 获取24新闻快递
        logger.debug('获取到24小时新闻')
        hours24_news_lists = get_24hours_news(content)
        for new in hours24_news_lists:
            newsSet.add(new['url'])
            logger.debug(new)
        # 获取分析数据新闻
        logger.debug('获取到分析数据')
        analysis_news_lists = get_analysis_news(content)
        for new in analysis_news_lists:
            newsSet.add(new['url'])
            logger.debug(new)
        # 获取机构观点新闻
        logger.debug('获取到机构观点')
        institution_opinion_news_lists = get_institution_opinion_news(content)
        for new in institution_opinion_news_lists:
            newsSet.add(new['url'])
            logger.debug(new)
        # 获取专家观点新闻
        logger.debug('获取到专家观点')
        specialist_opinion_news_lists = get_specialist_opinion_news(content)
        for new in specialist_opinion_news_lists:
            newsSet.add(new['url'])
            logger.debug(new)
        # 获取人民币汇率新闻
        logger.debug('获取到人民币汇率')
        rmb_exchange_news_lists = get_RMB_exchange_news(content)
        for new in rmb_exchange_news_lists:
            newsSet.add(new['url'])
            logger.debug(new)

        #提取文章详细内容
        logger.debug('抓取新闻共计:' + str(len(newsSet)))
        #采用基本的循环模式
        #for url in newsSet:
        #    get_page_detail(url)
        
        #采用多进程模式
        #pool = Pool(5)
        #pool.map(get_page_detail,newsSet)
        #pool.close()
        #pool.join()
        
        #采用gevent多协程
        pool = gevent.pool.Pool(5)
        data = pool.map(get_page_detail,newsSet)
        
        return len(newsSet)
    else:
        logger.info('请求新浪外汇首页失误')

#获取焦点新闻
def get_focus_news(content):
    focus_news_list = pq(content)('#wrap > div:nth-child(24) > div.Center.headline > div.ListB > ul > li')
    for li in focus_news_list.items():
        yield {
            'title':li.text(),
            'url':li('a').attr('href')
        }
#获取滚动新闻
def get_roll_news(content):
    roll_news_list = pq(content)('#wrap > div:nth-child(25) > div.Center > div.ListB > ul > li')
    for li in roll_news_list.items():
        yield {
            'title':li.text(),
            'url':li('a').attr('href')
        }
#获取24新闻快递
def get_24hours_news(content):
    roll_news_list = pq(content)('#wrap > div.PartA.Top10 > div.CenterB > div.ListB.ListE > ul > li')
    for li in roll_news_list.items():
        yield {
            'title':li('a').text() + li('span').text(),
            'url':li('a').attr('href')
        }
#获取分析数据新闻
def get_analysis_news(content):

    roll_news_list = pq(content)('#wrap > div:nth-child(28) > div.Center > div.ListE > ul > li')
    for li in roll_news_list.items():
        yield {
            'title':li('a').text() + li('span').text(),
            'url':li('a').attr('href')
        }
#获取机构观点新闻
def get_institution_opinion_news(content):
    roll_news_list = pq(content)('#wrap > div:nth-child(29) > div.Center > div.ListE > ul > li')
    for li in roll_news_list.items():
        yield {
            'title':li('a').text() + li('span').text(),
            'url':li('a').attr('href')
        }
#获取专家观点新闻
def get_specialist_opinion_news(content):
    roll_news_list = pq(content)('#wrap > div:nth-child(30) > div.Center > div.ListE > ul > li')
    for li in roll_news_list.items():
        yield {
            'title':li('a').text() + li('span').text(),
            'url':li('a').attr('href')
        }
#获取人民币汇率新闻
def get_RMB_exchange_news(content):
    roll_news_list = pq(content)('#wrap > div:nth-child(31) > div.Center > div.ListE > ul > li')
    for li in roll_news_list.items():
        yield {
            'title':li('a').text() + li('span').text(),
            'url':li('a').attr('href')
        }

#get_index_pages()
View Code
提取文章详细内容类spiderDetail.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import requests
from requests.exceptions import RequestException
from pyquery import PyQuery as pq
import re
from logger import *
from DBHelper import *
from hashlib import md5
import json
from bs4 import BeautifulSoup as bs


def get_page_detail(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            response.encoding = 'utf-8'
            content = response.text
            pqContent = pq(content)
            title = pqContent('#artibodyTitle').text()
            print(title)
            date = pqContent('#wrapOuter > div > div.page-info > span').text()[:16]
            print(date)
            cnt = bs(content,"lxml")
            body = cnt.find(id='artibody')
            blockquote = cnt.find('blockquote')
            if blockquote:
                new_tag = cnt.new_tag("<b>")
                new_tag.string = "<a href='www.mysite.com'>替换成自己网站的名称和地址</a>"
                body.blockquote.replace_with(new_tag)
            #print(str(body))

            articleContent = pq(''.join(str(body)))
            #print(type(articleContent))
            #print(articleContent)
            if body:
                # 获取正文内容
                regex = re.compile('<!-- 原始正文start -->(.*)<!-- 原始正文end -->',re.S)
                match = re.findall(regex,str(body))
                if match:
                    match = match[0].strip()
                    images = pq(match)('img')
                    for img in images:
                        img_name = get_page_img(pq(img).attr('src'))
                        if img_name:
                            r = re.subn(pq(img).attr('src'), 'img/' + img_name, match)
                            match = r[0]
                    content_url = write_to_file(match, url)
                    dict = {'title':title,'content':content_url,'date':date,'expired':'false'}
                    insert(dict)
                else:
                    logger.info('未能提取到文章正文:[%s]' % url)

            else:
                logger.info('未在该文章页面中查找到标签artibody:[%s]' % url)

    except RequestException:
        logger.info('请求文章正文出错',url)

#获取文章内容页中的图片
def get_page_img(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            actual_img_path = save_image(response.content)
            return actual_img_path
        else:
            return None
    except RequestException:
        logger.info('请求图片出错',url)
        return None

#保存文章页中的全部插图
def save_image(content):
    img_folder = os.path.join(os.getcwd(), 'img')
    img_name = md5(content).hexdigest()
    img_path = '{0}/{1}.{2}'.format(img_folder,img_name,'jpg')
    if not os.path.exists(img_path):
        with open(img_path,'wb') as f:
            f.write(content)
            f.close()
            return img_name+'.jpg'
    else:
        return img_name
# 将正文内容保存至文件中
def write_to_file(content,url):
    content_folder = os.path.join(os.getcwd(),'files')
    file_name = md5(url.encode('utf-8')).hexdigest()
    file_path = '{0}/{1}.{2}'.format(content_folder, file_name,'txt')
    if not os.path.exists(file_path):
        with open(file_path,'w',encoding='utf-8') as f:
            f.write(json.dumps(content,ensure_ascii=False))
            f.close()
            logger.info('文件正文保存成功---新浪地址url:'+ url)
            return file_name
    else:
        return file_name
View Code
日志类 logger.py
#!usr/bin/python3
# -*- coding: utf-8 -*-

import os
import logging
import time

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

#按天保存记录日志
logFile = './log/log_{0}.txt'.format(time.strftime("%Y%m%d",time.localtime()))


# if not os.path.exists(logFile):
#     os.mknod(logFile)
fh = logging.FileHandler(logFile,mode='a')
fh.setLevel(logging.INFO)

ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

formater = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s:%(message)s")
fh.setFormatter(formater)
ch.setFormatter(formater)

logger.addHandler(fh)
logger.addHandler(ch)
View Code

其它的类,Models.py,DBHelper.py,config.py

DBHelper.py
#!/usr/bin/python3
# -*- coding: utf-8 -*-

import pymysql

from config import *
from logger import *

#
#扩展功能
#

def insert(article):
    db = pymysql.connect(host=HOST,port=POST,user=USERNAME,passwd=PASSWORD,db=DATABASE,charset='utf8',use_unicode=True)
    cursor = db.cursor()
    sql = """insert into articles(title,content,date,expired) values('%s','%s','%s','%s')""" % (article['title'],article['content'],article['date'],article['expired'])
    try:
        cursor.execute(sql)
        db.commit()
        logger.info('插入文章记录成功,执行命令[' + sql + ']')
    except:
        logger.error('文章记录插入错误,执行命令[' + sql + ']')
        db.rollback()
    db.close()
View Code
数据库配置类config.py
#database config items
HOST = '127.0.0.1'
POST = 6000            #数据库端口号,我更改为了6000
DATABASE = '数据库名'
USERNAME = '数据库账号'
PASSWORD = '数据库密码'
View Code
Models.py(本实例未使用,可扩展编写,采用类操作文章)
#!usr/bin/python3
# -*- coding: utf-8 -*-

class Article:
    ID = None,
    Title = '',
    SubTitle = '',
    Summary = '',
    Content = '',
    Date = '',
    Author = '',
    ForumID = 0,
    StickyPost = 'false',
    Expired = 'false'
View Code

二、配置nginx

请求访问入口app.py(采用nginx+uwsgi)

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from sinaForex import *
import time
from logger import *

def application(env,start_response):
    start_response('200 ok',[('Content_Type','text/html')])
    s = time.time()    
    len = get_index_pages()
    e = time.time()
    logger.info("-----------------共爬取新闻{0}条,耗时:{1}-----------------".format(len,round(e-s,3)))    
    rst =   "共爬取新闻{0}条,耗时:{1}".format(len,round(e-s,3))
    print(time.localtime(time.time()))
    return [b'%s' % rst.encode('utf-8')]
View Code
nginx配置,例如使用nginx默认配置:../nginx/conf/vhost/py.mysite.com.conf

server {
listen 80;
root /www/web/sina_forex;
server_name py.mysite.com;
index index.html index.php index.htm;
error_page 400 /errpage/400.html;
error_page 403 /errpage/403.html;
error_page 404 /errpage/404.html;
error_page 503 /errpage/503.html;

location /spider {
uwsgi_pass 127.0.0.1:8001;
include uwsgi_params;
}
location / {
try_files $uri @apache;
}
}

uwsgi的配置(/etc/uwsgi8001.ini)
[uwsgi]
socket = :8001    #web服务端口
chdir = /www/web/sina_forex   #网站根目录
wsgi-file = app.py         #请求处理类
vhost = true
master = true
processes = 5
pidfile = /www/web/sina_forex/uwsgi8001.pid
daemonize = /www/web/sina_forex/log/uwsgi8001.log
View Code

这样web服务即可正常获取,根据nginx的访问规则http://py.mysite.com/spider

三、定时收集新闻

写一个定时执行的类autoSpiderTimer.py,来定时执行web请求,而进程的管理采用supervisor

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import requests
import time

def timer():
    response = requests.get('http://py.mysite.com/spider')    
    print(response.text)
    print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    time.sleep(3600*24)

timer()   #每天定时收集一次
View Code

 

posted @ 2018-01-09 10:49  小小财经  阅读(231)  评论(0编辑  收藏  举报