每日学习

今天开始爬取数据:

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import json
import pymysql
def getnewsdetail(newsurl):
    res = requests.get(newsurl)
    res.encoding = 'utf-8'
    value=[]
    soup = BeautifulSoup(res.text, 'html.parser')
    if (soup.select('.main-title')):
        title = soup.select('.main-title')[0].text
    else:
        title="异常爬取"
    if (soup.select('.date-source span')):
        timesource = soup.select('.date-source span')[0].text  # 获取时间)
        dt = datetime.strptime(timesource, '%Y年%m月%d日 %H:%M')
        dt.strftime('%Y-%m-%d')
    else:
        timesource="异常爬取"
    if(soup.select('.date-source a')):
        place = soup.select('.date-source a')[0].text  # 获取新闻来源
    else:
        if soup.select('#top_bar > div > div.date-source > span.source'):
            place = soup.select('#top_bar > div > div.date-source > span.source')[0].text
        else:
            place="异常爬取"
    if(soup.select("#artibody")):
        articleall = soup.select("#artibody")[0]  # 获取文章内容
    else:
        articleall = soup.select("#article")[0]  # 获取文章内容
    if(soup.select('#article p')):
        editor = soup.select('#article p')[-1].text.strip('责任编辑:')  # 获取作者姓名
    else:
        editor='异常爬取'
    value=[title,timesource,place,editor,articleall]
    return value
def parseListLinks(url):
    newsdetail=[]
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
    res=requests.get(url,headers=headers)
    jd=json.loads(res.text[47:-14])
    for ent in jd['result']['data']:
        newsdetail.append(getnewsdetail(ent['url']))
    return newsdetail
url='https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2509&k=&num=50&page={}&r=0.7778780795677847&callback=jQuery1112046350965709357705_1620651288029&_=1620651288032'
news_total=[]
for i in range(2):
    newsurl=url.format(i)
    newsary=parseListLinks(newsurl)
    news_total.extend(newsary)
    print(i)
tuplist = tuple(news_total)
db = pymysql.connect(host="localhost", user="root", password="1229", database="lianxi", charset='utf8')
cursor = db.cursor()
sql_xiwen2 = "INSERT INTO xinwen2 values (%s,%s,%s,%s,%s)"
try:
    cursor.executemany(sql_xiwen2,tuplist)
    db.commit()
except:
      print('执行失败,进入回调3')
      db.rollback()
db.close()

爬取腾讯新闻数据的一个爬虫,之后还会对其他新闻网站进行爬取

posted @ 2021-04-29 19:43  哦心有  阅读(20)  评论(0编辑  收藏  举报