python爬取凤凰网站的新闻,及其链接地址,来源,时间和内容,用selenium自动化和requests处理数据

有写规则需要自己定义判断。
import requests
from selenium import webdriver
import time

def grasp(urlT):
    driver = webdriver.Chrome(r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') #自动化测试程序工具本地所在地
    resAll = []         #用于存储单条数据
    rest = {}           #用于存储单个数据
    res=requests.get(urlT)
    for i in range(0,29):
        print(f'第{i+1}条新闻开始')
        print(res.json()['data'][i]['title'])
        try:
            print(res.json()['data'][i]['newsTime'])
        except:
            print('None')
        print(res.json()['data'][i]['source'])
        print(res.json()['data'][i]['url'])
        rest['title']=res.json()['data'][i]['title']
        try:
            rest['newsTime'] = res.json()['data'][i]['newsTime']
        except:
            rest['newsTime'] = 'None'
        rest['source'] = res.json()['data'][i]['source']
        url = res.json()['data'][i]['url']
        rest['url'] = res.json()['data'][i]['url']
        try:

            driver.get(url)
            time.sleep(4)
            contend = driver.find_element_by_class_name('text-3zQ3cZD4').text
            rest['contend'] = str(contend)
            print(contend)
            driver.back()
            print(f'第{i+1}条新闻结束')
            time.sleep(6)
        except:
            contend = driver.find_element_by_class_name('topic_column-5QvrwcWi').text
            rest['contend'] = str(contend)
            print(contend)
            driver.back()
            time.sleep(6)
            print(f'第{i+1}条新闻格式不同')
            print('#-----------------------某些格式不符合------------------------#')
        resAll.append(rest)
        with open('./news.txt', 'a+', encoding='utf-8') as f:
                try:
                    f.write(''.join(resAll[i].values())+'\n')
                except:
                    print('写入失败')

url = "https://shankapi.ifeng.com/spring/finance/index/newInfoIndex/75219"
grasp(url)
#
#
# class Grasp:
#
#     def __init__(self):
#         self.driver = webdriver.Chrome(r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
#         self.resAll = []#用于存储单条数据
#         self.rest = {}#用于存储单个数据
#         self.res = requests.get("https://shankapi.ifeng.com/spring/finance/index/newInfoIndex/75219")#目标链接
#
#     def run(self):
#         for i in range(0, len(self.res.json()['data'])):
#             print(f'第{i+1}条新闻开始')
#             print(self.res.json()['data'][i]['title']) #输出标题
#             try:
#                 print(self.res.json()['data'][i]['newsTime']) #输出时间
#             except:
#                 print('None')
#             print(self.res.json()['data'][i]['source']) #输出来源
#             print(self.res.json()['data'][i]['url']) #输出链接地址
#             self.rest['title'] = self.res.json()['data'][i]['title'] #获取标题
#             try:
#                 self.rest['newsTime'] = self.res.json()['data'][i]['newsTime'] #获取时间
#             except:
#                 self.rest['newsTime'] = 'None'
#             self.rest['source'] = self.res.json()['data'][i]['source'] #获取来源
#             self.url = self.res.json()['data'][i]['url']
#             self.rest['url'] = self.res.json()['data'][i]['url']#获取链接地址
#             try:
#                 self.driver.get(url)
#                 time.sleep(4)
#                 self.contend = self.driver.find_element_by_class_name('text-3zQ3cZD4').text#获取网页标签下的文本
#                 self.rest['contend'] = str(self.contend)#插入单条数据
#                 print(f'第{i}条新闻成功')
#                 self.driver.back()
#                 time.sleep(4)
#             except:
#                 contend = driver.find_element_by_class_name('topic_column-5QvrwcWi').text
#                 rest['contend'] = str(contend)
#                 driver.back()
#                 time.sleep(6)
#                 print(f'第{i+1}条新闻格式不同')
#                 print('#-----------------------某些格式不符合------------------------#')
#             self.resAll.append(self.rest)
#             with open('./news.txt', 'a+', encoding='utf-8') as f:
#                 try:
#                    
#                     f.write(''.join(self.resAll[i].values()) + '\n') #写入数据
#                     f.write(f'第{i+1}条新闻结束')
#                 except:
#                     print('写入失败')
#
# g = Grasp()
# g.run()
View Code

电脑性能差,如若想获取其他页面的数据,将规则写在except中,即可

希望,帮到大家
posted @ 2019-09-15 00:20  King~~~  阅读(871)  评论(0编辑  收藏  举报