欢迎来到RankFan的Blogs

扩大
缩小

爬虫 -- 中国货币网债券财务报告

Python批量下载中国货币网债券财务报告, 很多python爬虫的东西 值得学习

推文代码不能正常下载,可参考下面这个。

from bs4 import BeautifulSoup
import os
import bs4
import requests

def getHtml(url):
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
                (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'}
    res = requests.get(url, headers=headers)
    res.raise_for_status()
    res.encoding = res.apparent_encoding
    return res.text

def get_file(title):
    filename = f'{title}.PDF'
    
    sub_dir = 'D:\Desktop\公告'
    if not os.path.exists(sub_dir):
        os.makedir(sub_dir)
        # saving_path = r'D:\Desktop\公告'  # 设置存储年报的文件夹
        
    filepath = sub_dir +'\\'+ filename
    return filepath

nums = 10
url = f'https://www.chinamoney.com.cn/ags/ms/cm-u-notice-issue/financeRepo?year=&type=&orgName=&pageSize={nums}&pageNo=1&inextp=3%2C5&limit=1&'
r = requests.get(url)
r.encoding = r.apparent_encoding  # 防止网页乱码
json = r.json()
# 存储数据
records = json['records']

items = []
for d in records:
    
    title = d['title']
    releaseDate = d['releaseDate']

    draftPath = d['draftPath']
    child_url = 'https://www.chinamoney.com.cn/'+draftPath

    item = [title, releaseDate, child_url]
    items.append(item)

    html = getHtml(child_url)
    soup = BeautifulSoup(html, 'html.parser')
    notice_info = soup.find('div', class_='article-a-attach-body')

    main_url = 'https://www.chinamoney.com.cn/dqs/cm-s-notice-query/' # constant

    for i in notice_info.select('li'):
        if isinstance(i, bs4.element.Tag):
            temp = i.a['onclick'].split('+')[1].split('\'')[1]
            url_end = os.path.join(main_url, temp)
            print(url_end)

            url_end_r = requests.get(url_end)
            # 下载PDF

            title = i.a.text.split('\n')[-2].split('.')[0]
            filepath = get_file(title)
            with open(filepath, 'wb') as f:
               f.write(url_end_r.content)

直接输入公司名称

from selenium import webdriver
from selenium.webdriver.common.by import By
import requests
import os
from datetime import datetime
import time
from bs4 import BeautifulSoup

os.chdir(r'D:\Desktop\公告')
companies = ["重庆两江新区开发投资集团有限公司", "重庆城市交通开发投资(集团)有限公司", "重庆渝隆资产经营(集团)有限公司",
             "重庆市江北嘴中央商务区投资集团有限公司", "重庆市地产集团有限公司", "重庆市涪陵国有资产投资经营集团有限公司", "重庆市江津区华信资产经营(集团)有限公司"]

主函数:

   try:
    #初始浏览器对象
    browser = webdriver.Chrome()
    #打开界面
    browser.get('https://www.chinamoney.com.cn/chinese/cqcwbglm/')
    #等待界面加载
    time.sleep(5)
    #找到输入框
    input = browser.find_element(By.ID, "bond-finance-org")
    #将公司名字输入
    input.send_keys(company)
    #找到查询键并点击
    browser.find_elements(By.CLASS_NAME, "san-btn.san-btn-primary")[2].click()
    #切换到新打开选项卡
    browser.switch_to.window(browser.window_handles[1])
    #等待界面加载
    time.sleep(5)
    #找到当前界面中公司的名字和链接
    elements = browser.find_elements(By.CLASS_NAME, "san-grid-m")
    #找到公告的发布日期
    dates = browser.find_elements(By.CLASS_NAME, "text-date")
    #x记录每个公司下载的文件个数
    x = 0
    #对当前界面公司公告进行遍历
    for i, j in zip(dates, elements):
        date = datetime.strptime(i.text, '%Y-%m-%d')
        #筛选2022年1月1日之后的公司公告
        if date >= datetime(year=2022, month=1, day=1):
            title = j.text
            #找到打开包含公告的链接
            link = j.find_element(By.LINK_TEXT, title).get_attribute('href')
            #获取包含公告的网页
            html = requests.get(link)
            html.encoding = html.apparent_encoding
            html = html.content
            #将网页解析
            soup = BeautifulSoup(html, 'lxml')
            # find返回elemnt对象;find_all返回Result_Set对象,需要遍历才能访问
            info = soup.find_all(name='li')
            x += len(info)

            constant_url = "https://www.chinamoney.com.cn/dqs/cm-s-notice-query/"
            for k in range(len(info)):
                #找到pdf下载链接的后半段
                sub_url = info[k].a['onclick'].split('+')[1].split('\'')[1]
                #url是pdf的下载链接
                url = constant_url + sub_url
                #name是pdf的文档名称
                name = info[k].find_all(name='span')[1].string
                #将内容写入pdf
                with open(f'{name}.pdf', 'wb') as f:
                    f.write(requests.get(url).content)
    print(f'{company}:{x}')
    browser.quit()
except:
    print(f'{company}出现异常')

posted on 2022-06-22 23:15  RankFan  阅读(246)  评论(0编辑  收藏  举报

导航