爬虫 -- 中国货币网债券财务报告
Python批量下载中国货币网债券财务报告, 很多python爬虫的东西 值得学习
推文代码不能正常下载,可参考下面这个。
from bs4 import BeautifulSoup
import os
import bs4
import requests
def getHtml(url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'}
res = requests.get(url, headers=headers)
res.raise_for_status()
res.encoding = res.apparent_encoding
return res.text
def get_file(title):
filename = f'{title}.PDF'
sub_dir = 'D:\Desktop\公告'
if not os.path.exists(sub_dir):
os.makedir(sub_dir)
# saving_path = r'D:\Desktop\公告' # 设置存储年报的文件夹
filepath = sub_dir +'\\'+ filename
return filepath
nums = 10
url = f'https://www.chinamoney.com.cn/ags/ms/cm-u-notice-issue/financeRepo?year=&type=&orgName=&pageSize={nums}&pageNo=1&inextp=3%2C5&limit=1&'
r = requests.get(url)
r.encoding = r.apparent_encoding # 防止网页乱码
json = r.json()
# 存储数据
records = json['records']
items = []
for d in records:
title = d['title']
releaseDate = d['releaseDate']
draftPath = d['draftPath']
child_url = 'https://www.chinamoney.com.cn/'+draftPath
item = [title, releaseDate, child_url]
items.append(item)
html = getHtml(child_url)
soup = BeautifulSoup(html, 'html.parser')
notice_info = soup.find('div', class_='article-a-attach-body')
main_url = 'https://www.chinamoney.com.cn/dqs/cm-s-notice-query/' # constant
for i in notice_info.select('li'):
if isinstance(i, bs4.element.Tag):
temp = i.a['onclick'].split('+')[1].split('\'')[1]
url_end = os.path.join(main_url, temp)
print(url_end)
url_end_r = requests.get(url_end)
# 下载PDF
title = i.a.text.split('\n')[-2].split('.')[0]
filepath = get_file(title)
with open(filepath, 'wb') as f:
f.write(url_end_r.content)
直接输入公司名称:
from selenium import webdriver
from selenium.webdriver.common.by import By
import requests
import os
from datetime import datetime
import time
from bs4 import BeautifulSoup
os.chdir(r'D:\Desktop\公告')
companies = ["重庆两江新区开发投资集团有限公司", "重庆城市交通开发投资(集团)有限公司", "重庆渝隆资产经营(集团)有限公司",
"重庆市江北嘴中央商务区投资集团有限公司", "重庆市地产集团有限公司", "重庆市涪陵国有资产投资经营集团有限公司", "重庆市江津区华信资产经营(集团)有限公司"]
主函数:
try:
#初始浏览器对象
browser = webdriver.Chrome()
#打开界面
browser.get('https://www.chinamoney.com.cn/chinese/cqcwbglm/')
#等待界面加载
time.sleep(5)
#找到输入框
input = browser.find_element(By.ID, "bond-finance-org")
#将公司名字输入
input.send_keys(company)
#找到查询键并点击
browser.find_elements(By.CLASS_NAME, "san-btn.san-btn-primary")[2].click()
#切换到新打开选项卡
browser.switch_to.window(browser.window_handles[1])
#等待界面加载
time.sleep(5)
#找到当前界面中公司的名字和链接
elements = browser.find_elements(By.CLASS_NAME, "san-grid-m")
#找到公告的发布日期
dates = browser.find_elements(By.CLASS_NAME, "text-date")
#x记录每个公司下载的文件个数
x = 0
#对当前界面公司公告进行遍历
for i, j in zip(dates, elements):
date = datetime.strptime(i.text, '%Y-%m-%d')
#筛选2022年1月1日之后的公司公告
if date >= datetime(year=2022, month=1, day=1):
title = j.text
#找到打开包含公告的链接
link = j.find_element(By.LINK_TEXT, title).get_attribute('href')
#获取包含公告的网页
html = requests.get(link)
html.encoding = html.apparent_encoding
html = html.content
#将网页解析
soup = BeautifulSoup(html, 'lxml')
# find返回elemnt对象;find_all返回Result_Set对象,需要遍历才能访问
info = soup.find_all(name='li')
x += len(info)
constant_url = "https://www.chinamoney.com.cn/dqs/cm-s-notice-query/"
for k in range(len(info)):
#找到pdf下载链接的后半段
sub_url = info[k].a['onclick'].split('+')[1].split('\'')[1]
#url是pdf的下载链接
url = constant_url + sub_url
#name是pdf的文档名称
name = info[k].find_all(name='span')[1].string
#将内容写入pdf
with open(f'{name}.pdf', 'wb') as f:
f.write(requests.get(url).content)
print(f'{company}:{x}')
browser.quit()
except:
print(f'{company}出现异常')