# -*- coding: utf-8 -*- """ Created on Aug 5 2020 @author: lizitingxue """ #基础包 import numpy as np import pandas as pd import os import time ##爬虫 import re import requests from bs4 import BeautifulSoup import urllib.request from selenium import webdriver ##pdf处理 import winerror from win32com import client as wc from win32com.client.dynamic import Dispatch, ERRORS_BAD_CONTEXT ##word信息处理 from docx import Document from openpyxl import Workbook #定义函数 # 定义函数 def get_code(key_word): url = 'http://www.cmbchina.com/cfweb/Personal/Default.aspx' js_bottom = "var q=document.documentElement.scrollTop=10000" browser = webdriver.Chrome() browser.get(url) time.sleep(3) browser.find_element_by_xpath('//*[@id="tbCondition"]/table[1]/tbody/tr[9]/td[2]/ul/li[2]/input[1]').click() browser.find_element_by_xpath('//*[@id="inputPrdKey"]').clear() browser.find_element_by_xpath('//*[@id="inputPrdKey"]').send_keys(key_word) browser.find_element_by_xpath('//*[@id="column_content"]/div[8]/input[1]').click() time.sleep(2) browser.execute_script(js_bottom) time.sleep(2) browser.execute_script(js_bottom) time.sleep(2) ##产品名称和编号 temp_for_code = browser.find_elements_by_css_selector("[class='leftArea']>[class='row']>[class='code']") print(key_word+'系列产品数量为'+str(len(temp_for_code))) temp_for_name = browser.find_elements_by_css_selector('[class="inlineBgImage"]>[class="inlineBriefName"]') temp_code = [] temp_name = [] for j in temp_for_code: temp_code.append(j.text) for i in temp_for_name: temp_name.append(i.text) ###产品pdf的下载链接 url_lst = [] counts = len(browser.find_elements_by_css_selector("[class='viewMore']")) # temp = browser.find_elements_by_css_selector("[class='viewMore']") for i in range(counts): #需要返回上一级 browser.find_elements_by_css_selector("[class='viewMore']")[i].click() time.sleep(3) browser.switch_to.window(browser.window_handles[-1]) browser.find_element_by_id('fmenu_7').click() temp_for_pdf = browser.find_elements_by_css_selector("[class='c_title']>[target = '_blank']") if len(temp_for_pdf)!= 0: pdf = temp_for_pdf[0].get_attribute('href') url_lst.append(pdf) else: url_lst.append("未披露") browser.close() browser.switch_to.window(browser.window_handles[0]) ######保存为excel tarDir = folder_path+"//"+key_word #希望生成code列表文件的位置", if not os.path.exists(tarDir): os.mkdir(tarDir) df = pd.DataFrame({"产品名称":temp_name,"产品代码":temp_code,"投资报告下载链接":url_lst}) print(df) df.to_excel(tarDir+"\\"+key_word+"产品代码和链接.xlsx",index = False) return url_lst def get_pdf(link,key_word): #保存pdf try: r = requests.get(link) r.encoding = r.apparent_encoding html = r.text soup = BeautifulSoup(html,'html.parser') pro_name = soup.find(name='head',attrs={"id":"Head1"}).text.replace('\n', '').replace('\r', '').replace('\t', '').replace(' ', '').split("--")[-1] link = soup.find_all(name='a',attrs={"href":re.compile(r'.pdf$')}) if len(link) != 0: mylink = link[0] # name = mylink.string # print(name) url = mylink.get('href') # print(url) filename = pro_name filepath = folder_path + '/' +key_word+'/'+ pro_name +".pdf" urllib.request.urlretrieve(url, filename=filepath) print(pro_name+"报告下载完成~~~~~O(∩_∩)O~") else: print(pro_name+'报告下载失败01') except: print(pro_name+"报告下载失败02") def pdf_to_word(my_dir): ERRORS_BAD_CONTEXT.append(winerror.E_NOTIMPL) os.chdir(my_dir) file_lst = [x for x in os.listdir('.') if os.path.isfile(x) and os.path.splitext(x)[1]=='.pdf'] for f in file_lst: src = os.path.abspath(f)#原来的地址 # my_pdf = "招商银行零售青葵系列一年定开1号理财计划(产品代码:301025301025B301025C301025D)2020年第2季度报告.pdf" try: AvDoc = Dispatch("AcroExch.AVDoc") if AvDoc.Open(src, ""): pdDoc = AvDoc.GetPDDoc() jsObject = pdDoc.GetJSObject() new = os.path.join(my_dir, f.split(".")[0]+'.docx') jsObject.SaveAs(new, "com.adobe.acrobat.docx")#新的地址 #word格式转换 word = wc.Dispatch('Word.Application') doc = word.Documents.Open(new) doc.SaveAs(new.split(".")[0]+'T'+".docx",16) #16对应于下表中的默认docx文件文件 doc.Close() word.Quit() #把原来的doc文件删除 if os.path.exists(new): os.remove(new) except Exception as e: print(str(e)) finally: AvDoc.Close(True) jsObject = None pdDoc = None AvDoc = None def date_trans(start): publish_Time = start.replace(' ','') array = time.strptime(publish_Time, u"%Y年%m月%d日") publishTime = time.strftime("%Y-%m-%d", array) return publishTime def basic_info(file): wb = Workbook() sheet = wb.active header = ['产品名称', '业绩比较基准', '产品风险评级', '杠杆上限', '理财产品份额总额(亿元)', '理财产品成立日','理财产品到期日', '直接_债券','直接_一般债权','直接_资产支持证券','直接_现金及货币工具','直接_含权债权','直接_权益类','直接_商品及金融衍生品类','直接_合计', '间接_债券','间接_一般债权','间接_资产支持证券','间接_现金及货币工具','间接_含权债权','间接_权益类','间接_商品及金融衍生品类','间接_合计',] sheet.append(header) for doc in file: document = Document(folder_path + "\\"+key_word + "\\"+doc) # 读取word中的所有表格 tables = document.tables #基本信息 table0= tables[0] name = table0.cell(1, 1).text.strip().replace(' ','')[4:][:-4] compe = table0.cell(6, 1).text.strip().replace(' ','')[2:] risk = table0.cell(7,1).text.strip().replace(' ','') leverage = table0.cell(8,1).text.strip().replace(' ','') total = table0.cell(9,1).text.strip().replace(' ','') start = date_trans(table0.cell(10,1).text.strip().replace(' ','')) end = date_trans(table0.cell(11,1).text.strip().replace(' ','')[0:11]) #投资标的 invest = tables[1] print(doc+"投资表格行数"+str(len(invest.rows))+"-"*20) temp_direct = [] temp_indirect = [] for i in range(8): try: # 直接投资 # name = invest.cell(i+3,1) amount = invest.cell(i+1,2).text.replace("\n","") temp_direct.append(amount) # 间接投资 # name = invest.cell(i+3,1) amount = invest.cell(i+1,4).text.replace("\n","") temp_indirect.append(amount) except Exception as error: # 捕获异常,也可以用log写到日志里方便查看和管理 print(error) continue #格式化输出基本信息 row = [name,compe,risk,leverage,total,start,end]+temp_direct+temp_indirect sheet.append(row) wb.save(file_path+'\\'+key_word+'基本信息.xlsx') def nonstan_info(file): res = pd.DataFrame(columns = ['产品名称','项目名称','融资客户名称','交易日','项目剩余融资期限(单位:天)','报告期内到期收益率预估(%)','交易结构']) for doc in file: document = Document(folder_path + "\\"+key_word + "\\"+doc) # 读取word中的所有表格 tables = document.tables #非标信息 name = tables[0].cell(1, 1).text.strip().replace(' ','')[4:][:-4] nonstan = tables[-2] # print(len(nonstan.rows))#带表头 nonstan_all = [[] for _ in range(len(nonstan.rows)-1)] if nonstan.cell(1,1).text != '-': for l in range(len(nonstan.rows)-1): for num in range(6): nonstan_all[l].append(nonstan.cell(l+1,num).text.replace("\n",'').replace(" ","")) # print(nonstan_all)#不带表头 print(name + '非标项目数量为{0}'.format(len(nonstan_all))) # print("-"*20) if nonstan_all != [[]]: for x in range(len(nonstan_all)): temp = {'产品名称':name,'项目名称':nonstan_all[x][0],'融资客户名称':nonstan_all[x][1],'交易日':nonstan_all[x][2],'项目剩余融资期限(单位:天)':nonstan_all[x][3],'报告期内到期收益率预估(%)':nonstan_all[x][4],'交易结构':nonstan_all[x][5]} res = res.append(temp,ignore_index=True) res.to_excel (file_path+'\\'+key_word +'非标投资情况.xlsx', index = False, header=True) ##爬取过程 #定义路径和爬取产品名称 folder_path = 'F:\\PY\\爬虫\\投资报告备份' key_word = "卓远" #爬取数据 url_lst = get_code(key_word) #下载报告 for link in url_lst: if link != "未披露": get_pdf(link,key_word) #pdf解析 my_dir = folder_path+'/'+key_word pdf_to_word(my_dir) #提取投资报告信息 file_path = folder_path+"\\"+key_word file = [x for x in os.listdir(file_path) if os.path.isfile(x) and os.path.splitext(x)[1]=='.docx'] file #保存信息 basic_info(file) nonstan_info(file)
有问题欢迎留言哦~~~O(∩_∩)O