股票数据定向爬虫.py(亲测有效)
import requests from bs4 import BeautifulSoup import traceback import re def getHTMLText(url,code='utf-8'): try: r = requests.get(url,timeout=30) r.encoding = code return r.text except: return "" def getStockList(lst,stockURL): html = getHTMLText(stockURL) soup = BeautifulSoup(html,'html.parser') a = soup.find_all('tr') for i in a: try: href = i.attrs['id'] lst.append(re.findall(r'[tr]\d{6}',href)[0]) except: continue def getStockInfo(lst,stockURL,fpath): count = 0 for stock in lst: url = stockURL + stock[1:] +".html" html = getHTMLText(url) try: if html == "": continue infoDict = {} soup = BeautifulSoup(html,'html.parser') stockInfo = soup.find('div',attrs={'class':'merchandiseDetail'}) name = stockInfo.find_all(attrs={'class':'fundDetail-tit'})[0] infoDict.update({'股票名称':name.text.split()[0]}) keylist = stockInfo.find_all('dt') valuelist = stockInfo.find_all('dd') for i in range(len(keylist)): key = keylist[i].text print(key) val = valuelist[i].text infoDict[key] = val with open(fpath,'a',encoding='utf-8')as f: f.write(str(infoDict)+'\n') count = count+1 print('\r当前速度:{:.2f}%'.format(count*100/len(lst)),end='') except: count = count + 1 print('\r当前速度:{:.2f}%'.format(count * 100 / len(lst)), end='') traceback.print_exc() continue def main(): stock_list_url = 'https://fund.eastmoney.com/fund.html#os_0;isall_0;ft_;pt_1' stock_info_url = 'https://fund.eastmoney.com/' output_file = 'D://桌面//BaiduStockInfo.txt' slist = [] getStockList(slist,stock_list_url) getStockInfo(slist,stock_info_url,output_file) main()