爬虫案例 提取股票信息
Ref:https://blog.csdn.net/weixin_50437588/article/details/119481864
import requests
from bs4 import BeautifulSoup
import re
import traceback
def getHtml(url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'}
res = requests.get(url, headers=headers)
res.raise_for_status()
res.encoding = res.apparent_encoding
return res.text
def getStock(lst, html):
try:
soup = BeautifulSoup(html, 'html.parser')
# stockTable = soup.find_all('section', class_='stockTable')
# for each in soup.find('section', class_='stockTable'):
# print(each.child)
sh_stock = []
stockTable = soup.find('section', class_='stockTable').children
for each in stockTable:
if each.name == 'a':
sh_stock.append(list(filter(None, each['href'].split('/')))[-1])
# sh_stock.append(re.findall(r'[(]\d{6}[)]', each.string.strip())) # [(] 表示括号
# sh_stock = [i for i in sh_stock if i != ''] # 去除空元素
return sh_stock
except:
print('获取股票代码错误')
def save_stock(lst, path_file):
host = 'https://hq.gucheng.com/'
count = 0
for each in lst:
info_stock = {}
try:
url = host + each + '/'
html = getHtml(url)
# html = getHtml('https://hq.gucheng.com/SZ300247/')
soup = BeautifulSoup(html, 'html.parser')
stockInfo = soup.find('div', class_='stock_top clearfix')
# stockInfo = soup.find('div', attrs={'class': 'stock_top clearfix'})
stock_name = soup.find('h1').string.strip()
info_stock.update({'股票名称': stock_name})
info_stock.update({'股票代码': each})
keylist = stockInfo.find_all('dt')
valuelist = stockInfo.find_all('dd')
for i in range(len(keylist)):
key = keylist[i].string
value = valuelist[i].string
info_stock[key] = value
with open(path_file, 'a+', encoding='utf-8') as f:
f.write(str(info_stock) + '\n')
count = count + 1
print('\r当前完成度: {:.2f} %'.format(count*100/len(lst)), end=" ")
except:
traceback.print_exc()
print('\r当前完成度: {:.2f} %'.format(count * 100 / len(lst)), end=" ")
continue
def main():
stock_list_url = r'https://hq.gucheng.com/gpdmylb.html'
# stock_info_url = r'https://hq.gucheng.com/'
path_file = 'p44_stock_information.txt'
lst = []
html = getHtml(stock_list_url)
stock_info = getStock(lst, html)
save_stock(stock_info, path_file)
if __name__ == "__main__":
main()