欢迎来到RankFan的Blogs

扩大
缩小

爬虫案例 提取股票信息

Ref:https://blog.csdn.net/weixin_50437588/article/details/119481864

import requests
from bs4 import BeautifulSoup
import re
import traceback

def getHtml(url):
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
                (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'}
    res = requests.get(url, headers=headers)
    res.raise_for_status()
    res.encoding = res.apparent_encoding
    return res.text

def getStock(lst, html):
    try:
        soup = BeautifulSoup(html, 'html.parser')
        # stockTable = soup.find_all('section', class_='stockTable')
        # for each in soup.find('section', class_='stockTable'):
        #     print(each.child)

        sh_stock = []
        stockTable = soup.find('section', class_='stockTable').children
        for each in stockTable:
            if each.name == 'a':
                sh_stock.append(list(filter(None, each['href'].split('/')))[-1])

                # sh_stock.append(re.findall(r'[(]\d{6}[)]', each.string.strip())) # [(] 表示括号
                # sh_stock = [i for i in sh_stock if i != '']  # 去除空元素

        return sh_stock
    except:
        print('获取股票代码错误')

def save_stock(lst, path_file):
    host = 'https://hq.gucheng.com/'
    count = 0

    for each in lst:
        info_stock = {}
        try:
            url = host + each + '/'
            html = getHtml(url)
            # html = getHtml('https://hq.gucheng.com/SZ300247/')
            soup = BeautifulSoup(html, 'html.parser')

            stockInfo = soup.find('div', class_='stock_top clearfix')
            # stockInfo = soup.find('div', attrs={'class': 'stock_top clearfix'})
            stock_name = soup.find('h1').string.strip()
            info_stock.update({'股票名称': stock_name})
            info_stock.update({'股票代码': each})

            keylist = stockInfo.find_all('dt')
            valuelist = stockInfo.find_all('dd')

            for i in range(len(keylist)):
                key = keylist[i].string
                value = valuelist[i].string
                info_stock[key] = value

            with open(path_file, 'a+', encoding='utf-8') as f:
                f.write(str(info_stock) + '\n')
                count = count + 1
                print('\r当前完成度: {:.2f} %'.format(count*100/len(lst)), end=" ")
        except:
            traceback.print_exc()
            print('\r当前完成度: {:.2f} %'.format(count * 100 / len(lst)), end=" ")
            continue

def main():
    stock_list_url = r'https://hq.gucheng.com/gpdmylb.html'
    # stock_info_url = r'https://hq.gucheng.com/'
    path_file = 'p44_stock_information.txt'

    lst = []
    html = getHtml(stock_list_url)
    stock_info = getStock(lst, html)
    save_stock(stock_info, path_file)

if __name__ == "__main__":
    main()

posted on 2021-09-12 18:34  RankFan  阅读(73)  评论(0编辑  收藏  举报

导航