同花顺概念板块成分股数据分享
爬取了同花顺上概念板块的成分股
主要是发现了各个量化的网站没有这个数据源,很多策略无法展开。这份数据应该挺多人需要的吧。把链接挂这里了,需要的阔以下载
http://pan.baidu.com/s/1eSGSS5W
数据有4列,分别是板块代码,板块名字,成分股代码以及对应的公司
源码贴出来了,初步学习爬虫,写的十分难看。。。。
1 #!/usr/bin/env python3 2 # -*- coding: utf-8 -*- 3 """ 4 Created on Fri Nov 17 19:41:44 2017 5 6 @author: Mr.ZeroW 7 8 同花顺板块成分股 9 """ 10 11 #首先不同板块地址不同 http://q.10jqka.com.cn/gn/detail/order/desc/page/1/ajax/1/code/300018 12 #需要高出不同板块页数有多少,才能得出地址 13 import urllib.request 14 from lxml import etree 15 import pandas as pd 16 import time 17 18 #爬取板块名称以及代码并且存在文件 19 with urllib.request.urlopen('http://q.10jqka.com.cn/gn/') as f: 20 text = f.read().decode('gb2312') 21 22 html = etree.HTML(text) 23 24 gnbk = html.xpath('/html/body/div[2]/div[1]/div//div//div//a') 25 thsgnbk = [] 26 for i in range(len(gnbk)): 27 thsgnbk.append((gnbk[i].text)) 28 29 #板块代码 30 bkcode = html.xpath('/html/body/div[2]/div[1]/div//div//div//a/@href') 31 bkcode = list(map(lambda x : x.split('/')[-2], bkcode)) 32 data = {'Name': thsgnbk} 33 34 #存储 35 gnbk = pd.DataFrame(data, index = bkcode) 36 gnbk.to_csv('gnbk.csv') 37 38 print('板块名称以及代码已爬取,存储文件名:gnbk.csv') 39 #导入板块名称和代码 40 data = pd.read_csv('gnbk.csv') 41 #建立数据框,四列【板块id, 板块name, 成分股id, 成分股name】 42 43 bk_id = [] 44 bk_name = [] 45 s_id = [] 46 s_name = [] 47 iCount = 1 48 print('爬取开始!') 49 start = time.time() 50 for i in range(len(data)): 51 52 bk_code = str(data.iloc[i, 0]) 53 name = str(data.iloc[i, 1]) 54 url = 'http://q.10jqka.com.cn/gn/detail/code/' + bk_code + '/' 55 print('%d: %s' %(iCount, name)) 56 iCount += 1 57 58 with urllib.request.urlopen(url) as f: 59 text = f.read().decode('GBK', 'ignore') 60 61 #得出板块成分股有多少页 62 html = etree.HTML(text) 63 64 result = html.xpath('//*[@id="m-page"]/span/text()') 65 try: 66 page = int(result[0].split('/')[-1]) 67 for j in range(page): 68 page_n = str(j + 1) 69 curl = 'http://q.10jqka.com.cn/gn/detail/order/desc/page/' + page_n+ '/ajax/1/code/' + bk_code 70 with urllib.request.urlopen(curl) as f: 71 text = f.read().decode('GBK') 72 html = etree.HTML(text) 73 #成分股代码 74 stock_code = html.xpath('/html/body/table/tbody/tr/td[2]/a/text()') 75 #成分股名称 76 stock_name = html.xpath('/html/body/table/tbody/tr/td[3]/a/text()') 77 s_id += stock_code 78 s_name += stock_name 79 bk_id.extend([bk_code]* len(stock_code)) 80 bk_name.extend([name]* len(stock_name)) 81 82 except IndexError as e: 83 curl = url 84 with urllib.request.urlopen(curl) as f: 85 text = f.read().decode('GBK') 86 html = etree.HTML(text) 87 #成分股代码 88 stock_code = html.xpath('//*[@id="maincont"]/table/tbody/tr/td[2]/a/text()') 89 #成分股名称 90 stock_name = html.xpath('//*[@id="maincont"]/table/tbody/tr/td[3]/a/text()') 91 s_id += stock_code 92 s_name += stock_name 93 bk_id.extend([bk_code]* len(stock_code)) 94 bk_name.extend([name]* len(stock_name)) 95 96 97 data_dict = dict(BK_ID = bk_id, BK_NAME = bk_name, S_ID = s_id, S_NAME = s_name) 98 cdata = pd.DataFrame(data_dict) 99 cdata.to_csv('chengfengu.csv') 100 end = time.time() 101 print('爬取结束!!\n开始时间:%s\n结束时间:%s\n'%(time.ctime(end), time.ctime(start)))