同花顺概念板块成分股数据分享

爬取了同花顺上概念板块的成分股

主要是发现了各个量化的网站没有这个数据源,很多策略无法展开。这份数据应该挺多人需要的吧。把链接挂这里了,需要的阔以下载

http://pan.baidu.com/s/1eSGSS5W

数据有4列,分别是板块代码,板块名字,成分股代码以及对应的公司

源码贴出来了,初步学习爬虫,写的十分难看。。。。

  1 #!/usr/bin/env python3
  2 # -*- coding: utf-8 -*-
  3 """
  4 Created on Fri Nov 17 19:41:44 2017
  5 
  6 @author: Mr.ZeroW
  7 
  8 同花顺板块成分股
  9 """
 10 
 11 #首先不同板块地址不同 http://q.10jqka.com.cn/gn/detail/order/desc/page/1/ajax/1/code/300018
 12 #需要高出不同板块页数有多少,才能得出地址
 13 import urllib.request
 14 from lxml import etree
 15 import pandas as pd
 16 import time
 17 
 18 #爬取板块名称以及代码并且存在文件
 19 with urllib.request.urlopen('http://q.10jqka.com.cn/gn/') as f:
 20     text = f.read().decode('gb2312')
 21 
 22 html = etree.HTML(text)
 23 
 24 gnbk = html.xpath('/html/body/div[2]/div[1]/div//div//div//a')
 25 thsgnbk = []
 26 for i in range(len(gnbk)):
 27     thsgnbk.append((gnbk[i].text))
 28     
 29 #板块代码
 30 bkcode = html.xpath('/html/body/div[2]/div[1]/div//div//div//a/@href')
 31 bkcode = list(map(lambda x : x.split('/')[-2], bkcode))
 32 data = {'Name': thsgnbk}
 33 
 34 #存储
 35 gnbk = pd.DataFrame(data, index = bkcode)
 36 gnbk.to_csv('gnbk.csv')
 37 
 38 print('板块名称以及代码已爬取,存储文件名:gnbk.csv')
 39 #导入板块名称和代码
 40 data = pd.read_csv('gnbk.csv')
 41 #建立数据框,四列【板块id, 板块name, 成分股id, 成分股name】
 42 
 43 bk_id = []
 44 bk_name = []
 45 s_id = []
 46 s_name = []
 47 iCount = 1
 48 print('爬取开始!')
 49 start = time.time()
 50 for i in range(len(data)):
 51 
 52     bk_code = str(data.iloc[i, 0])
 53     name = str(data.iloc[i, 1])
 54     url = 'http://q.10jqka.com.cn/gn/detail/code/' + bk_code + '/'
 55     print('%d: %s' %(iCount, name))
 56     iCount += 1
 57     
 58     with urllib.request.urlopen(url) as f:
 59         text = f.read().decode('GBK', 'ignore')
 60 
 61     #得出板块成分股有多少页
 62     html = etree.HTML(text)
 63     
 64     result = html.xpath('//*[@id="m-page"]/span/text()')
 65     try:
 66         page = int(result[0].split('/')[-1])
 67         for j in range(page):
 68             page_n = str(j + 1)
 69             curl = 'http://q.10jqka.com.cn/gn/detail/order/desc/page/' + page_n+ '/ajax/1/code/' + bk_code
 70             with urllib.request.urlopen(curl) as f:
 71                 text = f.read().decode('GBK')
 72             html = etree.HTML(text)
 73             #成分股代码
 74             stock_code = html.xpath('/html/body/table/tbody/tr/td[2]/a/text()')
 75             #成分股名称
 76             stock_name = html.xpath('/html/body/table/tbody/tr/td[3]/a/text()')
 77             s_id += stock_code
 78             s_name += stock_name
 79             bk_id.extend([bk_code]* len(stock_code))
 80             bk_name.extend([name]* len(stock_name))
 81             
 82     except IndexError as e:
 83         curl = url
 84         with urllib.request.urlopen(curl) as f:
 85             text = f.read().decode('GBK')
 86         html = etree.HTML(text)
 87         #成分股代码
 88         stock_code = html.xpath('//*[@id="maincont"]/table/tbody/tr/td[2]/a/text()')
 89         #成分股名称
 90         stock_name = html.xpath('//*[@id="maincont"]/table/tbody/tr/td[3]/a/text()')   
 91         s_id += stock_code
 92         s_name += stock_name
 93         bk_id.extend([bk_code]* len(stock_code))
 94         bk_name.extend([name]* len(stock_name))
 95         
 96     
 97 data_dict = dict(BK_ID = bk_id, BK_NAME = bk_name, S_ID = s_id, S_NAME = s_name)
 98 cdata = pd.DataFrame(data_dict)
 99 cdata.to_csv('chengfengu.csv')
100 end = time.time()
101 print('爬取结束!!\n开始时间:%s\n结束时间:%s\n'%(time.ctime(end), time.ctime(start)))

 

posted @ 2017-11-18 01:12  Mr.ZeroW  阅读(6602)  评论(1编辑  收藏  举报