这周学了一下利用python进行爬虫,因为只是为了提取数据,所以代码很粗糙,爬了一下同花顺财经、新浪财经和东方财富网中每一只股票的新闻以及研报
# -*- coding: utf-8 -*- import urllib.request,re,time,random,gzip from bs4 import BeautifulSoup def get_stock_code(): import tushare as ts df_stock = ts.get_sz50s() code_list = df_stock['code'] return code_list def saveFile(data,i): path = "E:\\projects\\paper_"+str(i+1)+".txt" file = open(path,'wb') page = '第'+str(i+1)+'\n' file.write(page.encode('gbk')) for d in data: d = str(d)+'\n' file.write(d.encode('gbk')) file.close() def ungzip(data): try: data = gzip.decompress(data) except: print("δ��ѹ���������ѹ...") return data #CSDN������ class CSDNSpider: def __init__(self,pageIdx=1,url="http://blog.csdn.net/fly_yr/article/list/1"): #Ĭ�ϵ�ǰҳ self.pageIdx = pageIdx self.url = url[0:url.rfind('/') + 1] + str(pageIdx) self.headers = { #"Connection": "keep-alive", 'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' #"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", #"Accept-Encoding": "gzip, deflate, sdch", #"Accept-Language": "zh-CN,zh;q=0.8", #"Host": "data.eastmoney.com" } def readData_tonghuashun_report(self,urlweb,code): import os,sys,re ret="" req = urllib.request.Request(url=urlweb, headers=self.headers) res = urllib.request.urlopen(req) data = res.read() data = ungzip(data) data = data.decode('utf-8') soup=BeautifulSoup(data,"html5lib") info = (soup.find('div','kuaizhaolefttitle aDBlue nu')) info = (info.find('p')).string.split(' ') time = info[0] source = info[1].split('机构:')[1] source = source.split('\xa0')[0] content = str(soup.find('div','kuaizhao_contant aDBlue nu')) content = content.split('<br/><br/>') contenttemp = '' for i in content: contenttemp = contenttemp+i+'\n' contenttemp = contenttemp.split('>')[1] contenttemp = contenttemp.split('<')[0] print(contenttemp) article = time+'\n'+source+'\n'+contenttemp return article def readData_tonghuashun_news(self,urlweb,code): import os,sys,re ret="" req = urllib.request.Request(url=urlweb, headers=self.headers) res = urllib.request.urlopen(req) data = res.read() data = ungzip(data) soup=BeautifulSoup(data,"html5lib") time = (soup.find('span','time')).string source = soup.find('span',{"id":'source_baidu'}) source = (str(source.find('a'))).split('>') source = (source[1]).split(' ') sourcetemp='' for i in source: i = i.split('\n') for j in i: j = j.split('\t') for k in j: if(k!=''): sourcetemp = k break if(sourcetemp!=''): break if(sourcetemp!=''): break source = sourcetemp content = soup.find('div','atc-content') contents = content.find_all('p') contenttemp = '' content = '' for i in contents: content=content +str(i)+'\n' for s in content: if (s=='。')or(s==',')or(s=='\n')or(s.isdigit()): contenttemp += s continue if u'\u4e00' <= s <= u'\u9fff': contenttemp += s article = time+'\n'+source+'\n'+contenttemp return article # pat_1 = re.compile(ur'[\u4e00-\u9fa5]') # for item in re.findall(pat_1,content): # contenttemp+=item # print(contenttemp) def readData_sina_report(self,urlweb,code): ret="" req = urllib.request.Request(url=urlweb, headers=self.headers) res = urllib.request.urlopen(req) data = res.read() data = ungzip(data) #data = data.decode('utf-8') #print(data) soup=BeautifulSoup(data,"html5lib") #items = soup.find_all('div','newsContent') infos = soup.find_all('div','creab') count = 0 for info in infos: info = info.find_all('span') for i in info: for j in i: j.find('a') if ":" in j: pass else: source = j.string if "日期" in j: time = j time = time.split(':')[-1] contents = soup.find_all('p') contentAll = "" for content in contents: contentAll+=str(content) print(contentAll) def readData_sina_news(self,urlweb,code): ret="" req = urllib.request.Request(url=urlweb, headers=self.headers) res = urllib.request.urlopen(req) data = res.read() data = ungzip(data) data = data.decode('utf-8') #print(data) soup=BeautifulSoup(data,"html5lib") time = soup.find('span','time-source') time = (time.string).split(' ') print(time) count = 0 for i in time: i = i.split('\n') for j in i: j = j.split('\t') for k in j: if(k!="")and(count==0): timetemp = k count+=1 elif(k!="")and(count==1): source = k break time = timetemp contents = soup.find('div','article article_16') contents = contents.find_all('p') content = '' for i in range(len(contents)-1): try: content+=contents[i+1].string+'\n' except: pass article = code+'\n'+time+'\n'+source+'\n'+content print(article) return article def readData_eastmoney_news(self,urlweb): ret="" req = urllib.request.Request(url=urlweb, headers=self.headers) res = urllib.request.urlopen(req) data = res.read() data = ungzip(data) data = data.decode('utf-8') soup=BeautifulSoup(data,"html5lib") time = soup.find_all('div','time') #getTime time = time[0].string sources = soup.find('div','source') trs=sources.find("img") source = str(trs.attrs["alt"]) contents = soup.find_all('p') contentStr = '' for i in range(len(contents)): try: contentStr += contents[i].string contentStr += '\n' except: pass artical = time+'\n'+source+'\n'+contentStr return artical def readData_eastmoney_report(self,urlweb): ret="" req = urllib.request.Request(url=urlweb, headers=self.headers) res = urllib.request.urlopen(req) data = res.read() data = ungzip(data) #data = data.decode('utf-8') soup=BeautifulSoup(data,"html5lib") infos = soup.find('div','report-infos') infos = infos.find_all('span') time = infos[1].string time = time.split(' ') timetemp = "" for i in time: i = i.split('\n') for j in i: j = j.split('\t') for k in j: if(k!=""): timetemp = k break if(timetemp!=""): break if(timetemp!=""): break time = timetemp source = infos[2].string source = source.split(' ') for i in source: i = i.split('\n') for j in i: j = j.split('\t') for k in j: if(k!=""): sourcetemp = k break source = sourcetemp #getTime contents = soup.find_all('p') contentStr = '' for i in range(len(contents)): try: contentStr += contents[i].string contentStr += '\n' except: pass artical = time+'\n'+source+'\n'+contentStr return artical def getAllUrl(self,url): import re import requests r = requests.get(url) data = r.text link_list =re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" ,data) url_list = [] for url in link_list: url_list.append(url) return url_list def getAllUrl2(self,url): cs = CSDNSpider() url_list = [] for i in cs.getAllUrl('http://finance.eastmoney.com/yaowen.html'): url_list.append(i) for i in cs.getAllUrl('http://finance.eastmoney.com/pinglun.html'): url_list.append(i) for i in cs.getAllUrl('http://stock.eastmoney.com/bidu.html'): url_list.append(i) return url_list def save_file_sina(): import os import csv cs = CSDNSpider() code_list = get_stock_code() for code in code_list: code = str(code) URL="http://money.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol=sh"+code+"&Page=1" print(URL) url_list = cs.getAllUrl(URL) time = [] source = [] count = 0 for i in url_list: #artical = cs.readData_sina(i,code) try: if 'search' in i: artical = cs.readData_sina_report(i,code) else: artical = cs.readData_sina_news(i,code) artical = artical.split('\n') code = artical[0] timetemp = artical[1].split(' ')[0] timetemp = timetemp.split('日')[0] if "年" in timetemp: timetemp = timetemp.replace('年','-') timetemp = timetemp.replace('月','-') sourcetemp = artical[2] path = 'abc/sina/'+code+'/'+timetemp+'/' print(path) if timetemp in time: if sourcetemp in source: print('ss3') csvFile = open(path+'/'+sourcetemp+'.txt', "a") writer = csv.writer(csvFile) print(artical) writer.writerow(artical) csvFile.close() print('end3') else: print('ss1') source.append(sourcetemp) csvfile = open(path+'/'+sourcetemp+'.txt', 'w') writer = csv.writer(csvfile) print(artical) writer.writerow(artical) csvfile.close() print('end1') else: os.makedirs(path) csvfile = open(path+'/'+sourcetemp+'.txt', 'w') print('ss2') time.append(timetemp) #csvfile = open(path, 'w') writer = csv.writer(csvfile) print(artical) writer.writerow(artical) csvfile.close() print('end2') print(count) count = count+1 except: pass def save_file_eastmoney(): import os import csv cs = CSDNSpider() #cs.readData2() code_list = get_stock_code() for code in code_list: code = str(code) url = "http://quote.eastmoney.com/sh"+code+".html" url_list = cs.getAllUrl(url) time = [] mechanism = [] count = 0 for url in url_list: try: if "news" in url: artical = cs.readData_eastmoney_news(url) elif "report" in url: artical = cs.readData_eastmoney_report(url) else: continue artical = artical.split('\n') timetemp = artical[0].split('日')[0] if "年" in timetemp: timetemp = timetemp.replace('年','-') timetemp = timetemp.replace('月','-') mechanismtemp = artical[1] path = 'abc/eastmoney/'+code+'/'+timetemp print(path) if timetemp in time: if mechanismtemp in mechanism: print('ss3') csvFile = open(path+'/'+mechanismtemp+'.txt', "a") writer = csv.writer(csvFile) print(artical) writer.writerow(artical) csvFile.close() print('end3') else: print('ss1') mechanism.append(mechanismtemp) csvfile = open(path+'/'+mechanismtemp+'.txt', 'w') writer = csv.writer(csvfile) print(artical) writer.writerow(artical) csvfile.close() print('end1') else: os.makedirs(path) csvfile = open(path+'/'+mechanismtemp+'.txt', 'w') print('ss2') time.append(timetemp) #csvfile = open(path, 'w') writer = csv.writer(csvfile) print(artical) writer.writerow(artical) csvfile.close() print('end2') print(count) count = count+1 except: pass cs = CSDNSpider() cs.readData_tonghuashun_report("http://search.10jqka.com.cn/snapshot/report_pdf/ea02eda3880f930e.html",'100000') #cs.readData_sina_news('http://cj.sina.com.cn/article/detail/5966752440/177997','60000') #cs.readData_sina2('http://finance.sina.com.cn/stock/hyyj/2017-02-28/doc-ifyavvsk3874186.shtml', '10000') #cs.readData_sina('http://finance.sina.com.cn/stock/hyyj/2016-11-23/doc-ifxxwrwk1751619.shtml')