这周学了一下利用python进行爬虫,因为只是为了提取数据,所以代码很粗糙,爬了一下同花顺财经、新浪财经和东方财富网中每一只股票的新闻以及研报

# -*- coding: utf-8 -*-
import urllib.request,re,time,random,gzip
from bs4 import BeautifulSoup

def get_stock_code():
    import tushare as ts
    df_stock = ts.get_sz50s()
    code_list = df_stock['code']
    return code_list


def saveFile(data,i):  
    path = "E:\\projects\\paper_"+str(i+1)+".txt"  
    file = open(path,'wb')  
    page = ''+str(i+1)+'\n'  
    file.write(page.encode('gbk'))  
    for d in data:  
        d = str(d)+'\n'  
        file.write(d.encode('gbk'))  
    file.close()  
  
def ungzip(data):  
    try:  
        data = gzip.decompress(data)  
    except:  
        print("δ��ѹ���������ѹ...")  
    return data  
  
#CSDN������  
class CSDNSpider:  
    def __init__(self,pageIdx=1,url="http://blog.csdn.net/fly_yr/article/list/1"):  
        #Ĭ�ϵ�ǰҳ  
        self.pageIdx = pageIdx  
        self.url = url[0:url.rfind('/') + 1] + str(pageIdx)  
        self.headers = {  
            #"Connection": "keep-alive",  
            'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
            #"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",  
            #"Accept-Encoding": "gzip, deflate, sdch",  
            #"Accept-Language": "zh-CN,zh;q=0.8",  
            #"Host": "data.eastmoney.com"  
        }  
        
    def readData_tonghuashun_report(self,urlweb,code):
        import os,sys,re
        ret=""
        req = urllib.request.Request(url=urlweb, headers=self.headers)
        res = urllib.request.urlopen(req)  
        data = res.read()  
        data = ungzip(data)  
        data = data.decode('utf-8')
        soup=BeautifulSoup(data,"html5lib") 
        info = (soup.find('div','kuaizhaolefttitle aDBlue nu'))
        info = (info.find('p')).string.split(' ')
        time = info[0]
        source = info[1].split('机构:')[1]
        source = source.split('\xa0')[0]
        content = str(soup.find('div','kuaizhao_contant aDBlue nu'))
        content = content.split('<br/><br/>')
        contenttemp = ''
        for i in content:
            contenttemp = contenttemp+i+'\n'
        contenttemp = contenttemp.split('>')[1]
        contenttemp = contenttemp.split('<')[0]
        print(contenttemp)
        article = time+'\n'+source+'\n'+contenttemp
        return article
    
    def readData_tonghuashun_news(self,urlweb,code):
        import os,sys,re
        ret=""
        req = urllib.request.Request(url=urlweb, headers=self.headers)
        res = urllib.request.urlopen(req)  
        data = res.read()  
        data = ungzip(data)  
        soup=BeautifulSoup(data,"html5lib") 
        time = (soup.find('span','time')).string
        source = soup.find('span',{"id":'source_baidu'})
        source = (str(source.find('a'))).split('>')
        source = (source[1]).split(' ')
        sourcetemp=''
        for i in source:
            i = i.split('\n')
            for j in i:
                j = j.split('\t')
                for k in j:
                    if(k!=''):
                        sourcetemp = k
                        break
                if(sourcetemp!=''):
                    break
            if(sourcetemp!=''):
                break
        source = sourcetemp
        content = soup.find('div','atc-content')
        contents = content.find_all('p')
        contenttemp = ''
        content = ''
        for i in contents:
            content=content +str(i)+'\n'
        for s in content:
            if (s=='')or(s=='')or(s=='\n')or(s.isdigit()):
                contenttemp += s
                continue
            if u'\u4e00' <= s <= u'\u9fff':
                contenttemp += s
        article = time+'\n'+source+'\n'+contenttemp
        return article
#         pat_1 = re.compile(ur'[\u4e00-\u9fa5]')
#         for item in re.findall(pat_1,content):
#             contenttemp+=item
#         print(contenttemp)
        
    
    def readData_sina_report(self,urlweb,code):  
        ret=""
        req = urllib.request.Request(url=urlweb, headers=self.headers)
        res = urllib.request.urlopen(req)  
  
        data = res.read()  
        data = ungzip(data)  
        #data = data.decode('utf-8')
        #print(data)
        soup=BeautifulSoup(data,"html5lib") 
        #items = soup.find_all('div','newsContent')
        infos = soup.find_all('div','creab')
        count = 0
        for info in infos:
            info = info.find_all('span') 
            for i in info:
                for j in i:
                    j.find('a')
                    if "" in j:
                        pass
                    else:
                        source = j.string
                    if "日期" in j:
                        time = j
                        time = time.split('')[-1]
        
        
        contents = soup.find_all('p')
        contentAll = "" 
        for content in contents:
            contentAll+=str(content)
        print(contentAll)
    def readData_sina_news(self,urlweb,code):  
        ret=""
        req = urllib.request.Request(url=urlweb, headers=self.headers)
        res = urllib.request.urlopen(req)  
  
        data = res.read()  
        data = ungzip(data)  
        data = data.decode('utf-8')
        #print(data)
        soup=BeautifulSoup(data,"html5lib") 
        time = soup.find('span','time-source')
        time = (time.string).split(' ')
        print(time)
        count = 0
        for i in time:
            i = i.split('\n')
            for j in i:
                j = j.split('\t')
                for k in j:
                    if(k!="")and(count==0):
                        timetemp = k
                        count+=1
                    elif(k!="")and(count==1):
                        source = k
                        break
            
        time = timetemp
        contents = soup.find('div','article article_16')
        contents = contents.find_all('p')
        content = ''
        for i in range(len(contents)-1):
            
            try:
                
                content+=contents[i+1].string+'\n'
            except:
                pass
        article = code+'\n'+time+'\n'+source+'\n'+content
        print(article)
        return article
    
    
    
    
    def readData_eastmoney_news(self,urlweb):  
        ret=""
        req = urllib.request.Request(url=urlweb, headers=self.headers)
        res = urllib.request.urlopen(req)  
        data = res.read()  
        data = ungzip(data)  
        data = data.decode('utf-8')
        soup=BeautifulSoup(data,"html5lib") 
        time = soup.find_all('div','time')
        #getTime
        time = time[0].string
        sources = soup.find('div','source')
        trs=sources.find("img")
        source = str(trs.attrs["alt"])
        contents = soup.find_all('p')
        contentStr = ''
        for i in range(len(contents)):
            try:
                contentStr += contents[i].string
                contentStr += '\n'
            except:
                pass
        artical = time+'\n'+source+'\n'+contentStr
        return artical
    
    def readData_eastmoney_report(self,urlweb):  
        ret=""
        req = urllib.request.Request(url=urlweb, headers=self.headers)
        res = urllib.request.urlopen(req)  
        data = res.read()  
        data = ungzip(data)  
        #data = data.decode('utf-8')
        soup=BeautifulSoup(data,"html5lib") 
        infos = soup.find('div','report-infos')
        infos = infos.find_all('span')
        time = infos[1].string
        time = time.split(' ')
        timetemp = ""
        for i in time:
            i = i.split('\n')
            for j in i:
                j = j.split('\t')
                for k in j:
                    if(k!=""):
                        timetemp = k
                        break
                if(timetemp!=""):
                    break
            if(timetemp!=""):
                break
        time = timetemp
        source = infos[2].string
        source = source.split(' ')
        for i in source:
            i = i.split('\n')
            for j in i:
                j = j.split('\t')
                for k in j:
                    if(k!=""):
                        sourcetemp = k
                        break
        source = sourcetemp
        #getTime
        
        contents = soup.find_all('p')
        contentStr = ''
        for i in range(len(contents)):
            try:
                contentStr += contents[i].string
                contentStr += '\n'
            except:
                pass
        artical = time+'\n'+source+'\n'+contentStr
        return artical
    
    
                 
                 
    def getAllUrl(self,url):
        import re
        import requests
        r = requests.get(url)
        data = r.text
        link_list =re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" ,data)
        url_list = []
        for url in link_list:
            
            url_list.append(url)
        return url_list
    
    def getAllUrl2(self,url):
        cs = CSDNSpider()  
        url_list = []
        for i in cs.getAllUrl('http://finance.eastmoney.com/yaowen.html'):
            url_list.append(i)
        for i in cs.getAllUrl('http://finance.eastmoney.com/pinglun.html'):
            url_list.append(i)
        for i in cs.getAllUrl('http://stock.eastmoney.com/bidu.html'):
            url_list.append(i)
        return url_list
            


def save_file_sina():
    import os
    import csv
    cs = CSDNSpider()
    code_list = get_stock_code()
    for code in code_list:
        code = str(code)
        URL="http://money.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol=sh"+code+"&Page=1"
        print(URL)
        url_list = cs.getAllUrl(URL)
        time = []
        source = []
        count = 0
        for i in url_list:
            #artical = cs.readData_sina(i,code)
            try:
                if 'search' in i:
                    artical = cs.readData_sina_report(i,code)
                else:
                    artical = cs.readData_sina_news(i,code)
                artical = artical.split('\n')
                code = artical[0]
                timetemp = artical[1].split(' ')[0]
                timetemp = timetemp.split('')[0]
                if "" in timetemp:
                    timetemp = timetemp.replace('','-')
                    timetemp = timetemp.replace('','-')
                sourcetemp = artical[2]
                
                path = 'abc/sina/'+code+'/'+timetemp+'/'
        
                print(path)
                if timetemp in time:
                    if sourcetemp in source:
                        print('ss3')
                        csvFile = open(path+'/'+sourcetemp+'.txt', "a")
                        writer = csv.writer(csvFile)
                        print(artical)
                        writer.writerow(artical)
                        csvFile.close()
                        print('end3')
                    else:
                        print('ss1')
                        source.append(sourcetemp)
                        csvfile = open(path+'/'+sourcetemp+'.txt', 'w')
                        writer = csv.writer(csvfile)
                        print(artical)
                        writer.writerow(artical)
                        csvfile.close()
                        print('end1')
                else:
                    os.makedirs(path)
                    csvfile = open(path+'/'+sourcetemp+'.txt', 'w')
                    print('ss2')
                    time.append(timetemp)
                    #csvfile = open(path, 'w')
                    writer = csv.writer(csvfile)
                    print(artical)
                    writer.writerow(artical)
                    csvfile.close()
                    print('end2')
                print(count)
                count = count+1
            except:
                pass
            




def save_file_eastmoney():
    import os
    import csv
    cs = CSDNSpider()  
    #cs.readData2()
    code_list = get_stock_code()
    for code in code_list:
        code = str(code)
        url = "http://quote.eastmoney.com/sh"+code+".html"
        url_list = cs.getAllUrl(url)
        time = []
        mechanism = []
        count = 0
        for url in url_list:
            try:        
                if "news" in url:
                    artical = cs.readData_eastmoney_news(url)
                elif "report" in url:
                    artical = cs.readData_eastmoney_report(url)
                else:
                    continue                
                artical = artical.split('\n')
                timetemp = artical[0].split('')[0]
                if "" in timetemp:
                    timetemp = timetemp.replace('','-')
                    timetemp = timetemp.replace('','-')
                mechanismtemp = artical[1]
                path = 'abc/eastmoney/'+code+'/'+timetemp
            
                print(path)
                if timetemp in time:
                    if mechanismtemp in mechanism:
                        print('ss3')
                        csvFile = open(path+'/'+mechanismtemp+'.txt', "a")
                        writer = csv.writer(csvFile)
                        print(artical)
                        writer.writerow(artical)
                        csvFile.close()
                        print('end3')
                    else:
                        print('ss1')
                        mechanism.append(mechanismtemp)
                        csvfile = open(path+'/'+mechanismtemp+'.txt', 'w')
                        writer = csv.writer(csvfile)
                        print(artical)
                        writer.writerow(artical)
                        csvfile.close()
                        print('end1')
                else:
                    os.makedirs(path)
                    csvfile = open(path+'/'+mechanismtemp+'.txt', 'w')
                    print('ss2')
                    time.append(timetemp)
                    #csvfile = open(path, 'w')
                    writer = csv.writer(csvfile)
                    print(artical)
                    writer.writerow(artical)
                    csvfile.close()
                    print('end2')
                print(count)
                count = count+1
            except:
                pass
cs = CSDNSpider()
cs.readData_tonghuashun_report("http://search.10jqka.com.cn/snapshot/report_pdf/ea02eda3880f930e.html",'100000')
#cs.readData_sina_news('http://cj.sina.com.cn/article/detail/5966752440/177997','60000')
#cs.readData_sina2('http://finance.sina.com.cn/stock/hyyj/2017-02-28/doc-ifyavvsk3874186.shtml', '10000')
#cs.readData_sina('http://finance.sina.com.cn/stock/hyyj/2016-11-23/doc-ifxxwrwk1751619.shtml')

 

posted on 2017-03-04 14:57  薄樱  阅读(880)  评论(0编辑  收藏  举报