Python项目完成

今天,来对之前自学python做的一个小任务进行复习和总结,首先回顾之前任务的每一个步骤,按照自己个人的思路,一布一步往下做

需求:根据主网站,爬取出大连市各年地区生产总值,观察2010至2015年的趋势

1.获取网页html源码

2.根据html源码解析a标签

3.对a标签进行筛选,返回对应的a标签文本和url字典

4.分别进入各个url网页,解析网页文本内容

5.利用正则提取网页感兴趣的信息

6.将处理后的信息存数据库

第一步:根据url获取主网页源代码

代码如下:

import urllib2
import re
import pymssql
import time
from sgmllib import SGMLParser
#获取页面的html
def getHTML(url):
    request = urllib2.Request(url)
    response = urllib2.urlopen(request)
    html = response.read()
    return html

html = getHTML("http://www.stats.dl.gov.cn/index.php?m=content&c=index&a=lists&catid=52")
print html

第二部:根据html源码解析a标签

代码如下:

import urllib2
import re
import pymssql
import time
import datetime
from sgmllib import SGMLParser

#获取网络页面
def getHTML(url):
    request = urllib2.Request(url)
    response = urllib2.urlopen(request)
    html = response.read()
    return html
#获取完整连接内容整个网页包含a标签的内容
def findLinks3(html):
    l = []
    urls = re.findall(r"<a.*?href=.*?<\/a>",html,re.I|re.S|re.M)
    for i in urls:
        l.append(i)
    return l
html = getHTML("http://www.stats.dl.gov.cn/index.php?m=content&c=index&a=lists&catid=52")
l = findLinks3(html)
for val in l:
    print val

第三布:对a标签进行筛选,返回对应的a标签文本和url字典

import urllib2
import re
import pymssql
import time
import datetime
from sgmllib import SGMLParser

#获取网络页面
def getHTML(url):
    request = urllib2.Request(url)
    response = urllib2.urlopen(request)
    html = response.read()
    return html
#进行模糊筛选(对字符串进行模糊查找,判断是否存在)
def choose(html):
    a = -1 
    if html.find("大连市国民经济和社会发展") == -1:
        a=-1
    else:
        a=1
    return a
#获取完整连接内容整个网页包含a标签的内容(有筛选)
def findLinks3(html):
    l = []
    urls = re.findall(r"<a.*?href=.*?<\/a>",html,re.I|re.S|re.M)
    for i in urls:
        if choose(i)==1:
            l.append(i)
    return l
html = getHTML("http://www.stats.dl.gov.cn/index.php?m=content&c=index&a=lists&catid=52")
l = findLinks3(html)
for val in l[1:]:
    print val
    
import urllib2
import re
import pymssql
import time
import datetime
from sgmllib import SGMLParser

#获取网络页面
def getHTML(url):
    request = urllib2.Request(url)
    response = urllib2.urlopen(request)
    html = response.read()
    return html
#进行模糊筛选(对字符串进行模糊查找,判断是否存在)
def choose(html):
    a = -1 
    if html.find("大连市国民经济和社会发展") == -1:
        a=-1
    else:
        a=1
    return a
#获取完整连接内容整个网页包含a标签的内容(有筛选)
def findLinks3(html):
    l = []
    urls = re.findall(r"<a.*?href=.*?<\/a>",html,re.I|re.S|re.M)
    for i in urls:
        if choose(i)==1:
            l.append(i)
    return l
#获取a标签文本内容
def findLinks1(html):
    text = []
    txts = re.findall(r"<a.*?>(.*?)<\/a>",html,re.I|re.S|re.M)
    for i in txts:
        if choose(i)==1:
            #print i
            text.append(i)
    return text[0]
#只获取<a>标签链接href中的url<a href="www.www.com"></a>
def findLinks2(html):
    url = []
    txts = re.findall("(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')",html,re.I|re.S|re.M)
    for i in txts:
        #print i
        url.append(i)
    return url[0]
def getEveryYear(html):
    yearUrl ={} #定义空的字典,
    l = findLinks3(html)   #查找所有超链接
    for i in l[1:]:
        text = findLinks1(i)
        url = findLinks2(i)
        yearUrl[text]=url  #将超链接文本和url存放在字典中(key-value)
        #print text
        #print url
    return yearUrl

html = getHTML("http://www.stats.dl.gov.cn/index.php?m=content&c=index&a=lists&catid=52")
"""
l = findLinks3(html)
for val in l[1:]:
    pass
    print findLinks1(val)
    print findLinks2(val)
"""
d = getEveryYear(html)
for val,key in d.items():
    print  val
    print  key
   

第四 、五:分别进入各个url,提取信息

import urllib2
import re
import pymssql
import time
import datetime
from sgmllib import SGMLParser

#获取网络页面
def getHTML(url):
    request = urllib2.Request(url)
    response = urllib2.urlopen(request)
    html = response.read()
    return html
#进行模糊筛选(对字符串进行模糊查找,判断是否存在)
def choose(html):
    a = -1 
    if html.find("大连市国民经济和社会发展") == -1:
        a=-1
    else:
        a=1
    return a
#获取完整连接内容整个网页包含a标签的内容(有筛选)
def findLinks3(html):
    l = []
    urls = re.findall(r"<a.*?href=.*?<\/a>",html,re.I|re.S|re.M)
    for i in urls:
        if choose(i)==1:
            l.append(i)
    return l
#获取a标签文本内容
def findLinks1(html):
    text = []
    txts = re.findall(r"<a.*?>(.*?)<\/a>",html,re.I|re.S|re.M)
    for i in txts:
        if choose(i)==1:
            #print i
            text.append(i)
    return text[0]
#只获取<a>标签链接href中的url<a href="www.www.com"></a>
def findLinks2(html):
    url = []
    txts = re.findall("(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')",html,re.I|re.S|re.M)
    for i in txts:
        #print i
        url.append(i)
    return url[0]
def getEveryYear(html):
    yearUrl ={} #定义空的字典,
    l = findLinks3(html)   #查找所有超链接
    for i in l[1:]:
        text = findLinks1(i)
        url = findLinks2(i)
        yearUrl[text]=url  #将超链接文本和url存放在字典中(key-value)
        #print text
        #print url
    return yearUrl
"""
for val in l[1:]:
    pass
    print findLinks1(val)
    print findLinks2(val)

d = getEveryYear(html)
for val,key in d.items():
    print  val
   # print  key
   """


class GetIdList(SGMLParser):  
    def reset(self):  
        self.IDlist = []  
        self.flag = False  
        self.getdata = False  
        self.verbatim = 0  
        SGMLParser.reset(self)  
          
    def start_div(self, attrs):  
        if self.flag == True:  
            self.verbatim +=1 #进入子层div了,层数加1  
            return  
        for k,v in attrs:  #遍历div的所有属性以及其值  
            if k == 'class' and v == 'newsBody':#确定进入了<div class='newsBody'>  
                self.flag = True  
                return  
  
    def end_div(self):#遇到</div>  
        if self.verbatim == 0:  
            self.flag = False  
        if self.flag == True:#退出子层div了,层数减1  
            self.verbatim -=1  
  
    def start_p(self, attrs):  
        if self.flag == False:  
            return  
        self.getdata = True  
          
    def end_p(self):#遇到</p>  
        if self.getdata:  
            self.getdata = False  
  
    def handle_data(self, text):#处理文本  
        if self.getdata:  
            self.IDlist.append(text)  
              
    def printID(self):  
        for i in self.IDlist:  
            ##print i
            pass
        return self.IDlist



def printtext(url):
    html = getHTML(url)
    lister = GetIdList()  
    lister.feed(html)  
    l = lister.printID()
    s=" "
    for text in l:
        s=s+text
    return s[0:500]
def matchRe2(string):
    pattern = re.compile(r'([0-9]{1,5}.[0-9])亿元')
    #pat = re.compile(r'比上年增长[0-9]{1,5}.[0-9]%')
    #pat1 = re.compile(r'第一产业增加值[0-9]{1,5}.[0-9]亿元')
    match = pattern.search(string)
    if match:
        pass
        #print "ok"
        #print match.group(0)
        #print '国民经济生产总值:', pattern.search(string).group()
       # print pat1.search(string).group()
        #print re.compile(r'现将.*年大连市国民经济和社会发展情况公报如下').search(string).group()
       # print re.compile(r'第三产业增加值[0-9]{1,5}.[0-9]亿元').search(string).group()
    else:
        print 'No match!'
    return pattern.search(string).group()

def getData(html):
    data = {}
    d = getEveryYear(html)
    for key,value in d.items():
    #print key #,value
    #download3(value,key)
        s=printtext(value)
        #print s
        #matchRe2(s[20:80])
        data[key]= matchRe2(s)
    return data
html = getHTML("http://www.stats.dl.gov.cn/index.php?m=content&c=index&a=lists&catid=52")
d = getEveryYear(html)

#打印出所有文本内容
"""
for key,val in d.items():
    #print  val
    #print key
    s = printtext(val)
    print s
  """

data = getData(html)
for val,key in data.items():
    print val,key
   


    

第六步:存数据库

# -*- coding: cp936 -*-
import urllib2
import re
import pymssql
import time
import datetime
from sgmllib import SGMLParser

#获取网络页面
def getHTML(url):
    request = urllib2.Request(url)
    response = urllib2.urlopen(request)
    html = response.read()
    return html
#进行模糊筛选(对字符串进行模糊查找,判断是否存在)
def choose(html):
    a = -1 
    if html.find("大连市国民经济和社会发展") == -1:
        a=-1
    else:
        a=1
    return a
#获取完整连接内容整个网页包含a标签的内容(有筛选)
def findLinks3(html):
    l = []
    urls = re.findall(r"<a.*?href=.*?<\/a>",html,re.I|re.S|re.M)
    for i in urls:
        if choose(i)==1:
            l.append(i)
    return l
#获取a标签文本内容
def findLinks1(html):
    text = []
    txts = re.findall(r"<a.*?>(.*?)<\/a>",html,re.I|re.S|re.M)
    for i in txts:
        if choose(i)==1:
            #print i
            text.append(i)
    return text[0]
#只获取<a>标签链接href中的url<a href="www.www.com"></a>
def findLinks2(html):
    url = []
    txts = re.findall("(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')",html,re.I|re.S|re.M)
    for i in txts:
        #print i
        url.append(i)
    return url[0]
def getEveryYear(html):
    yearUrl ={} #定义空的字典,
    l = findLinks3(html)   #查找所有超链接
    for i in l[1:]:
        text = findLinks1(i)
        url = findLinks2(i)
        yearUrl[text]=url  #将超链接文本和url存放在字典中(key-value)
        #print text
        #print url
    return yearUrl
"""
for val in l[1:]:
    pass
    print findLinks1(val)
    print findLinks2(val)

d = getEveryYear(html)
for val,key in d.items():
    print  val
   # print  key
   """


class GetIdList(SGMLParser):  
    def reset(self):  
        self.IDlist = []  
        self.flag = False  
        self.getdata = False  
        self.verbatim = 0  
        SGMLParser.reset(self)  
          
    def start_div(self, attrs):  
        if self.flag == True:  
            self.verbatim +=1 #进入子层div了,层数加1  
            return  
        for k,v in attrs:#遍历div的所有属性以及其值  
            if k == 'class' and v == 'newsBody':#确定进入了<div class='entry-content'>  
                self.flag = True  
                return  
  
    def end_div(self):#遇到</div>  
        if self.verbatim == 0:  
            self.flag = False  
        if self.flag == True:#退出子层div了,层数减1  
            self.verbatim -=1  
  
    def start_p(self, attrs):  
        if self.flag == False:  
            return  
        self.getdata = True  
          
    def end_p(self):#遇到</p>  
        if self.getdata:  
            self.getdata = False  
  
    def handle_data(self, text):#处理文本  
        if self.getdata:  
            self.IDlist.append(text)  
              
    def printID(self):  
        for i in self.IDlist:  
            ##print i
            pass
        return self.IDlist



def printtext(url):
    html = getHTML(url)
    lister = GetIdList()  
    lister.feed(html)  
    l = lister.printID()
    s=" "
    for text in l:
        s=s+text
    return s[0:500]
def matchRe2(string):
    pattern = re.compile(r'([0-9]{1,5}.[0-9])亿元')
    #pat = re.compile(r'比上年增长[0-9]{1,5}.[0-9]%')
    #pat1 = re.compile(r'第一产业增加值[0-9]{1,5}.[0-9]亿元')
    match = pattern.search(string)
    if match:
        pass
        #print "ok"
        #print match.group(0)
        #print '国民经济生产总值:', pattern.search(string).group()
       # print pat1.search(string).group()
        #print re.compile(r'现将.*年大连市国民经济和社会发展情况公报如下').search(string).group()
       # print re.compile(r'第三产业增加值[0-9]{1,5}.[0-9]亿元').search(string).group()
    else:
        print 'No match!'
    return pattern.search(string).group()

def getData(html):
    data = {}
    d = getEveryYear(html)
    for key,value in d.items():
    #print key #,value
    #download3(value,key)
        s=printtext(value)
        #print s
        #matchRe2(s[20:80])
        data[key]= matchRe2(s)
    return data

class MSSQL:
    def __init__(self,host,user,pwd,db):
        self.host = host
        self.user = user
        self.pwd = pwd
        self.db = db

    def __GetConnect(self):
        if not self.db:
            raise(NameError,"没有设置数据库信息")
        self.conn = pymssql.connect(host=self.host,user=self.user,password=self.pwd,database=self.db,charset="utf8")
        cur = self.conn.cursor()
        if not cur:
            raise(NameError,"连接数据库失败")
        else:
            return cur

    def ExecQuery(self,sql):
        cur = self.__GetConnect()
        cur.execute(sql)
        resList = cur.fetchall()

        #查询完毕后必须关闭连接
        self.conn.close()
        return resList

    def ExecNonQuery(self,sql):
        cur = self.__GetConnect()
        cur.execute(sql)
        self.conn.commit()
        self.conn.close()

def OnlyCharNum(s,oth=''):
    s2 = s.lower();
    fomart = '.0123456789'
    for c in s2:
        if not c in fomart:
            s = s.replace(c,'');
    return s;
#写sql
def makesql(year,money,time):
    sql = "insert into Innocellence_TimeSheet_GDPRecord values("+year+","+money+","+time+")"
    return sql

#创建数据库实例
ms = MSSQL(host="localhost",user="sa",pwd="1234",db="testdb")
#先删除旧数据

#查询数据
"""reslist = ms.ExecQuery("select year,round(amount,1) from Innocellence_TimeSheet_GDPRecord order by year")
for i in reslist:
    print i"""
#先清空数据
ms.ExecNonQuery("delete  from  Innocellence_TimeSheet_GDPRecord ")
html = getHTML("http://www.stats.dl.gov.cn/index.php?m=content&c=index&a=lists&catid=52")
d = getEveryYear(html)
data = getData(html)
for key,val in data.items():
    print key,val
    sql = makesql(key[:4],OnlyCharNum(val),"'"+time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))+"'")
    print sql
    ms.ExecNonQuery(sql)



    

 

posted @ 2016-07-23 09:48  Blue眼泪2016  阅读(311)  评论(0编辑  收藏  举报