Python项目完成
今天,来对之前自学python做的一个小任务进行复习和总结,首先回顾之前任务的每一个步骤,按照自己个人的思路,一布一步往下做
需求:根据主网站,爬取出大连市各年地区生产总值,观察2010至2015年的趋势
1.获取网页html源码
2.根据html源码解析a标签
3.对a标签进行筛选,返回对应的a标签文本和url字典
4.分别进入各个url网页,解析网页文本内容
5.利用正则提取网页感兴趣的信息
6.将处理后的信息存数据库
第一步:根据url获取主网页源代码
代码如下:
import urllib2 import re import pymssql import time from sgmllib import SGMLParser #获取页面的html def getHTML(url): request = urllib2.Request(url) response = urllib2.urlopen(request) html = response.read() return html html = getHTML("http://www.stats.dl.gov.cn/index.php?m=content&c=index&a=lists&catid=52") print html
第二部:根据html源码解析a标签
代码如下:
import urllib2 import re import pymssql import time import datetime from sgmllib import SGMLParser #获取网络页面 def getHTML(url): request = urllib2.Request(url) response = urllib2.urlopen(request) html = response.read() return html #获取完整连接内容整个网页包含a标签的内容 def findLinks3(html): l = [] urls = re.findall(r"<a.*?href=.*?<\/a>",html,re.I|re.S|re.M) for i in urls: l.append(i) return l html = getHTML("http://www.stats.dl.gov.cn/index.php?m=content&c=index&a=lists&catid=52") l = findLinks3(html) for val in l: print val
第三布:对a标签进行筛选,返回对应的a标签文本和url字典
import urllib2 import re import pymssql import time import datetime from sgmllib import SGMLParser #获取网络页面 def getHTML(url): request = urllib2.Request(url) response = urllib2.urlopen(request) html = response.read() return html #进行模糊筛选(对字符串进行模糊查找,判断是否存在) def choose(html): a = -1 if html.find("大连市国民经济和社会发展") == -1: a=-1 else: a=1 return a #获取完整连接内容整个网页包含a标签的内容(有筛选) def findLinks3(html): l = [] urls = re.findall(r"<a.*?href=.*?<\/a>",html,re.I|re.S|re.M) for i in urls: if choose(i)==1: l.append(i) return l html = getHTML("http://www.stats.dl.gov.cn/index.php?m=content&c=index&a=lists&catid=52") l = findLinks3(html) for val in l[1:]: print val
import urllib2 import re import pymssql import time import datetime from sgmllib import SGMLParser #获取网络页面 def getHTML(url): request = urllib2.Request(url) response = urllib2.urlopen(request) html = response.read() return html #进行模糊筛选(对字符串进行模糊查找,判断是否存在) def choose(html): a = -1 if html.find("大连市国民经济和社会发展") == -1: a=-1 else: a=1 return a #获取完整连接内容整个网页包含a标签的内容(有筛选) def findLinks3(html): l = [] urls = re.findall(r"<a.*?href=.*?<\/a>",html,re.I|re.S|re.M) for i in urls: if choose(i)==1: l.append(i) return l #获取a标签文本内容 def findLinks1(html): text = [] txts = re.findall(r"<a.*?>(.*?)<\/a>",html,re.I|re.S|re.M) for i in txts: if choose(i)==1: #print i text.append(i) return text[0] #只获取<a>标签链接href中的url<a href="www.www.com"></a> def findLinks2(html): url = [] txts = re.findall("(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')",html,re.I|re.S|re.M) for i in txts: #print i url.append(i) return url[0] def getEveryYear(html): yearUrl ={} #定义空的字典, l = findLinks3(html) #查找所有超链接 for i in l[1:]: text = findLinks1(i) url = findLinks2(i) yearUrl[text]=url #将超链接文本和url存放在字典中(key-value) #print text #print url return yearUrl html = getHTML("http://www.stats.dl.gov.cn/index.php?m=content&c=index&a=lists&catid=52") """ l = findLinks3(html) for val in l[1:]: pass print findLinks1(val) print findLinks2(val) """ d = getEveryYear(html) for val,key in d.items(): print val print key
第四 、五:分别进入各个url,提取信息
import urllib2 import re import pymssql import time import datetime from sgmllib import SGMLParser #获取网络页面 def getHTML(url): request = urllib2.Request(url) response = urllib2.urlopen(request) html = response.read() return html #进行模糊筛选(对字符串进行模糊查找,判断是否存在) def choose(html): a = -1 if html.find("大连市国民经济和社会发展") == -1: a=-1 else: a=1 return a #获取完整连接内容整个网页包含a标签的内容(有筛选) def findLinks3(html): l = [] urls = re.findall(r"<a.*?href=.*?<\/a>",html,re.I|re.S|re.M) for i in urls: if choose(i)==1: l.append(i) return l #获取a标签文本内容 def findLinks1(html): text = [] txts = re.findall(r"<a.*?>(.*?)<\/a>",html,re.I|re.S|re.M) for i in txts: if choose(i)==1: #print i text.append(i) return text[0] #只获取<a>标签链接href中的url<a href="www.www.com"></a> def findLinks2(html): url = [] txts = re.findall("(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')",html,re.I|re.S|re.M) for i in txts: #print i url.append(i) return url[0] def getEveryYear(html): yearUrl ={} #定义空的字典, l = findLinks3(html) #查找所有超链接 for i in l[1:]: text = findLinks1(i) url = findLinks2(i) yearUrl[text]=url #将超链接文本和url存放在字典中(key-value) #print text #print url return yearUrl """ for val in l[1:]: pass print findLinks1(val) print findLinks2(val) d = getEveryYear(html) for val,key in d.items(): print val # print key """ class GetIdList(SGMLParser): def reset(self): self.IDlist = [] self.flag = False self.getdata = False self.verbatim = 0 SGMLParser.reset(self) def start_div(self, attrs): if self.flag == True: self.verbatim +=1 #进入子层div了,层数加1 return for k,v in attrs: #遍历div的所有属性以及其值 if k == 'class' and v == 'newsBody':#确定进入了<div class='newsBody'> self.flag = True return def end_div(self):#遇到</div> if self.verbatim == 0: self.flag = False if self.flag == True:#退出子层div了,层数减1 self.verbatim -=1 def start_p(self, attrs): if self.flag == False: return self.getdata = True def end_p(self):#遇到</p> if self.getdata: self.getdata = False def handle_data(self, text):#处理文本 if self.getdata: self.IDlist.append(text) def printID(self): for i in self.IDlist: ##print i pass return self.IDlist def printtext(url): html = getHTML(url) lister = GetIdList() lister.feed(html) l = lister.printID() s=" " for text in l: s=s+text return s[0:500] def matchRe2(string): pattern = re.compile(r'([0-9]{1,5}.[0-9])亿元') #pat = re.compile(r'比上年增长[0-9]{1,5}.[0-9]%') #pat1 = re.compile(r'第一产业增加值[0-9]{1,5}.[0-9]亿元') match = pattern.search(string) if match: pass #print "ok" #print match.group(0) #print '国民经济生产总值:', pattern.search(string).group() # print pat1.search(string).group() #print re.compile(r'现将.*年大连市国民经济和社会发展情况公报如下').search(string).group() # print re.compile(r'第三产业增加值[0-9]{1,5}.[0-9]亿元').search(string).group() else: print 'No match!' return pattern.search(string).group() def getData(html): data = {} d = getEveryYear(html) for key,value in d.items(): #print key #,value #download3(value,key) s=printtext(value) #print s #matchRe2(s[20:80]) data[key]= matchRe2(s) return data html = getHTML("http://www.stats.dl.gov.cn/index.php?m=content&c=index&a=lists&catid=52") d = getEveryYear(html) #打印出所有文本内容 """ for key,val in d.items(): #print val #print key s = printtext(val) print s """ data = getData(html) for val,key in data.items(): print val,key
第六步:存数据库
# -*- coding: cp936 -*- import urllib2 import re import pymssql import time import datetime from sgmllib import SGMLParser #获取网络页面 def getHTML(url): request = urllib2.Request(url) response = urllib2.urlopen(request) html = response.read() return html #进行模糊筛选(对字符串进行模糊查找,判断是否存在) def choose(html): a = -1 if html.find("大连市国民经济和社会发展") == -1: a=-1 else: a=1 return a #获取完整连接内容整个网页包含a标签的内容(有筛选) def findLinks3(html): l = [] urls = re.findall(r"<a.*?href=.*?<\/a>",html,re.I|re.S|re.M) for i in urls: if choose(i)==1: l.append(i) return l #获取a标签文本内容 def findLinks1(html): text = [] txts = re.findall(r"<a.*?>(.*?)<\/a>",html,re.I|re.S|re.M) for i in txts: if choose(i)==1: #print i text.append(i) return text[0] #只获取<a>标签链接href中的url<a href="www.www.com"></a> def findLinks2(html): url = [] txts = re.findall("(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')",html,re.I|re.S|re.M) for i in txts: #print i url.append(i) return url[0] def getEveryYear(html): yearUrl ={} #定义空的字典, l = findLinks3(html) #查找所有超链接 for i in l[1:]: text = findLinks1(i) url = findLinks2(i) yearUrl[text]=url #将超链接文本和url存放在字典中(key-value) #print text #print url return yearUrl """ for val in l[1:]: pass print findLinks1(val) print findLinks2(val) d = getEveryYear(html) for val,key in d.items(): print val # print key """ class GetIdList(SGMLParser): def reset(self): self.IDlist = [] self.flag = False self.getdata = False self.verbatim = 0 SGMLParser.reset(self) def start_div(self, attrs): if self.flag == True: self.verbatim +=1 #进入子层div了,层数加1 return for k,v in attrs:#遍历div的所有属性以及其值 if k == 'class' and v == 'newsBody':#确定进入了<div class='entry-content'> self.flag = True return def end_div(self):#遇到</div> if self.verbatim == 0: self.flag = False if self.flag == True:#退出子层div了,层数减1 self.verbatim -=1 def start_p(self, attrs): if self.flag == False: return self.getdata = True def end_p(self):#遇到</p> if self.getdata: self.getdata = False def handle_data(self, text):#处理文本 if self.getdata: self.IDlist.append(text) def printID(self): for i in self.IDlist: ##print i pass return self.IDlist def printtext(url): html = getHTML(url) lister = GetIdList() lister.feed(html) l = lister.printID() s=" " for text in l: s=s+text return s[0:500] def matchRe2(string): pattern = re.compile(r'([0-9]{1,5}.[0-9])亿元') #pat = re.compile(r'比上年增长[0-9]{1,5}.[0-9]%') #pat1 = re.compile(r'第一产业增加值[0-9]{1,5}.[0-9]亿元') match = pattern.search(string) if match: pass #print "ok" #print match.group(0) #print '国民经济生产总值:', pattern.search(string).group() # print pat1.search(string).group() #print re.compile(r'现将.*年大连市国民经济和社会发展情况公报如下').search(string).group() # print re.compile(r'第三产业增加值[0-9]{1,5}.[0-9]亿元').search(string).group() else: print 'No match!' return pattern.search(string).group() def getData(html): data = {} d = getEveryYear(html) for key,value in d.items(): #print key #,value #download3(value,key) s=printtext(value) #print s #matchRe2(s[20:80]) data[key]= matchRe2(s) return data class MSSQL: def __init__(self,host,user,pwd,db): self.host = host self.user = user self.pwd = pwd self.db = db def __GetConnect(self): if not self.db: raise(NameError,"没有设置数据库信息") self.conn = pymssql.connect(host=self.host,user=self.user,password=self.pwd,database=self.db,charset="utf8") cur = self.conn.cursor() if not cur: raise(NameError,"连接数据库失败") else: return cur def ExecQuery(self,sql): cur = self.__GetConnect() cur.execute(sql) resList = cur.fetchall() #查询完毕后必须关闭连接 self.conn.close() return resList def ExecNonQuery(self,sql): cur = self.__GetConnect() cur.execute(sql) self.conn.commit() self.conn.close() def OnlyCharNum(s,oth=''): s2 = s.lower(); fomart = '.0123456789' for c in s2: if not c in fomart: s = s.replace(c,''); return s; #写sql def makesql(year,money,time): sql = "insert into Innocellence_TimeSheet_GDPRecord values("+year+","+money+","+time+")" return sql #创建数据库实例 ms = MSSQL(host="localhost",user="sa",pwd="1234",db="testdb") #先删除旧数据 #查询数据 """reslist = ms.ExecQuery("select year,round(amount,1) from Innocellence_TimeSheet_GDPRecord order by year") for i in reslist: print i""" #先清空数据 ms.ExecNonQuery("delete from Innocellence_TimeSheet_GDPRecord ") html = getHTML("http://www.stats.dl.gov.cn/index.php?m=content&c=index&a=lists&catid=52") d = getEveryYear(html) data = getData(html) for key,val in data.items(): print key,val sql = makesql(key[:4],OnlyCharNum(val),"'"+time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))+"'") print sql ms.ExecNonQuery(sql)