爬取字段 spider_text

__author__ = 'sus'
import urllib
import urllib2
import re

def getPage(url):        #获取网页
request = urllib2.Request(url)
response = urllib2.urlopen(request)
page=response.read().decode('utf-8')
return page

def getTitle(page):       #获取目录
pattern = re.compile('<a href="http://news.bistu.edu.cn.*?>(.*?)</a>',re.S)
items = re.findall(pattern,page)
for item in items:
return item


page=getPage("http://www.bistu.edu.cn")
print getTitle(page)

posted on 2016-12-07 16:47  BruceSue  阅读(205)  评论(0编辑  收藏  举报

导航