爬虫2
#coding=utf-8
import urllib
import re
import os
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def getImg(html):
reg = r'src="..(.+?\.JPG)"'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
x = 0
for imgurl in imglist:
print imgurl
imgurl = "http://www.cust.edu.cn"+imgurl
print imgurl
urllib.urlretrieve(imgurl,'D:\img\%s.jpg' % x)
x+=1
par = r'<span style="font-family:宋体">(.*)</span>'
parre = re.compile(par)
parlist = re.findall(parre,html)
for item in parlist:
print item
print '-----------------------------------'
html = getHtml("http://www.cust.edu.cn/lgxw/32913.htm")
print getImg(html)