爬虫2

#coding=utf-8
import urllib
import re
import os

def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html

def getImg(html):
reg = r'src="..(.+?\.JPG)"'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
x = 0

for imgurl in imglist:
print imgurl
imgurl = "http://www.cust.edu.cn"+imgurl
print imgurl

urllib.urlretrieve(imgurl,'D:\img\%s.jpg' % x)
x+=1

par = r'<span style="font-family:宋体">(.*)</span>'
parre = re.compile(par)
parlist = re.findall(parre,html)

for item in parlist:
print item
print '-----------------------------------'

html = getHtml("http://www.cust.edu.cn/lgxw/32913.htm")

print getImg(html)

posted on 2017-06-28 15:58 天才程序猿阅读(123) 评论(0) 收藏举报

刷新页面返回顶部

天才程序猿

爬虫2

导航

公告