小爬虫爬一个贴吧网页的图片
#!/usr/bin/python import re import urllib def getHtml(url): page = urllib.urlopen(url) html = page.read() return html def getImg(html): reg = r'src="(.*?\.jpg)" width' imgre = re.compile(reg) imglist = re.findall(imgre,html) x=0 for imgurl in imglist: urllib.urlretrieve(imgurl,'%s.jpg' % x) x+=1 html = getHtml("http://image.baidu.com/") getImg(html)
python version 3.4 仿照别人的脚本修改后运行成功:
#!/usr/bin/python # -*- coding: utf-8 -*- import re import urllib.request as urllib2 def getHtml(url): page = urllib2.urlopen(url) html = page.read() return html def getImage(html): imglist=[] imgre = re.compile(r'src="(http://imgsrc.*?\.jpg)" size') html = html.decode('utf-8') imglist = imgre.findall(html) x=0 for imgurl in imglist: urllib2.urlretrieve(imgurl,'E:\\%s.jpg' %x) x+=1 html = getHtml("http://tieba.baidu.com/p/4866459683") getImage(html)
<wiz_tmp_tag id="wiz-table-range-border" contenteditable="false" style="display: none;">