import urllib import urllib2 import socket import re import sys import os req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept':'text/html;q=0.9,*/*;q=0.8', 'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding':'gzip', 'Connection':'close', 'Referer':None } req_timeout = 5 def getHtml(url): global req_header global req_timeout requst = urllib2.Request(url,None,req_header) page = urllib2.urlopen(requst,None,req_timeout) html = page.read() return html def get_coment(html): reg = r'name.+?\n.+?DATE:.+?PICS' imgre = re.compile(reg) imglist = imgre.findall(html) if( 1 == len(imglist)): return " ".join(imglist[0].split()) else: return "default_name" def get_image_list(html): reg = r'src="(.+?\.jpg)"' litre = re.compile(reg) imglist = litre.findall(html) return imglist def getImg(html): #get page coment name-date-pics #get pic count print get_coment(html) #get img list #delete and replace the unusuful url imglist = get_image_list(html) for img_url in imglist: print img_url #download img to folder #download_img(imglist) if __name__ == '__main__': html = getHtml("http://www.image.com/sample.php?no=517") getImg(html)