import urllib
import urllib2
import socket  
import re  
import sys  
import os  

req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
             'Accept':'text/html;q=0.9,*/*;q=0.8',
             'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
             'Accept-Encoding':'gzip',
             'Connection':'close',
             'Referer':None
             }
req_timeout = 5

def getHtml(url):
    global req_header
    global req_timeout

    requst = urllib2.Request(url,None,req_header)

    page = urllib2.urlopen(requst,None,req_timeout)

    html = page.read()

    return html

def get_coment(html):
    reg = r'name.+?\n.+?DATE:.+?PICS'
    imgre = re.compile(reg)
    imglist = imgre.findall(html)

    if( 1 == len(imglist)):
        return " ".join(imglist[0].split())
    else:
        return "default_name"

def get_image_list(html):
    reg = r'src="(.+?\.jpg)"'
    litre = re.compile(reg)
    imglist = litre.findall(html)
    return imglist

def getImg(html):
    #get page coment name-date-pics
    #get pic count
    print get_coment(html)

    #get img list
    #delete and replace the unusuful url
    imglist = get_image_list(html)
    for img_url in imglist:
        print img_url

    #download img to folder
    #download_img(imglist)

if __name__ == '__main__':
    html = getHtml("http://www.image.com/sample.php?no=517")
    getImg(html)

 

posted on 2013-10-27 10:47  shaivas  阅读(231)  评论(0编辑  收藏  举报