No-Host-Given
import urllib
import urllib2
import re
import time
path="G:/123/"
path_file="1.txt"
def geturllist(text):
text= text.decode('utf-8')
urllist = re.findall(r'(?=https:)+[^ ]+(?<=hd.jpg)',text)
return urllist
with open(path+path_file, "r") as f:
textall=f.read()
urllist=geturllist(textall)
f.close()
urllistonly = {}.fromkeys(urllist).keys()#删除数组里的重复值
for i in range(len(urllistonly)):
pic = urllistonly[i]
pic =pic .replace('002','')
pic =pic .replace('https:\\\\','http://')
pic =pic .replace('\\','/')
urllib.urlretrieve(pic ,path+str(i)+".jpg")
print 'End!'
注意
https:\\pic2.com\50\vc9ddaa4c92da.jpg
地址是无效的,所以会报错:no host given。
http://pic2.com/50/vc9ddaa4c92da.jpg
改成这样就不报错了。