No-Host-Given

import urllib
import urllib2
import re
import time
 
path="G:/123/"
path_file="1.txt"
 
def geturllist(text):
    text= text.decode('utf-8')  
    urllist = re.findall(r'(?=https:)+[^ ]+(?<=hd.jpg)',text) 
    return urllist
 
with open(path+path_file, "r") as f:  
    textall=f.read()
   urllist=geturllist(textall)
f.close() 
 
urllistonly = {}.fromkeys(urllist).keys()#删除数组里的重复值
 
for i in range(len(urllistonly)):  
    pic = urllistonly[i]
   pic =pic .replace('002','')
   pic =pic .replace('https:\\\\','http://')
   pic =pic .replace('\\','/')
   urllib.urlretrieve(pic ,path+str(i)+".jpg")
print 'End!'
 
 
注意
https:\\pic2.com\50\vc9ddaa4c92da.jpg
地址是无效的,所以会报错:no host given。
 
http://pic2.com/50/vc9ddaa4c92da.jpg
改成这样就不报错了。
 
 
posted @ 2019-11-01 14:29  数之美  阅读(320)  评论(0编辑  收藏  举报