从某图片论坛中自动下载每页的每个子post的图片
I often goto some picture bbs to see pictures ,I have needed to download the pictures from some good post.
But I hate of saving it to local disk by WebBrowser. WE NEED TO DO IT AUTOMATICALLY.
Some picture bbs forum usually has a list of each post , and each post contain the pictures , with url in it's code file.
So we
- download the list
- grab each post
- analyst each post to grab the pictures' url
- download the pictures by url
Here is the code in python do step 1-2 (htmladdr.py):
#!/usr/bin/python
import re
import os
import sys
import funimg
#BBS url (might need to change)
url_pre='http://xxxxx.com/bbs'
#sub html in hte list (might need to change)
p='<a\shref=\"thread-\d+-\d+-\d+.html\">'
def usage():
print sys.argv[0]+' filepath'
def getpicfromhtml(htmlpath):
text=open(htmlpath,'r').readlines()
print "try to analyst", htmlpath
line=0
for element in text:
m=re.findall(p, element)
if m:
try:
for element_m in m:
#get the real relative url in element_m (might need to change)
url=element_m[9:-2]
print url
dirname=url
if not os.path.exists(dirname):
os.makedirs(dirname)
os.chdir(dirname)
url_sub=url_pre+dirname
print "get" ,url_sub
os.system('GET '+ url_sub+' HTTP/1.1'+ ' > '+ 'htmlfile')
funimg.downloadpic('./htmlfile')
print url_sub
#go back to parent (might need to change)
os.chdir('..')
except Exception:
pass
# else:
# print 'find no match'
#get max 5 pages from bbs
htmlindex=1
while htmlindex<=5:
htmlfilename='page'+str(htmlindex)
#forum list url (might need to change)
html_url=url_pre+'forum-4-'+str(htmlindex)+'.html'
print html_url
if not os.path.exists(htmlfilename):
#download the pagefile to a file name: pagex (x is stand for page index)
print 'GET '+ html_url+' HTTP/1.1'+ ' > '+ htmlfilename
os.system('GET '+ html_url+' HTTP/1.1'+ ' > '+ htmlfilename)
getpicfromhtml('./page'+str(htmlindex))
htmlindex=htmlindex+1
import os
import sys
import funimg
#BBS url (might need to change)
url_pre='http://xxxxx.com/bbs'
#sub html in hte list (might need to change)
p='<a\shref=\"thread-\d+-\d+-\d+.html\">'
def usage():
print sys.argv[0]+' filepath'
def getpicfromhtml(htmlpath):
text=open(htmlpath,'r').readlines()
print "try to analyst", htmlpath
line=0
for element in text:
m=re.findall(p, element)
if m:
try:
for element_m in m:
#get the real relative url in element_m (might need to change)
url=element_m[9:-2]
print url
dirname=url
if not os.path.exists(dirname):
os.makedirs(dirname)
os.chdir(dirname)
url_sub=url_pre+dirname
print "get" ,url_sub
os.system('GET '+ url_sub+' HTTP/1.1'+ ' > '+ 'htmlfile')
funimg.downloadpic('./htmlfile')
print url_sub
#go back to parent (might need to change)
os.chdir('..')
except Exception:
pass
# else:
# print 'find no match'
#get max 5 pages from bbs
htmlindex=1
while htmlindex<=5:
htmlfilename='page'+str(htmlindex)
#forum list url (might need to change)
html_url=url_pre+'forum-4-'+str(htmlindex)+'.html'
print html_url
if not os.path.exists(htmlfilename):
#download the pagefile to a file name: pagex (x is stand for page index)
print 'GET '+ html_url+' HTTP/1.1'+ ' > '+ htmlfilename
os.system('GET '+ html_url+' HTTP/1.1'+ ' > '+ htmlfilename)
getpicfromhtml('./page'+str(htmlindex))
htmlindex=htmlindex+1
Here is the code in python do step 3-4 (funimg.py):
#!/usr/bin/python
import re
import os
import string
import sys
#(might need to change)
p=re.compile('.*<img.*src=\".*.jpg\"', re.I)
def downloadpic(filename):
#read from the filename
text=open(filename,'r').readlines()
for element in text:
m=re.match(p,element)
if m:
#get the real url of pic from m
url=m.group()
i=string.find(url,"src=")
url_sub=url[i+4+1:-1]
j=string.rfind(url,"/")
jpgfile=url[j+1:-1]
if not os.path.exists(jpgfile):
#max retry 3 times
os.system('wget '+url_sub+' --tries=3 &' )
print url_sub
else:
print "exists already", jpgfile
import os
import string
import sys
#(might need to change)
p=re.compile('.*<img.*src=\".*.jpg\"', re.I)
def downloadpic(filename):
#read from the filename
text=open(filename,'r').readlines()
for element in text:
m=re.match(p,element)
if m:
#get the real url of pic from m
url=m.group()
i=string.find(url,"src=")
url_sub=url[i+4+1:-1]
j=string.rfind(url,"/")
jpgfile=url[j+1:-1]
if not os.path.exists(jpgfile):
#max retry 3 times
os.system('wget '+url_sub+' --tries=3 &' )
print url_sub
else:
print "exists already", jpgfile
EOF