爬虫爬oj用户头像

import requests
import Queue
import urllib
import urllib2
import re
import requests
alreadyImg = set()
s = requests.session()
s.post("http://acm.hrbust.edu.cn/index.php?m=User&a=login"
, data={
    "user_name": "1304020306",
    "password": "123456"
})
r = s.get("http://acm.hrbust.edu.cn/index.php?m=User&a=userInfo&user_name=1404020214")
print r.text
urllist = Queue.Queue(maxsize = -1)
already = set()
url = "http://acm.hrbust.edu.cn/index.php?m=Ranklist&a=showRatingrank"
urllist.put(url)
reg = r'a href="(.+?)"'
httpre = re.compile(reg)
#reg = r'src="(.+?\.jpg)"'
reimg = r'img class="large_avatar" src="([^>]+?\.(png|jpg))>?"'
imgre = re.compile(reimg)
def putUrl(html):
    httplist = re.findall(httpre, html)
    for url in httplist:
        realurl = url
        if 'http' not in url:
            realurl = "http://acm.hrbust.edu.cn/"+url
        #print realurl
        if url not in already:
            already.add(url)
            urllist.put(realurl)
x = 0;
def getImg(html):
    Imglist = re.findall(imgre, html)
    global x
    for Img in Imglist:
        Img = Img[0]
        if Img in alreadyImg:
            continue
        else:
            alreadyImg.add(Img)
        print Img
        if Img[0] != 'h':
            Img = "http://acm.hrbust.edu.cn/" + Img
        #print "Img == " +Img
        try:
            urllib.urlretrieve(Img, 'C:/%s.jpg' % x)
        except urllib2.URLError, e:
            pass
        else:
            #print "http://acm.hrbust.edu.cn/"+Img
            x += 1
while True != urllist.empty():
    url = urllist.get(urllist)
    print url
    try:
        r = s.get(url)
        html = r.text
        if "index.php?m=Ranklist&a=showRatingrank" in url:
            putUrl(html)
        getImg(html)
    except urllib2.URLError, e:
        pass
    except urllib2.HTTPError, e:
        pass
    else:
        pass
    
    #else:
    #    print url
    #print html
    #break
View Code

 

posted @ 2016-04-27 21:25  icodefive  阅读(690)  评论(0编辑  收藏  举报