fetch html use python still have some small issue - lexus

公告

fetch html use python still have some small issue

#!/usr/bin/env python
#encoding=utf-8
import redis
import urllib2
import time
import StringIO
import gzip
import httplib
import cookielib
httplib.HTTPConnection.debuglevel = 1
files=["12148","12510","15362","11593","11750"]

class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
    def http_error_301(self, req, fp, code, msg, headers):
        result = urllib2.HTTPRedirectHandler.http_error_301(
            self, req, fp, code, msg, headers)
        result.status = code
        return result
    def http_error_302(self, req, fp, code, msg, headers):
        result = urllib2.HTTPRedirectHandler.http_error_302(
            self, req, fp, code, msg, headers)
        result.status = code
        return result

ckjar = cookielib.MozillaCookieJar()
ckproc = urllib2.HTTPCookieProcessor(ckjar)
count=0
def fetch(k,r1):
    try:
        request = urllib2.Request(k)
        request.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:10.0.1) Gecko/20100101 Firefox/10.0.1')
        #ckjar = cookielib.MozillaCookieJar(filename)
        #ckproc = urllib2.HTTPCookieProcessor(ckjar)
        global ckproc
        global count
        #opener = urllib2.build_opener(ckproc)
        opener = urllib2.build_opener(ckproc,SmartRedirectHandler())
        f = opener.open(request)
        #print f.status
        context=f.read()
        #要加上agent
        #request = urllib2.Request(k)
        #request.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:10.0.1) Gecko/20100101 Firefox/10.0.1')
        #request.add_header("Accept-encoding", "gzip")
        #retval=urllib2.urlopen(request)
        #context=""
        #
        #if retval.headers.has_key('content-encoding'):
        #      fileobj = StringIO.StringIO()
        #      fileobj.write(retval.read())
        #      fileobj.seek(0)
        #      gzip_file = gzip.GzipFile(fileobj=fileobj)
        #      context = gzip_file.read()
        #else:
        #      context = retval.read()
        html=context.decode("gb18030","ignore").encode("utf-8")
        #print html
        if len(html.strip())>0:
            r1.hset(file,k,html)
            count+=1
            print "save %s"%count
        time.sleep(2)
    except urllib2.HTTPError,e:
        print "error->"+k
        r1.rpush("errors",k)
        print str(e)
        print e.getcode()
        print "rework"
        fetch(k,r1)


r1=redis.Redis(db=1)
count=0
for file in files:
    dict=r1.hgetall(file)
    for k,v in dict.iteritems():
        if v=="":
            print k
            fetch(k,r1)
print "done!"

posted on 2012-02-14 21:38 lexus 阅读(237) 评论(0) 收藏举报

刷新页面返回顶部

浙江省高等学校教师教育理论培训

公告