python 黄图机的爬虫脚本

import tornado.httpclient as httpclient
import urllib
from pyquery import PyQuery as pq
import torndb
import threading
url = 'http://gmgard.us/gm'
img_url = 'http://gmgard.us'
static_path = 'static/img/'

db = torndb.Connection(host='localhost:3306',user='ypc',password='ypc',database='ypc')
InsertSQL = 'INSERT INTO yellow_pictures(path) VALUES(%s)'

cnt = 10

class crawl_picture(threading.Thread):
    def __init__(self,i):
        threading.Thread.__init__(self)
        self.i = i
    def run(self):
        cnt--
        http_client = httpclient.HTTPClient()
        print 'Getting '+url+str(self.i)
        response = http_client.fetch(url+str(i))
        print "global: %r\n" % globals().keys()
        print "local: %r\n" % locals().keys()
        if response.effective_url != 'http://gmgard.us/Blog/List':
            d = pq(response.body)
            img_path = d('#blog').find('img').eq(0).attr('src').encode('utf-8')
            url_img = img_url+img_path
            filename = static_path+img_path.split('/')[2]
            print url_img
            #urllib.urlretrieve(url_img,filename)
            # db.execute(InsertSQL,img_path.split('/')[2])
        else:
            print 'Getting Failed.'
        http_client.close()
cnt = 10
try:
    for i in range(1000,1010):
        x = i
        print i

        if cnt <=0
            while True:
                if cnt > 0
                    break
        c = crawl_picture(x)
        c.start()
        cnt--
        #print response.body
except httpclient.HTTPError as e:
    print "Error:", e

 

posted @ 2014-04-05 19:53  ggaaooppeenngg  阅读(963)  评论(0编辑  收藏  举报