python 黄图机的爬虫脚本
import tornado.httpclient as httpclient import urllib from pyquery import PyQuery as pq import torndb import threading url = 'http://gmgard.us/gm' img_url = 'http://gmgard.us' static_path = 'static/img/' db = torndb.Connection(host='localhost:3306',user='ypc',password='ypc',database='ypc') InsertSQL = 'INSERT INTO yellow_pictures(path) VALUES(%s)' cnt = 10 class crawl_picture(threading.Thread): def __init__(self,i): threading.Thread.__init__(self) self.i = i def run(self): cnt-- http_client = httpclient.HTTPClient() print 'Getting '+url+str(self.i) response = http_client.fetch(url+str(i)) print "global: %r\n" % globals().keys() print "local: %r\n" % locals().keys() if response.effective_url != 'http://gmgard.us/Blog/List': d = pq(response.body) img_path = d('#blog').find('img').eq(0).attr('src').encode('utf-8') url_img = img_url+img_path filename = static_path+img_path.split('/')[2] print url_img #urllib.urlretrieve(url_img,filename) # db.execute(InsertSQL,img_path.split('/')[2]) else: print 'Getting Failed.' http_client.close() cnt = 10 try: for i in range(1000,1010): x = i print i if cnt <=0 while True: if cnt > 0 break c = crawl_picture(x) c.start() cnt-- #print response.body except httpclient.HTTPError as e: print "Error:", e