python多线程抓取网页信息

Posted on 2013-07-30 18:32  冰天雪域  阅读(254)  评论(0编辑  收藏  举报

#!/usr/env  python
#-*- coding: utf-8  -*-
import urllib 
import urllib2 
import random 
import requests
import os,sys 
import Queue
import threading
import time
import MySQLdb
from sgmllib import SGMLParser 
import re
queue = Queue.Queue()
out_queue = Queue.Queue()
num=0




class ThreadUrl(threading.Thread):
    
    def __init__(self, queue, out_queue):
        threading.Thread.__init__(self)
        self.queue = queue
        self.out_queue = out_queue


    def run(self):
        while True:
            
host = self.queue.get()
print host
try:
html=requests.get(host)

result=html.content
html.close()
self.out_queue.put(result)


            #place chunk into out queue
except:
print time.sleep(5)



            #signals to queue job is done
self.queue.task_done()


class DatamineThread(threading.Thread):
   
    def __init__(self, out_queue):
        threading.Thread.__init__(self)
        self.out_queue = out_queue


    def run(self):
        while True:
            
result = self.out_queue.get()
pattern=re.compile('<div class="appdiscrib">[\s\S]*?<h4>(.+?)</h4>')
data0=re.findall(pattern,result)

pattern=re.compile('版 本 号(.+?)</li>')
data1=re.findall(pattern,result)
pattern=re.compile('开 发 者(.+?)</li>')
data2=re.findall(pattern,result)
pattern=re.compile('发布时间(.+?)</li>')
data3=re.findall(pattern,result)
pattern=re.compile('文件大小(.+?)</li>')
data4=re.findall(pattern,result)
pattern=re.compile('支持固件(.+?)</li>')
data5=re.findall(pattern,result)
pattern=re.compile('应用介绍</h3>[\s\S]*?<div class="intro">([\s\S]*?)</div>')
data6=re.findall(pattern,result)
for items in data6:
pass#print re.sub('<br />',' ',items)
sql="insert into address(name,version,developer,pubtime,filesize,support,introduction) values(%s,%s,%s,%s,%s,%s,%s)"
for items in data6:

if(data5):
values=(data0[0],data1[0],data2[0],data3[0],data4[0],data5[0],re.sub('<br />',' ',items))
else:
values=(data0[0],data1[0],data2[0],data3[0],data4[0],'NULL',re.sub('<br />',' ',items))
#print values
#print sql % values

try:

conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='addressbookdb',charset="utf8")
cursor=conn.cursor() 
cursor.execute(sql,values)
conn.commit()
except:
print "error2"


try:
cursor.close()
conn.close()
except:
print "error3"

pattern=re.compile(' <div class="appTitle clearfix">[\s\S]*?<img src=(.+?)/>')
data=re.findall(pattern,result)
for j in data:
print j
global num
      
try:
temp=requests.get(j[1:-2])
f=file("picture/"+str(num),"w+")
num=num+1
print num
f.write(temp.content)
except:
print "error4"
 
           
self.out_queue.task_done()
def main():

for k in range(1,2539):
print k


try:
url="http://apk.gfan.com/apps_7_1_"+str(k)+".html"

html=requests.get(url)

result=html.content
html.close()
pattern=re.compile('<a href="([http://apk.gfan.com]?/Product/App\d{1,8}.html)"')
dataresult=re.findall(pattern,result)
dataresult=list(set(dataresult))


for a in range(20):
w = ThreadUrl(queue, out_queue)
w.setDaemon(True)
w.start()
for i in dataresult:
host="http://apk.gfan.com"+i

queue.put(host)
for a in range(20):
dt = DatamineThread(out_queue)
dt.setDaemon(True)
dt.start()
except:
time.sleep(5)


queue.join()
out_queue.join()





#sql="select * from address"
#cursor.execute(sql)
#conn.commit()
#finalresult=cursor.fetchall()
#if finalresult:
#for x in finalresult:
#pass #print x[0:]


    
if  __name__=="__main__":
       main()

Copyright © 2024 冰天雪域
Powered by .NET 9.0 on Kubernetes