python多线程实现抓取网页
Python实现抓取网页
以下的Python抓取网页的程序比較0基础。仅仅能抓取第一页的url所属的页面,仅仅要预定URL足够多。保证你抓取的网页是无限级别的哈,以下是代码:
##coding:utf-8 ''' 无限抓取网页 @author wangbingyu @date 2014-06-26 ''' import sys,urllib,re,thread,time,threading ''' 创建下载线程类 ''' class download(threading.Thread): def __init__(self,url,threadName): threading.Thread.__init__(self,name=threadName) self.thread_stop = False self.url = url def run(self): while not self.thread_stop: self.list = self.getUrl(self.url) self.downloading(self.list) def stop(self): self.thread_stop = True def downloading(self,list): try: for i in range(len(list) - 1): urllib.urlretrieve(list[i],'E:\upload\download\%s.html' % time.time()) except Exception,ex: print Exception,'_upload:',ex def getUrl(self,url): result = [] s = urllib.urlopen(url).read(); ss = s.replace(' ','') urls=re.findall('<a.*?href=.*?<\/a>',ss,re.I) for i in urls: tmp = i.split('"') try: if tmp[1]: if re.match(r'\http://.*',tmp[1]): result.append(tmp[1]) except Exception,ex: print Exception,":getUrl",ex return result if __name__ == '__main__': list = ['http://www.baidu.com','http://www.qq.com','http://www.taobao.com','http://www.sina.com.cn'] for i in range(len(list)): #print list[i] download(list[i],'thread%s' % i).start() #list = ['http://www.baidu.com','http://www.sina.com.cn'] #obj = download('http://www.baidu.com','threadName') #obj.start(); input()