2019.1.7
import urllib.request import urllib.error import re data=urllib.request.urlopen("http://bbs.hupu.com/").read() data=data.decode("utf-8","ignore") pat='<a href="(.*?.html)" target="_blank" title=' allurl=re.compile(pat).findall(data) for i in range(0,len(allurl)): allurl[i]='https://bbs.hupu.com/'+allurl[i] fh=open('./result.txt','a',encoding='utf8') for i in range(0,len(allurl)): try: nowurl=allurl[i] print('正在爬取第'+str(i+1)+'个帖子') print(nowurl) data=urllib.request.urlopen(nowurl).read() data=data.decode("utf-8","ignore") pat='<title>\n(.*?)\n</title>' result=re.compile(pat).findall(data) fh.write(result[0]+'\n') print('----打印成功----') except urllib.error.URLError as e: print('爬取第' + str(i + 1) + '个帖子失败') if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) fh.close()