2019.1.7

import urllib.request
import urllib.error
import re
data=urllib.request.urlopen("http://bbs.hupu.com/").read()
data=data.decode("utf-8","ignore")
pat='<a href="(.*?.html)" target="_blank" title='
allurl=re.compile(pat).findall(data)
for i in range(0,len(allurl)):
    allurl[i]='https://bbs.hupu.com/'+allurl[i]
fh=open('./result.txt','a',encoding='utf8')
for i in range(0,len(allurl)):
    try:
        nowurl=allurl[i]
        print('正在爬取第'+str(i+1)+'个帖子')
        print(nowurl)
        data=urllib.request.urlopen(nowurl).read()
        data=data.decode("utf-8","ignore")
        pat='<title>\n(.*?)\n</title>'
        result=re.compile(pat).findall(data)
        fh.write(result[0]+'\n')
        print('----打印成功----')
    except urllib.error.URLError as e:
        print('爬取第' + str(i + 1) + '个帖子失败')
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
fh.close()

 

posted @ 2019-01-07 20:14  Hesse  阅读(128)  评论(0编辑  收藏  举报