python爬虫之—中国工程院信息
1 import requests 2 import re 3 4 url = 'http://www.cae.cn/cae/html/main/col48/column_48_1.html' 5 6 html = requests.get(url) #获取网页源代码 7 8 html.encoding = 'utf-8' #编码格式 9 10 nuber = re.findall(r'<a href="/cae/html/main/colys/(\d+).html" target="_blank">',html.text) 11 12 for n in nuber[:2]: 13 nextUrl = 'http://www.cae.cn/cae/html/main/colys/{}.html'.format(n) #获取所有url 14 text = requests.get(nextUrl) 15 text.encoding = 'utf-8' 16 text2 = re.findall('<div class="intro">(.*?)</div>',text.text,re.S) #正则表达式 17 text3 = re.sub(r' |<p>| |</p>','',text2[0]).strip() #去掉特殊字符 18 print(text3) 19 with open(r'C:\Users\Administrator\Desktop\888.txt',mode= 'a+') as f: #指定文件路径,追加格式 20 f.write(text3 + '\n'*2) #写入文件