python解析html文件,提取标签中一个元素
对于本地html文件
1 # -*- coding: utf-8 -*- 2 # 使用BeautifulSoup解析网页 3 from bs4 import BeautifulSoup 4 5 #获取要解析的标签 6 with open('test.html','r',encoding='utf-8') as wb_data: 7 Soup = BeautifulSoup(wb_data,'lxml'); #将要解析的文件传入 8 print(Soup); #打印读入Soup中的内容 9 print("!--------------\n") 10 shot_name = Soup.select('body > div > div > table > tbody > tr > td > a'); #将要解析的标签元素路径传入 11 #shot_name = Soup.select('body > div > div > div > ol > li > a'); #将要解析的标签元素路径传入 12 #可以从网站上直接复制 13 print(shot_name,sep='\n!!---------------\n'); #打印解析标签元素包含内容 14 wb_data.close(); 15 16 #解析标签内容-------使用get_text()获得文本内容,使用get('')方法获取标签属性值 17 list = []; 18 for shot in shot_name: 19 data = shot.get('href').strip('\/'); 20 list.append(data); 21 22 with open('shot_names.txt', 'w+') as f: 23 for i in list: 24 f.writelines(i + '\n')
2、对于网页
1 # -*- coding: utf-8 -*- 2 from bs4 import BeautifulSoup 3 import requests 4 5 6 url = 'https://hao.360.cn/?a1004' 7 wb_data = requests.get(url) 8 soup = BeautifulSoup(wb_data.text,'lxml') #把web_data变得可读 9 #解析网页元素,从网站上复制元素的CSS路径 10 #这里以链接为例 11 #famous-section > ul.list.first.gclearfix > li:nth-child(7) > a 12 url_famous = soup.select('#famous-section > ul.list.first.gclearfix > li > a') 13 #famous-section > ul.list.last.gclearfix > li:nth-child(1) > a 14 url_famous .append(soup.select('#famous-section > ul.list.last.gclearfix > li > a')) 15 print(url_famous) 16 #focus_news > ul > li:nth-child(1) > a 17 url_focus = soup.select('#focus_news > ul > li > a') 18 print(url_focus)