python解析html文件，提取标签中一个元素

对于本地html文件
 1 # -*- coding: utf-8 -*-
 2 # 使用BeautifulSoup解析网页
 3 from bs4 import BeautifulSoup
 4 
 5 #获取要解析的标签
 6 with open('test.html','r',encoding='utf-8') as wb_data:
 7     Soup = BeautifulSoup(wb_data,'lxml');    #将要解析的文件传入
 8     print(Soup);    #打印读入Soup中的内容
 9     print("!--------------\n")
10     shot_name = Soup.select('body > div > div > table > tbody > tr > td > a');     #将要解析的标签元素路径传入
11     #shot_name = Soup.select('body > div > div > div > ol > li > a');     #将要解析的标签元素路径传入
12         #可以从网站上直接复制
13     print(shot_name,sep='\n!!---------------\n');      #打印解析标签元素包含内容
14 wb_data.close();
15 
16 #解析标签内容-------使用get_text()获得文本内容,使用get('')方法获取标签属性值
17 list = [];
18 for shot in shot_name:
19     data = shot.get('href').strip('\/');
20     list.append(data);
21 
22 with open('shot_names.txt', 'w+') as f:
23     for i in list:
24         f.writelines(i + '\n')

2、对于网页

 1 # -*- coding: utf-8 -*-
 2 from bs4 import BeautifulSoup
 3 import requests
 4 
 5 
 6 url = 'https://hao.360.cn/?a1004'
 7 wb_data = requests.get(url)
 8 soup = BeautifulSoup(wb_data.text,'lxml')   #把web_data变得可读
 9 #解析网页元素，从网站上复制元素的CSS路径
10 #这里以链接为例
11 #famous-section > ul.list.first.gclearfix > li:nth-child(7) > a
12 url_famous = soup.select('#famous-section > ul.list.first.gclearfix > li > a')
13 #famous-section > ul.list.last.gclearfix > li:nth-child(1) > a
14 url_famous .append(soup.select('#famous-section > ul.list.last.gclearfix > li > a'))
15 print(url_famous)
16 #focus_news > ul > li:nth-child(1) > a
17 url_focus = soup.select('#focus_news > ul > li > a')
18 print(url_focus)

posted @ 2017-09-26 10:07 一罐热茶阅读(1906) 评论(0) 编辑收藏举报

刷新页面返回顶部

python解析html文件，提取标签中一个元素

公告