网页解析--BeautifulSoup练习
1 # coding = utf-8 2 # BeautifulSoup 主要功能是解析提取HTML数据 3 # re lxml bs4 4 5 # pip install Beautifulsoup4 6 7 # from bs4 import BeautifulSoup 8 9 html = ''' 10 <html><head><title>The Dormouse's story</title></head> 11 12 <p class="title"><b>The Dormouse's story</b></p> 13 14 <p class="story">Once upon a time there were three little sisters; and their names were 15 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, 16 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and 17 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; 18 and they lived at the bottom of a well.</p> 19 20 <p class="story">...</p> 21 22 ''' 23 ############################################################################ 24 # BeautifulSoup部分 25 ############################################################################# 26 27 # soup = BeautifulSoup(html, 'lxml') 28 29 # 四大对象种类:Tag NavigableString Beautifulsoup Comment 30 31 # print(soup.a) # 获取a标签 32 # print(soup.a.get('href')) # 取a标签的属性,获得超链接 33 # print(soup.a.text) # 获取a标签下的文本,若a下有子标签,可能获取不到 34 # print(soup.a.string) # 获取a标签(包含a下的子标签)下的文本 35 36 37 # 搜索文档:find find_all 按照一定的过滤条件进行匹配 38 39 # 字符串 40 # print(soup.find_all('a')) # 匹配整个文档中的a标签 41 # print(soup.find_all(attrs={'class': 'title'})) # 匹配class为title的标签 42 43 # #正则表达式 44 # import re 45 # print(soup.find_all(re.compile('^p'))) # 匹配以p开头的标签 46 # print(soup.find_all(re.compile('y$'))) # 匹配以y结尾的标签 47 # print(soup.find_all(re.compile('t'))) # 匹配包含t的标签 48 49 # 列表 50 # for tag in soup.find_all(['a', 'b']): # 匹配a标签,b标签 51 # print(tag) 52 53 # for tag in soup.find_all('p', class_='story'): # 匹配class=story的p标签 54 # print(tag) 55 56 # # 方法 给find_all传入一个方法作为过滤条件 57 # def has_class_but_no_id(tag): 58 # """ 59 # 定义一个判断有class属性但是没有id属性的方法,作为过滤条件 60 # """ 61 # return tag.has_attr('class') and not tag.has_attr('id') 62 # 63 # for tag in soup.find_all(has_class_but_no_id): 64 # print(tag) 65 66 67 # css选择器 68 # print(soup.select('title')) # 通过标签名查找 69 # print(soup.select('.sister')) # 通过class名查找 70 # print(soup.select('#link1')) # 通过id名查找 71 # print(soup.select('p #link2')) # 组合查找,id为link2的p标签 72 73 # > 只能够一级一级向下查找 74 # print(soup.select('body > p .sister')) # 查找body下类名为sister的p 75 76 77 # 百度搜索python,对返回页面进行属性查找 78 # import requests 79 # url = 'http://www.baidu.com/s?wd=python' 80 # response = requests.get(url) # 获取的数据是网页源代码,未经过js渲染 81 # 82 # soup = BeautifulSoup(response.text, 'lxml') 83 84 # 查找返回页面搜索到的结果 85 # items = soup.find_all('div', class_='result c-container ') 86 87 # 打印搜索结果 88 # for item in items: 89 # print(item.select('h3 > a')[0].get('href') # 取a标签 90 # print(item.select('h3 > a')[0].get_text()) 91 92 ################################################################################# 93 # xpath 部分 94 # 通配符 / // @ # . .. 95 # /表示从当前节点匹配 //整个文档匹配 @选取属性 * 96 ######################################################################################## 97 html = ''' 98 <html><head><title>The Dormouse's story</title></head> 99 <p class="title"><b>The Dormouse's story</b></p> 100 <p class="story">Once upon a time there were three little sisters; and their names were 101 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, 102 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and 103 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; 104 and they lived at the bottom of a well.</p> 105 <p class="story">...</p> 106 ''' 107 # from lxml import etree 108 # e = etree.HTML(html) 109 # for i in e.xpath('//p'): # 整个文档中搜索p标签 110 # # print(i.xpath('string(.)')) # 获取当前标签下所有文本(标签下套标签),包括下面子标签的文本 111 # print(i.text) # 匹配当前标签下的文本内容,不包含子标签 112 113 """ 114 # for i in e.xpath('//p/@class'): # 选取p的class属性 115 # for i in e.xpath('//p[@class=title]'): # 搜索class=title的p标签 116 //title[@*] 匹配所有有属性的title标签 117 """ 118 # 百度搜索python,用xpath查找 119 import requests 120 from lxml import etree 121 122 url = 'http://www.baidu.com/s?wd=python' 123 response = requests.get(url) # 获取的数据是网页源代码 124 tree = etree.HTML(response.text) 125 126 # 查找返回页面搜索到的结果 127 items = tree.xpath('//div[@class="result c-container "]') 128 for item in items: 129 # print(item.xpath('h3/a/@href')) 130 print(item.xpath('h3/a')[0].xpath('string(.)'))