【Python初级爬虫系列--01】python beautifulsoup4 HTML解析器详细用法

  1 import re
  2 
  3 from bs4 import BeautifulSoup, Comment
  4 
  5 html_doc = """<html><head><title>The Dormouse's story</title></head>
  6 <body>
  7 <p class="title"><b>The Dormouse's story</b></p>
  8 
  9 <p class="story">Once upon a time there were three little sisters; and their names were
 10 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
 11 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
 12 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
 13 and they lived at the bottom of a well.</p>
 14 
 15 <p class="story">...</p>
 16 """
 17 # 或者打开某个html文件 soup = BeautifulSoup(open("index.html"))
 18 soup = BeautifulSoup(html_doc, "html.parser")
 19 
 20 tag = soup.p
 21 # 获取标签的类型
 22 print(type(tag)) # <class 'bs4.element.Tag'>
 23 # 获取标签的名字
 24 print(tag.name) # p
 25 # 获取标签的class属性的值
 26 print(tag['class']) # ['title']
 27 # 获取标签的所有属性
 28 print(tag.attrs) # {'class': ['title']}
 29 
 30 css_soup = BeautifulSoup('<p class="body strikeout"></p>', "html.parser")
 31 # 获取多值属性
 32 print(css_soup.p['class']) # ['body', 'strikeout']
 33 # 获取标签内容
 34 print(tag.string) # The Dormouse's story
 35 # 获取标签内容的类型,字符串用NavigableString来包装
 36 print(type(tag.string)) # <class 'bs4.element.NavigableString'>
 37 
 38 # 将标签的内容替换
 39 tag.string.replace_with("No longer bold")
 40 print(tag) # <p class="title"><b>No longer bold</b></p>
 41 
 42 # BeautifulSoup 对象表示的是一个文档的全部内容
 43 print(soup.name) # [document]
 44 
 45 markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
 46 soup = BeautifulSoup(markup, "html.parser")
 47 comment = soup.b.string
 48 # 获取注释的类型Comment,Comment 对象是一个特殊类型的 NavigableString 对象
 49 print(type(comment)) # <class 'bs4.element.Comment'>
 50 # 以漂亮的格式输出
 51 # <b>
 52 # <!--Hey, buddy. Want to buy a used parser?-->
 53 # </b>
 54 print(soup.b.prettify())
 55 
 56 # 遍历文档树
 57 soup = BeautifulSoup(html_doc, "html.parser")
 58 # 直接获取head节点
 59 print(soup.head) # <head><title>The Dormouse's story</title></head>
 60 # 直接获取title节点
 61 print(soup.title) # <title>The Dormouse's story</title>
 62 # 获取body元素内的第一个b标签
 63 print(soup.body.b) # <b>The Dormouse's story</b>
 64 # 获取文档内的第一个a标签
 65 print(soup.a) # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
 66 # 获取文档内的所有a标签
 67 # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister"
 68 # href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie"
 69 # id="link3">Tillie</a>]
 70 print(soup.find_all('a'))
 71 # 通过.contents 获取head标签的所有子节点,以列表形式返回
 72 print(soup.head.contents) # [<title>The Dormouse's story</title>]
 73 # 通过.children获取head的所有子节点
 74 # 结果:<title>The Dormouse's story</title>
 75 for child in soup.head.children:
 76 print(child)
 77 # 通过.descendants获取head的所有子孙节点
 78 # 结果:
 79 # <title>The Dormouse's story</title>
 80 # The Dormouse's story
 81 for child in soup.head.descendants:
 82 print(child)
 83 # 获取整个文档的子节点也就是一个html节点
 84 print(len(list(soup.children))) # 1
 85 # 获取整个文档的所有子孙节点的数量
 86 print(len(list(soup.descendants))) # 26
 87 # 获取文档内的所有的字符串
 88 # 结果:
 89 # "The Dormouse's story"
 90 # '\n'
 91 # '\n'
 92 # "The Dormouse's story"
 93 # '\n'
 94 # 'Once upon a time there were three little sisters; and their names were\n'
 95 # 'Elsie'
 96 # ',\n'
 97 # 'Lacie'
 98 # ' and\n'
 99 # 'Tillie'
100 # ';\nand they lived at the bottom of a well.'
101 # '\n'
102 # '...'
103 # '\n'
104 for string in soup.strings:
105 print(repr(string))
106 # 获取文档内的所有字符串,去除多余的空白
107 # 结果:
108 # "The Dormouse's story"
109 # "The Dormouse's story"
110 # 'Once upon a time there were three little sisters; and their names were'
111 # 'Elsie'
112 # ','
113 # 'Lacie'
114 # 'and'
115 # 'Tillie'
116 # ';\nand they lived at the bottom of a well.'
117 # '...'
118 for string in soup.stripped_strings:
119 print(repr(string))
120 
121 title_tag = soup.title
122 # 通过.parent 查找title节点的父节点
123 print(title_tag.parent) # <head><title>The Dormouse's story</title></head>
124 # 获取所有title_tag的父节点
125 # 结果:
126 # head
127 # html
128 # [document]
129 for parent in title_tag.parents:
130 print(parent.name)
131 
132 sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>", "html.parser")
133 # 通过.next_sibling获取节点的下一个兄弟节点
134 print(sibling_soup.b.next_sibling) # <c>text2</c>
135 print(sibling_soup.c.next_sibling) # None
136 # 通过.previous_sibling获取节点的前一个系统第节点
137 print(sibling_soup.c.previous_sibling) # <b>text1</b>
138 print(sibling_soup.b.previous_sibling) # None
139 # 通过.next_siblings查找a标签的所有兄弟节点
140 # 结果:
141 # ',\n'
142 # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
143 # ' and\n'
144 # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
145 # ';\nand they lived at the bottom of a well.'
146 for sibling in soup.a.next_siblings:
147 print(repr(sibling))
148 # 通过.previous_siblings查找id为link3的所有前置兄弟节点
149 # 结果:
150 # ' and\n'
151 # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
152 # ',\n'
153 # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
154 # 'Once upon a time there were three little sisters; and their names were\n'
155 for sibling in soup.find(id="link3").previous_siblings:
156 print(repr(sibling))
157 
158 last_a_tag = soup.find("a", id="link3")
159 # 查找最后一个a标签的下一个被解析的对象
160 # 和next_sibling 区别在于是被解析的下一个对象,不是下一个对象
161 print(last_a_tag.next_element) # Tillie
162 # 查找最后一个a标签的上一个被解析的对象
163 print(repr(last_a_tag.previous_element)) # ' and\n'
164 # 查找最后一个a标签之后的所有被解析对象
165 # 结果:
166 # 'Tillie'
167 # ';\nand they lived at the bottom of a well.'
168 # '\n'
169 # <p class="story">...</p>
170 # '...'
171 # '\n'
172 for element in last_a_tag.next_elements:
173 print(repr(element))
174 
175 # 查找文档的所有b标签
176 print(soup.find_all('b')) # [<b>The Dormouse's story</b>]
177 # 查找所有以b开头的标签
178 # 结果:
179 # body
180 # b
181 for tag in soup.find_all(re.compile("^b")):
182 print(tag.name)
183 # 查找所有包含t的标签
184 # 结果:
185 # html
186 # title
187 for tag in soup.find_all(re.compile("t")):
188 print(tag.name)
189 # 传入一个列表查找元素
190 # 结果:[<b>The Dormouse's story</b>, <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
191 # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister"
192 # href="http://example.com/tillie" id="link3">Tillie</a>]
193 print(soup.find_all(['a', 'b']))
194 # 匹配所有元素,但是不会返回字符串节点
195 for tag in soup.find_all(True):
196 print(tag.name)
197 
198 
199 # 定义过滤方法
200 def has_class_but_no_id(tag):
201 return tag.has_attr('class') and not tag.has_attr('id')
202 
203 
204 # 通过自定义方法实现过滤
205 # 结果: [<p class="title"><b>The Dormouse's story</b></p>, <p class="story">Once upon a time there were
206 # three little sisters; and their names were <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
207 # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and <a class="sister"
208 # href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well.</p>,
209 # <p class="story">...</p>]
210 print(soup.find_all(has_class_but_no_id))
211 # 查找id为link2的元素
212 print(soup.find_all(id='link2')) # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
213 # 查找href包含elsie的元素
214 print(soup.find_all(href=re.compile("elsie"))) # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
215 # 查找所有包含id属性的元素
216 # 结果: [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister"
217 # href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie"
218 # id="link3">Tillie</a>]
219 print(soup.find_all(id=True))
220 # 多条件查找元素
221 # 结果:
222 # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
223 print(soup.find_all(href=re.compile("elsie"), id='link1'))
224 
225 data_soup = BeautifulSoup('<div data-foo="value">foo!</div>', 'html.parser')
226 # 对于一些特殊的属性,可以通过attrs的形式查找标签
227 print(data_soup.find_all(attrs={"data-foo": "value"})) # [<div data-foo="value">foo!</div>]
228 # 通过css类名查找元素,因为class是python的关键字,所以用class_代替、
229 # 结果: [<a class="sister" href="http://example.com/elsie"
230 # id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister"
231 # href="http://example.com/tillie" id="link3">Tillie</a>]
232 print(soup.find_all('a', class_='sister'))
233 # class_也可以用正则来过滤
234 print(soup.find_all(class_=re.compile("itl"))) # [<p class="title"><b>The Dormouse's story</b></p>]
235 
236 
237 def has_six_characters(css_class):
238 return css_class is not None and len(css_class) == 6
239 
240 
241 # 通过自定义过滤方法过滤元素
242 # 结果:[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister"
243 # href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie"
244 # id="link3">Tillie</a>]
245 print(soup.find_all(class_=has_six_characters))
246 # 查找文档中的字符串为Elsie的
247 print(soup.find_all(text="Elsie")) # ['Elsie']
248 # 正则表达式查找text
249 print(soup.find_all(text=re.compile("Dormouse"))) # ["The Dormouse's story", "The Dormouse's story"]
250 # 通过limit限制返回的结果集数量
251 # 结果:
252 # [<a class="sister" href="http://example.com/elsie" id
253 # ="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
254 print(soup.find_all("a", limit=2))
255 # 默认会查找文档的所有子孙节点,如果recursive指定为False则只会查找子节点
256 print(soup.find_all('title', recursive=False)) # []
257 # 等价于 soup.find_all("a")
258 print(soup("a"))
259 # 等价于 soup.title.find_all(text=True)
260 print(soup.title(text=True))
261 # find用法与find_all用法基本一致,区别如下:
262 # 1、find返回找到元素的第一个元素,find_all返回所有
263 # 2、如果没有找到元素,find返回None,find_all返回空集合
264 print(soup.find("a")) # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
265 
266 a_string = soup.find(text='Lacie')
267 # 找到a_string元素的父节点是a的所有元素
268 print(a_string.find_parents("a")) # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
269 # 找到a_string元素的父节点是p的第一个元素
270 # 结果:
271 # <p class="story">Once upon a time there were three little sisters; and their names were
272 # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
273 # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
274 # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
275 # and they lived at the bottom of a well.</p>
276 print(a_string.find_parent("p"))
277 # 查找a_string元素的父节点是p,class为title的所有元素
278 print(a_string.find_parents("p", class_="title")) # []
279 
280 first_link = soup.a
281 # 查找第一个a标签的所有是a的兄弟元素
282 # 结果: [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister"
283 # href="http://example.com/tillie" id="link3">Tillie</a>]
284 print(first_link.find_next_siblings("a"))
285 
286 first_story_paragraph = soup.find("p", "story")
287 # 查找first_story_paragraph的下一个标签的p的兄弟标签
288 print(first_story_paragraph.find_next_sibling("p")) # <p class="story">...</p>
289 
290 last_link = soup.find("a", id="link3")
291 # 查找last_link的前一个标签是a的所有兄弟标签
292 # 结果: [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
293 # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
294 print(last_link.find_previous_siblings("a"))
295 # 查找last_link的前一个标签是a的兄弟标签
296 print(last_link.find_previous_sibling("a")) # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
297 
298 first_link = soup.a
299 # 查找first_link之后的所有有字符串的节点
300 # 结果: ['Elsie', ',\n', 'Lacie', ' and\n', 'Tillie', ';\nand they lived at the bottom of a well.', '\n', '...', '\n']
301 print(first_link.find_all_next(text=True))
302 # 查找first_link之后的第一个p标签
303 print(first_link.find_next("p")) # <p class="story">...</p>
304 # 查找first_link之前的所有p标签
305 # 结果:[<p class="story">Once upon a time there were three little sisters; and their names were
306 # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
307 # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
308 # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
309 # and they lived at the bottom of a well.</p>, <p class="title"><b>The Dormouse's story</b></p>]
310 print(first_link.find_all_previous("p"))
311 # 查找first_link的前一个title元素
312 print(first_link.find_previous("title")) # <title>The Dormouse's story</title>
313 
314 
315 # CSS 选择器
316 # 通过css选择器来查找标签为title的元素
317 print(soup.select("title")) # [<title>The Dormouse's story</title>]
318 # 查找是p元素的第三个元素
319 print(soup.select("p:nth-of-type(3)")) # [<p class="story">...</p>]
320 # 逐级查找body下的所有a标签
321 # 结果: [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister"
322 # href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie"
323 # id="link3">Tillie</a>]
324 print(soup.select("body a"))
325 # 逐级查找html下的head虾的title元素
326 print(soup.select("html head title")) # [<title>The Dormouse's story</title>]
327 # 查找head元素下的直接子title元素
328 print(soup.select("head > title")) # [<title>The Dormouse's story</title>]
329 # 查找p元素下子元素id为link1的元素
330 print(soup.select("p > #link1")) # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
331 # 查找body下的子元素为a的元素,不会逐级查找
332 print(soup.select("body > a")) # []
333 # 查找id为link1的所有class为sister的兄弟节点
334 # 结果:[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
335 # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
336 print(soup.select("#link1 ~ .sister"))
337 # 通过css类型sister查找元素
338 # 结果:[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister"
339 # href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie"
340 # id="link3">Tillie</a>]
341 print(soup.select(".sister"))
342 # 通过id来查找元素
343 print(soup.select("#link1")) # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
344 # 查找所有a标签包含href属性的
345 # 结果:[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister"
346 # href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie"
347 # id="link3">Tillie</a>]
348 print(soup.select("a[href]"))
349 # 根据a标签的href属性值查找元素
350 # 结果:[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
351 print(soup.select('a[href="http://example.com/elsie"]'))
352 # 根据a标签的href前缀查找元素
353 # 结果:[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister"
354 # href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie"
355 # id="link3">Tillie</a>]
356 print(soup.select('a[href^="http://example.com"]'))
357 # 查找所有a标签的href值是以tillie结尾的
358 # 结果:[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
359 print(soup.select('a[href$="tillie"]'))
360 # 查找所有href的值与表达式相匹配的a标签
361 print(soup.select('a[href*=".com/el"]'))
362 
363 
364 # 修改文档树
365 soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', "html.parser")
366 tag = soup.b
367 # 修改标签的name
368 tag.name = "blockquote"
369 # 修改标签的class
370 tag['class'] = "verybold"
371 # 新增标签的id属性
372 tag['id'] = 1
373 print(tag) # <blockquote class="verybold" id="1">Extremely bold</blockquote>
374 # 通过.string修改标签的内容
375 tag.string = "New link text."
376 print(tag) # <blockquote class="verybold" id="1">New link text.</blockquote>
377 
378 soup = BeautifulSoup("<a>Foo</a>", "html.parser")
379 # 对指定标签增加内容
380 soup.a.append("Bar")
381 print(soup.a) # <a>FooBar</a>
382 # 通过new_string()方法创建一个字符串对象
383 new_string = soup.new_string("New content")
384 soup.a.append(new_string)
385 print(soup.a) # <a>FooBarNew content</a>
386 # 创建一个注释对象
387 new_comment = soup.new_string("I am comment.", Comment)
388 soup.a.append(new_comment)
389 print(soup.a) # <a>FooBarNew content<!--I am comment.--></a>
390 
391 soup = BeautifulSoup("<b></b>", "html.parser")
392 original_tag = soup.b
393 # 通过new_tag()方法创建一个新的标签
394 new_tag = soup.new_tag("a", href="http://www.example.com")
395 original_tag.append(new_tag)
396 print(original_tag) # <b><a href="http://www.example.com"></a></b>
397 
398 markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
399 soup = BeautifulSoup(markup, "html.parser")
400 tag = soup.a
401 # 通过insert()方法将制定内容插入对应的下标下
402 tag.insert(1, "but did not endorse")
403 print(tag) # <a href="http://example.com/">I linked to but did not endorse<i>example.com</i></a>
404 
405 soup = BeautifulSoup("<b>stop</b>", "html.parser")
406 tag = soup.new_tag("i")
407 tag.string = "Don't"
408 # 通过insert_before()方法在当前tag或者文本节点前插入内容
409 soup.b.string.insert_before(tag)
410 print(soup) # <b><i>Don't</i>stop</b>
411 # 通过insert_after() 方法在当前tag或文本节点后插入内容
412 soup.b.i.insert_after(soup.new_string(" no no "))
413 print(soup) # <b><i>Don't</i> no no stop</b>
414 
415 markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
416 soup = BeautifulSoup(markup, 'html.parser')
417 tag = soup.a
418 # 通过clear() 方法移除当前tag的内容
419 tag.clear()
420 print(tag) # <a href="http://example.com/"></a>
421 
422 markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
423 soup = BeautifulSoup(markup, 'html.parser')
424 a_tag = soup.a
425 # 通过extract() 方法将当前tag移除文档树,并作为方法结果返回
426 i_tag = soup.i.extract()
427 print(a_tag) # <a href="http://example.com/">I linked to </a>
428 print(i_tag) # <i>example.com</i>
429 
430 markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
431 soup = BeautifulSoup(markup, 'html.parser')
432 a_tag = soup.a
433 # 通过decompose() 方法将当前节点移除文档树并完全销毁
434 i_tag = soup.i.decompose()
435 print(a_tag) # <a href="http://example.com/">I linked to </a>
436 print(i_tag) # None
437 
438 markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
439 soup = BeautifulSoup(markup, 'html.parser')
440 a_tag = soup.a
441 new_tag = soup.new_tag("b")
442 new_tag.string = "example.net"
443 # 通过replace_with() 方法移除文档树中的某段内容,并用新tag或文本节点替代它
444 a_tag.i.replace_with(new_tag)
445 print(a_tag) # <a href="http://example.com/">I linked to <b>example.net</b></a>
446 
447 soup = BeautifulSoup("<p>I wish I was bold.</p>", 'html.parser')
448 # 通过wrap() 方法可以对指定的tag元素进行包装
449 soup.p.string.wrap(soup.new_tag("b"))
450 print(soup) # <p><b>I wish I was bold.</b></p>
451 
452 markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
453 soup = BeautifulSoup(markup, 'html.parser')
454 a_tag = soup.a
455 # unwrap() 方法与 wrap() 方法相反.将移除tag内的所有tag标签,该方法常被用来进行标记的解包
456 a_tag.i.unwrap()
457 print(a_tag) # <a href="http://example.com/">I linked to example.com</a>
458 
459 markup = '<a href="http://example.com/">\nI linked to <i>example.com</i>\n</a>'
460 soup = BeautifulSoup(markup, 'html.parser')
461 # 如果只想得到tag中包含的文本内容,那么可以嗲用 get_text() 方法,这个方法获取到tag中包含的所有文版内容包括子孙tag中的内容,并将结果作为Unicode字符串返回:
462 print(repr(soup.get_text())) # '\nI linked to example.com\n'
463 # 可以通过参数指定tag的文本内容的分隔符
464 print(repr(soup.get_text("|"))) # '\nI linked to |example.com|\n'
465 # 还可以去除获得文本内容的前后空白
466 print(repr(soup.get_text("|", strip=True))) # 'I linked to|example.com'

 

posted @ 2018-11-20 15:40  爱寂寞撒的谎言  阅读(516)  评论(0编辑  收藏  举报