python3 调用 beautifulSoup 进行简单的网页处理

python3 调用 beautifulSoup 进行简单的网页处理

  1. from bs4 import BeautifulSoup
  2. file = open('index.html','r',encoding='utf-16-le') #此处有坑!!!
  3. soup = BeautifulSoup(file,'lxml')
  4. print (soup) # 打印读出的内容
  5. print ('\n ------------- \n')
  6. print (soup.get_text()) # 取所有标签中的文字
  7. print ('\n ------------- \n')
  8. print (soup.prettify()) # 格式化输出
  1. # 以标签的形式输出
  2. print (soup.title)
  3. print ('\n ------------- \n')
  4. print (soup.body)
  5. print ('\n ------------- \n')
  6. print (soup.body.div)
  1. import re
  2. print (soup.find_all('br')) # 仅仅用来搜索标签
  3. print ('\n ------------- \n')
  4. print (soup.find_all(re.compile('^b')))#可以使用正则表达式 b开头的标签
  5. print ('\n ------------- \n')
  6. print (soup.find_all(id='wiz_custom_css'))
  7. print ('\n ------------- \n')
  8. for strr in soup.strings: # 取所有下一级标签中的字符串 .stripped_strings可以去空白
  9. print (strr)
  10. print ('\n ------------- \n')
  1. # 去除body中的标签,将结果保存于文件 待改进
  2. # kill all script and style elements
  3. for script in soup(["script", "style"]):
  4. script.extract() # rip current tap
  5. title_text = soup.title.get_text()
  6. str_text = ''
  7. for strr in soup.body.strings: # 取所有下一级标签中的字符串 .stripped_strings可以去空白
  8. str_text = str_text + strr + '\n'
  9. print (str_text)
  10. if title_text == '':
  11. md_file = open('index.md','w')
  12. md_file.write(str_text)
  13. else:
  14. md_file = open(title_text+'.md','w')
  15. md_file.write(str_text)
  1. # 网上搜到的方式,<br/>标签没有转为换行,后面有另一种方式
  2. #print soup
  3. # kill all script and style elements
  4. for script in soup(["script", "style"]):
  5. script.extract() # rip current tap
  6. # get text
  7. text = soup.get_text()
  8. #print text + '____________'
  9. # break into lines and remove leading and trailing space on each
  10. # splitlines 按\r \r\n \n三种标签分解为行
  11. # strip()移除首尾字符,参数默认为空格
  12. lines = (line.strip() for line in text.splitlines())
  13. # break multi-headlines into a line each
  14. chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
  15. # drop blank lines
  16. text = '\n'.join(chunk for chunk in chunks if chunk) # 这个循环…………
  17. #wfile = open('aa.md','w')
  18. #wfile.write(text)
  19. print(text)




posted @ 2017-05-18 11:31  extendswind  阅读(371)  评论(0编辑  收藏  举报