python bs4
# -*- coding: UTF-8 -*- #爬虫 import urllib2 #import bs4 import re import sys from bs4 import BeautifulSoup # import time reload(sys) sys.setdefaultencoding("utf-8") #html=urllib2.urlopen("http://121.196.21.238/report.html") #html=urllib2.urlopen("http://image.baidu.com/") with open("report.html") as f: content = f.read() bsObj = BeautifulSoup(content,features='html.parser') print bsObj.prettify() #打印title # print bsObj.title # #<a href="http://baijiahao.baidu.com/s?id=1665731690282269956" target="_blank" mon="p=1&a=1&pn=1">G15沈海高速轿车起火 现场火光冲天</a> # titlist = bsObj.findAll("a", {"target":"_blank"}) # #下面这两个功能是一样的 # bsObj.findAll(id="text") # bsObj.findAll("", {"id":"text"}) # for x in titlist: # print x.get_text() # #http://t8.baidu.com/it/u=3571592872,3353494284&fm=79&app=86&size=h300&n=0&g=4n&f=jpeg?sec=1589296136&t=e713d1fe058c0dcb1714f9bc0fd4ee92 # titlist = bsObj.find("table",{"id":"result_table"}).tr.next_siblings # for i in titlist: # print i list = bsObj.findAll(class="testcase") for i in list: print i