解析网页的多种方式
1.使用正则表达式获取网页标题内容
In [ ]:
import requests
import re
url = 'http://www.tipdm.com/tipdm/index.html'
rqq = requests.get(url)
rqq.encoding = 'utf-8'
rqq.text
In [ ]:
re.findall('<li><a href="[a-z0-9.:/]+" target=".*">(.+)</a></li>', rqq.text)
In [ ]:
re.findall('<li[ a-z="]*><a[a-z0-9 = "]* href="[a-z0-9.:/]+".*>(.+)</a></li>', rqq.text)
2.使用XPath解析网页
In [ ]:
import requests
from lxml import etree
url = 'http://www.tipdm.com/tipdm/index.html'
rqq = requests.get(url)
html = etree.HTML(rqq.content, etree.HTMLParser())
html
In [ ]:
etree.tostring(html, encoding='utf-8').decode('utf-8')
rqq.encoding = 'utf-8'
rqq.text
In [ ]:
html.xpath('/html/head/title')
In [ ]:
html.xpath('//title')
In [ ]:
html.xpath('/html/body/header/div/nav/ul/li/a/text()')
html.xpath('//header/div/nav/ul/li/a/text()')
html.xpath('//*[@id="menu"]/li/a/text()')
In [ ]:
html.xpath('/html/body/header/div/nav/ul/li[last()]')
In [ ]:
html.xpath('/html/body/header/div/nav/ul/li/a[@target="_blank"]/text()')
In [ ]:
import requests
from lxml import etree
url = 'https://weixin.sogou.com/'
rqq = requests.get(url)
In [ ]:
html = etree.HTML(rqq.content, etree.HTMLParser(encoding='utf-8'))
html.xpath('//*[@id="topwords"]/li/a[@title]/text()')
In [ ]:
[html.xpath('//*[@id="topwords"]/li['+str(i)+']/a[@title]/text()') for i in range(1, 11)]
html.xpath('//*[@id="topwords"]/li/a[@title]/text()')
In [ ]:
print(rqq.text[:20])
print(rqq.content[:20])
3.使用Beautiful Soup解析网页
In [ ]:
import requests
from bs4 import BeautifulSoup
rqq = requests.get('http://www.tipdm.com/tipdm/index.html')
soup = BeautifulSoup(rqq.content, 'lxml')
In [ ]:
soup.head
soup.body
soup.li
soup.find_all('li')
In [ ]:
a = soup.link
a.name
a.attrs
a['href']
In [ ]:
a = soup.find_all('nav')
for i in a[0].find_all('li'):
print(i.string)
In [ ]:
a=soup.select('html > head > title')[0]
a.text
In [ ]:
soup.select('.menu > li') # class
soup.select('#menu > li') # id
[i.text for i in soup.select('.menu > li')]
In [ ]:
soup.select('#menu > li > a')
In [ ]:
from bs4 import BeautifulSoup
import requests
rqq = requests.get('https://weixin.sogou.com/')
soup = BeautifulSoup(rqq.content, 'lxml')
soup.select('#topwords')
[i.text for i in soup.select('.hot-news > li > a')]
[i['title'] for i in soup.select('.hot-news > li > a')]
In [ ]:
a = soup.find_all(id='topwords')[0]
[i.text for i in a.find_all('a')]
[i.string for i in a.find_all('a')]
4.参考文章
【创作不易,望点赞收藏,若有疑问,请留言,谢谢】