网络爬虫基础练习

import requests
url='http://localhost:63342/new/news.html?_ijt=55294hg253a9s359i3e3f9kdku'
res=requests.get(url)
res.encoding='utf-8'
 
from bs4 import BeautifulSoup
soup=BeautifulSoup(res.text,'html.parser')

 

取出h1标签的文本

soups=soup.select('h1')[0].text
print(soups)

  

取出a标签的链接

soupa=soup.a.attrs
print(soupa['href'])

  

取出所有li标签的所有内容

for li in soup.find_all('li'):
     print(li.contents)

  

  

取出一条新闻的标题、链接、发布时间、来源

print(soup.select('div.article-info')[0].text)
print(soup.select('div .text-title')[0].find('h1').text)

  

posted @ 2018-03-29 20:51  226李汉昊  阅读(147)  评论(0编辑  收藏  举报