2.JSON数据处理以及BS使用
Json
加载json数据
import requests
import json
# from pprint import pprint
def main():
url = "http://192.168.223.143/test.json"
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
"Referer":"https://www.baidu.com"
}
resp = requests.get(url=url,headers=headers)
json_str = resp.content.decode("utf-8")
# print(json_str)
ret1 = json.loads(json_str)
print(ret1['objects'][4]['EmailAddress'])
# 美化打印
# pprint(ret1)
pass
if __name__ == '__main__':
main()
保存文件与读取文件
with open("b.txt","w",encoding="utf-8") as file:
# ensure_ascii=False:可以显示中文
# indent=2:把子节点往后移两个空格,不移就会显示一行,影响美观
file.write(json.dumps(ret1,ensure_ascii=False,indent=2))
with open("b.txt","r",encoding="utf-8") as file:
ret2 = json.load(file)
print(ret2)
BeautifulSoup代替正则表达式
作用:解析网页的源码
使用方法
pip install bs4
from bs4 import BeautifulSoup
# 保存页面源代码
html_doc = resp.content.decode("utf-8")
# 使用bs去处理网页源代码
soup = BeautifulSoup(html_doc)
# 找到class='m-hd'的div标签
print(soup.find('div', class_='m-hd'))
# 找到所有的a标签,并且class='u-card'的东西,放在list里,可以取出第2个
print(soup.find_all('a', class_='u-card')[2])
# 获取全部页面中的a标签,class为'u-card'的元素
number = len(soup.find_all('a', class_='u-card'))
for i in range(number):
# 获得文字内容
print(soup.find_all('a', class_='u-card')[i].get_text().strip())
# 查找所有div下id='j-anime-nav-collect'的标签内容,放在list里,取出第一个的文本
print(soup.select("div > #j-anime-nav-collect")[0].get_text())
# 查找所有ul下的class='item'的标签内容,放在list里,取出第一个的文本
print(soup.select("ul > .item")[1].get_text())
# 取出网页的标题
print(soup.title.string)
print(soup.title.get_text())
# 取出所有的img标签的内容,放在一个list里
print(soup.find_all('img'))
# 找到第一个class='u-tt'的,无视标签
print(soup.find(class_='u-tt').get_text())
list = soup.find('div', class_='lst-item').find_all('a', class_='u-card')
for item in list:
# 取出所有list中每一项的细节
name = item.find('p', class_='u-tt').get_text()
# print(name)
# 取出图片地址,img标签里面的data-src属性,因为网站的src只是作为占位符,所有要具体情况具体分析
pic_url = item.find('img').get('data-src')
print(name + '-----' + pic_url)
BS的两个小例子
依次爬取1-14页的动漫名和图片保存位置
import requests
from bs4 import BeautifulSoup
def scrapy(page):
url = "http://www.4399dmw.com/search/dh-1-0-0-0-0-{}-0/".format(page)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
"Referer": "https://www.baidu.com"
}
resp = requests.get(url=url, headers=headers)
html_doc = resp.content.decode("utf-8")
soup = BeautifulSoup(html_doc)
list = soup.find('div', class_='lst').find_all('a', class_='u-card')
for item in list:
name = item.find('p', class_='u-tt').get_text()
pic_url = item.find('img').get('data-src')
print(name + '—————' + pic_url)
def main():
for i in range(14):
print("爬到了第" + str(i) + "页")
scrapy(i)
if __name__ == '__main__':
main()
爬取其中火影忍者的猜你喜欢栏目
import requests
from bs4 import BeautifulSoup
def main():
url = "http://www.4399dmw.com/huoying/donghua/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
"Referer": "https://www.baidu.com"
}
resp = requests.get(url=url, headers=headers)
html_doc = resp.content.decode('utf-8')
soup = BeautifulSoup(html_doc)
list = soup.find_all('div', class_='works__info')[3].find_all('a')
for item in list:
print(item.get_text())
if __name__ == '__main__':
main()
本文来自博客园,作者:icui4cu,转载请注明原文链接:https://www.cnblogs.com/icui4cu/p/16153838.html