【Python】 爬虫
---------------例子1--------------
import requests
import os
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
url = 'http://jandan.net/ooxx/MjAyMTAyMDQtODY=#comments'
def get_html(url):
response = requests.get(url, headers=headers)
return response
def find_imgs(url):
html = get_html(url).text
bsObj = BeautifulSoup(html, "html.parser", from_encoding='utf-8')
bsObj = bsObj.find_all('a', class_='view_img_link')
pic_list = []
for link in bsObj:
img = link['href'][2:]
pic_list.append(img)
return pic_list
def save_imgs(folder, img_addrs):
for each in img_addrs:
filename = each.split('/')[-1]
with open(filename, 'wb') as f:
img = get_html('http://' + each)
f.write(img.content)
def download_mm(folder='OOXX', pages=10):
os.mkdir(folder)
os.chdir(folder)
img_addrs = find_imgs(url)
save_imgs(folder, img_addrs)
if __name__ == '__main__':
print (download_mm())
【备注】
保证本文件所在目录下没有名为“OOXX”的文件夹
这里全局变量url是jandan.com/ooxx网址手动找的一个有图片的网址,如果失效,可以重新再去网页搜下
语法参考:
【https://www.jb51.net/article/152899.htm】
今天用到BeautifulSoup解析爬下来的网页数据
首先导入包from bs4 import BeautifulSoup
然后可以利用urllib请求数据
记得要导包
1
|
import urllib.request |
然后调用urlopen,读取数据
1
2
|
f = urllib.request.urlopen(‘http: / / jingyan.baidu.com / article / 455a9950bc94b8a166277898 .html‘) response = f.read() |
这里我们就不请求数据了,直接用本地的html代码,如下
注意:”'xxx”'是多行注释
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
#python3 from bs4 import BeautifulSoup html = '''<html> <head> <title class='ceshi'>super 哈哈 star</title> </head> <body> 天下第一帅 <p class='sister'> 是不是 </p> </body> </html>''' #用BeautifulSoup解析数据 python3 必须传入参数二'html.parser' 得到一个对象,接下来获取对象的相关属性 html = BeautifulSoup(html, 'html.parser' ) # 读取title内容 print (html.title) # 读取title属性 attrs = html.title.attrs print (attrs) # 获取属性attrs['class'] ---->['ceshi'] 这是一个list 通过下标可以获取值 print (attrs[ 'class' ][ 0 ]) # 读取body print (html.body) 读取数据还可以通过BeautifulSoup的select方法 html.select() #按标签名查找 soup.select( 'title' ) soup.select( 'body' ) # 按类名查找 soup.select( '.sister' ) # 按id名查找 # p标签中id为link的标签 soup.select( 'p #link' ) #取标签里面的值 soup.p.string #取标签里属性值 通过href获取 html[ 'href' ] |
【https://blog.csdn.net/qq_39360985/article/details/94437306】
python考试的时候一道简单的爬虫题记录:
给一个URL后,获取html内容并提取p标签中class="title"的文本内容
import requests
from bs4 import BeautifulSoup
html = """
<html><head><title>TheDormouse'sstory</title></head>
<body>
<p class="title"name="dromouse"><b>TheDormouse'sstory</b></p>
<p class="story">Onceuponatimetherewerethreelittlesisters;andtheirnameswere
<a href="http://example.com/elsie"class="sister"id="link1"><!--Elsie--></a>,
<a href="http://example.com/lacie"class="sister"id="link2">Lacie</a>and
<a href="http://example.com/tillie"class="sister"id="link3">Tillie</a>;
andtheylivedatthebottomofawell.</p>
<pclass="story">...</p>"""
bsObj = BeautifulSoup(html, "html.parser")
print(bsObj.title)
print(bsObj.title.string)
print(bsObj.head)
print(bsObj.a)
print(bsObj.head.contents)
print(bsObj.head.contents[0])
print(bsObj.select('.title')[0].text) #获取class="title的内容"
print(bsObj.select('.story')[0].text)
<title>TheDormouse'sstory</title>
TheDormouse'sstory
<head><title>TheDormouse'sstory</title></head>
<a class="sister" href="http://example.com/elsie" id="link1"><!--Elsie--></a>
[<title>TheDormouse'sstory</title>]
<title>TheDormouse'sstory</title>
TheDormouse'sstory
Onceuponatimetherewerethreelittlesisters;andtheirnameswere
,
Lacieand
Tillie;
andtheylivedatthebottomofawell.
在菜鸟教程上又看到这种写法,学习了:
#创建一个BeautifulSoup解析对象 soup = BeautifulSoup(html,"html.parser",from_encoding="utf-8") #获取所有的链接 links = soup.find_all('a') print "所有的链接" for link in links: print link.name,link['href'],link.get_text() print "获取特定的URL地址" link_node = soup.find('a',href="http://example.com/elsie") print link_node.name,link_node['href'],link_node['class'],link_node.get_text() print "正则表达式匹配" link_node = soup.find('a',href=re.compile(r"ti")) print link_node.name,link_node['href'],link_node['class'],link_node.get_text() print "获取P段落的文字" p_node = soup.find('p',class_='story') print p_node.name,p_node['class'],p_node.get_text()