【Python】 爬虫

---------------例子1--------------

import requests
import os
from bs4 import BeautifulSoup

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
url = 'http://jandan.net/ooxx/MjAyMTAyMDQtODY=#comments'

def get_html(url):
response = requests.get(url, headers=headers)
return response

def find_imgs(url):
html = get_html(url).text
bsObj = BeautifulSoup(html, "html.parser", from_encoding='utf-8')
bsObj = bsObj.find_all('a', class_='view_img_link')
pic_list = []
for link in bsObj:
img = link['href'][2:]
pic_list.append(img)
return pic_list

def save_imgs(folder, img_addrs):
for each in img_addrs:
filename = each.split('/')[-1]
with open(filename, 'wb') as f:
img = get_html('http://' + each)
f.write(img.content)

def download_mm(folder='OOXX', pages=10):
os.mkdir(folder)
os.chdir(folder)

img_addrs = find_imgs(url)
save_imgs(folder, img_addrs)

if __name__ == '__main__':
print (download_mm())

【备注】
保证本文件所在目录下没有名为“OOXX”的文件夹
这里全局变量url是jandan.com/ooxx网址手动找的一个有图片的网址,如果失效,可以重新再去网页搜下

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

语法参考:

【https://www.jb51.net/article/152899.htm】

今天用到BeautifulSoup解析爬下来的网页数据

首先导入包from bs4 import BeautifulSoup

然后可以利用urllib请求数据

记得要导包

1
import urllib.request

然后调用urlopen,读取数据

1
2
f=urllib.request.urlopen(‘http://jingyan.baidu.com/article/455a9950bc94b8a166277898.html‘)
response=f.read()

这里我们就不请求数据了,直接用本地的html代码,如下

注意:”'xxx”'是多行注释 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#python3
from bs4 import BeautifulSoup
html='''<html>
<head>
 <title class='ceshi'>super 哈哈 star</title>
</head>
<body>
 天下第一帅
 <p class='sister'>
 
  是不是
 </p>
</body>
</html>'''
#用BeautifulSoup解析数据 python3 必须传入参数二'html.parser' 得到一个对象,接下来获取对象的相关属性
html=BeautifulSoup(html,'html.parser')
# 读取title内容
print(html.title)
# 读取title属性
attrs=html.title.attrs
print(attrs)
# 获取属性attrs['class'] ---->['ceshi'] 这是一个list 通过下标可以获取值
print(attrs['class'][0])
# 读取body
print(html.body)
读取数据还可以通过BeautifulSoup的select方法
html.select()
#按标签名查找
soup.select('title')
soup.select('body')
# 按类名查找
soup.select('.sister')
# 按id名查找
# p标签中id为link的标签
soup.select('p #link')
#取标签里面的值
soup.p.string
#取标签里属性值 通过href获取
html['href']

【https://blog.csdn.net/qq_39360985/article/details/94437306】

python考试的时候一道简单的爬虫题记录:
给一个URL后,获取html内容并提取p标签中class="title"的文本内容

import requests
from bs4 import BeautifulSoup

html = """
<html><head><title>TheDormouse'sstory</title></head>
<body>
<p class="title"name="dromouse"><b>TheDormouse'sstory</b></p>
<p class="story">Onceuponatimetherewerethreelittlesisters;andtheirnameswere
<a href="http://example.com/elsie"class="sister"id="link1"><!--Elsie--></a>,
<a href="http://example.com/lacie"class="sister"id="link2">Lacie</a>and
<a href="http://example.com/tillie"class="sister"id="link3">Tillie</a>;
andtheylivedatthebottomofawell.</p>
<pclass="story">...</p>"""

bsObj = BeautifulSoup(html, "html.parser")
print(bsObj.title)
print(bsObj.title.string)
print(bsObj.head)
print(bsObj.a)

print(bsObj.head.contents)
print(bsObj.head.contents[0])

print(bsObj.select('.title')[0].text)  #获取class="title的内容"
print(bsObj.select('.story')[0].text)
<title>TheDormouse'sstory</title>
TheDormouse'sstory
<head><title>TheDormouse'sstory</title></head>
<a class="sister" href="http://example.com/elsie" id="link1"><!--Elsie--></a>
[<title>TheDormouse'sstory</title>]
<title>TheDormouse'sstory</title>
TheDormouse'sstory
Onceuponatimetherewerethreelittlesisters;andtheirnameswere
,
Lacieand
Tillie;
andtheylivedatthebottomofawell.

在菜鸟教程上又看到这种写法,学习了:

#创建一个BeautifulSoup解析对象
soup = BeautifulSoup(html,"html.parser",from_encoding="utf-8")
#获取所有的链接
links = soup.find_all('a')
print "所有的链接"
for link in links:
    print link.name,link['href'],link.get_text()
 
print "获取特定的URL地址"
link_node = soup.find('a',href="http://example.com/elsie")
print link_node.name,link_node['href'],link_node['class'],link_node.get_text()
 
print "正则表达式匹配"
link_node = soup.find('a',href=re.compile(r"ti"))
print link_node.name,link_node['href'],link_node['class'],link_node.get_text()
 
print "获取P段落的文字"
p_node = soup.find('p',class_='story')
print p_node.name,p_node['class'],p_node.get_text()

 

posted @ 2021-02-05 15:05  素人渔芙2017  阅读(126)  评论(0编辑  收藏  举报