【Python】爬虫

---------------例子1--------------

import requests
import os
from bs4 import BeautifulSoup

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
url = 'http://jandan.net/ooxx/MjAyMTAyMDQtODY=#comments'

def get_html(url):
    response = requests.get(url, headers=headers)
    return response

def find_imgs(url):
    html = get_html(url).text
    bsObj = BeautifulSoup(html, "html.parser", from_encoding='utf-8')
    bsObj = bsObj.find_all('a', class_='view_img_link')
    pic_list = []
    for link in bsObj:
        img = link['href'][2:]
        pic_list.append(img)
    return pic_list

def save_imgs(folder, img_addrs):
    for each in img_addrs:
        filename = each.split('/')[-1]
        with open(filename, 'wb') as f:
            img = get_html('http://' + each)
            f.write(img.content)

def download_mm(folder='OOXX', pages=10):
    os.mkdir(folder)
    os.chdir(folder)

    img_addrs = find_imgs(url)
    save_imgs(folder, img_addrs)

if __name__ == '__main__':
    print (download_mm())

【备注】
保证本文件所在目录下没有名为“OOXX”的文件夹
这里全局变量url是jandan.com/ooxx网址手动找的一个有图片的网址，如果失效，可以重新再去网页搜下

语法参考：

【https://www.jb51.net/article/152899.htm】

今天用到BeautifulSoup解析爬下来的网页数据

首先导入包from bs4 import BeautifulSoup

然后可以利用urllib请求数据

记得要导包

import urllib.request

然后调用urlopen，读取数据

f=urllib.request.urlopen(‘http://jingyan.baidu.com/article/455a9950bc94b8a166277898.html‘) 
response=f.read()

这里我们就不请求数据了，直接用本地的html代码，如下

注意：”'xxx”'是多行注释

#python3
from bs4 import BeautifulSoup
html='''<html>
<head>
 <title class='ceshi'>super 哈哈 star</title>
</head>
<body>
 天下第一帅
 <p class='sister'>
 
  是不是
 </p>
</body>
</html>'''
#用BeautifulSoup解析数据 python3 必须传入参数二'html.parser' 得到一个对象，接下来获取对象的相关属性
html=BeautifulSoup(html,'html.parser')
# 读取title内容
print(html.title)
# 读取title属性
attrs=html.title.attrs
print(attrs)
# 获取属性attrs['class'] ---->['ceshi'] 这是一个list 通过下标可以获取值
print(attrs['class'][0])
# 读取body
print(html.body)
读取数据还可以通过BeautifulSoup的select方法
html.select()
#按标签名查找 
soup.select('title')
soup.select('body')
# 按类名查找
soup.select('.sister')
# 按id名查找
# p标签中id为link的标签
soup.select('p #link')
#取标签里面的值
soup.p.string
#取标签里属性值 通过href获取
html['href']

【https://blog.csdn.net/qq_39360985/article/details/94437306】

python考试的时候一道简单的爬虫题记录：
给一个URL后，获取html内容并提取p标签中class="title"的文本内容

import requests
from bs4 import BeautifulSoup

html = """
<html><head><title>TheDormouse'sstory</title></head>
<body>
<p class="title"name="dromouse"><b>TheDormouse'sstory</b></p>
<p class="story">Onceuponatimetherewerethreelittlesisters;andtheirnameswere
<a href="http://example.com/elsie"class="sister"id="link1"><!--Elsie--></a>,
<a href="http://example.com/lacie"class="sister"id="link2">Lacie</a>and
<a href="http://example.com/tillie"class="sister"id="link3">Tillie</a>;
andtheylivedatthebottomofawell.</p>
<pclass="story">...</p>"""

bsObj = BeautifulSoup(html, "html.parser")
print(bsObj.title)
print(bsObj.title.string)
print(bsObj.head)
print(bsObj.a)

print(bsObj.head.contents)
print(bsObj.head.contents[0])

print(bsObj.select('.title')[0].text)  #获取class="title的内容"
print(bsObj.select('.story')[0].text)

<title>TheDormouse'sstory</title>
TheDormouse'sstory
<head><title>TheDormouse'sstory</title></head>
<a class="sister" href="http://example.com/elsie" id="link1"><!--Elsie--></a>
[<title>TheDormouse'sstory</title>]
<title>TheDormouse'sstory</title>
TheDormouse'sstory
Onceuponatimetherewerethreelittlesisters;andtheirnameswere
,
Lacieand
Tillie;
andtheylivedatthebottomofawell.

在菜鸟教程上又看到这种写法,学习了：

#创建一个BeautifulSoup解析对象
soup = BeautifulSoup(html,"html.parser",from_encoding="utf-8")
#获取所有的链接
links = soup.find_all('a')
print "所有的链接"
for link in links:
    print link.name,link['href'],link.get_text()
 
print "获取特定的URL地址"
link_node = soup.find('a',href="http://example.com/elsie")
print link_node.name,link_node['href'],link_node['class'],link_node.get_text()
 
print "正则表达式匹配"
link_node = soup.find('a',href=re.compile(r"ti"))
print link_node.name,link_node['href'],link_node['class'],link_node.get_text()
 
print "获取P段落的文字"
p_node = soup.find('p',class_='story')
print p_node.name,p_node['class'],p_node.get_text()

posted @ 2021-02-05 15:05 素人渔芙2017 阅读(126) 评论(0) 编辑收藏举报

刷新页面返回顶部

素人渔芙2017

【Python】 爬虫

公告

【Python】爬虫