爬虫基本操作、requests和BeautifulSoup

1. 爬虫基本操作

例如舆情系统:
  获取汽车之家新闻放到自己数据库里,创建自己的app,发布内容,注明来源,自己创业。

URL指定内容获取到
    - 发送Http请求:http://www.autohome.com.cn/news/
    - 基于正则表达式获取内容 

Python实现:

import requests
from bs4 import BeautifulSoup

response = requests.get('http://www.autohome.com.cn/news/')
response.text

obj = BeautifulSoup(response.text,...)
标签对象 = obj.find('a') # 找到匹配成功的第一个标签
标签对象.find(...)

[标签对象,标签对象,]= obj.find_all('a') # 找到匹配成功的所有标签

 

示例一:爬取汽车之家新闻

requests
	
	obj = requests.get("url")
	obj.content
	obj.encoding = "gbk"
	obj.text
	
	
	soup = beautifulsoup(obj.text,'html.parser')
	标签对象 = soup.find(name='xx')
	[标签对象,标签对象,] = soup.find_all(...)
	
	
	标签对象.text
	标签对象.attrs
	标签对象.get(...)
	
import requests
from bs4 import BeautifulSoup

response = requests.get('http://www.autohome.com.cn/news/')     # socket发送的是字节类型
# # print(response.text)    # 字符串,编码设置不对出现乱码
# print(response.content)     # response.content获取的是字节类型
response.encoding = 'gbk'
# print(response.text)        # response.text拿到的是文本信息

# python有个内置解析器html.parser,html页面的<html lang='en'...></html>对象通过html.parser解析出来
soup = BeautifulSoup(response.text,'html.parser')
tag = soup.find(id='auto-channel-lazyload-article')
# h3 = tag.find(name='h3',class_='c1')     # name是标签名。标签名不能直接写,class='c1'直接报错,写成class_='c1',或者写成attrs={'class':'c1'}
# h3 = tag.find(name='h3',attrs={'class':'c1'})
h3 = tag.find(name='h3')
print(h3)
练习一:获取一个新闻
response = requests.get('http://www.autohome.com.cn/news/')
response.encoding = 'gbk'
soup = BeautifulSoup(response.text,'html.parser')
li_list = soup.find(id='auto-channel-lazyload-article').find_all('li')    # find_all('li')默认为find_all(name='li')
for li in li_list:
    # print(li.find('h3'))        # 有时候获取到的li.find('h3')为None
    title = li.find('h3')
    if not title:
        continue
    # print(title,type(title))    # <h3>将于第四季度上市 云度π1正式下线</h3> <class 'bs4.element.Tag'>
    summary = li.find('p').text
    # url = li.find('a').attrs['href']    # li.find('a').attrs  # 获取到li的所有属性,是个字典.使用get也可以获取到url
    url = li.find('a').get('href')
    img = li.find('img').get('src')

    # # 下载img
    # res= requests.get(img)
    # file_name = '%s.jpg'%(title,)       # 标题当作下载的img文件名不符合规范,需修改
    # with open(file_name,'wb') as f:
    #     f.write(res.content)

    print(title.text, summary,url,img)  # 标题:title.text,简介:summary
    print('=============')
练习二:找到所有新闻,其中包括标题,简介,url,图片

 

示例二:python代码登录github

1. 登录页面发送请求GET,获取csrftoken
2. 发送POST请求:
  携带用户名、密码、csrftoken发送POST请求
  产生cookie,拿到后下次就不需要登录了
requests
	
	obj = requests.get("url")
	obj.content
	obj.encoding = "gbk"
	obj.text
	obj.cookies.get_dict()
	
	
	requests.get("url",cookies={'k1':"v1"})
	
	
	soup = beatifulsoup(obj.text,'html.parser')
	标签 = soup.find(name='xx')
	[标签,] = soup.find_all(...)
	
	
	标签.text
	标签.attrs
	标签.get(...)
import requests
from bs4 import BeautifulSoup

# 获取token
r1 = requests.get('https://github.com/login')
s1 = BeautifulSoup(r1.text,'html.parser')
token = s1.find(name='input',attrs={'name':'authenticity_token'}).get('value')  # github登录页面携带的凭证不是csrf_token,凭证是authenticity_token
print(token)    # 4WLM4c+ilLUmmhsM8TEFiYXMX5evoTQaIxmhTc5FmUYetTseKP6Upx5jJkGOzjm3kCAg9sMv3ShMnz0UGzuGvA==
r1_token_dict = r1.cookies.get_dict()

# 将用户名、密码、token以POST请求发送到服务端
# 测试下发送POST请求时,查看浏览器Network响应头Headers发送请求的内容
"""
utf8:?
authenticity_token:ollV+avLm6Fh3ZevegPO7gOH7xUzEBL0NWdA1aOQ1IO3YQspjOHbfnaXJOtVLQ95BtW9GZlaCIYd5M6v7FGUKg==
login:asdf
password:asdf
commit:Sign in
"""

r2 = requests.post(
    'http://github.com/session',    # POST发送的url是从浏览器Network响应头Headers中查看获取到的
    data={
        'utf8':'?',
        'authenticity_token':token,
        # 'login':'用户名',
        'login':'317828332@qq.com',
        'password':'alex3714',
        # 'password':'密码',
        'commit':'Sign in'
    },
    cookies = r1_token_dict
)
# print(r2.text)
r2_cookie_dict = r2.cookies.get_dict()
print(r1_token_dict)        # 有些网页get请求时有cookies,有些没有
#---> {'logged_in': 'no', '_gh_sess': 'eyJzZXNzaW9uX2lkIjoiZmMwOTdlNGNlY2U2MmZlNGU4MzBkZmQ2NmYwMjQxNDQiLCJsYXN0X3JlYWRfZnJvbV9yZXBsaWNhcyI6MTUwNDAwNjM5MjE5MSwiX2NzcmZfdG9rZW4iOiJLYXJ5MXhpNnQ5SWdJQ3FKeDluamtjYnZ2NXNsWXQyQjBhSy9aQ3I2U1FBPSIsImZsYXNoIjp7ImRpc2NhcmQiOltdLCJmbGFzaGVzIjp7ImFuYWx5dGljc19sb2NhdGlvbl9xdWVyeV9zdHJpcCI6InRydWUifX19--a5df8578d625ae99c39b34c4163f684a1d8ad568'}
print(r2_cookie_dict)          # post请求时的cookies
#---> {'_gh_sess': 'eyJzZXNzaW9uX2lkIjoiZmMwOTdlNGNlY2U2MmZlNGU4MzBkZmQ2NmYwMjQxNDQiLCJsYXN0X3JlYWRfZnJvbV9yZXBsaWNhcyI6MTUwNDAwNjQwNzQwNywiX2NzcmZfdG9rZW4iOiJLYXJ5MXhpNnQ5SWdJQ3FKeDluamtjYnZ2NXNsWXQyQjBhSy9aQ3I2U1FBPSIsImZsYXNoIjp7ImRpc2NhcmQiOltdLCJmbGFzaGVzIjp7ImFuYWx5dGljc19sb2NhdGlvbl9xdWVyeV9zdHJpcCI6InRydWUifX19--db506f001c00ee91aefb55fad7c6cf9965ce3132'}

#整合二个cookies
cookie_dict = {}
cookie_dict.update(r1_token_dict)
cookie_dict.update(r2_cookie_dict)

#再次发送请求时
r3 = requests.get(
    # url='xxxxxx',           #登录后可以访问github的页面
    url='https://github.com/settings/emails',
    cookies=cookie_dict
)
print(r3.text)
代码实现

 

示例三:对抽屉新闻点赞

# 1.登录,拿到cookie
# 2.找到标签url,看抽屉页面发送的点赞请求,首先看往哪个url发送请求。
# 发送的是post请求,发送的url地址:http://dig.chouti.com/login。返回的不是让浏览器直接跳转页面,返回的是字典

import requests
from bs4 import BeautifulSoup
# 1.获取cookie
r0 = requests.get('http://dig.chouti.com/')
r0_cookie_dict = r0.cookies.get_dict()

# 2.发送用户名、密码、cookie
r1 = requests.post(
    'http://dig.chouti.com/login',
    data={
        'phone':'8615131255089',
        'password':'woshiniba',
        'oneMonth':1    # 一个月免登录
    },
    cookies=r0_cookie_dict
)
r1_cookie_dict = r1.cookies.get_dict()
print(r1.text)
#---> {"result":{"code":"8887", "message":"手机号格式不对", "data":""}}    #这是手机不对的情况下打印的内容
print(r1.cookies.get_dict())
#---> {'gpsd': 'd3c9d0b3dfff883f4e86f0094cbfd9bc', 'route': '967b7c98a00b517a995a5a62d3abc65e'}

cookie_dict = {}
cookie_dict.update(r0_cookie_dict)
cookie_dict.update(r1_cookie_dict)

# cookie_dict={'gpsd':r0_cookie_dict['gpsd']}      # 同上面cookie_dict一样,但不推荐使用

# 点赞
r2 = requests.post('http://dig.chouti.com/link/vote?linksId=13911006',cookies=cookie_dict)    # 点赞的时候是post请求,linksId=13911006是文章id
print(r2.text)
View Code

 

 

 

 

2. requests模块

requests模块中提供的方法

# requests.get()
# requests.post()
# requests.put()
# requests.request('post')

# requests.get(url, params=None, **kwargs)
# requests.post(url, data=None, json=None, **kwargs)
# requests.put(url, data=None, **kwargs)
# requests.head(url, **kwargs)
# requests.delete(url, **kwargs)
# requests.patch(url, data=None, **kwargs)
# requests.options(url, **kwargs)
#
# # 以上方法均是在此方法的基础上构建
# requests.request(method, url, **kwargs)
调用关系
# url='xxx',
# params={'k1':'v1','nid':888},     #GET传参
# cookies={},
# headers={},
# data = {},        # data提供数据
# json = {}         # json提供数据


# requests.get(
#     url='xxx',
#     params={'k1':'v1','nid':888},
#     cookies={},
#     headers={}
# )
# http://www.baidu.com?k1=v1&nid=888

requests.post(
    url='xxx',
    params={'k1':'v1','nid':888},
    cookies={},
    headers={},
    json={}
)

# 注意:向后台发送去年请求时,注意请求头

# requests.post(url='',data={})   # 默认携带请求头application/x-www-form-urlencoded

requests.post(url='',data={},headers={'content-type':'application/json'})   # 这样写的话django通过request.POST拿不到值,只能通过request.boby中自己拿

requests.post(url='',json={})       # 默认携带请求头headers={'content-type':'application/json'}
常用参数
# auth
def param_auth():
    from requests.auth import HTTPBasicAuth, HTTPDigestAuth     # HTTPBasicAuth基本上路由器都是通过HTTPBasicAuth验证的
    # 简单常用的基本验证规则
    ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf'))     # HTTPBasicAuth验证规则
    ret = requests.get('https://api.github.com/user', auth=HTTPDigestAuth('wupeiqi', 'sdfasdfasdf'))     # HTTPDigestAuth验证规则
    # 上面二种规则不会简单的,爬虫反爬不可能那么简单按照这二种规则验证账号密码。
    print(ret.text)

    # ret = requests.get('http://192.168.1.1',)
    # auth=HTTPBasicAuth('admin', 'admin'))
    # ret.encoding = 'gbk'
    # print(ret.text)

    # ret = requests.get('http://httpbin.org/digest-auth/auth/user/pass', auth=HTTPDigestAuth('user', 'pass'))
    # print(ret)


# timeout   超时时间限制


# allow_redirects 允许重定向
# 假设访问http://www.abc.com跳转到http://www.baidu.com
response = requests.get('http://www.abc.com',allow_redirects=False)
print(response.text)        # 不允许重定向,则返回的是http://www.abc.com的内容

response = requests.get('http://www.abc.com',allow_redirects=True)
print(response.text)       # 返回的是http://www.baidu.com的内容


# proxies   代理,防止爬网页时,把ip封了,加代理。可以买代理,也可以自己搭代理服务器,自己生成

# stream

# verify    证书,例如12306的证书。知乎证书可带可不带
requests.get('http://httpbin.org/get',stream=True,cert='xxxx.pem')  # stream=True需要携带证书,stream=False不需要携带证书
其他参数

 

 

3. BeautifulSoup

beautifulsoup:把html结构化成对象,通过对象的方式取html内部元素

#html_doc = 
#"""
# <html><head><title>The Dormouse's story</title></head>
# <body>
# asdf
#     <div class="title">
#         <b>The Dormouse's story总共</b>
#         <h1>f</h1>
#     </div>
# <div class="story">Once upon a time there were three little sisters; and their names were
#     <a  class="sister0" id="link1">Els<span>f</span>ie</a>,
#     <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
#     <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
# and they lived at the bottom of a well.</div>
# ad<br/>sf
# <p class="story">...</p>
# </body>
# </html>
# """
#from bs4 import BeautifulSoup
#soup = BeautifulSoup(html_doc, features="lxml")		# 与BeautifulSoup(html_doc,'html.parser')不同的是使用的解析器不同,lxml性能更好,不过要安装lxml模块,推荐使用

#tag = soup.find(class_='story')
# print(tag)
# print(tag.name)
# #---> div
# # tag.name = 'span' # 设置
name属性
# print(tag.attrs)
# #---> {'class': ['story']}
# tag.attrs['kkk'] = 'vvv'
# print(tag.attrs)
# #---> {'class': ['story'], 'kkk': 'vvv'}
# del tag.attrs['kkk']
# print(tag.attrs)
# #---> {'class': ['story']}
attrs属性
# print(tag.children)
# #---> <list_iterator object at 0x0000000002EA32B0>
# print(list(tag.children))
# #---> ['Once upon a time there were three little sisters; and their names were\n    ', <a class="sister0" id="link1">Els<span>f</span>ie</a>, ',\n    ', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, ' and\n    ', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, ';\nand they lived at the bottom of a well.']
# for item in tag.children:
#     print(type(item),item)
# # ---> <class 'bs4.element.NavigableString'> Once upon a time there were three little sisters; and their names were
#
#     # <class 'bs4.element.Tag'> <a class="sister0" id="link1">Els<span>f</span>ie</a>
#     # <class 'bs4.element.NavigableString'> ,
#     #
#     # <class 'bs4.element.Tag'> <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
#     # <class 'bs4.element.NavigableString'>  and
#     #
#     # <class 'bs4.element.Tag'> <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
#     # <class 'bs4.element.NavigableString'> ;
#     # and they lived at the bottom of a well.
chidren属性
# print(tag)
# # ---> <div class="story">Once upon a time there were three little sisters; and their names were
#     #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
#     #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
#     #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
#     # and they lived at the bottom of a well.</div>
# tag.clear()
# print(tag)
# ---> <div class="story"></div>
clear属性,清空,但保留标签名
# tag.decompose()
# print(tag)
# #---> <None></None>
decompose,递归的删除所有的标签
# taga = tag.find(name='a')
# taga.extract()
# print(tag)
extract属性,递归的删除所有的标签,并获取删除的标签
# print(tag.decode())
# #---> <div class="story">Once upon a time there were three little sisters; and their names were
#     #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
#     #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
#     #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
#     # and they lived at the bottom of a well.</div>
# print(type(tag.decode()))
# # ---> <class 'str'>
# print(tag.decode_contents(),type(tag.decode_contents()))
# #---> Once upon a time there were three little sisters; and their names were
# #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
# #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
# #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
# # and they lived at the bottom of a well. <class 'str'>
decode 将标签对象转为字符串类型.但decode_contents(不含当前标签)
# print(tag.decode())
# #---> <div class="story">Once upon a time there were three little sisters; and their names were
#     #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
#     #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
#     #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
#     # and they lived at the bottom of a well.</div>
# print(type(tag.decode()))
# # ---> <class 'str'>
# print(tag.decode_contents(),type(tag.decode_contents()))
# #---> Once upon a time there were three little sisters; and their names were
# #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
# #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
# #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
# # and they lived at the bottom of a well. <class 'str'>
decode 将标签对象转为字符串类型.但decode_contents(不含当前标签)
# print(type(tag.encode()))
# # ---> <class 'bytes'>
# print(tag.encode())
# #---> b'<div class="story">Once upon a time there were three little sisters; and their names were\n    <a class="sister0" id="link1">Els<span>f</span>ie</a>,\n    <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and\n    <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;\nand they lived at the bottom of a well.</div>'
# print(tag.encode_contents(),type(tag.encode_contents()))
encode,转换为字节(含当前标签);encode_contents(不含当前标签)
# tag = soup.find('a')
# print(tag)
# tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')    # recursive递归找;text文本内容,很少用
# tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tag)
find,获取匹配的第一个标签
# tags = soup.find_all('a')
# print(tags)

# tags = soup.find_all('a',limit=1)     # limit=1只找一个
# print(tags)

# tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tags)
find_all,获取匹配的所有标签
# v = soup.find_all(name=['a','div'])       # name=['a','div'] 查找‘a’标签和'div'标签

# print(v)

# v = soup.find_all(class_=['sister0', 'sister'])   # class_=['sister0', 'sister']查找class='sister0'或者class='sister'
# print(v)

# v = soup.find_all(text=['Tillie'])
# print(v, type(v[0]))


# v = soup.find_all(id=['link1','link2'])
# print(v)

# v = soup.find_all(href=['link1','link2'])
# print(v)
列表
#import re
# rep = re.compile('p')
# rep = re.compile('^p')
# v = soup.find_all(name=rep)
# print(v)

# rep = re.compile('sister.*')
# v = soup.find_all(class_=rep)
# print(v)

# rep = re.compile('http://www.oldboy.com/static/.*')
# v = soup.find_all(href=rep)
# print(v)
正则
# def func(tag):
#     return tag.has_attr('class') and tag.has_attr('id')       # 返回结果为True,就把结果给v = soup.find_all()
# v = soup.find_all(name=func)      # name=func把标签遍历一遍,每找到标签执行一次函数。
# print(v)
方法筛选,不常用
# tag = soup.find('a')
# v = tag.get('id')
# print(v)
get,获取标签属性
# tag = soup.find('a')
# v = tag.has_attr('id')
# print(v)
has_attr,检查标签是否具有该属性
# tag = soup.find('a')
# v = tag.get_text()
# print(v)
get_text,获取标签内部文本内容
# tag = soup.find('body')
# v = tag.index(tag.find('div'))
# print(v)
# tag = soup.find('body')
# for i,v in enumerate(tag):
#     print(i,v)
index,检查标签在某标签中的索引位置
is_empty_element,是否是空标签(是否可以是空)或者自闭合标签
# soup.next             # 找下一个,不管是标签还是文本
# soup.next_element     # 找下一个标签
# soup.next_elements
# soup.next_sibling     # 找兄弟姐妹
# soup.next_siblings

# tag.previous
# tag.previous_element
# tag.previous_elements
# tag.previous_sibling
# tag.previous_siblings

# tag.parent
# tag.parents
当前的关联标签
# tag.find_next(...)
# tag.find_all_next(...)
# tag.find_next_sibling(...)
# tag.find_next_siblings(...)

# tag.find_previous(...)
# tag.find_all_previous(...)
# tag.find_previous_sibling(...)
# tag.find_previous_siblings(...)

# tag.find_parent(...)
# tag.find_parents(...)
# 参数同find_all
查找某标签的关联标签
# soup.select("title")
#
# soup.select("p nth-of-type(3)")
#
# soup.select("body a")
#
# soup.select("html head title")
#
# tag = soup.select("span,a")
#
# soup.select("head > title")
#
# soup.select("p > a")
#
# soup.select("p > a:nth-of-type(2)")
#
# soup.select("p > #link1")
#
# soup.select("body > a")
#
# soup.select("#link1 ~ .sister")
#
# soup.select("#link1 + .sister")
#
# soup.select(".sister")
#
# soup.select("[class~=sister]")
#
# soup.select("#link1")
#
# soup.select("a#link2")
#
# soup.select('a[href]')
#
# soup.select('a[href="http://example.com/elsie"]')
#
# soup.select('a[href^="http://example.com/"]')
#
# soup.select('a[href$="tillie"]')
#
# soup.select('a[href*=".com/el"]')
#
# from bs4.element import Tag
#
#
# def default_candidate_generator(tag):
#     for child in tag.descendants:
#         if not isinstance(child, Tag):
#             continue
#         if not child.has_attr('href'):
#             continue
#         yield child
#
#
# tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator)
# print(type(tags), tags)
#
# from bs4.element import Tag
#
#
# def default_candidate_generator(tag):
#     for child in tag.descendants:
#         if not isinstance(child, Tag):
#             continue
#         if not child.has_attr('href'):
#             continue
#         yield child
#
#
# tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1)
# print(type(tags), tags)
select, select_one, CSS选择器,select查找多个,select_one查找一个,但是参数类型不一样
# tag = soup.find('span')
# print(tag.string)          # 获取
# tag.string = 'new content' # 设置
# print(soup)

# tag = soup.find('body')
# print(tag.string)
# tag.string = 'xxx'            # tag.text不能修改标签内容
# print(soup)

# tag = soup.find('body')
# v = tag.stripped_strings  # 递归内部获取所有标签的文本
# print(v)
标签的内容
# tag = soup.find('body')
# tag.append(soup.find('a'))
# print(soup)
# 如果实在想追加当前标签已经存在的,方法如下
# from bs4.element import Tag
# obj = Tag(name='i',attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# tag.append(obj)
# print(soup)
append在当前标签内部追加一个标签,当当前内部标签有追加的这个标签时,只是把当前标签内部位置被追加的标签移动到最后
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# tag.insert(2, obj)
# print(soup)
insert在当前标签内部指定位置插入一个标签
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# # tag.insert_before(obj)
# tag.insert_after(obj)
# print(soup)
insert_after, insert_before在当前标签后面或前面插入
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('div')
# tag.replace_with(obj)
# print(soup)
replace_with 在当前标签替换为指定标签
# tag = soup.find('div')
# a = soup.find('a')
# tag.setup(previous_sibling=a)
# print(tag.previous_sibling)
创建标签之间的关系,关系创建完后没什么用,不会改变标签间的位置
# from bs4.element import Tag
# obj1 = Tag(name='div', attrs={'id': 'it'})
# obj1.string = '我是一个新来的'
#
# tag = soup.find('a')
# v = tag.wrap(obj1)
# print(soup)

# tag = soup.find('a')
# v = tag.wrap(soup.find('p'))
# print(soup)
wrap,将指定标签把当前标签包裹起来
# tag = soup.find('a')
# v = tag.unwrap()
# print(soup)
unwrap,去掉当前标签,将保留其包裹的标签
# tag = soup.find('a')
# v = tag.unwrap()
# print(soup)
unwrap,去掉当前标签,将保留其包裹的标签

 

posted @ 2016-09-20 15:52  许二哈哈哈  阅读(3813)  评论(0编辑  收藏  举报