爬虫基本操作、requests和BeautifulSoup

1. 爬虫基本操作

例如舆情系统：
　　获取汽车之家新闻放到自己数据库里，创建自己的app，发布内容，注明来源，自己创业。

URL指定内容获取到
    - 发送Http请求：http://www.autohome.com.cn/news/
    - 基于正则表达式获取内容

Python实现：

import requests
from bs4 import BeautifulSoup

response = requests.get('http://www.autohome.com.cn/news/')
response.text

obj = BeautifulSoup(response.text,...)
标签对象 = obj.find('a') # 找到匹配成功的第一个标签
标签对象.find(...)

[标签对象,标签对象,]= obj.find_all('a') # 找到匹配成功的所有标签

示例一：爬取汽车之家新闻

requests
	
	obj = requests.get("url")
	obj.content
	obj.encoding = "gbk"
	obj.text
	
	
	soup = beautifulsoup(obj.text,'html.parser')
	标签对象 = soup.find(name='xx')
	[标签对象,标签对象,] = soup.find_all(...)
	
	
	标签对象.text
	标签对象.attrs
	标签对象.get(...)

import requests
from bs4 import BeautifulSoup

response = requests.get('http://www.autohome.com.cn/news/')     # socket发送的是字节类型
# # print(response.text)    # 字符串，编码设置不对出现乱码
# print(response.content)     # response.content获取的是字节类型
response.encoding = 'gbk'
# print(response.text)        # response.text拿到的是文本信息

# python有个内置解析器html.parser，html页面的<html lang='en'...></html>对象通过html.parser解析出来
soup = BeautifulSoup(response.text,'html.parser')
tag = soup.find(id='auto-channel-lazyload-article')
# h3 = tag.find(name='h3',class_='c1')     # name是标签名。标签名不能直接写，class='c1'直接报错，写成class_='c1',或者写成attrs={'class':'c1'}
# h3 = tag.find(name='h3',attrs={'class':'c1'})
h3 = tag.find(name='h3')
print(h3)

练习一：获取一个新闻

response = requests.get('http://www.autohome.com.cn/news/')
response.encoding = 'gbk'
soup = BeautifulSoup(response.text,'html.parser')
li_list = soup.find(id='auto-channel-lazyload-article').find_all('li')    # find_all('li')默认为find_all(name='li')
for li in li_list:
    # print(li.find('h3'))        # 有时候获取到的li.find('h3')为None
    title = li.find('h3')
    if not title:
        continue
    # print(title,type(title))    # <h3>将于第四季度上市 云度π1正式下线</h3> <class 'bs4.element.Tag'>
    summary = li.find('p').text
    # url = li.find('a').attrs['href']    # li.find('a').attrs  # 获取到li的所有属性，是个字典.使用get也可以获取到url
    url = li.find('a').get('href')
    img = li.find('img').get('src')

    # # 下载img
    # res= requests.get(img)
    # file_name = '%s.jpg'%(title,)       # 标题当作下载的img文件名不符合规范，需修改
    # with open(file_name,'wb') as f:
    #     f.write(res.content)

    print(title.text, summary,url,img)  # 标题：title.text，简介：summary
    print('=============')

练习二：找到所有新闻，其中包括标题，简介，url，图片

示例二：python代码登录github

1. 登录页面发送请求GET，获取csrftoken
2. 发送POST请求：
　　携带用户名、密码、csrftoken发送POST请求
　　产生cookie，拿到后下次就不需要登录了

requests
	
	obj = requests.get("url")
	obj.content
	obj.encoding = "gbk"
	obj.text
	obj.cookies.get_dict()
	
	
	requests.get("url",cookies={'k1':"v1"})
	
	
	soup = beatifulsoup(obj.text,'html.parser')
	标签 = soup.find(name='xx')
	[标签,] = soup.find_all(...)
	
	
	标签.text
	标签.attrs
	标签.get(...)

import requests
from bs4 import BeautifulSoup

# 获取token
r1 = requests.get('https://github.com/login')
s1 = BeautifulSoup(r1.text,'html.parser')
token = s1.find(name='input',attrs={'name':'authenticity_token'}).get('value')  # github登录页面携带的凭证不是csrf_token,凭证是authenticity_token
print(token)    # 4WLM4c+ilLUmmhsM8TEFiYXMX5evoTQaIxmhTc5FmUYetTseKP6Upx5jJkGOzjm3kCAg9sMv3ShMnz0UGzuGvA==
r1_token_dict = r1.cookies.get_dict()

# 将用户名、密码、token以POST请求发送到服务端
# 测试下发送POST请求时，查看浏览器Network响应头Headers发送请求的内容
"""
utf8:?
authenticity_token:ollV+avLm6Fh3ZevegPO7gOH7xUzEBL0NWdA1aOQ1IO3YQspjOHbfnaXJOtVLQ95BtW9GZlaCIYd5M6v7FGUKg==
login:asdf
password:asdf
commit:Sign in
"""

r2 = requests.post(
    'http://github.com/session',    # POST发送的url是从浏览器Network响应头Headers中查看获取到的
    data={
        'utf8':'?',
        'authenticity_token':token,
        # 'login':'用户名',
        'login':'317828332@qq.com',
        'password':'alex3714',
        # 'password':'密码',
        'commit':'Sign in'
    },
    cookies = r1_token_dict
)
# print(r2.text)
r2_cookie_dict = r2.cookies.get_dict()
print(r1_token_dict)        # 有些网页get请求时有cookies，有些没有
#---> {'logged_in': 'no', '_gh_sess': 'eyJzZXNzaW9uX2lkIjoiZmMwOTdlNGNlY2U2MmZlNGU4MzBkZmQ2NmYwMjQxNDQiLCJsYXN0X3JlYWRfZnJvbV9yZXBsaWNhcyI6MTUwNDAwNjM5MjE5MSwiX2NzcmZfdG9rZW4iOiJLYXJ5MXhpNnQ5SWdJQ3FKeDluamtjYnZ2NXNsWXQyQjBhSy9aQ3I2U1FBPSIsImZsYXNoIjp7ImRpc2NhcmQiOltdLCJmbGFzaGVzIjp7ImFuYWx5dGljc19sb2NhdGlvbl9xdWVyeV9zdHJpcCI6InRydWUifX19--a5df8578d625ae99c39b34c4163f684a1d8ad568'}
print(r2_cookie_dict)          # post请求时的cookies
#---> {'_gh_sess': 'eyJzZXNzaW9uX2lkIjoiZmMwOTdlNGNlY2U2MmZlNGU4MzBkZmQ2NmYwMjQxNDQiLCJsYXN0X3JlYWRfZnJvbV9yZXBsaWNhcyI6MTUwNDAwNjQwNzQwNywiX2NzcmZfdG9rZW4iOiJLYXJ5MXhpNnQ5SWdJQ3FKeDluamtjYnZ2NXNsWXQyQjBhSy9aQ3I2U1FBPSIsImZsYXNoIjp7ImRpc2NhcmQiOltdLCJmbGFzaGVzIjp7ImFuYWx5dGljc19sb2NhdGlvbl9xdWVyeV9zdHJpcCI6InRydWUifX19--db506f001c00ee91aefb55fad7c6cf9965ce3132'}

#整合二个cookies
cookie_dict = {}
cookie_dict.update(r1_token_dict)
cookie_dict.update(r2_cookie_dict)

#再次发送请求时
r3 = requests.get(
    # url='xxxxxx',           #登录后可以访问github的页面
    url='https://github.com/settings/emails',
    cookies=cookie_dict
)
print(r3.text)

代码实现

示例三：对抽屉新闻点赞

# 1.登录，拿到cookie
# 2.找到标签url，看抽屉页面发送的点赞请求，首先看往哪个url发送请求。
# 发送的是post请求，发送的url地址：http://dig.chouti.com/login。返回的不是让浏览器直接跳转页面，返回的是字典

import requests
from bs4 import BeautifulSoup
# 1.获取cookie
r0 = requests.get('http://dig.chouti.com/')
r0_cookie_dict = r0.cookies.get_dict()

# 2.发送用户名、密码、cookie
r1 = requests.post(
    'http://dig.chouti.com/login',
    data={
        'phone':'8615131255089',
        'password':'woshiniba',
        'oneMonth':1    # 一个月免登录
    },
    cookies=r0_cookie_dict
)
r1_cookie_dict = r1.cookies.get_dict()
print(r1.text)
#---> {"result":{"code":"8887", "message":"手机号格式不对", "data":""}}    #这是手机不对的情况下打印的内容
print(r1.cookies.get_dict())
#---> {'gpsd': 'd3c9d0b3dfff883f4e86f0094cbfd9bc', 'route': '967b7c98a00b517a995a5a62d3abc65e'}

cookie_dict = {}
cookie_dict.update(r0_cookie_dict)
cookie_dict.update(r1_cookie_dict)

# cookie_dict={'gpsd':r0_cookie_dict['gpsd']}      # 同上面cookie_dict一样，但不推荐使用

# 点赞
r2 = requests.post('http://dig.chouti.com/link/vote?linksId=13911006',cookies=cookie_dict)    # 点赞的时候是post请求,linksId=13911006是文章id
print(r2.text)

View Code

2. requests模块

requests模块中提供的方法

# requests.get()
# requests.post()
# requests.put()
# requests.request('post')

# requests.get(url, params=None, **kwargs)
# requests.post(url, data=None, json=None, **kwargs)
# requests.put(url, data=None, **kwargs)
# requests.head(url, **kwargs)
# requests.delete(url, **kwargs)
# requests.patch(url, data=None, **kwargs)
# requests.options(url, **kwargs)
#
# # 以上方法均是在此方法的基础上构建
# requests.request(method, url, **kwargs)

调用关系

# url='xxx',
# params={'k1':'v1','nid':888},     #GET传参
# cookies={},
# headers={},
# data = {},        # data提供数据
# json = {}         # json提供数据


# requests.get(
#     url='xxx',
#     params={'k1':'v1','nid':888},
#     cookies={},
#     headers={}
# )
# http://www.baidu.com?k1=v1&nid=888

requests.post(
    url='xxx',
    params={'k1':'v1','nid':888},
    cookies={},
    headers={},
    json={}
)

# 注意：向后台发送去年请求时，注意请求头

# requests.post(url='',data={})   # 默认携带请求头application/x-www-form-urlencoded

requests.post(url='',data={},headers={'content-type':'application/json'})   # 这样写的话django通过request.POST拿不到值，只能通过request.boby中自己拿

requests.post(url='',json={})       # 默认携带请求头headers={'content-type':'application/json'}

常用参数

# auth
def param_auth():
    from requests.auth import HTTPBasicAuth, HTTPDigestAuth     # HTTPBasicAuth基本上路由器都是通过HTTPBasicAuth验证的
    # 简单常用的基本验证规则
    ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf'))     # HTTPBasicAuth验证规则
    ret = requests.get('https://api.github.com/user', auth=HTTPDigestAuth('wupeiqi', 'sdfasdfasdf'))     # HTTPDigestAuth验证规则
    # 上面二种规则不会简单的，爬虫反爬不可能那么简单按照这二种规则验证账号密码。
    print(ret.text)

    # ret = requests.get('http://192.168.1.1',)
    # auth=HTTPBasicAuth('admin', 'admin'))
    # ret.encoding = 'gbk'
    # print(ret.text)

    # ret = requests.get('http://httpbin.org/digest-auth/auth/user/pass', auth=HTTPDigestAuth('user', 'pass'))
    # print(ret)


# timeout   超时时间限制


# allow_redirects 允许重定向
# 假设访问http://www.abc.com跳转到http://www.baidu.com
response = requests.get('http://www.abc.com',allow_redirects=False)
print(response.text)        # 不允许重定向，则返回的是http://www.abc.com的内容

response = requests.get('http://www.abc.com',allow_redirects=True)
print(response.text)       # 返回的是http://www.baidu.com的内容


# proxies   代理，防止爬网页时，把ip封了，加代理。可以买代理，也可以自己搭代理服务器，自己生成

# stream

# verify    证书，例如12306的证书。知乎证书可带可不带
requests.get('http://httpbin.org/get',stream=True,cert='xxxx.pem')  # stream=True需要携带证书，stream=False不需要携带证书

其他参数

3. BeautifulSoup

beautifulsoup：把html结构化成对象，通过对象的方式取html内部元素

#html_doc = 
#"""
# <html><head><title>The Dormouse's story</title></head>
# <body>
# asdf
#     <div class="title">
#         <b>The Dormouse's story总共</b>
#         <h1>f</h1>
#     </div>
# <div class="story">Once upon a time there were three little sisters; and their names were
#     <a  class="sister0" id="link1">Els<span>f</span>ie</a>,
#     <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
#     <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
# and they lived at the bottom of a well.</div>
# ad<br/>sf
# <p class="story">...</p>
# </body>
# </html>
# """
#from bs4 import BeautifulSoup
#soup = BeautifulSoup(html_doc, features="lxml")		# 与BeautifulSoup(html_doc,'html.parser')不同的是使用的解析器不同，lxml性能更好，不过要安装lxml模块，推荐使用

#tag = soup.find(class_='story')
# print(tag)

# print(tag.name)
# #---> div
# # tag.name = 'span' # 设置

name属性

# print(tag.attrs)
# #---> {'class': ['story']}
# tag.attrs['kkk'] = 'vvv'
# print(tag.attrs)
# #---> {'class': ['story'], 'kkk': 'vvv'}
# del tag.attrs['kkk']
# print(tag.attrs)
# #---> {'class': ['story']}

attrs属性

# print(tag.children)
# #---> <list_iterator object at 0x0000000002EA32B0>
# print(list(tag.children))
# #---> ['Once upon a time there were three little sisters; and their names were\n    ', <a class="sister0" id="link1">Els<span>f</span>ie</a>, ',\n    ', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, ' and\n    ', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, ';\nand they lived at the bottom of a well.']
# for item in tag.children:
#     print(type(item),item)
# # ---> <class 'bs4.element.NavigableString'> Once upon a time there were three little sisters; and their names were
#
#     # <class 'bs4.element.Tag'> <a class="sister0" id="link1">Els<span>f</span>ie</a>
#     # <class 'bs4.element.NavigableString'> ,
#     #
#     # <class 'bs4.element.Tag'> <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
#     # <class 'bs4.element.NavigableString'>  and
#     #
#     # <class 'bs4.element.Tag'> <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
#     # <class 'bs4.element.NavigableString'> ;
#     # and they lived at the bottom of a well.

chidren属性

# print(tag)
# # ---> <div class="story">Once upon a time there were three little sisters; and their names were
#     #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
#     #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
#     #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
#     # and they lived at the bottom of a well.</div>
# tag.clear()
# print(tag)
# ---> <div class="story"></div>

clear属性，清空，但保留标签名

# tag.decompose()
# print(tag)
# #---> <None></None>

decompose,递归的删除所有的标签

# taga = tag.find(name='a')
# taga.extract()
# print(tag)

extract属性,递归的删除所有的标签，并获取删除的标签

# print(tag.decode())
# #---> <div class="story">Once upon a time there were three little sisters; and their names were
#     #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
#     #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
#     #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
#     # and they lived at the bottom of a well.</div>
# print(type(tag.decode()))
# # ---> <class 'str'>
# print(tag.decode_contents(),type(tag.decode_contents()))
# #---> Once upon a time there were three little sisters; and their names were
# #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
# #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
# #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
# # and they lived at the bottom of a well. <class 'str'>

decode 将标签对象转为字符串类型.但decode_contents（不含当前标签）

# print(tag.decode())
# #---> <div class="story">Once upon a time there were three little sisters; and their names were
#     #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
#     #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
#     #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
#     # and they lived at the bottom of a well.</div>
# print(type(tag.decode()))
# # ---> <class 'str'>
# print(tag.decode_contents(),type(tag.decode_contents()))
# #---> Once upon a time there were three little sisters; and their names were
# #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
# #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
# #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
# # and they lived at the bottom of a well. <class 'str'>

decode 将标签对象转为字符串类型.但decode_contents（不含当前标签）

# print(type(tag.encode()))
# # ---> <class 'bytes'>
# print(tag.encode())
# #---> b'<div class="story">Once upon a time there were three little sisters; and their names were\n    <a class="sister0" id="link1">Els<span>f</span>ie</a>,\n    <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and\n    <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;\nand they lived at the bottom of a well.</div>'
# print(tag.encode_contents(),type(tag.encode_contents()))

encode,转换为字节（含当前标签）；encode_contents（不含当前标签）

# tag = soup.find('a')
# print(tag)
# tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')    # recursive递归找；text文本内容，很少用
# tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tag)

find,获取匹配的第一个标签

# tags = soup.find_all('a')
# print(tags)

# tags = soup.find_all('a',limit=1)     # limit=1只找一个
# print(tags)

# tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tags)

find_all,获取匹配的所有标签

# v = soup.find_all(name=['a','div'])       # name=['a','div'] 查找‘a’标签和'div'标签

# print(v)

# v = soup.find_all(class_=['sister0', 'sister'])   # class_=['sister0', 'sister']查找class='sister0'或者class='sister'
# print(v)

# v = soup.find_all(text=['Tillie'])
# print(v, type(v[0]))


# v = soup.find_all(id=['link1','link2'])
# print(v)

# v = soup.find_all(href=['link1','link2'])
# print(v)

列表

#import re
# rep = re.compile('p')
# rep = re.compile('^p')
# v = soup.find_all(name=rep)
# print(v)

# rep = re.compile('sister.*')
# v = soup.find_all(class_=rep)
# print(v)

# rep = re.compile('http://www.oldboy.com/static/.*')
# v = soup.find_all(href=rep)
# print(v)

正则

# def func(tag):
#     return tag.has_attr('class') and tag.has_attr('id')       # 返回结果为True,就把结果给v = soup.find_all()
# v = soup.find_all(name=func)      # name=func把标签遍历一遍，每找到标签执行一次函数。
# print(v)

方法筛选，不常用

# tag = soup.find('a')
# v = tag.get('id')
# print(v)

get,获取标签属性

# tag = soup.find('a')
# v = tag.has_attr('id')
# print(v)

has_attr,检查标签是否具有该属性

# tag = soup.find('a')
# v = tag.get_text()
# print(v)

get_text,获取标签内部文本内容

# tag = soup.find('body')
# v = tag.index(tag.find('div'))
# print(v)
# tag = soup.find('body')
# for i,v in enumerate(tag):
#     print(i,v)

index,检查标签在某标签中的索引位置

is_empty_element,是否是空标签(是否可以是空)或者自闭合标签

# soup.next             # 找下一个，不管是标签还是文本
# soup.next_element     # 找下一个标签
# soup.next_elements
# soup.next_sibling     # 找兄弟姐妹
# soup.next_siblings

# tag.previous
# tag.previous_element
# tag.previous_elements
# tag.previous_sibling
# tag.previous_siblings

# tag.parent
# tag.parents

当前的关联标签

# tag.find_next(...)
# tag.find_all_next(...)
# tag.find_next_sibling(...)
# tag.find_next_siblings(...)

# tag.find_previous(...)
# tag.find_all_previous(...)
# tag.find_previous_sibling(...)
# tag.find_previous_siblings(...)

# tag.find_parent(...)
# tag.find_parents(...)
# 参数同find_all

查找某标签的关联标签

# soup.select("title")
#
# soup.select("p nth-of-type(3)")
#
# soup.select("body a")
#
# soup.select("html head title")
#
# tag = soup.select("span,a")
#
# soup.select("head > title")
#
# soup.select("p > a")
#
# soup.select("p > a:nth-of-type(2)")
#
# soup.select("p > #link1")
#
# soup.select("body > a")
#
# soup.select("#link1 ~ .sister")
#
# soup.select("#link1 + .sister")
#
# soup.select(".sister")
#
# soup.select("[class~=sister]")
#
# soup.select("#link1")
#
# soup.select("a#link2")
#
# soup.select('a[href]')
#
# soup.select('a[href="http://example.com/elsie"]')
#
# soup.select('a[href^="http://example.com/"]')
#
# soup.select('a[href$="tillie"]')
#
# soup.select('a[href*=".com/el"]')
#
# from bs4.element import Tag
#
#
# def default_candidate_generator(tag):
#     for child in tag.descendants:
#         if not isinstance(child, Tag):
#             continue
#         if not child.has_attr('href'):
#             continue
#         yield child
#
#
# tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator)
# print(type(tags), tags)
#
# from bs4.element import Tag
#
#
# def default_candidate_generator(tag):
#     for child in tag.descendants:
#         if not isinstance(child, Tag):
#             continue
#         if not child.has_attr('href'):
#             continue
#         yield child
#
#
# tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1)
# print(type(tags), tags)

select, select_one, CSS选择器，select查找多个，select_one查找一个，但是参数类型不一样

# tag = soup.find('span')
# print(tag.string)          # 获取
# tag.string = 'new content' # 设置
# print(soup)

# tag = soup.find('body')
# print(tag.string)
# tag.string = 'xxx'            # tag.text不能修改标签内容
# print(soup)

# tag = soup.find('body')
# v = tag.stripped_strings  # 递归内部获取所有标签的文本
# print(v)

标签的内容

# tag = soup.find('body')
# tag.append(soup.find('a'))
# print(soup)
# 如果实在想追加当前标签已经存在的，方法如下
# from bs4.element import Tag
# obj = Tag(name='i',attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# tag.append(obj)
# print(soup)

append在当前标签内部追加一个标签，当当前内部标签有追加的这个标签时，只是把当前标签内部位置被追加的标签移动到最后

# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# tag.insert(2, obj)
# print(soup)

insert在当前标签内部指定位置插入一个标签

# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# # tag.insert_before(obj)
# tag.insert_after(obj)
# print(soup)

insert_after, insert_before在当前标签后面或前面插入

# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('div')
# tag.replace_with(obj)
# print(soup)

replace_with 在当前标签替换为指定标签

# tag = soup.find('div')
# a = soup.find('a')
# tag.setup(previous_sibling=a)
# print(tag.previous_sibling)

创建标签之间的关系，关系创建完后没什么用，不会改变标签间的位置

# from bs4.element import Tag
# obj1 = Tag(name='div', attrs={'id': 'it'})
# obj1.string = '我是一个新来的'
#
# tag = soup.find('a')
# v = tag.wrap(obj1)
# print(soup)

# tag = soup.find('a')
# v = tag.wrap(soup.find('p'))
# print(soup)

wrap，将指定标签把当前标签包裹起来

# tag = soup.find('a')
# v = tag.unwrap()
# print(soup)

unwrap，去掉当前标签，将保留其包裹的标签

# tag = soup.find('a')
# v = tag.unwrap()
# print(soup)

unwrap，去掉当前标签，将保留其包裹的标签

posted @ 2016-09-20 15:52 许二哈哈哈阅读(3814) 评论(0) 编辑收藏举报

刷新页面返回顶部

许二

爬虫基本操作、requests和BeautifulSoup

1. 爬虫基本操作

2. requests模块

3. BeautifulSoup

公告