爬虫基本操作、requests和BeautifulSoup
1. 爬虫基本操作
例如舆情系统:
获取汽车之家新闻放到自己数据库里,创建自己的app,发布内容,注明来源,自己创业。
URL指定内容获取到 - 发送Http请求:http://www.autohome.com.cn/news/ - 基于正则表达式获取内容
Python实现:
import requests from bs4 import BeautifulSoup response = requests.get('http://www.autohome.com.cn/news/') response.text obj = BeautifulSoup(response.text,...) 标签对象 = obj.find('a') # 找到匹配成功的第一个标签 标签对象.find(...) [标签对象,标签对象,]= obj.find_all('a') # 找到匹配成功的所有标签
示例一:爬取汽车之家新闻
requests obj = requests.get("url") obj.content obj.encoding = "gbk" obj.text soup = beautifulsoup(obj.text,'html.parser') 标签对象 = soup.find(name='xx') [标签对象,标签对象,] = soup.find_all(...) 标签对象.text 标签对象.attrs 标签对象.get(...)
import requests from bs4 import BeautifulSoup response = requests.get('http://www.autohome.com.cn/news/') # socket发送的是字节类型 # # print(response.text) # 字符串,编码设置不对出现乱码 # print(response.content) # response.content获取的是字节类型 response.encoding = 'gbk' # print(response.text) # response.text拿到的是文本信息 # python有个内置解析器html.parser,html页面的<html lang='en'...></html>对象通过html.parser解析出来 soup = BeautifulSoup(response.text,'html.parser') tag = soup.find(id='auto-channel-lazyload-article') # h3 = tag.find(name='h3',class_='c1') # name是标签名。标签名不能直接写,class='c1'直接报错,写成class_='c1',或者写成attrs={'class':'c1'} # h3 = tag.find(name='h3',attrs={'class':'c1'}) h3 = tag.find(name='h3') print(h3)
response = requests.get('http://www.autohome.com.cn/news/') response.encoding = 'gbk' soup = BeautifulSoup(response.text,'html.parser') li_list = soup.find(id='auto-channel-lazyload-article').find_all('li') # find_all('li')默认为find_all(name='li') for li in li_list: # print(li.find('h3')) # 有时候获取到的li.find('h3')为None title = li.find('h3') if not title: continue # print(title,type(title)) # <h3>将于第四季度上市 云度π1正式下线</h3> <class 'bs4.element.Tag'> summary = li.find('p').text # url = li.find('a').attrs['href'] # li.find('a').attrs # 获取到li的所有属性,是个字典.使用get也可以获取到url url = li.find('a').get('href') img = li.find('img').get('src') # # 下载img # res= requests.get(img) # file_name = '%s.jpg'%(title,) # 标题当作下载的img文件名不符合规范,需修改 # with open(file_name,'wb') as f: # f.write(res.content) print(title.text, summary,url,img) # 标题:title.text,简介:summary print('=============')
示例二:python代码登录github
1. 登录页面发送请求GET,获取csrftoken 2. 发送POST请求: 携带用户名、密码、csrftoken发送POST请求 产生cookie,拿到后下次就不需要登录了
requests obj = requests.get("url") obj.content obj.encoding = "gbk" obj.text obj.cookies.get_dict() requests.get("url",cookies={'k1':"v1"}) soup = beatifulsoup(obj.text,'html.parser') 标签 = soup.find(name='xx') [标签,] = soup.find_all(...) 标签.text 标签.attrs 标签.get(...)
import requests from bs4 import BeautifulSoup # 获取token r1 = requests.get('https://github.com/login') s1 = BeautifulSoup(r1.text,'html.parser') token = s1.find(name='input',attrs={'name':'authenticity_token'}).get('value') # github登录页面携带的凭证不是csrf_token,凭证是authenticity_token print(token) # 4WLM4c+ilLUmmhsM8TEFiYXMX5evoTQaIxmhTc5FmUYetTseKP6Upx5jJkGOzjm3kCAg9sMv3ShMnz0UGzuGvA== r1_token_dict = r1.cookies.get_dict() # 将用户名、密码、token以POST请求发送到服务端 # 测试下发送POST请求时,查看浏览器Network响应头Headers发送请求的内容 """ utf8:? authenticity_token:ollV+avLm6Fh3ZevegPO7gOH7xUzEBL0NWdA1aOQ1IO3YQspjOHbfnaXJOtVLQ95BtW9GZlaCIYd5M6v7FGUKg== login:asdf password:asdf commit:Sign in """ r2 = requests.post( 'http://github.com/session', # POST发送的url是从浏览器Network响应头Headers中查看获取到的 data={ 'utf8':'?', 'authenticity_token':token, # 'login':'用户名', 'login':'317828332@qq.com', 'password':'alex3714', # 'password':'密码', 'commit':'Sign in' }, cookies = r1_token_dict ) # print(r2.text) r2_cookie_dict = r2.cookies.get_dict() print(r1_token_dict) # 有些网页get请求时有cookies,有些没有 #---> {'logged_in': 'no', '_gh_sess': 'eyJzZXNzaW9uX2lkIjoiZmMwOTdlNGNlY2U2MmZlNGU4MzBkZmQ2NmYwMjQxNDQiLCJsYXN0X3JlYWRfZnJvbV9yZXBsaWNhcyI6MTUwNDAwNjM5MjE5MSwiX2NzcmZfdG9rZW4iOiJLYXJ5MXhpNnQ5SWdJQ3FKeDluamtjYnZ2NXNsWXQyQjBhSy9aQ3I2U1FBPSIsImZsYXNoIjp7ImRpc2NhcmQiOltdLCJmbGFzaGVzIjp7ImFuYWx5dGljc19sb2NhdGlvbl9xdWVyeV9zdHJpcCI6InRydWUifX19--a5df8578d625ae99c39b34c4163f684a1d8ad568'} print(r2_cookie_dict) # post请求时的cookies #---> {'_gh_sess': 'eyJzZXNzaW9uX2lkIjoiZmMwOTdlNGNlY2U2MmZlNGU4MzBkZmQ2NmYwMjQxNDQiLCJsYXN0X3JlYWRfZnJvbV9yZXBsaWNhcyI6MTUwNDAwNjQwNzQwNywiX2NzcmZfdG9rZW4iOiJLYXJ5MXhpNnQ5SWdJQ3FKeDluamtjYnZ2NXNsWXQyQjBhSy9aQ3I2U1FBPSIsImZsYXNoIjp7ImRpc2NhcmQiOltdLCJmbGFzaGVzIjp7ImFuYWx5dGljc19sb2NhdGlvbl9xdWVyeV9zdHJpcCI6InRydWUifX19--db506f001c00ee91aefb55fad7c6cf9965ce3132'} #整合二个cookies cookie_dict = {} cookie_dict.update(r1_token_dict) cookie_dict.update(r2_cookie_dict) #再次发送请求时 r3 = requests.get( # url='xxxxxx', #登录后可以访问github的页面 url='https://github.com/settings/emails', cookies=cookie_dict ) print(r3.text)
示例三:对抽屉新闻点赞
# 1.登录,拿到cookie # 2.找到标签url,看抽屉页面发送的点赞请求,首先看往哪个url发送请求。 # 发送的是post请求,发送的url地址:http://dig.chouti.com/login。返回的不是让浏览器直接跳转页面,返回的是字典 import requests from bs4 import BeautifulSoup # 1.获取cookie r0 = requests.get('http://dig.chouti.com/') r0_cookie_dict = r0.cookies.get_dict() # 2.发送用户名、密码、cookie r1 = requests.post( 'http://dig.chouti.com/login', data={ 'phone':'8615131255089', 'password':'woshiniba', 'oneMonth':1 # 一个月免登录 }, cookies=r0_cookie_dict ) r1_cookie_dict = r1.cookies.get_dict() print(r1.text) #---> {"result":{"code":"8887", "message":"手机号格式不对", "data":""}} #这是手机不对的情况下打印的内容 print(r1.cookies.get_dict()) #---> {'gpsd': 'd3c9d0b3dfff883f4e86f0094cbfd9bc', 'route': '967b7c98a00b517a995a5a62d3abc65e'} cookie_dict = {} cookie_dict.update(r0_cookie_dict) cookie_dict.update(r1_cookie_dict) # cookie_dict={'gpsd':r0_cookie_dict['gpsd']} # 同上面cookie_dict一样,但不推荐使用 # 点赞 r2 = requests.post('http://dig.chouti.com/link/vote?linksId=13911006',cookies=cookie_dict) # 点赞的时候是post请求,linksId=13911006是文章id print(r2.text)
2. requests模块
requests模块中提供的方法
# requests.get() # requests.post() # requests.put() # requests.request('post') # requests.get(url, params=None, **kwargs) # requests.post(url, data=None, json=None, **kwargs) # requests.put(url, data=None, **kwargs) # requests.head(url, **kwargs) # requests.delete(url, **kwargs) # requests.patch(url, data=None, **kwargs) # requests.options(url, **kwargs) # # # 以上方法均是在此方法的基础上构建 # requests.request(method, url, **kwargs)
# url='xxx', # params={'k1':'v1','nid':888}, #GET传参 # cookies={}, # headers={}, # data = {}, # data提供数据 # json = {} # json提供数据 # requests.get( # url='xxx', # params={'k1':'v1','nid':888}, # cookies={}, # headers={} # ) # http://www.baidu.com?k1=v1&nid=888 requests.post( url='xxx', params={'k1':'v1','nid':888}, cookies={}, headers={}, json={} ) # 注意:向后台发送去年请求时,注意请求头 # requests.post(url='',data={}) # 默认携带请求头application/x-www-form-urlencoded requests.post(url='',data={},headers={'content-type':'application/json'}) # 这样写的话django通过request.POST拿不到值,只能通过request.boby中自己拿 requests.post(url='',json={}) # 默认携带请求头headers={'content-type':'application/json'}
# auth def param_auth(): from requests.auth import HTTPBasicAuth, HTTPDigestAuth # HTTPBasicAuth基本上路由器都是通过HTTPBasicAuth验证的 # 简单常用的基本验证规则 ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf')) # HTTPBasicAuth验证规则 ret = requests.get('https://api.github.com/user', auth=HTTPDigestAuth('wupeiqi', 'sdfasdfasdf')) # HTTPDigestAuth验证规则 # 上面二种规则不会简单的,爬虫反爬不可能那么简单按照这二种规则验证账号密码。 print(ret.text) # ret = requests.get('http://192.168.1.1',) # auth=HTTPBasicAuth('admin', 'admin')) # ret.encoding = 'gbk' # print(ret.text) # ret = requests.get('http://httpbin.org/digest-auth/auth/user/pass', auth=HTTPDigestAuth('user', 'pass')) # print(ret) # timeout 超时时间限制 # allow_redirects 允许重定向 # 假设访问http://www.abc.com跳转到http://www.baidu.com response = requests.get('http://www.abc.com',allow_redirects=False) print(response.text) # 不允许重定向,则返回的是http://www.abc.com的内容 response = requests.get('http://www.abc.com',allow_redirects=True) print(response.text) # 返回的是http://www.baidu.com的内容 # proxies 代理,防止爬网页时,把ip封了,加代理。可以买代理,也可以自己搭代理服务器,自己生成 # stream # verify 证书,例如12306的证书。知乎证书可带可不带 requests.get('http://httpbin.org/get',stream=True,cert='xxxx.pem') # stream=True需要携带证书,stream=False不需要携带证书
3. BeautifulSoup
beautifulsoup:把html结构化成对象,通过对象的方式取html内部元素
#html_doc = #""" # <html><head><title>The Dormouse's story</title></head> # <body> # asdf # <div class="title"> # <b>The Dormouse's story总共</b> # <h1>f</h1> # </div> # <div class="story">Once upon a time there were three little sisters; and their names were # <a class="sister0" id="link1">Els<span>f</span>ie</a>, # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; # and they lived at the bottom of a well.</div> # ad<br/>sf # <p class="story">...</p> # </body> # </html> # """ #from bs4 import BeautifulSoup #soup = BeautifulSoup(html_doc, features="lxml") # 与BeautifulSoup(html_doc,'html.parser')不同的是使用的解析器不同,lxml性能更好,不过要安装lxml模块,推荐使用 #tag = soup.find(class_='story') # print(tag)
# print(tag.name) # #---> div # # tag.name = 'span' # 设置
# print(tag.attrs) # #---> {'class': ['story']} # tag.attrs['kkk'] = 'vvv' # print(tag.attrs) # #---> {'class': ['story'], 'kkk': 'vvv'} # del tag.attrs['kkk'] # print(tag.attrs) # #---> {'class': ['story']}
# print(tag.children) # #---> <list_iterator object at 0x0000000002EA32B0> # print(list(tag.children)) # #---> ['Once upon a time there were three little sisters; and their names were\n ', <a class="sister0" id="link1">Els<span>f</span>ie</a>, ',\n ', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, ' and\n ', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, ';\nand they lived at the bottom of a well.'] # for item in tag.children: # print(type(item),item) # # ---> <class 'bs4.element.NavigableString'> Once upon a time there were three little sisters; and their names were # # # <class 'bs4.element.Tag'> <a class="sister0" id="link1">Els<span>f</span>ie</a> # # <class 'bs4.element.NavigableString'> , # # # # <class 'bs4.element.Tag'> <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> # # <class 'bs4.element.NavigableString'> and # # # # <class 'bs4.element.Tag'> <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> # # <class 'bs4.element.NavigableString'> ; # # and they lived at the bottom of a well.
# print(tag) # # ---> <div class="story">Once upon a time there were three little sisters; and their names were # # <a class="sister0" id="link1">Els<span>f</span>ie</a>, # # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and # # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; # # and they lived at the bottom of a well.</div> # tag.clear() # print(tag) # ---> <div class="story"></div>
# tag.decompose() # print(tag) # #---> <None></None>
# taga = tag.find(name='a') # taga.extract() # print(tag)
# print(tag.decode()) # #---> <div class="story">Once upon a time there were three little sisters; and their names were # # <a class="sister0" id="link1">Els<span>f</span>ie</a>, # # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and # # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; # # and they lived at the bottom of a well.</div> # print(type(tag.decode())) # # ---> <class 'str'> # print(tag.decode_contents(),type(tag.decode_contents())) # #---> Once upon a time there were three little sisters; and their names were # # <a class="sister0" id="link1">Els<span>f</span>ie</a>, # # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and # # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; # # and they lived at the bottom of a well. <class 'str'>
# print(tag.decode()) # #---> <div class="story">Once upon a time there were three little sisters; and their names were # # <a class="sister0" id="link1">Els<span>f</span>ie</a>, # # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and # # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; # # and they lived at the bottom of a well.</div> # print(type(tag.decode())) # # ---> <class 'str'> # print(tag.decode_contents(),type(tag.decode_contents())) # #---> Once upon a time there were three little sisters; and their names were # # <a class="sister0" id="link1">Els<span>f</span>ie</a>, # # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and # # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; # # and they lived at the bottom of a well. <class 'str'>
# print(type(tag.encode())) # # ---> <class 'bytes'> # print(tag.encode()) # #---> b'<div class="story">Once upon a time there were three little sisters; and their names were\n <a class="sister0" id="link1">Els<span>f</span>ie</a>,\n <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and\n <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;\nand they lived at the bottom of a well.</div>' # print(tag.encode_contents(),type(tag.encode_contents()))
# tag = soup.find('a') # print(tag) # tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') # recursive递归找;text文本内容,很少用 # tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie') # print(tag)
# tags = soup.find_all('a') # print(tags) # tags = soup.find_all('a',limit=1) # limit=1只找一个 # print(tags) # tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') # # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie') # print(tags)
# v = soup.find_all(name=['a','div']) # name=['a','div'] 查找‘a’标签和'div'标签 # print(v) # v = soup.find_all(class_=['sister0', 'sister']) # class_=['sister0', 'sister']查找class='sister0'或者class='sister' # print(v) # v = soup.find_all(text=['Tillie']) # print(v, type(v[0])) # v = soup.find_all(id=['link1','link2']) # print(v) # v = soup.find_all(href=['link1','link2']) # print(v)
#import re # rep = re.compile('p') # rep = re.compile('^p') # v = soup.find_all(name=rep) # print(v) # rep = re.compile('sister.*') # v = soup.find_all(class_=rep) # print(v) # rep = re.compile('http://www.oldboy.com/static/.*') # v = soup.find_all(href=rep) # print(v)
# def func(tag): # return tag.has_attr('class') and tag.has_attr('id') # 返回结果为True,就把结果给v = soup.find_all() # v = soup.find_all(name=func) # name=func把标签遍历一遍,每找到标签执行一次函数。 # print(v)
# tag = soup.find('a') # v = tag.get('id') # print(v)
# tag = soup.find('a') # v = tag.has_attr('id') # print(v)
# tag = soup.find('a') # v = tag.get_text() # print(v)
# tag = soup.find('body') # v = tag.index(tag.find('div')) # print(v) # tag = soup.find('body') # for i,v in enumerate(tag): # print(i,v)
is_empty_element,是否是空标签(是否可以是空)或者自闭合标签
# soup.next # 找下一个,不管是标签还是文本 # soup.next_element # 找下一个标签 # soup.next_elements # soup.next_sibling # 找兄弟姐妹 # soup.next_siblings # tag.previous # tag.previous_element # tag.previous_elements # tag.previous_sibling # tag.previous_siblings # tag.parent # tag.parents
# tag.find_next(...) # tag.find_all_next(...) # tag.find_next_sibling(...) # tag.find_next_siblings(...) # tag.find_previous(...) # tag.find_all_previous(...) # tag.find_previous_sibling(...) # tag.find_previous_siblings(...) # tag.find_parent(...) # tag.find_parents(...) # 参数同find_all
# soup.select("title") # # soup.select("p nth-of-type(3)") # # soup.select("body a") # # soup.select("html head title") # # tag = soup.select("span,a") # # soup.select("head > title") # # soup.select("p > a") # # soup.select("p > a:nth-of-type(2)") # # soup.select("p > #link1") # # soup.select("body > a") # # soup.select("#link1 ~ .sister") # # soup.select("#link1 + .sister") # # soup.select(".sister") # # soup.select("[class~=sister]") # # soup.select("#link1") # # soup.select("a#link2") # # soup.select('a[href]') # # soup.select('a[href="http://example.com/elsie"]') # # soup.select('a[href^="http://example.com/"]') # # soup.select('a[href$="tillie"]') # # soup.select('a[href*=".com/el"]') # # from bs4.element import Tag # # # def default_candidate_generator(tag): # for child in tag.descendants: # if not isinstance(child, Tag): # continue # if not child.has_attr('href'): # continue # yield child # # # tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator) # print(type(tags), tags) # # from bs4.element import Tag # # # def default_candidate_generator(tag): # for child in tag.descendants: # if not isinstance(child, Tag): # continue # if not child.has_attr('href'): # continue # yield child # # # tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1) # print(type(tags), tags)
# tag = soup.find('span') # print(tag.string) # 获取 # tag.string = 'new content' # 设置 # print(soup) # tag = soup.find('body') # print(tag.string) # tag.string = 'xxx' # tag.text不能修改标签内容 # print(soup) # tag = soup.find('body') # v = tag.stripped_strings # 递归内部获取所有标签的文本 # print(v)
# tag = soup.find('body') # tag.append(soup.find('a')) # print(soup) # 如果实在想追加当前标签已经存在的,方法如下 # from bs4.element import Tag # obj = Tag(name='i',attrs={'id': 'it'}) # obj.string = '我是一个新来的' # tag = soup.find('body') # tag.append(obj) # print(soup)
# from bs4.element import Tag # obj = Tag(name='i', attrs={'id': 'it'}) # obj.string = '我是一个新来的' # tag = soup.find('body') # tag.insert(2, obj) # print(soup)
# from bs4.element import Tag # obj = Tag(name='i', attrs={'id': 'it'}) # obj.string = '我是一个新来的' # tag = soup.find('body') # # tag.insert_before(obj) # tag.insert_after(obj) # print(soup)
# from bs4.element import Tag # obj = Tag(name='i', attrs={'id': 'it'}) # obj.string = '我是一个新来的' # tag = soup.find('div') # tag.replace_with(obj) # print(soup)
# tag = soup.find('div') # a = soup.find('a') # tag.setup(previous_sibling=a) # print(tag.previous_sibling)
# from bs4.element import Tag # obj1 = Tag(name='div', attrs={'id': 'it'}) # obj1.string = '我是一个新来的' # # tag = soup.find('a') # v = tag.wrap(obj1) # print(soup) # tag = soup.find('a') # v = tag.wrap(soup.find('p')) # print(soup)
# tag = soup.find('a') # v = tag.unwrap() # print(soup)
# tag = soup.find('a') # v = tag.unwrap() # print(soup)