爬虫

编码问题:

import requests
res=requests.get('https://www.baidu.com')
#指定编码
res.encoding='utf-8'
print(res.text)  #响应内容
with open("a.html",'w') as f:
    f.write(res.text)

get请求携带参数

res=requests.get('https://www.baidu.com/s',
                 params={'wd':"泰迪"},
                 #请求头的信息
                 headers={
                     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
                 }
                 )
res.encoding='utf-8'# 制定编码
print(res.text)  #响应内容
with open("a.html",'w') as f:
    f.write(res.text)

# post请求
# params 是拼在get请求后的
# data:是post请求boyd体中的内容

模拟登陆

1带请求头
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
    'Referer': 'http://www.aa7a.cn/user.php?&ref=http%3A%2F%2Fwww.aa7a.cn%2Fgoods-4210.html'
}
2request请求
res = requests.post('http://www.aa7a.cn/user.php',
                    headers=headers,
              data={
                  'username': '@qq.com',
                  'password': '',
                  'captcha': 'sdfdswa',
                  'remember': 1,
                  'ref': 'http://www.aa7a.cn/goods-4210.html',
                  'act': 'act_login',
              })
3.如果登录成功,cookie会存在于res对象中
cookie = res.cookies.get_dict()
4.向首页发送get请求验证是否登陆成功
res = requests.get('http://www.aa7a.cn/',headers=headers,cookies=cookie)
if 'a1632079340@qq.com' in res.text:
    print('登陆成功')
else:
    print('登陆失败')

爬取视频

import re


res=requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=0')

reg_text='<a href="(.*?)" class="vervideo-lilink actplay">'

obj=re.findall(reg/_text,res.text)
print(obj)
for url in obj:
    url='https://www.pearvideo.com/'+url
    res1=requests.get(url)

    obj1=re.findall('srcUrl="(.*?)"',res1.text)
    print(obj1)
    print(obj1[0])
    name=obj1[0].rsplit('-',1)[1]
    print(name)
    res2=requests.get(obj1[0])
    print(res2)
    with open(name,'wb') as f:
        for line in res2.iter_content():# 这是for循环特有的写入二进制数据的方法;content
            f.write(line)

深浅拷贝

posted @ 2019-11-25 20:17  xg1321  阅读(151)  评论(0编辑  收藏  举报