爬虫案例

爬取汽车之家，指定页面的图片url

1.爬取汽车之家，指定页面的图片url

import requests
from bs4 import BeautifulSoup

# 获取页面数据
r1 = requests.get(
    url='https://www.autohome.com.cn/news/201801/912472.html#pvareaid=102624',
    headers={
        'Host':'www.autohome.com.cn',
        'Referer':"https://www.autohome.com.cn/",
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
    }
)

soup = BeautifulSoup(r1.text, "lxml")

# 定位标签
id_articlewrap = soup.find(name="div", id="articlewrap")
id_articleContent = soup.find(name="div", id="articleContent")

# 标题
h1 = (id_articlewrap.find(name="h1").text).strip()

# 获取id_articleContent下 p 标签，并且为 center 属性 []
pp = id_articleContent.find_all(name="p", attrs={"align": "center"})
for i in pp:
    img = i.find(name="img")
    # 判断是否有 img 标签
    if img:
        # 获取 src 地址

        img_url = "https:" + img.get("src")
        print(img_url)
        # 获取 图片的 bytes 内容
        img_response = requests.get(img_url).content

        # 截取url图片名称
        file_name = img_url.rsplit('/', maxsplit=1)[1]
        with open(file_name, 'wb') as f:
            # 写入文件中
            f.write(img_response)

爬取汽车之家，news页面的标题图片摘要

import requests
from bs4 import BeautifulSoup

response = requests.get(
    url="https://www.autohome.com.cn/news/"
)

response.encoding = 'gbk'

soup = BeautifulSoup(response.text, "lxml")

tag = soup.find(attrs={"id": "auto-channel-lazyload-article"})

li_list = tag.find_all("li")

for li in li_list:
    h3 = li.find(name="h3")

    if not h3:
        continue
    # 获取标题 h3 标签
    print(h3.text)

    # 获取 a 标签的数据 href 值 可以通过 get("href") 和 attrs 获取所以属性值
    url = "https:"+li.find(name="a").get("href")
    # url1 = "https:"+li.find(name="a").attrs["href"]
    print(url)

    # 获取内容简介
    article = li.find(name="p")
    print(article.text)

    # 获取图片地址
    img_url = "http:"+li.find(name="img").get("src")
    print(img_url)

    # 获取图片内容
    file_name = img_url.rsplit('/', maxsplit=1)[1]
    img_res = requests.get(
        url=img_url
    )
    # 将图片写入到文件中
    with open(file_name, "wb") as f:
        f.write(img_res.content)

通过requests 登录到github页面

"""
请求的url ：Referer   https://github.com/session
请求方式为 ：POST
响应类型：Content-Type: application/x-www-form-urlencoded   （判断使用 data 传参，还是使用json 传参）

"""

import requests
from bs4 import BeautifulSoup

# 1.获取 请求中的 authenticity_token
r1 = requests.get(
        url="https://github.com/login",
    )

s1 = BeautifulSoup(r1.text, "lxml")
token = s1.find(attrs={'name': 'authenticity_token'}).get("value")

r1_cookie_dict = r1.cookies.get_dict()

# 2.根据获取的 token 登录 github,进行用户认证
r2 = requests.post(
    url="https://github.com/session",
    data={
            "commit": "Sign in",
            "utf8": "✓",
            "authenticity_token": token,
            "login": "baolin2200",
            "password": "*******"
            },
    # 带着第一次请求，服务器给发送的 cookies 值去服务端 验证
    cookies=r1_cookie_dict
)

# 保存登录状态 的 cookies
r2_cookie_dict = r2.cookies.get_dict()


# 3.将两次的 cookies 值 合并用户登陆后数据请求
# 如果返回的 key 值相同的话，后面值会覆盖前面值，出现无法访问数据时，可以将 update 的顺序换一下
all_cookie_dict = {}
all_cookie_dict.update(r2_cookie_dict)
all_cookie_dict.update(r1_cookie_dict)


# 4.通过登录成功的 汇总 cookie 值，访问个人中心等地址
r3 = requests.get(
    url="https://github.com/",
    cookies=all_cookie_dict
)

print(r3.text)

通过requests给抽屉文章点赞

import requests

# 1.获取页面数据，并获取第一次访问的返回 cookies 值
r1 = requests.get(
    url='http://dig.chouti.com/',
    headers={
        'Host':'dig.chouti.com',
        'Referer':"http://dig.chouti.com/",
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
    }
)
# 获取第一次登陆时 服务端给的 cookies
r1_cookie_dict = r1.cookies.get_dict()


# 2.登陆服务器，带着第一次获取的 cookies 值
r2 = requests.post(
    # 登陆地址
    url='http://dig.chouti.com/login',
    # 登录信息数据
    data={
        # Form Data 的input框
        'phone': '8613121758648',
        'password': '********',
        'oneMonth': 1,
    },
    # 头部信息
    headers={
        'Host': 'dig.chouti.com',
        'Referer': "http://dig.chouti.com/",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
    },
    # 带着第一次访问获取的 cookies 信息
    cookies=r1_cookie_dict
)
# 获取登陆后 服务端给的 cookies
r2_cookie_dict = r2.cookies.get_dict()


# 3.将两次返回的 cookies 值组合为一个字典
all_cookie_dict = {}
all_cookie_dict.update(r2_cookie_dict)
all_cookie_dict.update(r1_cookie_dict)


# 4.带着cookie 值，依照post方式向url触发点赞
r3 = requests.post(
    # 要点赞的 url 地址id为文章id值
    url='http://dig.chouti.com/link/vote?linksId=17703876',
    headers={
        'Host': 'dig.chouti.com',
        'Referer': "http://dig.chouti.com/",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
    },
    cookies=all_cookie_dict
)
# 获取返回值
print(r3.text)

posted @ 2018-01-29 18:19 叨客厨子阅读(213) 评论(0) 收藏举报

刷新页面返回顶部

叨客厨子

爬虫案例

爬虫案例

爬取汽车之家，指定页面的图片url

爬取汽车之家，news页面的标题图片摘要

通过requests 登录到github页面

通过requests给抽屉文章点赞

公告