day04---requests模块的使用

''''''
'''
爬取豆瓣TOP250电影信息

主页:
    第一页:
        https://movie.douban.com/top250?start=0&filter=
    第二页:
        https://movie.douban.com/top250?start=25&filter=
    第三页:
        https://movie.douban.com/top250?start=50&filter=
    第四页:
        https://movie.douban.com/top250?start=75&filter=
    第十页:
        https://movie.douban.com/top250?start=225&filter=
        
    GET
    User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36

re正则:
    # 电影详情页url、图片链接、电影名称、导演、主演、电影上映时间、电影评分、评价人数、简介
   <div class="item">.*?href="(.*?)">.*?src="(.*?)" class="">.*?<span class="title">(.*?)</span>.*?<div class="bd">.*?导演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价.*?<span class="inq">(.*?)</span>
'''
import requests
import re


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}
base_url = 'https://movie.douban.com/top250?start={}&filter='

n = 0
for line in range(10):
    url = base_url.format(n)
    print(type(n))
    n += 25
    print(url)

    # 1、往豆瓣TOP250发送请求获取响应数据
    response = requests.get(url, headers=headers)

    # print(response.text)

    # 2、通过正则解析提取数据
    # 电影详情页url、图片链接、电影名称、电影评分、评价人数
    movie_content_list = re.findall(
        # 正则规则
        # '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价',
        '<div class="item">.*?href="(.*?)">.*?src="(.*?)" class="">.*?<span class="title">(.*?)</span>.*?<div class="bd">.*?导演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价.*?<span class="inq">(.*?)</span>',

        # 解析文本
        response.text,

        # 匹配模式
        re.S)

    for movie_content in movie_content_list:
        # 解压赋值每一部电影
        detail_url, movie_jpg, name, daoyan, timer,point, num,  desc= movie_content
        data = f'电影名称:{name},   详情页url:{detail_url}, 图片url:{movie_jpg}, 导演: {daoyan} 上映时间: {timer}评分: {point}, 评价人数: {num} 简介:{desc}\n'
        print(data)

        # 3、保存数据,把电影信息写入文件中
        with open('douban.txt', 'a', encoding='utf-8') as f:
            f.write(data)

 

上午的内容主要是爬虫的整体内容进行了梳理
一下
'''
什么是爬虫
什么是互联网
互联网建立的目的

爬虫程序
对浏览器模拟向目标站点发送请求
-requests
-selenium

获取相应数据 #不关注过程

解析并提取数据
-BeautifulSoup4
-Xpath
保存数据
-MongoDB

爬虫框架
-Scrapy
'''
# Today list
# requests模块的详细实用
# selenium
以下是今天对于requests模块的教学内容

首先对于知乎的访问,使用cookie和useragent信息
import requests
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
}
res = requests.get(url='https://www.zhihu.com/explore',headers = headers)
# print(res)
with open('zhihu.html','wt',encoding='utf-8')as f:
f.write(res.text)

from urllib.parse import urlencode
import requests
# UA
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
}

cookies = {
'Cookie':'BAIDUID=818CCB3A914DB4DC07CF0C10F173A77D:FG=1; PSTM=1555943005; BD_UPN=12314753; BIDUPSID=3257DBF41E8FCB9318C783A0F6A3B93F; BDUSS=XJlMEZ0SlpzYnRQQk5IWFExQjJibFJtUFoyQ0swRy11YTNqdWkzTmpaZ1JjLVpjSVFBQUFBJCQAAAAAAAAAAAEAAACxxUowzqiwrsPOtftmbHkAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABHmvlwR5r5cY; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; MCITY=-%3A; sug=3; sugstore=1; ORIGIN=0; bdime=0; delPer=0; BD_CK_SAM=1; PSINO=7; COOKIE_SESSION=2145_0_2_0_0_3_0_0_0_2_1_0_0_0_0_0_0_0_1560496689%7C2%230_0_1560496689%7C1; H_PS_PSSID=26522_1440_21083_29135_29238_28519_29099_29139_28833_29220_26350_29131; H_PS_645EC=9cfa98lhXcfeqaI5ofqpuCm39vJFw9WiZ74jdiuiP4pHeqqOhtFPraAXeKw; WWW_ST=1560497620569; BDSVRTM=176'
}

# url = 'https://www.baidu.com/s?' + urlencode({'wd':'eva'})
url = 'https://www.baidu.com/s?'

print(url)
# res = requests.get(url,headers=headers)
# print(res.status_code)
res = requests.get(url,headers=headers,params={'wd':'eva'})
with open('eva.html','wt',encoding='utf-8') as f:
f.write(res.text)

import requests
# 带cookie访问
# 可以通过headers传入参数,也可以通过另一个字典cookies传入擦护
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
}

cookies = {
'Cookie': 'BAIDUID=818CCB3A914DB4DC07CF0C10F173A77D:FG=1; PSTM=1555943005; BIDUPSID=3257DBF41E8FCB9318C783A0F6A3B93F; BDUSS=XJlMEZ0SlpzYnRQQk5IWFExQjJibFJtUFoyQ0swRy11YTNqdWkzTmpaZ1JjLVpjSVFBQUFBJCQAAAAAAAAAAAEAAACxxUowzqiwrsPOtftmbHkAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABHmvlwR5r5cY; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; MCITY=-%3A; H_PS_PSSID=26522_1440_21083_29135_29238_28519_29099_29369_28833_29220_26350_29131; delPer=0; PSINO=7; PHPSESSID=r3qtro7c73v5d140v0mrjillc1; Hm_lvt_4010fd5075fcfe46a16ec4cb65e02f04=1560498230,1560498412,1560498430,1560498452; Hm_lpvt_4010fd5075fcfe46a16ec4cb65e02f04=1560498452'
}
url = 'http://i.baidu.com/'
res = requests.get(url, headers=headers, cookies=cookies)
print(res.status_code)

# with open('mybaidu.html','wt',encoding='utf-8') as f:
# f.write(res.text)
print('唯爱梦蝶fly' in res.text)


posted @ 2019-06-21 21:47  时无英雄  阅读(213)  评论(0编辑  收藏  举报