一、爬虫基本知识

1)基本爬虫知识,爬取图片

# 第一步获取网页文本信息
import requests
response = requests.get(url='http://...')
#......
# 第二步 ,对文本信息进行正则匹配
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text,features='html.parser')  # features='lxml'  # 性能稍好
# .....
# 第三步 保存内容到本地
file_name = str(uuid.uuid1()) + '.jpg'
with open(file_name,'wb') as f:
    f.write(img_response.content)
爬取网页基本步骤

 1.1)爬取图片

# 第一步获取网页文本信息
import requests

response = requests.get(
    url='http://www.autohome.com.cn/news'
)
# print(response.text)
# 对于编码问题
# 第一种,修改编码为 gbk
# response.encoding = 'gbk'
# print(response.content)

# 第二种 ,response 反解编码
response.encoding = response.apparent_encoding
# print(response.text)

# 第二步 ,对文本信息进行正则匹配
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text,features='html.parser')  # features='lxml'  # 性能稍好
target  =soup.find(id='auto-channel-lazyload-article')  # 寻找 id 为 它 的标签
# print(target)

li_list = target.find_all('li') # 寻找 id 标签 下的所有 li 标签   获取到的是列表类型
# print(li_list)

for i in li_list:
    a = i.find('a')     # 循环列表li标签,找到下的所有 a 标签
    # print(a)
    # print(a.attrs)   # 寻找 a 标签的属性
    if a:
        #print(a.attrs.get('href'))  # 寻找 href 属性
        # txt = a.find('h3')  # 类型是bs4.。的对象
        txt = a.find('h3').text  # 获得文本内容
        #print(txt)
        img = a.find('img').attrs.get('src')
        print(img)
        img_head = 'http:'
        img_url = img_head + img
        print(img_url,type(img_url))
# 第三步 保存内容到本地
        # img_response = requests.get(url=img_url)
        img_response = requests.get(url=img_url)
        print(img_response)
        import uuid
        file_name = str(uuid.uuid1()) + '.jpg'
        with open(file_name,'wb') as f:
            f.write(img_response.content)
View Code

 1.2)爬取图片流程方法总结

1. requests
    pip3 install requests
    
    response = requests.get('http://www.autohome.com.cn/news/')
    response.text
    
    
    总结:
    
    response = requests.get('URL')
    response.text
    response.content
    response.encoding
    response.aparent_encoding
    response.status_code
    response.cookies.get_dict()

    
2. beautisoup模块
    pip3 install beautifulsoup4
    
    from bs4 import BeautiSoup
    soup = BeautiSoup(response.text,features='html.parser')
    target = soup.find(id='auto-channel-lazyload-article')
    print(target)

    总结:
        soup = beautifulsoup('<html>...</html>',features='html.parser')
        v1 = soup.find('div')
        v1 = soup.find(id='i1')
        v1 = soup.find('div',id='i1')
        
        v2 = soup.find_all('div')
        v2 = soup.find_all(id='i1')
        v2 = soup.find_all('div',id='i1')

        obj = v1
        obj = v2[0]
        
        obj.text
        obj.attrs
requests方法与BeautifulSoup方法

2)请求头,伪造浏览器信息。基本所有的网站都要添加请求头才能访问

2.1) 未添加请求头时

import requests

post_dict = {
    'phone':'1111111',
    'password':'1111111',
    'oneMonth':1
}
response = requests.post(
    url='https://dig.chouti.com/login',
    data = post_dict,
)
print(response)
没有添加请求头时

<Response [403]>   拒绝访问

2.2)添加了请求头时

import requests
headers = {
    'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding':'gzip, deflate, br',
    'accept-language':'zh-CN,zh;q=0.9',
    'cache-control':'max-age=0',
    'cookie':'__guid=9498528.323570739154576450.1534233211072.796; gpid=7277f557513c4639b2928de61853af52; gpsd=72d76079e281788ad0075160a54990a9; JSESSIONID=aaaSaxN93bwl6gJQkLgxw; monitor_count=2',
    'upgrade-insecure-requests':'1',
    'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
post_dict = {
    'phone':'1111111',
    'password':'1111111',
    'oneMonth':1
}
response = requests.post(
    url='https://dig.chouti.com/login',
    data = post_dict,
    headers=headers
)
print(response)
添加请求头的

 <Response [200]>  被允许访问

 二、爬出应用

1)爬取猫眼电影并存储文本信息:http://maoyan.com/board/4?offset=0

import random

import requests
import re

hds = [{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},
       {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},
       {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'},
       {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'},
       {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'},
       { 'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
       {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
       {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'},
       {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
       {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
       { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},
       {'User-Agent': 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'},
       {'User-Agent': 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}]

print(len(hds))
print(random.randint(0, len(hds) -1))
print(hds[random.randint(0, len(hds) - 1)])     # 随机写请求头
def get_page(url):
    # 对url的请求处理,获取返回的字符串
    # url = "http://maoyan.com/board/4?offset=10"
    headers = hds[random.randint(0, len(hds) - 1)]      # 随机寻找请求头
    response = requests.get(url, headers=headers)
    try:
        if response.status_code == 200:
            res = response.text
            return res
        return None
    except Exception as e:
        print(e)


'''
<div class="board-item-content">
    <div class="movie-item-info">
        <p class="name"><a href="/films/1203" title="霸王别姬" data-act="boarditem-click" data-val="{movieId:1203}">霸王别姬</a></p>
        <p class="star">
            主演:张国荣,张丰毅,巩俐
        </p>
        <p class="releasetime">上映时间:1993-01-01(中国香港)</p>
    </div>
    <div class="movie-item-number score-num">
        <p class="score"><i class="integer">9.</i><i class="fraction">6</i></p>
    </div>
</div>
'''


def get_movie(html):
    # 获取需要的文本信息
    #                          霸王别姬             主演:张国荣,张丰毅,巩俐                 上映时间:1993-01-01
    # partten = '<p.*?><a.*?>    (.*?)    </a></p>.*?<p.*?>    (.*?)      </p>.*?<p.*?>          (.*?)           </p>'
    partten = '<p.*?><a.*?>(.*?)</a></p>.*?<p.*?>(.*?)</p>.*?<p.*?>(.*?)</p>'
    items = re.findall(partten, html, re.S)
    print((items),'-=====')     # 获取到p 标签的文本内容 :  ('霸王别姬', '\n                主演:张国荣,张丰毅,巩俐\n        ', '上映时间:1993-01-01'),
    return items


def write_file(items):
    # 写入文本信息
    fileMovie = open('movie.txt', 'a+', encoding='utf8')
    try:
        for movie in items:
            fileMovie.write('电影排名:' + movie[0] + '\r\n')
            fileMovie.write('电影主演:' + movie[1].strip() + '\r\n')
            fileMovie.write('上映时间:' + movie[2] + '\r\n\r\n')
        print('文件写入成功...')
    finally:
        fileMovie.close()


def main(url):
    html = get_page(url)    # 返回请求的字符串内容
    items = get_movie(html)
    write_file(items)


# if __name__ == '__main__':
#     for i in range(0, 100, 10):
#         url = "http://maoyan.com/board/4?offset=" + str(i)
#         print(url)
#         main(url)

if __name__ == '__main__':
    url= "http://maoyan.com/board/4?offset=0"
    main(url)
猫眼电影

 2)之上,爬起电影存储的多种方式

  1. 存入普通文件
  2. 存入非关系型数据库中,进行操作
  3. 存入到excel

import random

import requests
import re
import pymongo
from  pymongo import MongoClient
import pandas as pd


hds = [{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},
       {
           'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},
       {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'},
       {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'},
       {
           'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'},
       {
           'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
       {
           'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
       {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'},
       {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
       {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
       {
           'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},
       {'User-Agent': 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'},
       {'User-Agent': 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}]


def get_page(url):
    headers = hds[random.randint(0, len(hds) - 1)]
    response = requests.get(url, headers=headers)
    try:
        if response.status_code == 200:
            res = response.text
            return res
        return None
    except Exception as e:
        print(e)


'''
<div class="board-item-content">
    <div class="movie-item-info">
        <p class="name"><a href="/films/1203" title="霸王别姬" data-act="boarditem-click" data-val="{movieId:1203}">霸王别姬</a></p>
        <p class="star">
            主演:张国荣,张丰毅,巩俐
        </p>
        <p class="releasetime">上映时间:1993-01-01(中国香港)</p>
    </div>
    <div class="movie-item-number score-num">
        <p class="score"><i class="integer">9.</i><i class="fraction">6</i></p>
    </div>
</div>
'''


def get_movie(html):
    # 正则获取电影的信息,并存为列表
    partten = '<p.*?><a.*?>(.*?)</a></p>.*?<p.*?>(.*?)</p>.*?<p.*?>(.*?)</p>'
    items = re.findall(partten, html, re.S)
    # print((items))
    movies_list = []

    for movie in items:
        info = {
            '电影名': movie[0],
            '主演': movie[1].strip()[3:],
            '上映时间': movie[2][5:]
        }
        movies_list.append(info)
        print(movies_list,'movies_list')
    return movies_list

# def save_to_mongodb(movie):
#     conn = pymongo.MongoClient('localhost', )
#     db = conn['movies'] # use movies
#
#     if db['movieList'].insert(movie):
#         print('success...')
#     else:
#         print('error..')


def save_to_csv(movie):
    df = pd.DataFrame(movie)
    df.to_csv('movies')


# def write_file(items):
#     fileMovie = open('movie.txt', 'a+', encoding='utf8')
#     try:
#         for movie in items:
#             fileMovie.write('电影排名:' + movie[0] + '\r\n')
#             fileMovie.write('电影主演:' + movie[1].strip() + '\r\n')
#             fileMovie.write('上映时间:' + movie[2] + '\r\n\r\n')
#         print('文件写入成功...')
#     finally:
#         fileMovie.close()


def main(url):
    html = get_page(url)    # 请求返回的html字符串
    print(html)
    items = get_movie(html)
    # 1. 存入文件
    # write_file(items)
    # 2. 存入非关系型数据库中,进行操作
    # save_to_mongodb(items)
    # 3. 存入到excel
    save_to_csv(items)



if __name__ == '__main__':
    url = "http://maoyan.com/board/4?offset=0"
    print(url)
    main(url)
View Code

 3)模拟登陆github

import re
import requests
from bs4 import BeautifulSoup

r1 = requests.get('https://github.com/login')

# authenticity_token 获取的2种方式
soup = BeautifulSoup(r1.text, 'html.parser')
res = soup.find('input', attrs={'name':'authenticity_token'})

# res2=re.findall(r'name="authenticity_token".*?value="(.*?)"',r1.text)[0] #从页面中拿到CSRF TOKEN
print(res.attrs['value'])
r1_cookies = r1.cookies.get_dict()
print(r1_cookies)
# {'logged_in': 'no', '_gh_sess': 'bHlIxa26', 'has_recent_activity': '1'}
data = {
    "commit": "Sign in",
    "utf8":"",
    "authenticity_token": res.attrs['value'],
    "login": "loverying",
    "password": "caojing825"
}

r2 = requests.post('https://github.com/session', data = data, cookies = r1_cookies)

r2_cookies = r2.cookies.get_dict()

r3 = requests.get('https://github.com/settings/emails', cookies = r2_cookies)
# print(r3.text)
with open("result2.html","wb") as f:
    f.write(r2.content) # 生产网页
View Code

 

posted on 2018-09-23 10:21  可口_可乐  阅读(119)  评论(0编辑  收藏  举报