头条街拍

参考博文链接：【Python爬虫实战】2020最新无错误，头条爬取图片实战，Ajax异步加载，附有源码

老规矩，先上代码。主要分为 toutiao.py 和 config.py 。

config.py

MONGO_URL = 'localhost'
MONGO_DB = 'toutiao'
MONGO_TABLE = 'toutiao'


GROUP_START = 0
GROUP_END = 15
KEY_WORD = '街拍'

toutiao.py

import os
import re
import time
from multiprocessing import Pool
from urllib.parse import urlencode
import pymongo
import requests
from requests.exceptions import RequestException
from config import *

client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]


# 得到主页
def get_page_index(offset, keyword):
    global headers
    headers = {
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36 Edg/81.0.416.68',
        'cookie': '' #记得填上自己的cookie
    }
    params = {
        'aid': '24',
        'app_name': 'web_search',
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'en_qc': '1',
        'cur_tab': '1',
        'from': 'search_tab',
        'pd': 'synthesis',
        'timestamp': int(time.time()*1000)
    }
    url = 'https://www.toutiao.com/api/search/content/?' + urlencode(params)
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            #print(response.json())
            return response.json()
        return None
    except RequestException:
        print('获取主页失败')
        return None


# 解析主页 得到一些图片地址
def parse_index_page(json):
    if json and 'data' in json.keys():
        items = json.get('data')
        for item in items:
            if item:
                title = item.get('title')
                urls = item.get('image_list')
                i = 0
                if title:
                    for url in urls:
                        i += 1
                        # 由于所得的url链接对应的图片很小，观察相应大图的对应的链接，这里修改一下，得到大图
                        if "190x124" in url.get('url'):
                            image_url = re.sub('list/190x124', 'large', url.get('url'))
                        else:
                            image_url = re.sub('list', 'large', url.get('url'))
                        yield {
                            'title': title+str(i),
                            'url': image_url
                        }


# 保存图片
def save_image(image):
    path = 'images'
    if not os.path.exists(path):
        os.makedirs(path)
    try:
        response = requests.get(image.get('url'))
        if response.status_code == 200:
            file_path = '{0}/{1}.{2}'.format(path, image.get('title'), 'jpg')
            if not os.path.exists(file_path):
                print('...正在下载', file_path)
                with open(file_path, 'wb') as f:
                    f.write(response.content)
            else:
                print('已经下载', file_path)
    except requests.ConnectionError:
        print('下载出错')


# 存到MongoDB数据库
def save_to_mongo(image):
    if db[MONGO_TABLE].insert(image):
        print('存储到MongoDB成功---', image.get('title'))
        return True
    return False


def main(offset):
    json = get_page_index(offset, KEY_WORD)
    images = parse_index_page(json)
    if images:
        for image in images:
            save_image(image)
            save_to_mongo(image)


if __name__ == '__main__':
    try: #在没有得到信息时，会出现异常
        pool = Pool()
        groups = [i * 20 for i in range(GROUP_START, GROUP_END)]
        pool.map(main, groups)
        pool.close()
        pool.join()
    except BaseException:
        print('没有对应信息')

代码的注释比较少，因此查看参考的视频、博客应该足够理解了。本篇博客主要记录一些前面两个资料没有提到的坑点：

1、如果获取的数据，'data':None，说明这个cookie暂时被禁用了，考虑换一个cookie。否则获取不到数据。

2、mongoDB自行安装。在存入的过程中，并不会显示，自己打开mongoDB就可以看到对应的项了。

3、视频中是从详情页提取图片，但尝试多次后都未能从详情页得到图片数据，因此直接在主页面提取。可是主页面显示的都是缩略图，因此在真正保存图片前，记得转换到大图的地址。

posted @ 2020-06-07 16:00 已是夕阳，不如放下阅读(366) 评论(0) 收藏举报

刷新页面返回顶部

放下

头条 街拍

公告

头条街拍