实训大作业7

datawhale-Task7

# 爬取丁香园的数据并存储进数据库
import requests
from time import sleep
from sqlalchemy.orm import sessionmaker
from dxy.model import User, Subject, Comment, db_engine
from sqlalchemy.exc import IntegrityError


session = sessionmaker(bind=db_engine)
db = session()


def save(data):
    if type(data) == list:
        for i in data:
            try:
                db.add(i)
                db.commit()
            except IntegrityError:
                db.flush()
    else:
        db.add(data)
        db.commit()


def crawler(url):
    headers = {
        'User-Agent': ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
                       '(KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36'),
    }
    cookies = {}
    temp_cookies_string = ("DXY_USER_GROUP=49; __auc=f5db5ffc1693f4415a8b9b324af; _ga=GA1.2.406327704.1551544823; __"
                           "utmz=1.1551655682.5.2.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr="
                           "(not%20provided); __utmz=3004402.1551676197.1.1.utmcsr=(direct)|utmccn=(direct)|"
                           "utmcmd=(none); _gid=GA1.2.13027547.1551889001; JUTE_BBS_DATA=15fded56752024cd95f26a0e8df"
                           "09dabc0b65dec4e30437ed04c6b1520f500a3a8e02c6161f0a5bd48d0f3ef0959dd38b9276650ec31f4f0c1"
                           "59e419c1b97cd34c3a0891d95f2a3926ef6fb7c3b40b4a551ebbb281325a043e4082b5e123d2287015bdcf2e"
                           "4925add012fdb048e846953a845df43b4b505c; __utma=1.406327704.1551544823.1551889130.1551914"
                           "990.8; JSESSIONID=6D78A1886433974BF211D4EA7FFBDA91; __utma=3004402.406327704.1551544823."
                           "1551914699.1551963394.6; __utmc=3004402; JUTE_SESSION_ID=f4417a7d-8d0b-417d-8f62-70b7018"
                           "78879; JUTE_TOKEN=b72f68e1-a1d1-45e8-8f7f-974abcad9dc9; JUTE_SESSION=04c9d3a941888796762"
                           "cc09e4d6b56b7a4047b1d26ec72a4bbb0433ff00144852b0e659c4741b4b7151463e8a91fdd12db83bc5ecde"
                           "5622b66b04b11d64be607c44fe976b21f8170")
    for i in temp_cookies_string.split(';'):
        key_value = i.strip().split('=')
        cookies[key_value[0]] = key_value[1]
    response = requests.get(url, headers=headers, cookies=cookies).json()
    max_page = response['pageBean']['total']
    subject = Subject(title=response['subject'], user_id=response['items'][0]['userId'])
    save(subject)
    # 获取 comment 数据与 user 数据
    for i in range(1, max_page+1):
        response = requests.get(url.format(i), headers=headers, cookies=cookies).json()
        item_user = [item['user'] for item in response['items']]
        # 获取评论的用户数据
        user = [User(user_id=item['userId'], avatar=item['avatar'], nickname=item['nickname']) for item in item_user]
        # 获取评论的详情
        body = [Comment(user_id=item['userId'], content=item['body'], subject_id=subject.id) for item in response['items']]
        save(user)
        save(body)
        sleep(2)


def main(url):
    crawler(url)


if __name__ == '__main__':
    url = ('http://3g.dxy.cn/bbs/bbsapi/mobile?'
           's=view_topic&checkUserAction=1&with'
           'Good=1&order=0&size=20&id=509959&page={}')
    main(url)

posted @ 2019-03-07 23:10  朝行  阅读(215)  评论(0编辑  收藏  举报