datawhale-Task7
# 爬取丁香园的数据并存储进数据库
import requests
from time import sleep
from sqlalchemy.orm import sessionmaker
from dxy.model import User, Subject, Comment, db_engine
from sqlalchemy.exc import IntegrityError
session = sessionmaker(bind=db_engine)
db = session()
def save(data):
if type(data) == list:
for i in data:
try:
db.add(i)
db.commit()
except IntegrityError:
db.flush()
else:
db.add(data)
db.commit()
def crawler(url):
headers = {
'User-Agent': ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36'),
}
cookies = {}
temp_cookies_string = ("DXY_USER_GROUP=49; __auc=f5db5ffc1693f4415a8b9b324af; _ga=GA1.2.406327704.1551544823; __"
"utmz=1.1551655682.5.2.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr="
"(not%20provided); __utmz=3004402.1551676197.1.1.utmcsr=(direct)|utmccn=(direct)|"
"utmcmd=(none); _gid=GA1.2.13027547.1551889001; JUTE_BBS_DATA=15fded56752024cd95f26a0e8df"
"09dabc0b65dec4e30437ed04c6b1520f500a3a8e02c6161f0a5bd48d0f3ef0959dd38b9276650ec31f4f0c1"
"59e419c1b97cd34c3a0891d95f2a3926ef6fb7c3b40b4a551ebbb281325a043e4082b5e123d2287015bdcf2e"
"4925add012fdb048e846953a845df43b4b505c; __utma=1.406327704.1551544823.1551889130.1551914"
"990.8; JSESSIONID=6D78A1886433974BF211D4EA7FFBDA91; __utma=3004402.406327704.1551544823."
"1551914699.1551963394.6; __utmc=3004402; JUTE_SESSION_ID=f4417a7d-8d0b-417d-8f62-70b7018"
"78879; JUTE_TOKEN=b72f68e1-a1d1-45e8-8f7f-974abcad9dc9; JUTE_SESSION=04c9d3a941888796762"
"cc09e4d6b56b7a4047b1d26ec72a4bbb0433ff00144852b0e659c4741b4b7151463e8a91fdd12db83bc5ecde"
"5622b66b04b11d64be607c44fe976b21f8170")
for i in temp_cookies_string.split(';'):
key_value = i.strip().split('=')
cookies[key_value[0]] = key_value[1]
response = requests.get(url, headers=headers, cookies=cookies).json()
max_page = response['pageBean']['total']
subject = Subject(title=response['subject'], user_id=response['items'][0]['userId'])
save(subject)
# 获取 comment 数据与 user 数据
for i in range(1, max_page+1):
response = requests.get(url.format(i), headers=headers, cookies=cookies).json()
item_user = [item['user'] for item in response['items']]
# 获取评论的用户数据
user = [User(user_id=item['userId'], avatar=item['avatar'], nickname=item['nickname']) for item in item_user]
# 获取评论的详情
body = [Comment(user_id=item['userId'], content=item['body'], subject_id=subject.id) for item in response['items']]
save(user)
save(body)
sleep(2)
def main(url):
crawler(url)
if __name__ == '__main__':
url = ('http://3g.dxy.cn/bbs/bbsapi/mobile?'
's=view_topic&checkUserAction=1&with'
'Good=1&order=0&size=20&id=509959&page={}')
main(url)