创意抓取及导出
# _*_ coding=utf-8 _*_ import requests import time import math import os import pandas as pd cookies = input('请输入Cookie:') headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Cookie': cookies, 'Host': 'xgop.in.zhihu.com', 'Referer': '***' } tempmap = {6: '大图', 7: '文字链', 8: '小图', 10: '多图', 11: '视频'} zonemap = {8:'知乎 APP 信息流',152:'知乎极速版首页',153:'知乎极速版回答页',20:'推荐阅读',33:'搜索',30:'App问题页信息流'} positionmap = {1:'首页',3:'回答页',2:'问题页'} osmap = {1:'安卓',2:'苹果'} networkmap = {1:'wifi',2:'2G',3:'3G',4:'4G'} equipmentPricemap = {1:'1500元以下',2:'1500-2500元',3:'2500-3500元',4:'3500元-4500元',5:'4500元以上'} mobileOperatormap = {0:'中国移动',1:'中国联通',2:'中国电信'} gendermap = {0:'女',1:'男'} all_data = [] def get_single_data(url): try: res = requests.get(url, headers=headers) except Exception as e: print('异常请求链接--->' + url + str(e)) else: data = res.json().get('result',0) if data: for i in data: single_data = {} try: single_data['创意id'] = i['id'] single_data['账户id'] = i['userId'] single_data['目标类型'] = i['targetType'] single_data['标题'] = i['asset']['title']['value'] single_data['描述'] = i['asset']['desc']['value'] try: single_data['图片url'] = i['asset']['main']['url'] except Exception as e: single_data['图片url'] = '' single_data['cta'] = i['asset']['cta']['value'] single_data['状态'] = i['status'] #single_data['落地页url'] = i['url'] single_data['创意名称'] = i['name'] single_data['曝光'] = i['counter']['impression'] single_data['点击'] = i['counter']['click'] single_data['点击率'] = i['counter']['clickRatio'] single_data['点击价格'] = i['counter']['clickPrice']/100 single_data['花费'] = i['counter']['cost']/100 single_data['样式'] = tempmap.get(i['ad']['templateId']) #数字 single_data['推广开始日期'] = i['ad']['dateBegin'] single_data['产品id'] = i['productId'] single_data['出价'] = i['ad']['price']/100 single_data['投放平台'] = ', '.join([osmap.get(d,'不限, ') for d in i['ad']['targeting']['os']]) or '不限' try: single_data['app行为'] = i['ad']['targeting']['appCategory'] except Exception as e: single_data['app行为'] = '无' try: single_data['自定义人群'] = i['ad']['targeting']['crowd'] except Exception as e: single_data['自定义人群'] = '无' single_data['性别'] = ', '.join([gendermap.get(d,'不限, ') for d in i['ad']['targeting']['gender']]) or '不限' try: single_data['兴趣'] = i['ad']['targeting']['interest'] if len(i['ad']['targeting']['interest']) > 0 else '不限' except Exception as e: single_data['兴趣'] = '' single_data['网络'] = ', '.join([networkmap.get(d,'不限, ') for d in i['ad']['targeting']['network']]) or '不限' try: single_data['运营商'] = ', '.join([mobileOperatormap.get(d,'不限, ') for d in i['ad']['targeting']['mobileOperator']]) or '不限' except Exception as e: single_data['运营商'] = '无' try: single_data['设备价格'] = ', '.join([equipmentPricemap.get(d,'不限, ') for d in i['ad']['targeting']['equipmentPrice']]) or '不限' except Exception as e: single_data['设备价格'] = '无' single_data['关键词'] = ', '.join(i['ad']['targeting']['keyword']) single_data['创意展现方式'] = i['ad']['strategy']['creative'] single_data['编辑页面地址'] = '****'.format(single_data['账户id'],single_data['创意id']) single_data['展现位置'] = ', '.join([positionmap.get(d,'未知') for d in i['ad']['zoneIds']]) except Exception as e: print('异常解析链接--->' + url+ str(e)) pass if float(single_data.get('花费',0)) >=0: all_data.append(single_data) print(len(all_data)) def get_all_urls(userid, start_time, end_time): base_url = '******' first_page_url = base_url.format(page=1, userid=int(userid), start_time=str(start_time), end_time=str(end_time)) try: res = requests.get(first_page_url, headers=headers) except Exception as e: print('异常all链接--->' + first_page_url + str(e)) else: total_page = math.ceil(res.json()['totalCount'] / 10) all_url = [base_url.format(page=int(page), userid=int(userid), start_time=str(start_time), end_time=str(end_time)) for page in range(1, int(total_page))] return all_url def main(): uids = input('请输入uids(格式:111,222,333):') start_time = input('请输入开始时间(格式:2018-01-01):') end_time = input('请输入结束时间(格式:2018-07-03):') if len(uids) > 0: for userid in uids.split(','): for url in get_all_urls(userid.strip(), start_time, end_time): time.sleep(3) get_single_data(url) df1 = pd.DataFrame(all_data) if not os.path.exists(uids): os.mkdir(os.path.join(os.getcwd(),uids)) df1.to_excel(uids+'/'+str(uids)+ "-" + time.strftime("%Y%m%d%H%M") +'有消费创意' + '.xlsx', index=False) print('done') if __name__ == '__main__': main()