自制小脚本for alibaba
应对采集需要登陆的网站小技巧,原理:通过cookie保持登陆的机制来实现登陆。
虽说一直折腾百度seo,但是也有做电商,特别是B2B阿里巴巴国际站,为了满足个人工作上的一些需求:自制了一些小脚本,采集阿里国际站后台数据:备份一下..
采集我的词
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
#encoding=utf-8 import requests,re,time op_csv = open ( 'myword.csv' , 'a' ) op_csv.write( '关键词,曝光量,点击量\n' ) for i in range ( 1 , 1001 ): #url估计也要换下 url = 'https://hz-mydata.alibaba.com/self/.json?action=CommonAction&iName=getKeywordsAndHasP4P&ctoken=cypr24i30ehf&statisticsType=month&orderBy=sumShowCnt&orderModel=desc&pageSize=10&pageNO=%d' % i headers = { 'user-agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36' } cookies = { 'cookie' : '你的cookie' } html = requests.post(url = url,cookies = cookies,headers = headers,timeout = 10 ) # print requests['QUERY_STRING'] c = html.json() b = c[ 'value' ][ 'keywords' ][ 'data' ] for i in b: print i[ 'keyword' ],i[ 'sumShowCnt' ],i[ 'sumClickCnt' ] op_csv.write( '%s,%s,%s\n' % (i[ 'keyword' ],i[ 'sumShowCnt' ],i[ 'sumClickCnt' ])) |
采集访客词
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
#encoding=utf-8 import sys reload (sys) sys.setdefaultencoding( 'utf-8' ) import requests,re op_csv = open ( 'visitword.csv' , 'a' ) for i in range ( 1 , 301 ): url = 'https://hz-mydata.alibaba.com/self/.json?action=CommonAction&iName=getVisitors&ctoken=cypr24i30ehf&pageSize=10&pageNO=%d&startDate=2016-03-26&endDate=2016-04-24' % i headers = { 'user-agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36' } cookies = { 'cookie' : '你的cookie' } html = requests.post(url = url,cookies = cookies,headers = headers,timeout = 10 ) c = html.json() b = c[ 'value' ][ 'data' ] for i in b: print i[ 'serKeywords' ] op_csv.write( '%s\n' % i[ 'serKeywords' ]) |
采集询盘词
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
#encoding=utf-8 import sys reload (sys) sys.setdefaultencoding( "utf-8" ) import requests op_csv_write = open ( 'p4p.csv' , 'a' ) op_csv_write.write( '关键词,p4pViews,p4pClicks,views,clicks\n' ) for num in range ( 1 , 20 ): url = 'http://hz-mydata.alibaba.com/self/.json?action=CommonAction&iName=getEffectiveProductsAndStats&ctoken=d1uvlnsn7bj3&statisticsType=month&pageNO=%d' % num cookies = { 'Cookie' : '你的cookie' } html = requests.get(url = url,cookies = cookies) datajson = html.json() # print datajson cc = datajson[ 'value' ] # print cc a = cc[ 'products' ] b = a[ 'data' ] # print b for i in b: f = i[ 'keywordEffect' ] for n in f: a0 = n[ 'keyword' ] a1 = n[ 'p4pViews' ] a2 = n[ 'p4pClicks' ] a3 = n[ 'views' ] a4 = n[ 'clicks' ] print a0,a1,a2,a3,a4 op_csv_write.write( '%s,%s,%s,%s,%s\n' % (a0,a1,a2,a3,a4)) |
采集热门搜索词
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
#encoding=utf-8 import requests import json from lxml import etree from multiprocessing.dummy import Pool import sys reload (sys) sys.setdefaultencoding( "utf-8" ) cook = { 'cookie' : '你的cookie' } # url='http://www2.alibaba.com/manage_ad_keyword.htm' # html=requests.get(url).content # print html op_csv_write = open ( 'aliword.csv' , 'a' ) op_csv_write.write( '关键词,卖家竞争度,橱窗数,搜索热度7月\n' ) for i in xrange ( 1 , 501 ): # url='http://hz.my.data.alibaba.com/industry/.json?action=CommonAction&iName=searchKeywords&ctoken=15m0g7t10hi17&keywords=hair&orderBy=srh_pv_this_mon&orderModel=desc&pageNO=%d'%i url = 'http://hz.my.data.alibaba.com/industry/.json?action=CommonAction&iName=searchKeywords&0.6011645244434476&ctoken=1ek_faad2506u&keywords=hair&orderBy=srh_pv_this_mon&orderModel=desc&pageSize=10&pageNO=%d' % i html = requests.get(url,cookies = cook).content # print html jsDict = json.loads(html) #jscontent是需要解析的js代码 # print jsDict jsData = jsDict[ 'value' ] # print jsData comments = jsData[ 'data' ] for each in comments: # print each print '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (each[ 'keywords' ],each[ 'company_cnt' ],each[ 'showwin_cnt' ],each[ 'srh_pv_last_10mon' ],each[ 'srh_pv_last_11mon' ],each[ 'srh_pv_last_1mon' ],each[ 'srh_pv_last_2mon' ],each[ 'srh_pv_last_3mon' ],each[ 'srh_pv_last_4mon' ],each[ 'srh_pv_last_5mon' ],each[ 'srh_pv_last_6mon' ],each[ 'srh_pv_last_7mon' ],each[ 'srh_pv_last_8mon' ],each[ 'srh_pv_last_9mon' ],each[ 'srh_pv_this_mon' ]) op_csv_write.write( '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (each[ 'keywords' ],each[ 'company_cnt' ],each[ 'showwin_cnt' ],each[ 'srh_pv_last_10mon' ],each[ 'srh_pv_last_11mon' ],each[ 'srh_pv_last_1mon' ],each[ 'srh_pv_last_2mon' ],each[ 'srh_pv_last_3mon' ],each[ 'srh_pv_last_4mon' ],each[ 'srh_pv_last_5mon' ],each[ 'srh_pv_last_6mon' ],each[ 'srh_pv_last_7mon' ],each[ 'srh_pv_last_8mon' ],each[ 'srh_pv_last_9mon' ],each[ 'srh_pv_this_mon' ])) |
采集p4p竞价词
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
|
#encoding=utf-8 import requests import json # import sys # reload(sys) # sys.setdefaultencoding('utf-8') op_word_csv = open ( 'okey.csv' , 'a' ) op_word_csv.write( '关键词,状态,搜索量\n' ) for n in xrange ( 1 , 190 ): k = str (n) # print type(c) url = 'http://www2.alibaba.com/asyGetAdKeyword.do?_t=1440205849059&cmd=showTable&ctoken=dnb2amfj9a86&json=%7B%22count%22%3A50%2C%22date%22%3A%227%22%2C%22delayShow%22%3Afalse%2C%22page%22%3A' + k + '%2C%22recStrategy%22%3A1%2C%22recType%22%3A%22recommend%22%2C%22sort%22%3A%22asc%22%2C%22sortKey%22%3A%22keyword%22%2C%22tagId%22%3A%2250191900149%22%7D&_csrf_token_=14z1d7pfbefjg' cook = { 'cookies' : '你的cookie' } html = requests.post(url = url,cookies = cook).content # print html json1 = json.loads(html) # print json1 jsdata = json1[ 'keywords' ] # print jsdata for i in jsdata: # print i b = i[ 'adKeywordDO' ] c = b[ 'status' ] e = b[ 'word' ] f = i[ 'search' ] print '%s,%s,%s\n' % (e,c,f) op_word_csv.write( '%s,%s,%s\n' % (e,c,f)) |
恭喜你很认真的看完了,友情提示:采集百度凤巢也可以这样玩!