爬虫经验记录
1. 最近想解决一个把s.click.taobao这种淘宝客链接转成tmail.item.id这种商品链接的问题。从浏览器上看。打开一个smzdm上的商品链接。从站内第一个请求到最后打开淘宝最终商品页,一共有4个请求。
第一个:站内请求https://go.smzdm.com/53a152cd8d34f672/ca_aa_yh_191_53256422_10720_2437_199_0.返回的是一个网页,主要执行了一段js。
curl 'https://go.smzdm.com/53a152cd8d34f672/ca_aa_yh_191_53256422_10720_2437_199_0' \
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \
-H 'Accept-Language: zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7' \
-H 'Connection: keep-alive' \
-H 'Cookie: smzdm_user_source=C23F486F6EF6FB431F3C29669D184807; shequ_pc_sug=b; _ga_271744817=GS1.1.1620825416.14.1.1620825784.0; _ga_271701614=GS1.1.1620838124.2.1.1620838126.0; device_id=66633830416213410837136330b05d6e4a075096605bc086241135855; homepage_sug=a; __gpi=00000000-0000-0000-0000-000000000000:c216ZG0uY29t:Lw==; r_sort_type=score; __jsluid_s=aef7e938f13c5329e8a598ce8806f4ed; _ga=GA1.2.1581842807.1585658612; _ga_09SRZM2FDD=GS1.1.1636644634.253.1.1636646572.0; __gads=ID=d77833754a98ff3f:T=1628263793:S=ALNI_MaTwZEvwytYuPKBY6iFcoqtOsSljg; __ckguid=9ls1p5pewF75yH7oA5t7R4; Hm_lvt_9b7ac3d38f30fe89ff0b8a0546904e58=1651347077; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%229767684078%22%2C%22first_id%22%3A%2217325368a01ac6-0a1d4c8d407fdc-31617402-2007040-17325368a02ab3%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_landing_page%22%3A%22https%3A%2F%2Fwww.smzdm.com%2F%22%7D%2C%22%24device_id%22%3A%2217325368a01ac6-0a1d4c8d407fdc-31617402-2007040-17325368a02ab3%22%7D; s_his=dsds%2C%E9%A5%BA%E5%AD%90%2C%E6%A4%B0%E6%A0%91%2C%E6%B0%B4%E9%A5%BA; ss_ab=ss49; _zdmA.uid=ZDMA.HdRKKE5ZT.1652802072.2419200; Hm_lpvt_9b7ac3d38f30fe89ff0b8a0546904e58=1652802073' \
-H 'Referer: https://www.smzdm.com/' \
-H 'Sec-Fetch-Dest: document' \
-H 'Sec-Fetch-Mode: navigate' \
-H 'Sec-Fetch-Site: same-site' \
-H 'Sec-Fetch-User: ?1' \
-H 'Upgrade-Insecure-Requests: 1' \
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36' \
-H 'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="101", "Google Chrome";v="101"' \
-H 'sec-ch-ua-mobile: ?0' \
-H 'sec-ch-ua-platform: "macOS"' \
--compressed
----------------------------
200
<!DOCTYPE html>
<html>
<head>
<title>正在跳转至购买页面</title>
<meta charset="utf-8" />
<meta http-equiv="pragma" content="no-cache" />
<noscript><meta http-equiv="refresh" content="0; url=/"></noscript>
<script>
var from_url = document.referrer.toLowerCase();
if (from_url.indexOf('www.baidu.com')>0 || from_url.indexOf('www.google.')>0 || from_url.indexOf('.bing.com')>0
|| from_url.indexOf('www.sogou.com') >0|| from_url.indexOf('www.soso.com')>0 || from_url.indexOf('www.so.com') >0
|| from_url.indexOf('.yahoo.com')>0 || from_url.indexOf('www.jike.com')>0) {
window.location.replace("http://www.smzdm.com");
}
eval(function(p,a,c,k,e,d){e=function(c){return(c<a?'':e(parseInt(c/a)))+((c=c%a)>35?String.fromCharCode(c+29):c.toString(36))};if(!''.replace(/^/,String)){while(c--){d[e(c)]=k[c]||e(c)}k=[function(e){return d[e]}];e=function(){return'\\w+'};c=1};while(c--){if(k[c]){p=p.replace(new RegExp('\\b'+e(c)+'\\b','g'),k[c])}}return p}('4 K(D){3 z,1k=16 X("(^| )"+D+"=([^;]*)(;|$)");5(z=f.W.2i(1k)){9 28(z[2])}1a{9\'\'}}4 u(B,1h,1n,u){9 B.1g(0,1h-1)+u+B.1g(1n,B.Q)}3 G=7.C.F;3 b=K("b");5(b!=\'\'){b=20("("+b+")");q=b.q;v=b.v;5(G.I(q+"/"+v)<0){1A=u(G,26,2d,q+"/"+v);7.C.1I=1A}}(4(){3 1J=K(\'1z\');3 z=1J.1M(\'|\');3 L=o;3 1d=o;4 13(){5(L)9;L=j;5(1d){3 k=f.R(\'29\');k.S=1y;k.2h.1X=\'2e\';f.19.23(k);7.1v(4(){f.19.21(k);7.C.F=h},1c)}1a{7.C.F=h}}(4(g){3 p=g.T,n=g.D,w=7,d=f,s=\'22\',x=O,y=O;5(25(w[\'18\'])!==\'27\'){9 o}w[\'18\']=n;w[n]=w[n]||4(a){9 4(){(w[n].15=w[n].15||[]).2b([a,2c])}};3 A=[\'2f\',\'1D\',\'1Y\',\'1b\',\'2g\',\'2a\',\'1Z\',\'1U\',\'1L\',\'1O\',\'1K\',\'1P\',\'1S\',\'1T\',\'1C\',\'1W\',\'1Q\',\'1V\',\'1R\'];1N(3 i=0;i<A.Q;i++){w[n][A[i]]=w[n].2j(O,A[i])}5(!w[n].2S){x=d.R(s),y=d.2P(s)[0];x.2L=1;x.S=p;x.2T(\'2M\',\'2N-8\');w[n].g=g;y.2Q.2R(x,y)}})({T:\'c://U.N.6/V/Y/E/P-Z-11@1.14.24/2U.17.E\',2V:\'c://U.N.6/V/Y/E/P-Z-11@1.14.24/2W.17.E\',D:\'l\',2X:\'c://2O-2K.N.6/P?2w=2I\',2l:o,2m:{2J:j,2n:j,2o:j,2p:j}});4 r(12){3 10=16 X(\'(^|;)[ ]*\'+12+\'=([^;]*)\'),M=10.2q(f.W);9 M?2r(M[2]):\'\'}l.1b({2s:\'2t\',2u:r(\'2k\'),2v:r(\'1z\'),2x:1E.1G,2y:7.2z||\'被屏蔽了\',2A:\'\'});3 J=r(\'2B\');J&&l.1C(J);l.1D(\'2C\');h=\'c://s.1w.1f.6/t?e=m%1i%1j%1x%2F%1l%1e%1m%1o&1p=1q:1r@1s@1t@1u\';3 H=1E.1G.2D();5(H.I(\'1F/2E\')>0||H.I(\'1F/2G\')>0){h=h.1I(\'c://1H.1B.6\',\'2H://1H.1B.6\')}1y=\'c://s.1w.1f.6/t?e=m%1i%1j%1x%2F%1l%1e%1m%1o&1p=1q:1r@1s@1t@1u\';1v(13,1c)})()',62,184,'|||var|function|if|com|window||return||zdm_track_info|https|||document|para|smzdmhref||true|ifr|sensors|||false||source|getSensorsCookie|||changeStr|channel||||arr|ifs|allstr|location|name|js|href|this_url|uaStr|indexOf|sensorsSmzdmId|getCookie|redirected|cookieMatch|smzdm|null|sa|length|createElement|src|sdk_url|res|resources|cookie|RegExp|public|sdk|cookiePattern|javascript|cookieName|redirect||_q|new|min|sensorsDataAnalytic201505|body|else|registerPage|1000|is_amazon|2FOwDMfXFgMfhIjP5Uhv22UTqMbFL|taobao|substring|start|3D2|26s|reg|2FivxmNmeo|2FUcdXzDmTpDqNxMoB6dRU2TtazxgxdTc00KD8|end|3D|union_lens|lensId|TAPI|1652802694|2133cdb8_0857_180d2b7ac05_637b|01|setTimeout|click|3DtNQUaBJH2S1w4vFB6t2Z2ueEDrYVVa64yK8Cckff7TXLWlSKdGSYDqZisqmwqXfxxq3IhSJN6GSbBjBh3LodcDtMDMUFdHxsdRWpt5EofIXZYTh5WpEozYVwhIsr1DlAgb9ZeSZXs4pzwnsamUFXrlNyYzMa7ZkMOemaFM5tHHYxZyjQcbVDhf4AdzT7K6Uc|smzdmhref1|user|go_url|linkstars|login|quick|navigator|chrome|userAgent|www|replace|cookie_user|incrementProfile|setOnceProfile|split|for|appendProfile|deleteProfile|trackLink|getAppStatus|unsetProfile|identify|setProfile|clearAllRegister|logout|display|register|trackAbtest|eval|removeChild|script|appendChild||typeof||undefined|unescape|iframe|trackSignup|push|arguments|30|none|track|registerOnce|style|match|call|device_id|show_log|preset_properties|title|latest_referrer_host|latest_landing_page|exec|decodeURIComponent|platForm|PC|deviceid|userid|project|useragent|ad_block_status|adBlockStatus|pc_abtest_collection|smzdm_id|autoTrack|toLowerCase|53||54|http|production|url|import|async|charset|UTF|shence|getElementsByTagName|parentNode|insertBefore|_t|setAttribute|sensorsdata|heatmap_url|heatmap|server_url'.split('|'),0,{}))
</script>
</head>
</html>
第二个请求访问淘宝,应该是第一个请求的response里的js执行跳到了这个链接,然后返回了一张网页,也是一段js。里面有个real_jump_address是再次跳转的地址,需要提取。
-H 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \ -H 'accept-language: zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7' \ -H 'cookie: t=1faf610294f67554477762cc35d7eb7a; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; _tb_token_=e0e9e53bee73; _samesite_flag_=true; cookie2=1e61304530f3b7850cfc557b199fbe85; cna=wZUDF+WZlyoCAdpIf/QFvM0W; miid=619006313101642899; lgc=zjhgx163; cancelledSubSites=empty; dnk=zjhgx163; tracknick=zjhgx163; tkmb=e=VDsC305EZlr-AUPYJvYe3BevvGA4rTDc1n2u1uaGfFSncaqCdc-OabALjOSSyVAWTlizbGXrYGyL4MfOiC6uuAAwQNP5NSRdW3TS4w0IXZQM1m4JopoW_YcCv4jF85eEUjkSYW8Ga_52OicW_Jc8Os7yWM1UEh5eiL13Vh1U7Zn1XxJM-pR8lf1SarTXhIOTsgIpc1WFZiKxlOPDCqdXehEXQ1TAz9Rb10txVpCzxeC39G0lAxWYErIWxEn0wFIbqvXBlx7F-Z8XLw7xEshRs_NJFx3oe4S3wtPo__x-nYOVgHe22V1bq8C_-VjCkqPmXbgU7qleLxS7RlnsLpjp8gMM1rpiz38uyJQMYFKz9HgmLnqs22wSPGpGQI2Hvc4Cs8Zz5S6s_cDEqcXIV9oBnA&iv=0&et=1650541213&tk_cps_param=25282911&tkFlag=0&tk_cps_ut=2; enc=AYBmwqhldmZ2AAAAAHAK4uEBSEv9LAY9HP39%2FWdUdf0Bcyi55%2BrlDHJT%2BPZFFHa%2BuB7S9zlWgy2VPUcicQ6iES4D; v=0; sgcookie=E100dCHES2pHjjuZhnkptXb%2BoY6sueenIfLEsjYjWtNwDx8uTW6uo04o9exwqsR0gZYySjGUm3zb8A1AdWUlX3TOHDk2FlGPuvxXJedEL2X3tyqnjv93BnUPVAFgYmb%2FeI%2BK; uc3=vt3=F8dCvC6FmZSUjX1kOT8%3D&nk2=GcInnL%2FNB4A%3D&id2=VyUOV0TccOw%3D&lg2=VFC%2FuZ9ayeYq2g%3D%3D; csg=275cd3b3; skt=f4d8e9b40b788fcc; existShop=MTY1MjA0MjIwOA%3D%3D; uc4=nk4=0%40GwsRJ8LwrreIzhlhb%2Fsyc%2B6x7Q%3D%3D&id4=0%40VXJ%2FGZFht1tAgkRGpV8BBABGRQ%3D%3D; _cc_=U%2BGCWk%2F7og%3D%3D; lLtC1_=1; mt=ci=-1_0; xlly_s=1; _m_h5_tk=8ec5cffc7caff1766bd312ef248cb13c_1652729272796; _m_h5_tk_enc=f39127cd52e9928460f0b02c8a8813a1; uc1=cookie14=UoexMNM4TVkFdg%3D%3D&pas=0&cookie21=WqG3DMC9Fb5mPLIQo9kR&cookie16=VFC%2FuZ9az08KUQ56dCrZDlbNdA%3D%3D&existShop=false; isg=BNPTDhHktXiC2kRZYVrvQHZeYlH9iGdKvriVgIXwNfIpBPOmDFtWmhkVPnRqpL9C; l=eBSVfi8ROw59CAUZBOfwourza77OjIRAguPzaNbMiOCPOHfp5cgdW6fqhAY9CnGVh6-yJ3-4a-MbBeYBc_C-nxvtIosM_QMmn; tfstk=cujfB211j1Iztx4Z0twzbTVI-dK1CMFWCrOVh7t7ToxqZqG2ac1DjDRQTgbi1hJ9F' \ -H 'referer: https://go.smzdm.com/' \ -H 'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="101", "Google Chrome";v="101"' \ -H 'sec-ch-ua-mobile: ?0' \ -H 'sec-ch-ua-platform: "macOS"' \ -H 'sec-fetch-dest: document' \ -H 'sec-fetch-mode: navigate' \ -H 'sec-fetch-site: cross-site' \ -H 'upgrade-insecure-requests: 1' \ -H 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36' \ --compressed
----------------------
200
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xmlns:th="http://www.w3.org/1999/xhtml"> <head> <meta content="always" name="referrer"/> </head> <body> <script th:inline="javascript"> /*<![CDATA[*/ function bol(){ if (top.location != self.location) { return false; } var real_jump_address = 'https://s.click.taobao.com/t?e=m%3D2%26s%3DtNQUaBJH2S1w4vFB6t2Z2ueEDrYVVa64yK8Cckff7TXLWlSKdGSYDqZisqmwqXfxxq3IhSJN6GSbBjBh3LodcDtMDMUFdHxsdRWpt5EofIXZYTh5WpEozYVwhIsr1DlAgb9ZeSZXs4pzwnsamUFXrlNyYzMa7ZkMOemaFM5tHHYxZyjQcbVDhf4AdzT7K6Uc%2F%2FivxmNmeo%2FOwDMfXFgMfhIjP5Uhv22UTqMbFL%2FUcdXzDmTpDqNxMoB6dRU2TtazxgxdTc00KD8%3D&union_lens=lensId:TAPI@1652802694@2133cdb8_0857_180d2b7ac05_637b@01&ref=https%3A%2F%2Fgo.smzdm.com%2F&et=SN%2FRV3mX3t%2BBsyYAu9%2FmSP5qcqKU7Y2J' if (!window.attachEvent) { document.write('<input style="display:none" type="button" id="exe" value="" onclick="window.location=\'' + real_jump_address + '\'">'); document.getElementById('exe').click(); } else { document.write('<a style="display:none" href="' + real_jump_address + '" id="exe"></a>'); document.getElementById('exe').click(); } }//end of bol() bol(); /*]]>*/ </script> </body> </html>
第三个:(返回302,跳转)
curl 'https://s.click.taobao.com/t?e=m%3D2%26s%3DQDvntgZuL4Jw4vFB6t2Z2ueEDrYVVa64yK8Cckff7TXLWlSKdGSYDmYxvWnNml8B1aH1Hk3GeOibBjBh3LodcDtMDMUFdHxsbz7PH9xAGK3Suo0EwBvBwoVwhIsr1DlAgb9ZeSZXs4pzwnsamUFXrlNyYzMa7ZkMOemaFM5tHHYxZyjQcbVDhf4AdzT7K6Uc%2F%2FivxmNmeo%2FOwDMfXFgMfhIjP5Uhv22UTqMbFL%2FUcdXzDmTpDqNxMoB6dRU2TtazxgxdTc00KD8%3D&union_lens=lensId:TAPI@1652802106@2133cdb8_0857_180d2aeb33a_9ad3@01&ref=https%3A%2F%2Fgo.smzdm.com%2F&et=ouM79o4p%2FYuQW5qhN1%2F8oPbtaObEJknZ' \ -H 'authority: s.click.taobao.com' \ -H 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \ -H 'accept-language: zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7' \ -H 'cookie: t=1faf610294f67554477762cc35d7eb7a; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; _tb_token_=e0e9e53bee73; _samesite_flag_=true; cookie2=1e61304530f3b7850cfc557b199fbe85; cna=wZUDF+WZlyoCAdpIf/QFvM0W; miid=619006313101642899; lgc=zjhgx163; cancelledSubSites=empty; dnk=zjhgx163; tracknick=zjhgx163; tkmb=e=VDsC305EZlr-AUPYJvYe3BevvGA4rTDc1n2u1uaGfFSncaqCdc-OabALjOSSyVAWTlizbGXrYGyL4MfOiC6uuAAwQNP5NSRdW3TS4w0IXZQM1m4JopoW_YcCv4jF85eEUjkSYW8Ga_52OicW_Jc8Os7yWM1UEh5eiL13Vh1U7Zn1XxJM-pR8lf1SarTXhIOTsgIpc1WFZiKxlOPDCqdXehEXQ1TAz9Rb10txVpCzxeC39G0lAxWYErIWxEn0wFIbqvXBlx7F-Z8XLw7xEshRs_NJFx3oe4S3wtPo__x-nYOVgHe22V1bq8C_-VjCkqPmXbgU7qleLxS7RlnsLpjp8gMM1rpiz38uyJQMYFKz9HgmLnqs22wSPGpGQI2Hvc4Cs8Zz5S6s_cDEqcXIV9oBnA&iv=0&et=1650541213&tk_cps_param=25282911&tkFlag=0&tk_cps_ut=2; enc=AYBmwqhldmZ2AAAAAHAK4uEBSEv9LAY9HP39%2FWdUdf0Bcyi55%2BrlDHJT%2BPZFFHa%2BuB7S9zlWgy2VPUcicQ6iES4D; v=0; sgcookie=E100dCHES2pHjjuZhnkptXb%2BoY6sueenIfLEsjYjWtNwDx8uTW6uo04o9exwqsR0gZYySjGUm3zb8A1AdWUlX3TOHDk2FlGPuvxXJedEL2X3tyqnjv93BnUPVAFgYmb%2FeI%2BK; uc3=vt3=F8dCvC6FmZSUjX1kOT8%3D&nk2=GcInnL%2FNB4A%3D&id2=VyUOV0TccOw%3D&lg2=VFC%2FuZ9ayeYq2g%3D%3D; csg=275cd3b3; skt=f4d8e9b40b788fcc; existShop=MTY1MjA0MjIwOA%3D%3D; uc4=nk4=0%40GwsRJ8LwrreIzhlhb%2Fsyc%2B6x7Q%3D%3D&id4=0%40VXJ%2FGZFht1tAgkRGpV8BBABGRQ%3D%3D; _cc_=U%2BGCWk%2F7og%3D%3D; lLtC1_=1; mt=ci=-1_0; xlly_s=1; _m_h5_tk=8ec5cffc7caff1766bd312ef248cb13c_1652729272796; _m_h5_tk_enc=f39127cd52e9928460f0b02c8a8813a1; uc1=cookie14=UoexMNM4TVkFdg%3D%3D&pas=0&cookie21=WqG3DMC9Fb5mPLIQo9kR&cookie16=VFC%2FuZ9az08KUQ56dCrZDlbNdA%3D%3D&existShop=false; isg=BNPTDhHktXiC2kRZYVrvQHZeYlH9iGdKvriVgIXwNfIpBPOmDFtWmhkVPnRqpL9C; l=eBSVfi8ROw59CAUZBOfwourza77OjIRAguPzaNbMiOCPOHfp5cgdW6fqhAY9CnGVh6-yJ3-4a-MbBeYBc_C-nxvtIosM_QMmn; tfstk=cujfB211j1Iztx4Z0twzbTVI-dK1CMFWCrOVh7t7ToxqZqG2ac1DjDRQTgbi1hJ9F' \ -H 'referer: https://s.click.taobao.com/t?e=m%3D2%26s%3DQDvntgZuL4Jw4vFB6t2Z2ueEDrYVVa64yK8Cckff7TXLWlSKdGSYDmYxvWnNml8B1aH1Hk3GeOibBjBh3LodcDtMDMUFdHxsbz7PH9xAGK3Suo0EwBvBwoVwhIsr1DlAgb9ZeSZXs4pzwnsamUFXrlNyYzMa7ZkMOemaFM5tHHYxZyjQcbVDhf4AdzT7K6Uc%2F%2FivxmNmeo%2FOwDMfXFgMfhIjP5Uhv22UTqMbFL%2FUcdXzDmTpDqNxMoB6dRU2TtazxgxdTc00KD8%3D&union_lens=lensId:TAPI@1652802106@2133cdb8_0857_180d2aeb33a_9ad3@01' \ -H 'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="101", "Google Chrome";v="101"' \ -H 'sec-ch-ua-mobile: ?0' \ -H 'sec-ch-ua-platform: "macOS"' \ -H 'sec-fetch-dest: document' \ -H 'sec-fetch-mode: navigate' \ -H 'sec-fetch-site: same-origin' \ -H 'upgrade-insecure-requests: 1' \ -H 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36' \ --compressed
----------------
HTTP/2 302 Found date: Tue, 17 May 2022 15:51:35 GMT content-length: 0 location: https://detail.tmall.com/item.htm?id=557059017425&ali_trackid=2:mm_25282911_3455987_108817150012:1652802695_095_1881908582&union_lens=lensId:TAPI@1652802694@2133cdb8_0857_180d2b7ac05_637b@01;recoveryid:1652802695_095_1881908582&ak=23340247&bxsign=tbkeQmghKs3aVX8yjS1INBkEEQxFGGMGguqcuWfFo2ouYwE15upeBXwmAbKMrVxbol0Wobk0KQJcUo/kcarKnz3xI99YVH9CMfgBTuDwynQ/qU= server: Tengine x-application-context: union-cps-httptrace:7001 set-cookie: lLtC1_=1;Domain=taobao.com;Path=/;Max-Age=86400;Expires=Wed, 18-May-2022 15:51:35 GMT;Secure;SameSite=None eagleeye-traceid-daily: 212d482716528026956285405e075b eagleeye-traceid: 212d482716528026956285405e075b strict-transport-security: max-age=0 timing-allow-origin: * X-Firefox-Spdy: h2
最后一个请求,跳到需要的最终商品页面
curl 'https://detail.tmall.com/item.htm?id=557059017425&ali_trackid=2:mm_25282911_3455987_108817150012:1652802695_095_1881908582&union_lens=lensId:TAPI@1652802694@2133cdb8_0857_180d2b7ac05_637b@01;recoveryid:1652802695_095_1881908582&ak=23340247&bxsign=tbkeQmghKs3aVX8yjS1INBkEEQxFGGMGguqcuWfFo2ouYwE15upeBXwmAbKMrVxbol0Wobk0KQJcUo/kcarKnz3xI99YVH9CMfgBTuDwynQ/qU=' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:100.0) Gecko/20100101 Firefox/100.0' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' -H 'Accept-Language: en-US,en;q=0.5' -H 'Accept-Encoding: gzip, deflate, br' -H 'Referer: https://s.click.taobao.com/t?e=m%3D2%26s%3DtNQUaBJH2S1w4vFB6t2Z2ueEDrYVVa64yK8Cckff7TXLWlSKdGSYDqZisqmwqXfxxq3IhSJN6GSbBjBh3LodcDtMDMUFdHxsdRWpt5EofIXZYTh5WpEozYVwhIsr1DlAgb9ZeSZXs4pzwnsamUFXrlNyYzMa7ZkMOemaFM5tHHYxZyjQcbVDhf4AdzT7K6Uc%2F%2FivxmNmeo%2FOwDMfXFgMfhIjP5Uhv22UTqMbFL%2FUcdXzDmTpDqNxMoB6dRU2TtazxgxdTc00KD8%3D&union_lens=lensId:TAPI@1652802694@2133cdb8_0857_180d2b7ac05_637b@01' -H 'Connection: keep-alive' -H 'Cookie: cna=fopiF6kJJRYCAX14QZtmdu0U; isg=BGhowJq2noiaY49JzsSUOTqkOlB6kcybgNMa8SKZwOPWfQjnyqMgK_f0dZ2N1oRz; pnm_cku822=098%23E1hvXvvUvbZvjQCkvvvvvjiWRLqw0j1HRsSU6jrCPmPW0jEvPsFpAjYPn2MOzjyRvpvhvv2MMQvCvvXvppvvvvmgvpvIphvvXPMMpbYvpv9SvvC2MZCvjvvvvhpyphvwvvvvBHavpCQvvvChxg9Cvva2pQQWeC3QuvhvmhCvCvEI6u%2F0mvhvLhE4QQmFejyyX9nr1EkKfvyf8%2BBl5FGDN5HVafmAdcHCjLPClfy64v6f8%2BBlDCODN5HvaNoXe7%2BRVAll%2Bb8rakrYVVzyd3WDN5Hvafvgvpvhphvvv8OCvvBvpvpZ; tfstk=cizNBuXHFq2Cs4u-SVgVhYCUN3MOCyHmIel-jTXGSvOMmEKKZW5DnKhH8VhEMBJmj; l=eBPe6pA4Of-HlbvJBOfCKurza779kIOYYuPzaNbMiOCPOMCe5rK1W6f0ZfYwCn1Nh6vXR3y0PpGXBeYBc6CKnxvOa6Fy_wkmn; t=08bdcb33eba17d4b8aa2e8c2a6f66847; _tb_token_=edbd8ee938be7; cookie2=1820cb3e27618bdbd55bbdef91b2acde' -H 'Upgrade-Insecure-Requests: 1' -H 'Sec-Fetch-Dest: document' -H 'Sec-Fetch-Mode: navigate' -H 'Sec-Fetch-Site: cross-site' -H 'Pragma: no-cache' -H 'Cache-Control: no-cache'
从上面看来也很清楚,只要得到第3个请求response header里面的location就大功告成了。
分析的过程是这样的:
首先第一个请求返回的那段js是这样的:
这是一段加密过的代码,由于对前端不是很熟悉,一开始不知道这种其实是eval加密,是最简单的一种加密方法,破解也很简单。把eval()中间的代码拿出来,直接console.log(代码)就能打印出来,或者把提取出来的代码赋值给一个变量,然后这个变量的值就是解密后的原代码。原因是啥,因为eval()中间是个字符串,它可以直接执行这个字符串里的代码。所以上面那一大堆代码其实返回的是个字符串,eval执行了这个字符串里的代码,跳转了页面。所以把那一大堆eval中间的代码赋值给变量,就是把一个字符串赋值给了变量,而这个字符串就是原代码。没有用console,log的方法因为console.log返回的是个undefined.而我需要把这段代码变成字符串再进行提取。
解密后原代码是这样的:
function getCookie(name) { var arr, reg = new RegExp('(^| )' + name + '=([^;]*)(;|$)'); if ((arr = document.cookie.match(reg))) { return unescape(arr[2]); } else { return ''; } } function changeStr(allstr, start, end, changeStr) { return allstr.substring(0, start - 1) + changeStr + allstr.substring(end, allstr.length); } var this_url = window.location.href; var zdm_track_info = getCookie('zdm_track_info'); if (zdm_track_info != '') { zdm_track_info = eval('(' + zdm_track_info + ')'); source = zdm_track_info.source; channel = zdm_track_info.channel; if (this_url.indexOf(source + '/' + channel) < 0) { go_url = changeStr(this_url, 26, 30, source + '/' + channel); window.location.replace = go_url; } } (function () { var cookie_user = getCookie('user'); var arr = cookie_user.split('|'); var redirected = false; var is_amazon = false; function redirect() { if (redirected) return; redirected = true; if (is_amazon) { var ifr = document.createElement('iframe'); ifr.src = smzdmhref1; ifr.style.display = 'none'; document.body.appendChild(ifr); window.setTimeout(function () { document.body.removeChild(ifr); window.location.href = smzdmhref; }, 1000); } else { window.location.href = smzdmhref; } } (function (para) { var p = para.sdk_url, n = para.name, w = window, d = document, s = 'script', x = null, y = null; if (typeof w['sensorsDataAnalytic201505'] !== 'undefined') { return false; } w['sensorsDataAnalytic201505'] = n; w[n] = w[n] || function (a) { return function () { (w[n]._q = w[n]._q || []).push([a, arguments]); }; }; var ifs = [ 'track', 'quick', 'register', 'registerPage', 'registerOnce', 'trackSignup', 'trackAbtest', 'setProfile', 'setOnceProfile', 'appendProfile', 'incrementProfile', 'deleteProfile', 'unsetProfile', 'identify', 'login', 'logout', 'trackLink', 'clearAllRegister', 'getAppStatus', ]; for (var i = 0; i < ifs.length; i++) { w[n][ifs[i]] = w[n].call(null, ifs[i]); } if (!w[n]._t) { (x = d.createElement(s)), (y = d.getElementsByTagName(s)[0]); x.async = 1; x.src = p; x.setAttribute('charset', 'UTF-8'); w[n].para = para; y.parentNode.insertBefore(x, y); } })({ sdk_url: 'https://res.smzdm.com/resources/public/js/sa-sdk-javascript@1.14.24/sensorsdata.min.js', heatmap_url: 'https://res.smzdm.com/resources/public/js/sa-sdk-javascript@1.14.24/heatmap.min.js', name: 'sensors', server_url: 'https://shence-import.smzdm.com/sa?project=production', show_log: false, preset_properties: { url: true, title: true, latest_referrer_host: true, latest_landing_page: true, }, }); function getSensorsCookie(cookieName) { var cookiePattern = new RegExp('(^|;)[ ]*' + cookieName + '=([^;]*)'), cookieMatch = cookiePattern.exec(document.cookie); return cookieMatch ? decodeURIComponent(cookieMatch[2]) : ''; } sensors.registerPage({ platForm: 'PC', deviceid: getSensorsCookie('device_id'), userid: getSensorsCookie('user'), useragent: navigator.userAgent, ad_block_status: window.adBlockStatus || '被屏蔽了', pc_abtest_collection: '', }); var sensorsSmzdmId = getSensorsCookie('smzdm_id'); sensorsSmzdmId && sensors.login(sensorsSmzdmId); sensors.quick('autoTrack'); smzdmhref = 'https://s.click.taobao.com/t?e=m%3D2%26s%3Ddj4fWNWd7yFw4vFB6t2Z2ueEDrYVVa64yK8Cckff7TXLWlSKdGSYDiAz8G2byJMpxq3IhSJN6GSbBjBh3LodcDtMDMUFdHxsPxxZDneUbvfEAAuCZ9AQJ4VwhIsr1DlAgb9ZeSZXs4pMNTz%2FEeSSKRsNgyJzvboJOemaFM5tHHZsAmR%2Ffhd2BT6EGxfJYunyFa91WAEa7M1NzuMLUNyvdLogkofNxAhKUCAclkAE%2FM8UwKMsvWj7aROGhEUGHWZ8&union_lens=lensId:TAPI@1652514754@0b16eaa6_084a_180c18e0f99_ba3b@01'; var uaStr = navigator.userAgent.toLowerCase(); if (uaStr.indexOf('chrome/53') > 0 || uaStr.indexOf('chrome/54') > 0) { smzdmhref = smzdmhref.replace('https://www.linkstars.com', 'http://www.linkstars.com'); } smzdmhref1 = 'https://s.click.taobao.com/t?e=m%3D2%26s%3Ddj4fWNWd7yFw4vFB6t2Z2ueEDrYVVa64yK8Cckff7TXLWlSKdGSYDiAz8G2byJMpxq3IhSJN6GSbBjBh3LodcDtMDMUFdHxsPxxZDneUbvfEAAuCZ9AQJ4VwhIsr1DlAgb9ZeSZXs4pMNTz%2FEeSSKRsNgyJzvboJOemaFM5tHHZsAmR%2Ffhd2BT6EGxfJYunyFa91WAEa7M1NzuMLUNyvdLogkofNxAhKUCAclkAE%2FM8UwKMsvWj7aROGhEUGHWZ8&union_lens=lensId:TAPI@1652514754@0b16eaa6_084a_180c18e0f99_ba3b@01'; setTimeout(redirect, 1000); })();
注意到这个有个链接smzdmhref就是我们需要的。只需要提取这个就可以了。
昨天晚上一开始不知道这个方法,我是把那个js直接放在node下跑,抛了一个错误:window is not defined.但在错误信息上面居然把解密后的代码也打出来了才发现的。其实我一开始是想在python下直接执行这段代码。查到了一个PyExecJS的东西。可以用python直接运行js代码。但由于默认的环境是node,window是浏览器对象找不到。我也没试过其他的环境是不是可以执行。那怎么办呢,后来又查到了jsdom这个东西,需要在python工作目录下用npm -i jsdom安装,这个可以模拟一个浏览器环境,在需要运行的js代码前面加上代码,像这样:
const dom = new JSDOM( `<!DOCTYPE html><head><script src="./sensors.js" charset="UTF-8"></script></head><p>Hello world</p>`, { runScripts: 'dangerously', resources: 'usable', url: 'https://www.smzdm.com', }, ); window = dom.window; document = window.document; navigator = window.navigator; XMLHttpRequest = window.XMLHttpRequest;
-------
把一些浏览器的对象定义好就不会报错了。但我这边运行下来还是报错:navigate not implement. 一查,貌似navigate还没有实现。这样下来用这个行不通了。
后来一想我根本不需要执行js啊,我只需要把解密后的代码取到就可以了,然后把那个url提取出来。况且就算执行成功大概率就是个url跳转的功能。
于是我把eval()中间的代码提取出来,然后赋值给一个b,用PyExecJS可以把这个变量的值取出来。代码是这样的:
js_code = scrapy_selector.css('script::text') if js_code: self.logger.info("js_code = %s", js_code) pattern = r'\beval\(([\w\W]*)\)\s*$' v_source = js_code.re_first(pattern) if v_source: self.logger.info("source = %s", v_source) source = "var b = " + v_source + "\n" else: self.logger.error("smzdm javascript is missing,%s", response.url) return try: ctx = execjs.compile(source) origin_script = ctx.eval('b')
这样就可以把解密后的代码origin_script给弄出来了,然后再提取那个smzdmhref就可以了。
本以为这样后面就会比较顺利了,但没想到用request.get()发请求时,到第3个请求,总是返回一张同样有real_jump_address的页面,而不是一个302重定向的response,header里没有location。而再次请求后返回的还是real_jump_address的页面。晕了!后来以为是header,cookie的问题,被淘宝检测出问题了,在那里折腾了一晚还是不行。等过后一想,可能是发的请求有问题,因为我发第3个请求的地址是第二个请求返回js里的real_jump_address原封不动的。然后我在浏览器上实验这两者是不是一样的。打开开发者工具,把第二个请求的response里的real_jump_address拷下来,然后把第3个请求的地址拷下来对比:
[real_jump_address]
https://s.click.taobao.com/t?e=m%3D2%26s%3DWXdMQNeUv6pw4vFB6t2Z2ueEDrYVVa64yK8Cckff7TXLWlSKdGSYDoeRwk68doNn79%2FTFaMDK6SbBjBh3LodcDtMDMUFdHxsM9Cqlci19F9zTzJpAUG9mYVwhIsr1DlAgb9ZeSZXs4pPsBFOfSjttxUDBS0LO4rAOemaFM5tHHYxZyjQcbVDhf4AdzT7K6Uc%2F%2FivxmNmeo%2FOwDMfXFgMfhIjP5Uhv22URCsByfO3crusKF4hNjN5z4B6dRU2TtazxgxdTc00KD8%3D&amp;union_lens=lensId:TAPI@1652772919@21050ddc_0b09_180d0f1584f_7ea9@01&ref=&et=y%2FteTh4WEUtc3ucfFXVmXVkbYc7MCDvW
[third request url] https://s.click.taobao.com/t?e=m%3D2%26s%3DWXdMQNeUv6pw4vFB6t2Z2ueEDrYVVa64yK8Cckff7TXLWlSKdGSYDoeRwk68doNn79%2FTFaMDK6SbBjBh3LodcDtMDMUFdHxsM9Cqlci19F9zTzJpAUG9mYVwhIsr1DlAgb9ZeSZXs4pPsBFOfSjttxUDBS0LO4rAOemaFM5tHHYxZyjQcbVDhf4AdzT7K6Uc%2F%2FivxmNmeo%2FOwDMfXFgMfhIjP5Uhv22URCsByfO3crusKF4hNjN5z4B6dRU2TtazxgxdTc00KD8%3D&union_lens=lensId:TAPI@1652772919@21050ddc_0b09_180d0f1584f_7ea9@01&ref=&et=y%2FteTh4WEUtc3ucfFXVmXVkbYc7MCDvW
经过对比发现,response里面的real_jump_address=‘https://s.’里的地址比实际发出的请求地址,里面有html的转义字符&
问题找到,只要把real_jump_address的转义字符去掉就可以了,代码如下:
html_parser = HTMLParser()
taobao_jump_path = html_parser.unescape(taobao_jump_path)
重新执行代码,这下第3个请求返回的就是header里有location的response了,至此大功告成。