node.js爬取ajax接口数据
爬取页面数据与爬取接口数据,我还是觉得爬取接口数据更加简单一点,主要爬取一些分页的数据。
爬取步骤:
1.明确目标接口地址,举个例子 : https://www.vcg.com/api/common/searchImage?phrase=%E6%98%A5%E5%A4%A9&graphicalStyle%5B0%5D=1&page=1 网上随便找到 视觉中国的一个网址
这个网址上的图片非常好看
2.接口返回的数据都是json数据。很统一,处理起来也很便捷。撇开奇葩接口不说。
3.只需要伪造好请求头就可访问,这是重点。
接口参数是
原始数据
下面是我爬取代码
//引入自己封装的链接数据模块 let mysql = require('./connectdatabase.js'); //SuperAgent是一个轻量级、灵活的、易读的、低学习曲线的客户端请求代理模块,使用在NodeJS环境中。 let superagent = require('superagent'); var fs = require('fs'); // .set( "Accept", "application/json") // .set( "Cookie", "acw_tc=276aedea15548801849954091e3094245e35937d42d912140fa37630751eeb; clientIp=122.235.188.119; source=baidusem; sajssdk_2015_cross_new_user=1; _ga=GA1.2.109353021.1554880168; _gid=GA1.2.1994909913.1554880168; Hm_lvt_0af14db9b5993b4879812c54f6cf147d=1554880168; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216a06147ef76c7-04ba8d1655b9ab-191f7059-2073600-16a06147ef8279%22%2C%22%24device_id%22%3A%2216a06147ef76c7-04ba8d1655b9ab-191f7059-2073600-16a06147ef8279%22%2C%22props%22%3A%7B%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_utm_source%22%3A%22baidusem%22%2C%22%24latest_utm_medium%22%3A%22cpc%22%2C%22%24latest_utm_campaign%22%3A%22%E7%AB%99%E5%86%85%E6%90%9C%E7%B4%A2%E8%AF%8D-%E6%A0%B8%E5%BF%83%22%2C%22%24latest_utm_content%22%3A%22%E7%AB%99%E5%86%85%E6%90%9C%E7%B4%A2%E8%AF%8D-%E5%9B%BE%E7%89%87%22%2C%22%24latest_utm_term%22%3A%22%E5%9B%BE%E7%89%87%22%7D%7D; Hm_lpvt_0af14db9b5993b4879812c54f6cf147d=1554880467") // .set( "X-Forwarded-For","122.235.188.119") // .set( "Host", "www.vcg.com") // .set( "Accept-Encoding","gzip, deflate, br") // .set( "Referer", url) function get(url) { superagent.get(url) .set( { "Accept": "application/json", "Accept-Encoding": "gzip, deflate, br", "Host": "www.vcg.com", "Referer": 'https://www.vcg.com/api/common/search?phrase=%E6%98%A5%E5%A4%A9&graphicalStyle%5B0%5D=1&page=1', "X-Forwarded-For": "122.235.188.119", "Cookie": "acw_tc=276aedea15548801849954091e3094245e35937d42d912140fa37630751eeb; clientIp=122.235.188.119; source=baidusem; sajssdk_2015_cross_new_user=1; _ga=GA1.2.109353021.1554880168; _gid=GA1.2.1994909913.1554880168; Hm_lvt_0af14db9b5993b4879812c54f6cf147d=1554880168; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216a06147ef76c7-04ba8d1655b9ab-191f7059-2073600-16a06147ef8279%22%2C%22%24device_id%22%3A%2216a06147ef76c7-04ba8d1655b9ab-191f7059-2073600-16a06147ef8279%22%2C%22props%22%3A%7B%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_utm_source%22%3A%22baidusem%22%2C%22%24latest_utm_medium%22%3A%22cpc%22%2C%22%24latest_utm_campaign%22%3A%22%E7%AB%99%E5%86%85%E6%90%9C%E7%B4%A2%E8%AF%8D-%E6%A0%B8%E5%BF%83%22%2C%22%24latest_utm_content%22%3A%22%E7%AB%99%E5%86%85%E6%90%9C%E7%B4%A2%E8%AF%8D-%E5%9B%BE%E7%89%87%22%2C%22%24latest_utm_term%22%3A%22%E5%9B%BE%E7%89%87%22%7D%7D; Hm_lpvt_0af14db9b5993b4879812c54f6cf147d=1554880467", } ) .end(function (err, res) { // 抛错拦截 if (err) { throw Error(err); return } // console.log(res); fs.appendFile("./接口数据.txt", JSON.stringify(res.text), 'utf-8', function (err) { if (err) { console.log(err); } }); }); } get('https://www.vcg.com/api/common/searchImage?phrase=%E6%98%A5%E5%A4%A9&graphicalStyle%5B0%5D=1&page=1');
这样就爬取到了第一页的数据 并且在本地创建了一个接口数据的txt文件 。
这样一个简单的接口数据就已经爬下来了,对于数据的处理就不多废话,各取所需。
本人qq : 981900309 加备注博客园