Nodejs实现爬虫的几种方式
获取代理 IP
// 需要安装 axios 模块 - npm install axios --save const axios = require('axios') // id secret 等参数是在猿人云官网提取API内获取的 const queries = { id: 'xxx', secret: 'xxx', limit: 1, format: 'txt', auth_mode: 'auto' }; axios.get('http://tunnel-api.apeyun.com/q', { params: queries, }).then((response) => { console.log('IP:', response.data); }).catch((e) => { console.error('Error:', e); });
爬虫程序
-
axios
// 需要安装 axios 模块 - npm install axios --save const axios = require('axios') // 要访问的目标页面 let targetUrl = "http://www.baidu.com" // 代理服务器,假设提取到的代理 ip 是 123.123.123.123:1234 const proxyHost = "123.123.123.123" const proxyPort = 1234 // 代理验证信息(猿人云官网获取) const proxyUser = "xxx" const proxyPass = "xxx" let proxy = { host: proxyHost, port: proxyPort, auth: { username: proxyUser, password: proxyPass } } // 参见官方文档 https://github.com/axios/axios#request-config axios.get(targetUrl,{proxy:proxy}) .then(function (response) { // handle success console.log(response.data) }) .catch(function (error) { // handle error console.log(error) }) .finally(function () { // always executed });
-
http
-
const http = require("http") const url = require("url") // 要访问的目标页面 const targetUrl = "http://www.baidu.com" const urlParsed = url.parse(targetUrl) // 代理服务器,假设提取到的代理ip是123.123.123.123:1234 const proxyHost = "123.123.123.123" const proxyPort = "1234" // 代理隧道验证信息(猿人云官网获取) const proxyUser = "xxx" const proxyPass = "xxx" const base64 = Buffer.from(proxyUser + ":" + proxyPass).toString("base64") const options = { host: proxyHost, port: proxyPort, path: targetUrl, method: "GET", headers: { "Host": urlParsed.hostname, "Proxy-Authorization": "Basic " + base64 } } http.request(options, function(res) { console.log("got response: " + res.statusCode) }) .on("error", function(err) { console.log(err) }) .end()
-
request
-
// 需要安装 request 模块 - npm install request --save const request = require("request") // 要访问的目标页面 const targetUrl = "http://www.baidu.com" // 代理服务器,假设提取到的代理ip是123.123.123.123:1234 const proxyHost = "123.123.123.123" const proxyPort = 1234 // 代理隧道验证信息(猿人云官网获取) const proxyUser = "xxx" const proxyPass = "xxx" const proxyUrl = "http://" + proxyUser + ":" + proxyPass + "@" + proxyHost + ":" + proxyPort const proxiedRequest = request.defaults({'proxy': proxyUrl}) const options = { url: targetUrl, headers: {} } proxiedRequest.get(options, function (err, res, body) { console.log("got response: " + res.statusCode) console.log("got response: " + body) }) .on("error", function (err) { console.log(err); })
-
superagent
-
// 需要安装 superagent 和 superagent-proxy 模块 - npm install superagent superagent-proxy --save const request = require("superagent") require("superagent-proxy")(request) // 要访问的目标页面 const targetUrl = "http://www.baidu.com" // 代理服务器,假设提取到的代理ip是123.123.123.123:1234 const proxyHost = "123.123.123.123"; const proxyPort = 1234; // 代理隧道验证信息(猿人云官网获取) const proxyUser = "xxx" const proxyPass = "xxx" const proxyUrl = "http://" + proxyUser + ":" + proxyPass + "@" + proxyHost + ":" + proxyPort request.get(targetUrl).proxy(proxyUrl).end(function onResponse(err, res) { if (err) { return console.log(err) } console.log(res.status, res.headers) console.log(res.text) });
使用 node 运行这个文件,当你的控制台打印出一大段 HTML 代码说明这个爬虫程序成功了
转自:https://mp.weixin.qq.com/s/JA11NzbbHtKqgijmdmJPlw