nodejs爬虫简单实现
const fs = require('fs'); const URL = require('url') const gbk = require('gbk') const { JSDOM } = require('jsdom') class Getdata { static http(url) { let Url = URL.parse(url) let http; if (Url.protocol == 'http:') { http = require('http') return { "http": http, 'hostname': Url.hostname, 'path': Url.path } } else { http = require('https') return { "http": http, 'hostname': Url.hostname, 'path': Url.path }; } } get(url, Filename) { Getdata.a += 1 let app = Getdata.http(url); let https = app.http.request({ 'hostname': app.hostname, 'path': app.path }, res => { if (res.statusCode == 200) { // console.log(res.statusCode) let arr = [] res.on('data', data => { // console.log(data); arr.push(data) }).on('end', () => { let b = Buffer.concat(arr) // let html = gbk.toString('utf-8', b) let dom = new JSDOM(b); let doc = dom.window.document.querySelectorAll('.Left_list_cont2 img'); for (let i = 0; i < doc.length; i++) { let a = doc[i].getAttribute('data-original') // console.log() this.set(a, `${Filename}${i}`, 'jpg') } }) } else { console.log(Getdata.a); console.log(res.statusCode, res.headers); this.get(url, Filename) } }) https.end() } set(url, Filename, kz = 'html') { Getdata.a += 1 let app = Getdata.http(url); let https = app.http.request({ 'hostname': app.hostname, 'path': app.path }, res => { if (res.statusCode == 200) { // console.log(res.statusCode) let arr = [] res.on('data', data => { // console.log(data); arr.push(data) }).on('end', () => { let b = Buffer.concat(arr) fs.writeFile('img/' + Filename + '.' + kz, b, () => { console.log('成功了'); }) }) } else { console.log(Getdata.a); console.log(res.statusCode, res.headers); this.get(url, Filename) } }) https.end() } } Getdata.a = 0;