[Node.js] Web Scraping with Pagination and Advanced Selectors

When web scraping, you'll often want to get more than just one page of data. Xray supports pagination by finding the "next" or "more" button on each page and cycling through each new page until it can no longer find that link. This lesson demonstrates how to paginate as well as more advanced selectors for when links are difficult to scrape.

 

/**
 * Created by Answer1215 on 8/22/2015.
 */
var Xray = require('x-ray');
var xray = new Xray();

xray('https://news.ycombinator.com/', '.athing', [{
    rank: '.rank',
    title: 'td:nth-child(3) a',
    link: "td:nth-child(3) a@href"
}])
    .paginate('a[rel="nofollow"]:last-child@href')
    .limit(3)
    .write('./results2.json');

///////////////////////////////
//  test
///////////////////////////////

xray('https://news.ycombinator.com/', 'a[rel="nofollow"]', [{
    show: ''
}]).write('./results2.json');
/**
 * [
 {
   "show": "Segment is hiring security engineers to help secure our container fleet"
 },
 {
   "show": "Modafinil for cognitive neuroenhancement: a systematic review"
 },
 {
   "show": "Ports and Power in the Indian Ocean"
 },
 {
   "show": "Natural and Artificial Intelligence (1988) [pdf]"
 },
 {
   "show": "Proofing Spirits with a Homemade Electrobalance"
 },
 {
   "show": "Seth Nickell on Replacing the Aging Init Procedure on Linux (2003)"
 },
 {
   "show": "More"
 }
 ]
 * */

xray('https://news.ycombinator.com/', 'a[rel="nofollow"]:last-child', [{
    show: ''
}]).write('./results2.json');
/*
* [
 {
 "show": "More"
 }
 ]
* */

 

posted @ 2015-08-22 16:47  Zhentiw  阅读(574)  评论(0编辑  收藏  举报