phpspider PHP 爬虫

* 通过composer下载

composer require owner888/phpspider

// composer.json

{
    "require": {
        "owner888/phpspider": "^2.1"
    }
}

  

* 去掉讨厌的注释

   https://doc.phpspider.org/demo-start.html

 ./vendor/owner888/phpspider/core/phpspider.php

/* Do NOT delete this comment */
        // 彩蛋
        $included_files = get_included_files();
        $content = file_get_contents($included_files[0]);
        if (!preg_match("#/\* Do NOT delete this comment \*/#", $content) || !preg_match("#/\* 不要删除这段注释 \*/#", $content))
        {
            $msg = "Unknown error...";
            log::error($msg);
            exit;
        }

 删掉这段恶心的代码

 

  * 导入数据库文件

    

cd ./vendor/owner888/phpspider/demo

  

mysql -uroot -hlocalhost -p

  

create database demo charset utf8 collate utf8_general_ci;
\. qiushibaike.sql

  

# ************************************************************
# Sequel Pro SQL dump
# Version 4541
#
# http://www.sequelpro.com/
# https://github.com/sequelpro/sequelpro
#
# Host: 127.0.0.1 (MySQL 5.7.14)
# Database: demo
# Generation Time: 2016-10-20 16:55:11 +0000
# ************************************************************


/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
/*!40101 SET NAMES utf8 */;
/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;


# Dump of table content
# ------------------------------------------------------------

DROP TABLE IF EXISTS `content`;

CREATE TABLE `content` (
  `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
  `depth` int(11) DEFAULT NULL,
  `url` varchar(200) DEFAULT NULL,
  `article_title` varchar(20) DEFAULT NULL,
  `article_headimg` varchar(150) DEFAULT NULL,
  `article_author` varchar(20) DEFAULT NULL,
  `article_content` text,
  `article_publish_time` int(10) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;




/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
View Code

* 创建./index.php

<?php
require './vendor/autoload.php';

use phpspider\core\phpspider;

$configs = [
    'name' => '糗事百科',
    'domains' => [
        'qiushibaike.com',
        'www.qiushibaike.com'
    ],
    'scan_urls' => [
        'http://www.qiushibaike.com/'
    ],
    'content_url_regexes' => [
        "http://www.qiushibaike.com/article/\d+"
    ],
    'list_url_regexes' => [
        "http://www.qiushibaike.com/8hr/page/\d+\?s=\d+"
    ],
    'fields' => [
        [
            // 抽取内容页的文章内容
            'name' => "article_content",
            'selector' => "//*[@id='single-next-link']",
            'required' => true
        ],
        [
            // 抽取内容页的文章作者
            'name' => "article_author",
            'selector' => "//div[contains(@class,'author')]//h2",
            'required' => true
        ],
    ],
    'log_show' => true,
    'input_encoding' => 'utf-8',
    'output_encoding' => 'utf-8',
    'db_config' => [
        'host' => '127.0.0.1',
        'user' => 'root',
        'pass' => '',
        'name' => 'demo',
        'port' => 3306
    ],
    /*
    'export' => [
        'type' => 'sql',
        'file' => './data/sql/qiushibaike.sql'
    ]
    */
    'export' => [
        'type' => 'db',
        'table' => 'content',
    ]
];

$spider = new phpspider($configs);
$spider->start();

  

* Run

php ./index.php 

  

posted @ 2018-07-14 23:13  zhanghui_ming  阅读(332)  评论(1编辑  收藏  举报