phpspider PHP 爬虫
* 通过composer下载
composer require owner888/phpspider
// composer.json
{ "require": { "owner888/phpspider": "^2.1" } }
* 去掉讨厌的注释
https://doc.phpspider.org/demo-start.html
./vendor/owner888/phpspider/core/phpspider.php
/* Do NOT delete this comment */
// 彩蛋 $included_files = get_included_files(); $content = file_get_contents($included_files[0]); if (!preg_match("#/\* Do NOT delete this comment \*/#", $content) || !preg_match("#/\* 不要删除这段注释 \*/#", $content)) { $msg = "Unknown error..."; log::error($msg); exit; }
删掉这段恶心的代码
* 导入数据库文件
cd ./vendor/owner888/phpspider/demo
mysql -uroot -hlocalhost -p
create database demo charset utf8 collate utf8_general_ci; \. qiushibaike.sql
# ************************************************************ # Sequel Pro SQL dump # Version 4541 # # http://www.sequelpro.com/ # https://github.com/sequelpro/sequelpro # # Host: 127.0.0.1 (MySQL 5.7.14) # Database: demo # Generation Time: 2016-10-20 16:55:11 +0000 # ************************************************************ /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; /*!40101 SET NAMES utf8 */; /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; # Dump of table content # ------------------------------------------------------------ DROP TABLE IF EXISTS `content`; CREATE TABLE `content` ( `id` int(11) unsigned NOT NULL AUTO_INCREMENT, `depth` int(11) DEFAULT NULL, `url` varchar(200) DEFAULT NULL, `article_title` varchar(20) DEFAULT NULL, `article_headimg` varchar(150) DEFAULT NULL, `article_author` varchar(20) DEFAULT NULL, `article_content` text, `article_publish_time` int(10) DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8; /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
* 创建./index.php
<?php require './vendor/autoload.php'; use phpspider\core\phpspider; $configs = [ 'name' => '糗事百科', 'domains' => [ 'qiushibaike.com', 'www.qiushibaike.com' ], 'scan_urls' => [ 'http://www.qiushibaike.com/' ], 'content_url_regexes' => [ "http://www.qiushibaike.com/article/\d+" ], 'list_url_regexes' => [ "http://www.qiushibaike.com/8hr/page/\d+\?s=\d+" ], 'fields' => [ [ // 抽取内容页的文章内容 'name' => "article_content", 'selector' => "//*[@id='single-next-link']", 'required' => true ], [ // 抽取内容页的文章作者 'name' => "article_author", 'selector' => "//div[contains(@class,'author')]//h2", 'required' => true ], ], 'log_show' => true, 'input_encoding' => 'utf-8', 'output_encoding' => 'utf-8', 'db_config' => [ 'host' => '127.0.0.1', 'user' => 'root', 'pass' => '', 'name' => 'demo', 'port' => 3306 ], /* 'export' => [ 'type' => 'sql', 'file' => './data/sql/qiushibaike.sql' ] */ 'export' => [ 'type' => 'db', 'table' => 'content', ] ]; $spider = new phpspider($configs); $spider->start();
* Run
php ./index.php