php爬虫 phpspider

<?php
/**
 * Created by PhpStorm.
 * User: brady
 * Date: 2016/12/9
 * Time: 17:32
 */
ini_set("memory_limit", "1024M");
require dirname(__FILE__).'/../core/init.php';

$url = "http://www.epooll.com/archives/806/";
$html = requests::get($url);
// 抽取文章标题
$selector = "//*[@id=\"content\"]/div[1]/div[1]/h1/a";

$title = selector::select($html, $selector);
// 检查是否抽取到标题
// 抽取文章作者
$selector = "//*[@id=\"content\"]/div[1]/div[1]/h6/span[1]";
$author = selector::select($html, $selector);
// 检查是否抽取到作者
// 去掉 作者:
$author = str_replace("作者:", "", $author);
//发布时间
$selector = "//*[@id=\"content\"]/div[1]/div[1]/h6/span[2]";
$time = selector::select($html, $selector);
$time = str_replace("发布时间:",'', $time);
$time  = date("Y-m-d H:i:s",strtotime($time));
// 抽取文章内容
$selector = "//*[@id=\"content\"]/div[1]/div[2]";
$content = selector::select($html, $selector);
// 检查是否抽取到内容
$data = array(
    'article_title' => $title,
    'article_author' => $author,
    'article_content' => $content,
);
// 查看数据是否正常
$res = db::insert("content", $data);
var_dump($res);

  

posted @ 2016-12-09 17:49  brady-wang  阅读(5113)  评论(0编辑  收藏  举报