curl抓取

<?php
header("content-type:text/html;charset=utf8");
set_time_limit(0);
 
//=================================工具函数=====================
function real_url($current_url, $base_url='') {
    $data = parse_url($current_url);
    if ( ! isset($data['host'])) {
        $current_url = $base_url . $base_url;
    }
    return $current_url;
}
 
 
// 源文件下载地址 : http://curlmulti.com/index/download/CurlMulti
if ( ! is_file('CurlMulti.php')) {
    $phpQuery = file_get_contents('http://curlmulti.com/index/download/CurlMulti');
    file_put_contents("./CurlMulti.php", $phpQuery);
}
require 'CurlMulti.php';
// 源文件下载地址 : http://curlmulti.com/index/download/phpQuery
if ( ! is_file('phpQuery.php')) {
    $phpQuery = file_get_contents('http://curlmulti.com/index/download/phpQuery');
    file_put_contents("./phpQuery.php", $phpQuery);
}
require 'phpQuery.php';
 
class myDebug {
    static $start;
    static $end;
    static $times;
    static function microtime_float(){
        list ($usec, $sec) = explode(" ", microtime());
        return ((float) $usec + (float) $sec);
    }
    public function set_start() {
        self::$start = self::microtime_float();
    }
    public function set_end() {
        self::$end = self::microtime_float();
    }
    public function report() {
        return self::$end - self::$start;
    }
}
class myCurl {
 
    public $curl;
    public $article_list;
    protected $cacheDir;
    protected $pageCount;
    protected $articleCount;
    protected $request;
     
 
    public function __construct(request $request){
        $this->_init_request($request);
        $this->_init_curl();
    }
    protected function _init_var() {
        $this->pageCount = 0;
    }
    protected function _init_request(request $request) {
        $this->request = $request;
        // $this->request->cache_path = __DIR__ . '/sjm_cache/';
        // $this->request->fetch_item_query = '#J_posts_list .subject .title a';
        // $this->request->fetch_page_current = '.J_page_wrap .pages strong';
        // $this->request->base_url = 'http://bbs.sijiaomao.com/index.php?m=bbs&c=thread&fid=10&page=%d';
 
    }
    protected function _init_curl() {
        $this->curl = new CurlMulti();
        $this->cacheDir = $this->request->cache_path . 'cache';
        if (! is_dir($this->cacheDir)) {
            mkdir($this->cacheDir, 777, true);
        }
        $this->cacheDataDir =  $this->request->cache_path . 'data';
        if (! is_dir($this->cacheDataDir)) {
            mkdir($this->cacheDataDir, 777, true);
        }
        $this->curl->cache = array(
            'dir' => $this->cacheDir,
            'on' => true,
            'expire' => 3600 * 24
        );
        $this->curl->maxThread = 10;
        $this->curl->opt[CURLOPT_CONNECTTIMEOUT] = 10;
    }
 
    public function fetch_list(){
        $this->_add_fetch_list_url();
        $this->curl->start();
        $this->_save_article_list();
    }
    public function fetch_article() {
        foreach ($this->article_list as $k => $v) {
            $this->curl->add(array(
                'url' => $v['href']
            ), array($this, '_success_article'));
        }
        $this->curl->start();
    }
    public function display() {
        printf(
            "\n共抓取%d个页面\n文章列表%d篇\n相关文章%d篇\n文章目录存放在%s\n",
            $this->pageCount + $this->articleCount,
            $this->pageCount,
            count($this->article_list),
            $this->cacheDataDir . '/list.php'
        );
    }
    public function fetch() {
        return sprintf(
            "\n共抓取%d个页面\n文章列表%d篇\n相关文章%d篇\n文章目录存放在%s\n",
            $this->pageCount + $this->articleCount,
            $this->pageCount,
            count($this->article_list),
            $this->cacheDataDir . '/list.php'
        );
    }
    public function _add_fetch_list_url($page = 1){
        $this->curl->add(
            array(
                'url' => sprintf($this->request->base_url, $page),
                'args' => array('page' => $page)
            ),
            array($this, '_success_list')
        );
    }
    protected function _save_article_list() {
        $res = file_put_contents(
            $this->cacheDataDir . '/list.php',
            sprintf("<?php\n return\t%s;",
            var_export($this->article_list, true))
        );
        // 相关性排序整理
        /*uasort($this->article_list, function ($a, $b){
            preg_match_all('#([a-zA-Z]+)#is', $a['title'], $match);
            $a_title = strtoupper(implode("", $match[0]));
             
            preg_match_all('#([a-zA-Z]+)#is', $b['title'], $match);
            $b_title = strtoupper(implode("", $match[0]));
            return $a_title > $b_title;
        });*/
        $res = file_put_contents(
            $this->cacheDataDir . '/list.txt',
            array_map(function($a_list){
                $str = sprintf(
                    "标题:%s\t超链接:%s \n",
                    str_replace(" ", "", $a_list['title']),
                    $a_list['href']
                );
                return $str;
            }, $this->article_list)
        );
        return $res;
    }
    public function _success_article($r, $param){
        ++$this->articleCount;
    }
    public function _success_list($r, $param){
        ++$this->pageCount;
        $html = phpQuery::newDocumentHTML($r['content']);
        $list = $html[$this->request->fetch_item_query];
        foreach ($list as $v) {
            $v = pq($v);
 
            $item = array(
                "title" => $v->attr('title') ? $v->attr('title') : $v->text(),
                "href" => real_url($v->attr('href'), $this->request->base_url)
            );
            $this->article_list[md5($item['href'])] = $item;
        }
        $page_current = $html[$this->request->fetch_page_current];
        if ($page_current->next()->text()) {
            $page = ++ $param['page'];
            $this->_add_fetch_list_url($page);
        }
         
        phpQuery::unloadDocuments();
    }
}
class request{
    /*url*/
    public $base_url;
    /*缓存文件路径*/
    public $cache_path;
    /*获取元素的CSS选择器*/
    public $fetch_item_query;
    /*分页当前页面元素的CSS选择器*/
    public $fetch_page_current;
 
    static $instance;
    static public function getInstance() {
        if (empty(self::$instance)) {
            self::$instance = new self;
        }
 
        return self::$instance;
    }
    private function __construct() {
        $this->_init_base();
    }
    function _init_base() {
        $this->cache_path = __DIR__ . '/'. trim($_POST['cache_path'], '/') .'/';
        $this->fetch_item_query = $_POST['fetch_item_query'];
        $this->fetch_page_current = $_POST['fetch_page_current'];
        $this->base_url = $_POST['url'];
    }
    function request() {
        if (strstr($_POST['url'], '?')) {
            $url = sprintf("%s&auth=%s", $_POST['url'], $auth);
        } else {
            $url = sprintf("%s?auth=%s", $_POST['url'], $auth);
        }
        $param = array();
        if (isset($_POST['param'])) {
            foreach($_POST['param'] as $k => $item) {
                if (!empty($item['method']) && !empty($item['name'])) {
                    $param[$item['method']][$item['name']] = $item['value'];
                }
            }
        }
        if (isset($param['get']) && !empty($param['get'])) {
            foreach ($param['get'] as $name => $value) {
                $url = sprintf("%s&%s=%s", $url, $name, $value);
            }
        }
        $post_data = null;
        if (isset($param['post']) && !empty($param['post'])) {
            $post_data = $param['post'];
        }
    }
}
?>
 
 
 
 
<?php
if (isset($_POST['submit'])) {
    $request = request::getInstance();
    $myCurl = new myCurl($request);
    myDebug::set_start();
    $myCurl->fetch_list();
     
    //$myCurl->fetch_article();
    myDebug::set_end();
} else {
    $_POST['url'] = 'http://www.oschina.net/code/tag/php?show=time&lang=&catalog=&p=%d';
    $_POST['cache_path'] = 'oschina';
    $_POST['fetch_item_query'] = '.code_list ul li .code_title > a';
    $_POST['fetch_page_current'] = '.pager li.current';
}
?>
 
 
 
 
<html lang="zh-CN">
<head>
    <meta charset="utf-8">
    <title>页面爬虫</title>
    <link href="http://cdn.bootcss.com/bootstrap/3.2.0/css/bootstrap.min.css" rel="stylesheet">
    <link href="http://cdn.bootcss.com/font-awesome/4.1.0/css/font-awesome.min.css" rel="stylesheet">
    <link href="http://static.bootcss.com/www/assets/css/site.min.css?v3" rel="stylesheet">
    <link href="http://static.bootcss.com/www/assets/ico/favicon.png" rel="shortcut icon">
    <script src="http://cdn.bootcss.com/jquery/1.11.1/jquery.min.js"></script>
</head>
<body>
<div class="container">
        <div class="row row-offcanvas row-offcanvas-right">
            <div class="col-xs-12 col-sm-12">
                <div class="row" >
                    <div class="col-xs-1 col-lg-4">
                        <h1>页面爬虫</h1>
                        <div class="thumbnail">
                        <form class="form-signin" action="" method="post">
                            <b>请填URL</b>:
                            <input value="<?php echo isset($_POST['url'])?$_POST['url']:'';?>" class="form-control" placeholder="填写完整地址,以http://开头" type="text" name="url" required><br>
                            <b>请填缓存文件路径</b>:
                            <input value="<?php echo isset($_POST['url'])?$_POST['cache_path']:'';?>" class="form-control" placeholder="填写缓存文件路径" type="text" name="cache_path" required><br>
                            <b>请填获取元素的CSS选择器</b>:
                            <input value="<?php echo isset($_POST['url'])?$_POST['fetch_item_query']:'';?>" class="form-control" placeholder="填写获取元素的CSS选择器" type="text" name="fetch_item_query" required><br>
                            <b>请填分页当前页面元素的CSS选择器</b>:
                            <input value="<?php echo isset($_POST['url'])?$_POST['fetch_page_current']:'';?>" class="form-control" placeholder="填写分页当前页面元素的CSS选择器" type="text" name="fetch_page_current" required><br>
                            <?php if (isset($_POST['param']) && !empty($_POST['param'])) :?>
                                <?php foreach ($_POST['param'] as $k => $item) :?>
                                    <?php if (!empty($item['method']) && !empty($item['name'])) :?>
                                        <div class="thumbnail">
                                            <b>参数name</b>:
                                            <input value="<?php echo $item['name'];?>" placeholder="请填写" type="text" name="param[<?php echo $k;?>][name]"><br>
                                            <b>参数value</b>:
                                            <input value="<?php echo $item['value'];?>" placeholder="请填写" type="text" name="param[<?php echo $k;?>][value]"><br>
                                            <b>请求方式</b>:
                                            <label><input <?php if($item['method']=='get'):?>checked<?php endif;?> value="get" type="radio" name="param[<?php echo $k;?>][method]">get</label>
                                            <label><input <?php if($item['method']=='post'):?>checked<?php endif;?> value="post" type="radio" name="param[<?php echo $k;?>][method]">post</label><br />
                                            <a href="#" onclick="del_param(this)">删除</a>
                                        </div>
                                    <?php endif;?>
                                <?php endforeach;?>
                            <?php endif;?>
                             
                            <input type="button" name="add_param" id="add_param" value="添加参数" class="btn btn-lg btn-primary btn-block"><br />
                            <input type="submit" name="submit" value="下载" class="btn btn-lg btn-primary btn-block"><br />
                        </form>
                        </div>
                    </div>
                    <div class="col-xs-1 col-lg-8">
                        <?php
                            if (isset($_POST['submit'])) {
                                echo "<pre>";
                                echo "请求时间:";
                                var_dump(myDebug::report());
                                 
                                echo "<br />请求url:";
                                isset($request->base_url) && var_dump($request->base_url);
                                 
                                echo "<br />请求参数:";
                                isset($param) && var_dump($param);
                                 
                                echo "<hr />结果:";
                                var_dump($myCurl->fetch());
                                 
                                echo "</pre>";
                            }
                        ?>
                    </div>
                </div>
            </div>
        </div>
        <hr />
    </div>
    <div class="blog-masthead">
        <div class="container">
            <nav class="blog-nav">
                <p class="blog-nav-item">&copy; Company 2014</p>
            </nav>
        </div>
    </div>
</body>
</html>
 
<script>
    $("#add_param").click(function(){
        var input_len = $("form input").size();
        input_len++;
        $(this).before('\
            <div class="thumbnail">\
                <b>参数name</b>:\
                <input value="" placeholder="请填写" type="text" name="param['+ input_len +'][name]"><br>\
                <b>参数value</b>:\
                <input value="" placeholder="请填写" type="text" name="param['+ input_len +'][value]"><br>\
                <b>请求方式</b>:\
                <label><input checked value="get" type="radio" name="param['+ input_len +'][method]">get</label>\
                <label><input value="post" type="radio" name="param['+ input_len +'][method]">post</label><br />\
                <a href="#" onclick="del_param(this)">删除</a>\
            </div>\
        ');
    });
    function del_param(obj) {
        $(obj).parent().remove();
    }
</script>

posted @ 2014-11-21 11:59  lemon66  阅读(341)  评论(0编辑  收藏  举报