PHP QueryList采集文章

1.安装querylist

     官网手册:http://www.querylist.cc/docs/guide/v4/overview

  如果实在本地环境测试在测试文件夹下面先用composer ini 初始化一下 在进行composer安装QueryList

  

 

2.index.html

<!DOCTYPE html>
<html lang="en">

<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>信息采集</title>
    <style>
        .form{
            text-align: center;
            border: solid 10px #BCF0F0;
            border-radius: 20px;
            width: 800px;
            height: 700px;
            margin: auto;
            position: absolute;
            top: 0;
            left: 0;
            right: 0;
            bottom: 0;
        }
        .form p input{width: 40%;margin: 10px;padding: 10px;border-radius: 5px;}
        .label{display: inline-block;padding: 3px 6px; text-align: right; width: 15%; }
        #login_click{ margin-top:32px; height:40px;display: inline-block;border-radius: 3px;}
        #login_click
        {
            
             cursor:pointer ;
            text-decoration:none;
            background:#2f435e;
            color:#f2f2f2;
            
            padding: 10px 30px 10px 30px;
            font-size:16px;
            font-family: 微软雅黑,宋体,Arial,Helvetica,Verdana,sans-serif;
            font-weight:bold;
            border-radius:3px;
            
            -webkit-transition:all linear 0.30s;
            -moz-transition:all linear 0.30s;
            transition:all linear 0.30s;
            
            }
           #login_click a:hover { background:#385f9e; }

    </style>
</head>

<body>
    <!-- 内嵌式CSS样式 -->
    <div class='form'>
        <h1>信息采集</h1>
        <br>
        <form action='index.php' method="post">
            <!-- 多个文本输入框左对齐方法:给label设置一个统一宽度,输入框左边缘会按这个边界排列对齐 -->
            <p>
                <label for="title" class='label' >主域名:</label>
                <input type="text" name="main" align="left" value="https://www.wealink.com">
            </p>
            <p>
                <label for="title" class='label' >列表页链接:</label>
                <input type="text" name="url" align="left" value="https://www.wealink.com/know/?page=">
            </p>
            <p>
                <label for="title" class='label' >列表页规则:</label>
                <input type="text" name="listPre" align="left" value=".art-ask li .art-askbt a">
            </p>
            <p>
                <label for="title" class='label' >内容页规则:</label>
                <input type="text" name="conPre" align="left" value="title:h1:text,problem:.art-flk-text p:text,answer:.art-flq-text:html">
            </p>
            <p>
                <label for="title" class='label' >采集页数:</label>
                <input type="text" name="num" align="left" value="1">
            </p>
            
            
            <!-- 按钮 -->
               <input type="submit" id="login_click" value="采 集" class="btn" />                
            
        </form>

    </div>
</body>

</html>

3.index.php接收表单信息处理进行采集

<?php

/**
 * @Author: Hiker
 * @Date:   2021-11-30 16:55:34
 * @Last:   Modified by: I will never know what the next difficulty is
 */
require_once 'vendor/autoload.php';
use QL\QueryList;



class Collection
{
    /**
     * [main description] 入口处理信息
     * @param  [type] $post [description]
     * @return [type]       [description]
     */
    public function main($post){
        foreach($post as $v){
            if(empty($v)){
                echo "采集失败,缺少参数";
                return false;
            }
        }
        $url = trim($post['url']);
        $main = trim($post['main']);
        $listPre = trim($post['listPre']);
        $conPre = trim($post['conPre']);

        $pre = explode(',', $conPre);
        $conPre = [];
        foreach($pre as $v){
            $val = explode(':',$v);
            array_push($conPre,$val);
        }
        // 开始采集
        $num = $post['num'];

        $count = $this->run($url,$listPre,$conPre,$main,$num);
        return $count;
        
        
    }
    /**
     * [index description] 采集列表页处理链接
     * @param  [type] $url     [description]
     * @param  [type] $listPre [description]
     * @param  [type] $conPre  [description]
     * @param  [type] $main    [description]
     * @return [type]          [description]
     */
    public function run($url,$listPre,$conPre,$main,$num){
        if($main){
            $this->main = $main;
        }
        $urlM = $url.$num;
        $html = QueryList::get($urlM);
        $urls = $html->find($listPre)->map(function($row){
            return $row->href;
        });
        
        $data = $this->show($urls,$conPre);
        // 展示页数
        echo $urlM.'<hr>';
        if($num > 1){
            $num -= 1;
            return $urls = $this->run($url,$listPre,$conPre,$main,$num);
        }
        return $urls;
    }

    /**
     * [show description] 采集内容页信息
     * @param  [type] $urls   [description]
     * @param  [type] $conPre [description]
     * @return [type]         [description]
     */
    public function show($urls,$conPre){
        $data = [];
        foreach($urls as $k => $v){
            // 采集内容页面
            $ql = QueryList::get($this->main.$v);
            $rt = [];
            foreach($conPre as $v){
                $type = $v[2];
                $rt[$v[0]] = $ql->find($v[1])->$type();
            }
            // title
            echo $rt['title'].'<br>';
            
            array_push($data,$rt);
        }
        return $data;
            
    }
}



$post = $_POST;
$r = new Collection;
$req = $r->main($post);

 

posted @ 2021-12-04 15:28  时间掉飞机  阅读(84)  评论(0编辑  收藏  举报