PHP QueryList采集文章
1.安装querylist
官网手册:http://www.querylist.cc/docs/guide/v4/overview
如果实在本地环境测试在测试文件夹下面先用composer ini 初始化一下 在进行composer安装QueryList
2.index.html
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <meta http-equiv="X-UA-Compatible" content="IE=edge"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>信息采集</title> <style> .form{ text-align: center; border: solid 10px #BCF0F0; border-radius: 20px; width: 800px; height: 700px; margin: auto; position: absolute; top: 0; left: 0; right: 0; bottom: 0; } .form p input{width: 40%;margin: 10px;padding: 10px;border-radius: 5px;} .label{display: inline-block;padding: 3px 6px; text-align: right; width: 15%; } #login_click{ margin-top:32px; height:40px;display: inline-block;border-radius: 3px;} #login_click { cursor:pointer ; text-decoration:none; background:#2f435e; color:#f2f2f2; padding: 10px 30px 10px 30px; font-size:16px; font-family: 微软雅黑,宋体,Arial,Helvetica,Verdana,sans-serif; font-weight:bold; border-radius:3px; -webkit-transition:all linear 0.30s; -moz-transition:all linear 0.30s; transition:all linear 0.30s; } #login_click a:hover { background:#385f9e; } </style> </head> <body> <!-- 内嵌式CSS样式 --> <div class='form'> <h1>信息采集</h1> <br> <form action='index.php' method="post"> <!-- 多个文本输入框左对齐方法:给label设置一个统一宽度,输入框左边缘会按这个边界排列对齐 --> <p> <label for="title" class='label' >主域名:</label> <input type="text" name="main" align="left" value="https://www.wealink.com"> </p> <p> <label for="title" class='label' >列表页链接:</label> <input type="text" name="url" align="left" value="https://www.wealink.com/know/?page="> </p> <p> <label for="title" class='label' >列表页规则:</label> <input type="text" name="listPre" align="left" value=".art-ask li .art-askbt a"> </p> <p> <label for="title" class='label' >内容页规则:</label> <input type="text" name="conPre" align="left" value="title:h1:text,problem:.art-flk-text p:text,answer:.art-flq-text:html"> </p> <p> <label for="title" class='label' >采集页数:</label> <input type="text" name="num" align="left" value="1"> </p> <!-- 按钮 --> <input type="submit" id="login_click" value="采 集" class="btn" /> </form> </div> </body> </html>
3.index.php接收表单信息处理进行采集
<?php /** * @Author: Hiker * @Date: 2021-11-30 16:55:34 * @Last: Modified by: I will never know what the next difficulty is */ require_once 'vendor/autoload.php'; use QL\QueryList; class Collection { /** * [main description] 入口处理信息 * @param [type] $post [description] * @return [type] [description] */ public function main($post){ foreach($post as $v){ if(empty($v)){ echo "采集失败,缺少参数"; return false; } } $url = trim($post['url']); $main = trim($post['main']); $listPre = trim($post['listPre']); $conPre = trim($post['conPre']); $pre = explode(',', $conPre); $conPre = []; foreach($pre as $v){ $val = explode(':',$v); array_push($conPre,$val); } // 开始采集 $num = $post['num']; $count = $this->run($url,$listPre,$conPre,$main,$num); return $count; } /** * [index description] 采集列表页处理链接 * @param [type] $url [description] * @param [type] $listPre [description] * @param [type] $conPre [description] * @param [type] $main [description] * @return [type] [description] */ public function run($url,$listPre,$conPre,$main,$num){ if($main){ $this->main = $main; } $urlM = $url.$num; $html = QueryList::get($urlM); $urls = $html->find($listPre)->map(function($row){ return $row->href; }); $data = $this->show($urls,$conPre); // 展示页数 echo $urlM.'<hr>'; if($num > 1){ $num -= 1; return $urls = $this->run($url,$listPre,$conPre,$main,$num); } return $urls; } /** * [show description] 采集内容页信息 * @param [type] $urls [description] * @param [type] $conPre [description] * @return [type] [description] */ public function show($urls,$conPre){ $data = []; foreach($urls as $k => $v){ // 采集内容页面 $ql = QueryList::get($this->main.$v); $rt = []; foreach($conPre as $v){ $type = $v[2]; $rt[$v[0]] = $ql->find($v[1])->$type(); } // title echo $rt['title'].'<br>'; array_push($data,$rt); } return $data; } } $post = $_POST; $r = new Collection; $req = $r->main($post);