PHP 之QueryList网页采集框架

一、文档

http://www.querylist.cc/docs/guide/v3/overview

二、示例

//接口解释
QueryList::Query(采集的目标页面,采集规则[,区域选择器][,输出编码][,输入编码][,是否移除头部])
//采集规则
$rules = array(
'规则名' => array('jQuery选择器','要采集的属性'[,"标签过滤列表"][,"回调函数"]),
'规则名2' => array('jQuery选择器','要采集的属性'[,"标签过滤列表"][,"回调函数"]),
..........
[,"callback"=>"全局回调函数"]
);
//注:方括号括起来的参数可选

1、内容过滤

<?php
require 'phpQuery.php';
require 'QueryList.php';

use QL\QueryList;

$html =<<<STR
    <div id="demo">
        xxx
        <span class="tt">yyy</span>
        <span>zzz</span>
        <p>nnn</p>
    </div>
STR;

//只想获取内容:xxx
$data = QueryList::Query($html, ['content' => ["#demo", "text", "-p -span"]])->data;
var_dump($data);

//去掉p标签,但保留p标签的内容
$data = QueryList::Query($html, ['content' => ["#demo", "html", "p"]])->data;
var_dump($data);

//获取纯文本,但保留p标签
$data = QueryList::Query($html, ['content' => ["#demo", "text", "p"]])->data;
var_dump($data);

//去掉class名为tt的元素和p标签,但保留p标签的内容
$data = QueryList::Query($html, ['content' => ["#demo", "text", "-.tt p"]])->data;
var_dump($data);

2、回调函数中传参数

<?php
require 'phpQuery.php';
require 'QueryList.php';

use QL\QueryList;

$html =<<<STR
    <div id="demo">
        <ul>
            <li>
              <h3>xxx</h3>
              <div class="list">
                <div class="item">item1</div>
                <div class="item">item2</div>
              </div>
            </li>

             <li>
              <h3>xxx2</h3>
              <div class="list">
                <div class="item">item12</div>
                <div class="item">item22</div>
              </div>
            </li>

        </ul>
    </div>
STR;

//获取列表
$data = QueryList::Query($html, ['h3' => ['h3', 'text']], 'ul > li')->data;

//递归多级DOM解析
$data = QueryList::Query($html, ['h3' => ['h3', 'text'], 'list' => ['.list', 'html']], 'ul > li')->getData(function($item){
    $item['list'] = QueryList::Query($item['list'], ['item' => ['.item', 'text']])->data;
    return $item;
});
var_dump($data);

 

posted @ 2023-04-25 16:22  样子2018  阅读(99)  评论(0编辑  收藏  举报