PHP 之QueryList网页采集框架
一、文档
http://www.querylist.cc/docs/guide/v3/overview
二、示例
//接口解释
QueryList::Query(采集的目标页面,采集规则[,区域选择器][,输出编码][,输入编码][,是否移除头部])
//采集规则
$rules = array(
'规则名' => array('jQuery选择器','要采集的属性'[,"标签过滤列表"][,"回调函数"]),
'规则名2' => array('jQuery选择器','要采集的属性'[,"标签过滤列表"][,"回调函数"]),
..........
[,"callback"=>"全局回调函数"]
);
//注:方括号括起来的参数可选
1、内容过滤
<?php require 'phpQuery.php'; require 'QueryList.php'; use QL\QueryList; $html =<<<STR <div id="demo"> xxx <span class="tt">yyy</span> <span>zzz</span> <p>nnn</p> </div> STR; //只想获取内容:xxx $data = QueryList::Query($html, ['content' => ["#demo", "text", "-p -span"]])->data; var_dump($data); //去掉p标签,但保留p标签的内容 $data = QueryList::Query($html, ['content' => ["#demo", "html", "p"]])->data; var_dump($data); //获取纯文本,但保留p标签 $data = QueryList::Query($html, ['content' => ["#demo", "text", "p"]])->data; var_dump($data); //去掉class名为tt的元素和p标签,但保留p标签的内容 $data = QueryList::Query($html, ['content' => ["#demo", "text", "-.tt p"]])->data; var_dump($data);
2、回调函数中传参数
<?php require 'phpQuery.php'; require 'QueryList.php'; use QL\QueryList; $html =<<<STR <div id="demo"> <ul> <li> <h3>xxx</h3> <div class="list"> <div class="item">item1</div> <div class="item">item2</div> </div> </li> <li> <h3>xxx2</h3> <div class="list"> <div class="item">item12</div> <div class="item">item22</div> </div> </li> </ul> </div> STR; //获取列表 $data = QueryList::Query($html, ['h3' => ['h3', 'text']], 'ul > li')->data; //递归多级DOM解析 $data = QueryList::Query($html, ['h3' => ['h3', 'text'], 'list' => ['.list', 'html']], 'ul > li')->getData(function($item){ $item['list'] = QueryList::Query($item['list'], ['item' => ['.item', 'text']])->data; return $item; }); var_dump($data);