snatch

把此代码复制 放到本地 更换下cookie即可抓取
 private $p = 100;   //检察院最大页数100页

    //抓取列表
    public function index(){

        set_time_limit(0);

        while(1){

            $html = '';
            if($this->p > 0){
                $html = $this->p.'.html';
            }
            $ch = curl_init();
            $client_ip = array ( // 伪造ip
 '61.157.' . mt_rand ( 1, 254 ) . '.' . mt_rand ( 1, 254 ), // 四川
 '61.156.' . mt_rand ( 1, 254 ) . '.' . mt_rand ( 1, 254 ), // 山东
 '182.97.' . mt_rand ( 1, 254 ) . '.' . mt_rand ( 1, 254 ), // 江西
 '111.17.' . mt_rand ( 1, 254 ) . '.' . mt_rand ( 1, 254 ), // 青岛
 '219.148.' . mt_rand ( 1, 254 ) . '.' . mt_rand ( 1, 254 ), // 河北、辽宁
 '218.82.' . mt_rand ( 1, 254 ) . '.' . mt_rand ( 1, 254 ), // 上海
 '175.12.' . mt_rand ( 1, 254 ) . '.' . mt_rand ( 1, 254 ), // 湖南
 '221.220.' . mt_rand ( 1, 254 ) . '.' . mt_rand ( 1, 254 ), // 北京
 '123.125.' . mt_rand ( 1, 254 ) . '.' . mt_rand ( 1, 254 ), // 北京
 '14.16.' . mt_rand ( 1, 254 ) . '.' . mt_rand ( 1, 254 )
            );

            $ip = $client_ip [mt_rand ( 0, 9 )];

            curl_setopt ($ch, CURLOPT_URL, 'http://www.ajxxgk.jcy.gov.cn/html/gj/jl/zjxflws/'.$html);
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
            curl_setopt($ch, CURLOPT_HEADER, false);
            curl_setopt($ch,CURLOPT_ENCODING,"'gzip'");//解决乱码
 curl_setopt ( $ch, CURLOPT_HTTPHEADER,
                array (
                    "X-FORWARDED-FOR: ".$ip,
                    "CLIENT-IP: ".$ip,
                    "Host: www.ajxxgk.jcy.gov.cn",
                    "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0",
                    "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                    "Accept-Language: zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
                    "Accept-Encoding: gzip, deflate",
                    //"Cookie: __jsluid=41f16405b9462bf65ea12ec337f6884e; __jsl_clearance=1531466152.033|0|MWEqLGLZtJXfLs8JERxw7Xz0VSg%3D; PHPSESSID=7svf9vl5bvrsu424ro01gkq4o6; Hm_lvt_2e64cf4f6ff9f8ccbe097650c83d719e=1531466157; Hm_lpvt_2e64cf4f6ff9f8ccbe097650c83d719e=1531466826; sYQDUGqqzHpid=page_0; sYQDUGqqzHtid=tab_0",
 "Cookie: __jsluid=41f16405b9462bf65ea12ec337f6884e; PHPSESSID=7svf9vl5bvrsu424ro01gkq4o6; Hm_lvt_2e64cf4f6ff9f8ccbe097650c83d719e=1531466157; Hm_lpvt_2e64cf4f6ff9f8ccbe097650c83d719e=1531469833; sYQDUGqqzHpid=page_0; sYQDUGqqzHtid=tab_0; __jsl_clearance=1531469779.42|0|ixCK%2F5i3LhsoOqHgzJOh%2BkgLS58%3D",
                    "Connection: keep-alive",
                    "Upgrade-Insecure-Requests: 1",
                ) );
            $file_contents = curl_exec($ch);
            curl_close($ch);


            $pattern1='/(a href="(.+?)")/';
            if (preg_match_all($pattern1, $file_contents, $match)) {
                $announce_no1 = $match[2];
            }



            foreach($announce_no1 as $val){
                if(strpos($val,'/html/2') > -1){
                    M('jcy')->add(array('url'=>$val,'page'=>$this->p));
                }
            }


            if($this->p == 100){
                echo 'finish';
                exit;
            }
            $this->p++;
            sleep(5);
        }


    }


    //抓取详情
    public function detail(){

        set_time_limit(0);

        $m = M('jcy');
        $list = $m->group('url')->getField('url',true);
        $jcy = M('jcy','','mysql://jufa_slave:KYi2303mdyTdyh3@123.56.183.226:3312/jianchayuan');
        foreach($list as $val){

            $url = $val;
            $check = $jcy->where(array('url'=>$url))->find();
            if($check){
                continue;
            }

            $ch = curl_init();
            $client_ip = array ( // 伪造ip
 '61.157.' . mt_rand ( 1, 254 ) . '.' . mt_rand ( 1, 254 ), // 四川
 '61.156.' . mt_rand ( 1, 254 ) . '.' . mt_rand ( 1, 254 ), // 山东
 '182.97.' . mt_rand ( 1, 254 ) . '.' . mt_rand ( 1, 254 ), // 江西
 '111.17.' . mt_rand ( 1, 254 ) . '.' . mt_rand ( 1, 254 ), // 青岛
 '219.148.' . mt_rand ( 1, 254 ) . '.' . mt_rand ( 1, 254 ), // 河北、辽宁
 '218.82.' . mt_rand ( 1, 254 ) . '.' . mt_rand ( 1, 254 ), // 上海
 '175.12.' . mt_rand ( 1, 254 ) . '.' . mt_rand ( 1, 254 ), // 湖南
 '221.220.' . mt_rand ( 1, 254 ) . '.' . mt_rand ( 1, 254 ), // 北京
 '123.125.' . mt_rand ( 1, 254 ) . '.' . mt_rand ( 1, 254 ), // 北京
 '14.16.' . mt_rand ( 1, 254 ) . '.' . mt_rand ( 1, 254 )
            );

            $ip = $client_ip [mt_rand ( 0, 9 )];

            curl_setopt ($ch, CURLOPT_URL, 'http://www.ajxxgk.jcy.gov.cn'.$url);
            //curl_setopt ($ch, CURLOPT_URL, 'http://www.ajxxgk.jcy.gov.cn/html/20180712/2/8282883.html');
 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
            curl_setopt($ch, CURLOPT_HEADER, false);
            curl_setopt($ch,CURLOPT_ENCODING,"'gzip'");//解决乱码
 curl_setopt ( $ch, CURLOPT_HTTPHEADER,
                array (
                    "X-FORWARDED-FOR: ".$ip,
                    "CLIENT-IP: ".$ip,
                    "Host: www.ajxxgk.jcy.gov.cn",
                    "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0",
                    "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                    "Accept-Language: zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
                    "Accept-Encoding: gzip, deflate",
                    "Referer: http://www.ajxxgk.jcy.gov.cn$url",
                    //"Cookie: __jsluid=41f16405b9462bf65ea12ec337f6884e; Hm_lvt_2e64cf4f6ff9f8ccbe097650c83d719e=1531466157,1531471714,1531529424; sYQDUGqqzHpid=page_0; sYQDUGqqzHtid=tab_0; __jsl_clearance=1531529417.685|0|d7RYsx0ncHz%2BZ3NUDzKPIYLNwV0%3D; Hm_lpvt_2e64cf4f6ff9f8ccbe097650c83d719e=1531529444; PHPSESSID=pqgshbduheq6ro8rj48suq97b6",
 //"Cookie: __jsluid=41f16405b9462bf65ea12ec337f6884e; Hm_lvt_2e64cf4f6ff9f8ccbe097650c83d719e=1531466157,1531471714,1531529424; sYQDUGqqzHpid=page_0; sYQDUGqqzHtid=tab_0; Hm_lpvt_2e64cf4f6ff9f8ccbe097650c83d719e=1531533080; PHPSESSID=pqgshbduheq6ro8rj48suq97b6; __jsl_clearance=1531533073.053|0|qnatGTD8P%2F8T3RZxBurOrxy9im4%3D",
 //"Cookie: __jsluid=41f16405b9462bf65ea12ec337f6884e; Hm_lvt_2e64cf4f6ff9f8ccbe097650c83d719e=1531466157,1531471714,1531529424; sYQDUGqqzHpid=page_0; sYQDUGqqzHtid=tab_0; Hm_lpvt_2e64cf4f6ff9f8ccbe097650c83d719e=1531537860; PHPSESSID=pqgshbduheq6ro8rj48suq97b6; Hm_lvt_d7682ab43891c68a00de46e9ce5b76aa=1531534004; Hm_lpvt_d7682ab43891c68a00de46e9ce5b76aa=1531534004; sYQDUGqqzHsearch_history=%u957F%u9AD8%u65B0%u68C0%u5211%u8BC9%u30142018%u301528%u53F7%7C%2C%u674E%u4E1C%u5219%u804C%u52A1%u4FB5%u5360%u6848%7C; __jsl_clearance=1531537853.545|0|TFsQzEgeOaRGBFjOuCoRwentL24%3D",
 //"Cookie: __jsluid=41f16405b9462bf65ea12ec337f6884e; Hm_lvt_2e64cf4f6ff9f8ccbe097650c83d719e=1531466157,1531471714,1531529424; sYQDUGqqzHpid=page_0; sYQDUGqqzHtid=tab_0; Hm_lpvt_2e64cf4f6ff9f8ccbe097650c83d719e=1531548533; PHPSESSID=pqgshbduheq6ro8rj48suq97b6; Hm_lvt_d7682ab43891c68a00de46e9ce5b76aa=1531534004; Hm_lpvt_d7682ab43891c68a00de46e9ce5b76aa=1531534004; sYQDUGqqzHsearch_history=%u957F%u9AD8%u65B0%u68C0%u5211%u8BC9%u30142018%u301528%u53F7%7C%2C%u674E%u4E1C%u5219%u804C%u52A1%u4FB5%u5360%u6848%7C; __jsl_clearance=1531546901.058|0|sXrj6SfoWx3vMny%2BfHz85TNORl0%3D",
 "Cookie: __jsluid=41f16405b9462bf65ea12ec337f6884e; Hm_lvt_2e64cf4f6ff9f8ccbe097650c83d719e=1531466157,1531471714,1531529424; sYQDUGqqzHpid=page_0; sYQDUGqqzHtid=tab_0; Hm_lpvt_2e64cf4f6ff9f8ccbe097650c83d719e=1531555369; PHPSESSID=pqgshbduheq6ro8rj48suq97b6; Hm_lvt_d7682ab43891c68a00de46e9ce5b76aa=1531534004; Hm_lpvt_d7682ab43891c68a00de46e9ce5b76aa=1531534004; sYQDUGqqzHsearch_history=%u957F%u9AD8%u65B0%u68C0%u5211%u8BC9%u30142018%u301528%u53F7%7C%2C%u674E%u4E1C%u5219%u804C%u52A1%u4FB5%u5360%u6848%7C; __jsl_clearance=1531555363.037|0|ABiNEmcyF9rTF7U%2FlZUcdgq01Sk%3D",
                    "Connection: keep-alive",
                    "Upgrade-Insecure-Requests: 1",
                    "If-Modified-Since: Fri, 13 Jul 2018 15:42:16 GMT",
                    "If-None-Match: W/'4bc075-6d79-570e3530aed66'",
                ) );
            $file_contents = curl_exec($ch);
            curl_close($ch);

// dump($file_contents);
// exit;

 $announce_no1 = '';
            $pattern1='/(<title>(.+?)<\/title>)/';
            if (preg_match($pattern1, $file_contents, $match)) {
                $announce_no1 = $match[2];
            }
            $announce_no1 = str_replace('- 法律文书公开 - 人民检察院案件信息公开网','',$announce_no1);


            $pattern1='/<p style="(.+?)">(.*?)<\/p>/is';
            if (preg_match_all($pattern1, $file_contents, $match)) {
                $announce_no = $match[2];
            }

            if(empty($announce_no)){
                echo $val;
                exit;
            }

            $arr = array();
            foreach($announce_no as $val){
                $str = strip_tags($val);
                $str = $this->trimall($str);
                if(!empty($str)){
                    $arr[] = $str;
                }
            }
            $add['case_no'] = $arr[2];
            $add['title'] = $announce_no1;
            $add['txt'] = implode('_|_',$arr);
            $add['url'] = $url;


            $jcy->add($add);

            sleep(5);

        }


    }

    function trimall($str){
        $str = strip_tags($str);
        $qian = array(" ", " ", "\t", "\n", "\r", '','&nbsp;','&#xa0;');
        $hou = array("", "", "", "", "", "","","");
        return str_replace($qian, $hou, $str);
    }
 

 

 

 

抓取官网

<?php
//https://www.fajuhe.com/version2/01.script/ws_parse/CurlJufaanli.php

include_once '../conf/mysql_conn.php';
include_once '../conf/Utils.php';
include_once '../ws_parse/function.wsp.php';
echo '<meta http-equiv="Content-Type" content="text/html; charset=utf8">';
date_default_timezone_set('Asia/Shanghai');
error_reporting(E_ERROR);

$keyword=array(
    ''
);
for ($i=0;$i<200;$i++){
    detail();
}

function detail()
{


    $ch = curl_init();
    $url = "http://spider.jufaanli.com/home/search/searchJson";
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_HEADER, false);
    curl_setopt($ch, CURLOPT_ENCODING, "&#39;gzip&#39;");//解决乱码


    $curlPost = array(
        'page' => '1',
        'searchTime' => 1542337827703,
        'searchNum' => 1,
        'nowReason' => 20,
        'sortType' => 'caseWeight',
        'keyword' => '莱姆顿',
        'TypeKey' => '1:莱姆顿',
    );

    // 设置URL和相应的选项
    $options = array(
        CURLOPT_URL => $url,
        CURLOPT_RETURNTRANSFER => 1,
        CURLOPT_HEADER =>
            array(

            ),
        CURLOPT_HTTPHEADER =>
            array(
                "Connection: keep-alive",
                "Content-Length: 152",
                "Accept: application/json, text/javascript, */*; q=0.01",
                "Origin: http://spider.jufaanli.com",
                "X-Requested-With: XMLHttpRequest",
                "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36",
                "Content-Type: application/x-www-form-urlencoded; charset=UTF-8",
                "Referer: http://spider.jufaanli.com/search2?TypeKey=1%3A%E8%8E%B1%E5%A7%86%E9%A1%BF",
                "Accept-Encoding: gzip, deflate",
                "Accept-Language: zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-TW;q=0.6",
                "Cookie: Cookie: t=4ed3f2e39fada12f15a1ee1b265f9a0b; BJYSESSION=8r562mhj2livqgmlebog2abha1; Hm_lvt_7d935fee641e9bdd8fd6b28e9a2b62dc=1542266081,1542266094,1542266694,1542329761; is_remember=1; refer_url=http%3A%2F%2Fspider.jufaanli.com%2Fsearch2%3FTypeKey%3D1%3A%E8%8E%B1%E5%A7%86%E9%A1%BF; Hm_lpvt_7d935fee641e9bdd8fd6b28e9a2b62dc=1542337815; login_time=2018-11-16+11%3A10%3A18; tf=fb9461bb0083ee407534bbc4cc8b7b82"

            )
    );

    curl_setopt_array($ch, $options);

    //post提交
    curl_setopt($ch, CURLOPT_POSTFIELDS, $curlPost);

// 抓取URL并把它传递给浏览器
    $data = curl_exec($ch);
    if($data!=false){
        echo "";
    }


// 关闭cURL资源,并且释放系统资源
    curl_close($ch);


    set_time_limit(0);


}

 

 

 

 

 

使用代理(https://www.cnblogs.com/burningc/p/8794584.html):

代理ip使用网站:http://www.89ip.cn/index_13.html

<?php
$ch = curl_init();
$requestUrl="http://39.105.47.187/";
curl_setopt($ch, CURLOPT_URL, $requestUrl);

curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);

curl_setopt($ch, CURLOPT_PROXYAUTH, CURLAUTH_BASIC); //代理认证模式

//curl_setopt($ch, CURLOPT_PROXY, "111.47.154.38"); //代理服务器地址
//curl_setopt($ch, CURLOPT_PROXYPORT, 53281); //代理服务器端口

curl_setopt($ch, CURLOPT_PROXY, "221.210.120.153"); //代理服务器地址
curl_setopt($ch, CURLOPT_PROXYPORT, 54402); //代理服务器端口

//curl_setopt($ch, CURLOPT_PROXYUSERPWD, ":"); //http代理认证帐号,名称:pwd的格式

curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP); //使用http代理模式

$file_contents = curl_exec($ch);
echo "the result is ".($file_contents);

$a=(curl_error ($ch));
var_dump($a);

 

posted on 2018-10-07 17:11  ziyi_ang  阅读(1008)  评论(0编辑  收藏  举报

导航