百度收录链接抓取小程序

set_time_limit(0);
header("Content-type:text/html;charset=utf-8");
    $updatePoint = date("Y-m-d", time());
    $patMD = date("m-d", time());
$xmlDatas = '';
for($i=0;$i<76;$i++)
{

$page = $i*10;
$conts = file_get_contents("http://www.baidu.com/s?wd=site%3Awww.xxxx.com%20%E4%B9%90%E5%A4%A9%E5%A0%82&pn={$page}&oq=site%3Awww.xxxx.com%20%E4%B9%90%E5%A4%A9%E5%A0%82&ie=utf-8&rsv_idx=1&rsv_pq=aff4775f00063733&rsv_t=ff065MbpZuOoe%2B%2BV4iOkvVuzeSXd1n2FRBQwnnwPHtpsy%2F7pPFaTfcrWm4M&f=8&rsv_bp=1&tn=baidu");
$pat = '|\\"http://www.baidu.com/link\?url=?([^>]*)\\"\s|U';   
    
$xmlDatas .= getLists($pat, $conts, $updatePoint);


   
}

 if(file_exists(dirname(__FILE__)."/silian.txt")){
        file_put_contents(dirname(__FILE__)."/silian.txt", $xmlDatas);
    }else{
        $fp = fopen(dirname(__FILE__)."/silian.txt", 'w+b');
        fwrite($fp, $xmlDatas);
        fclose($fp);
    }
    
    function getLists($pattern, $contents, $updatePoint){
        preg_match_all($pattern, $contents, $matches);
       
        $lists = $matches[0];
        $xmlData = "";
        $lists = array_unique($lists);//过滤重复的 ;
   
        
        if(!empty($lists)){
         
            foreach ($lists as $key => $value) {
                # code...
        
              
                $value = trim($value, '"');
               
                $value = substr($value, 0,-1);
                $value = trim($value,'"');
              
                 
                $info = parse_url($value);

                $fp = fsockopen($info['host'], 80,$errno, $errstr, 30);
               
                fputs($fp,"GET {$info['path']}?{$info['query']} HTTP/1.0"."\r\n");
                fputs($fp, "Host: {$info['host']}"."\r\n");
                fputs($fp, "Connection: close"."\r\n");
                fputs($fp, "\r\n");
                $rewrite = '';
                while(!feof($fp)) {
                    $line = fgets($fp,512);
                    if($line != " " ) {
                        if(strpos($line,'Location:') !== false) {
                            $rewrite = str_replace("Location: ",'',$line);
                        }
                    }else {
                        break;
                    }
                }
              
                $value = $rewrite;
    
              
                $xmlData .=  $value ;
            }
            return $xmlData;
        }else{
            exit();
        } 
    }

 此抓取主要用于百度收录的网址查询,没有直接按关键词查询来查询收录情况。

posted @ 2016-06-20 14:01  haishashou  阅读(1072)  评论(0编辑  收藏  举报