php利用curl获取网页title内容

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
/**$html = curl_get_file_contents($url);
$title = get_title_contents($html);
var_dump($title);*/
function curl_get_file_contents($url,$referer='') {
    static $curl_loops = 0;//避免死了循环必备
    static $curl_max_loops = 3;
    $useragent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36";
 
    $ch = curl_init();
    curl_setopt($ch,CURLOPT_URL,$url);
    curl_setopt($ch,CURLOPT_HEADER,true);
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); //不验证证书
    curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); //不验证证书
    curl_setopt($ch,CURLOPT_USERAGENT,$useragent);
    curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
    curl_setopt($ch,CURLOPT_REFERER,$referer);
    $data = curl_exec($ch);
    $ret = $data;
    list($header,$data) = explode("\r\n\r\n",$data,2);
    $http_code = curl_getinfo($ch,CURLINFO_HTTP_CODE);
    $last_url = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL);
    curl_close($ch);
    if ($http_code == 301 || $http_code == 302) {
        $matches = array();
        preg_match('/Location:(.*?)\n/',$header,$matches);
        $url = @parse_url(trim(array_pop($matches)));
        if (!$url) {
            return $data;
        }
        $new_url = $url['scheme'] . '://' . $url['host'] . $url['path'] . (isset($url['query']) ? '?' . $url['query'] : '');
        if ($curl_loops++ >= $curl_max_loops) {
            return false;
        }else {
            $new_url = stripslashes($new_url);
            return curl_get_file_contents($new_url);
        }
    } else {
        list($header,$data) = explode("\r\n\r\n",$ret,2);
        return $data;
    }
}
function get_title_contents($html){
    // 解析 HTML 的 <head> 区段
//  <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
//  <meta content="text/html; charset=gb2312" http-equiv="Content-Type">
    preg_match("/<head.*>(.*)<\/head>/smUi",$html, $htmlHeaders);
    //var_dump($output);die();
    if(!count($htmlHeaders)){
        $title = "无法解析数据中的 <head> 区段";
    }
 
// 取得 <head> 中 meta 设置的编码格式<meta charset="gb2312">
    if(preg_match('/<meta.*charset=(("){0,1}[a-zA-Z0-9-]*("){0,1})/',$htmlHeaders[1], $results)){
        $charset $results[1];
    }else{
        $charset = "None";
    }
    $charset = str_replace('"','',$charset);
 
// 取得 <title> 中的文字
    if(preg_match("/<title>(.*)<\/title>/Ui",$htmlHeaders[1], $htmlTitles)){
        if(!count($htmlTitles)){
            $title = "无法解析 <title> 的内容";
            exit;
        }
 
        // 将  <title> 的文字编码格式转成 UTF-8
        if($charset == "None"){
            $title=$htmlTitles[1];
        }else{
            $title=iconv($charset, "UTF-8", $htmlTitles[1]);
        }
    }
    return html_entity_decode($title);
}

  支持https,302跳转

posted on   防空洞123  阅读(1645)  评论(0编辑  收藏  举报

导航

< 2025年1月 >
29 30 31 1 2 3 4
5 6 7 8 9 10 11
12 13 14 15 16 17 18
19 20 21 22 23 24 25
26 27 28 29 30 31 1
2 3 4 5 6 7 8
点击右上角即可分享
微信分享提示