php利用curl获取网页title内容
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | /**$html = curl_get_file_contents($url); $title = get_title_contents($html); var_dump($title);*/ function curl_get_file_contents( $url , $referer = '' ) { static $curl_loops = 0; //避免死了循环必备 static $curl_max_loops = 3; $useragent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" ; $ch = curl_init(); curl_setopt( $ch ,CURLOPT_URL, $url ); curl_setopt( $ch ,CURLOPT_HEADER,true); curl_setopt( $ch , CURLOPT_SSL_VERIFYPEER, false); //不验证证书 curl_setopt( $ch , CURLOPT_SSL_VERIFYHOST, false); //不验证证书 curl_setopt( $ch ,CURLOPT_USERAGENT, $useragent ); curl_setopt( $ch ,CURLOPT_RETURNTRANSFER,true); curl_setopt( $ch ,CURLOPT_REFERER, $referer ); $data = curl_exec( $ch ); $ret = $data ; list( $header , $data ) = explode ( "\r\n\r\n" , $data ,2); $http_code = curl_getinfo( $ch ,CURLINFO_HTTP_CODE); $last_url = curl_getinfo( $ch ,CURLINFO_EFFECTIVE_URL); curl_close( $ch ); if ( $http_code == 301 || $http_code == 302) { $matches = array (); preg_match( '/Location:(.*?)\n/' , $header , $matches ); $url = @ parse_url (trim( array_pop ( $matches ))); if (! $url ) { return $data ; } $new_url = $url [ 'scheme' ] . '://' . $url [ 'host' ] . $url [ 'path' ] . (isset( $url [ 'query' ]) ? '?' . $url [ 'query' ] : '' ); if ( $curl_loops ++ >= $curl_max_loops ) { return false; } else { $new_url = stripslashes ( $new_url ); return curl_get_file_contents( $new_url ); } } else { list( $header , $data ) = explode ( "\r\n\r\n" , $ret ,2); return $data ; } } function get_title_contents( $html ){ // 解析 HTML 的 <head> 区段 // <meta http-equiv="Content-type" content="text/html; charset=utf-8" /> // <meta content="text/html; charset=gb2312" http-equiv="Content-Type"> preg_match( "/<head.*>(.*)<\/head>/smUi" , $html , $htmlHeaders ); //var_dump($output);die(); if (! count ( $htmlHeaders )){ $title = "无法解析数据中的 <head> 区段" ; } // 取得 <head> 中 meta 设置的编码格式<meta charset="gb2312"> if (preg_match( '/<meta.*charset=(("){0,1}[a-zA-Z0-9-]*("){0,1})/' , $htmlHeaders [1], $results )){ $charset = $results [1]; } else { $charset = "None" ; } $charset = str_replace ( '"' , '' , $charset ); // 取得 <title> 中的文字 if (preg_match( "/<title>(.*)<\/title>/Ui" , $htmlHeaders [1], $htmlTitles )){ if (! count ( $htmlTitles )){ $title = "无法解析 <title> 的内容" ; exit ; } // 将 <title> 的文字编码格式转成 UTF-8 if ( $charset == "None" ){ $title = $htmlTitles [1]; } else { $title =iconv( $charset , "UTF-8" , $htmlTitles [1]); } } return html_entity_decode( $title ); } |
支持https,302跳转
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步