PHP获取网站标题和图标

安装依赖

composer require guzzlehttp/guzzle:*

如果不想使用guzzlehttp,可以自己实现curl,反正只要获取网站正文就行

核心源码

<?php
namespace xfstu\http;
use GuzzleHttp\Client;
class titleFavicon
{
private function httpGet($url)
{
$client = new Client([
'headers' => [
'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0',
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language' => 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding' => 'gzip, deflate, br',
'Connection' => 'keep-alive',
'Upgrade-Insecure-Requests' => '1',
'Sec-Fetch-Dest' => 'document',
'Sec-Fetch-Mode' => 'navigate',
'Sec-Fetch-Site' => 'same-origin',
'Sec-Fetch-User' => '?1',
'Pragma' => 'no-cache',
'Cache-Control' => 'no-cache'
],
'cookies' => true
]);
$response = $client->get($url);
return $response->getBody()->getContents();
}
public function getTitle($url)
{
$html = $this->httpGet($url);
preg_match('/<title[^>]*>\s*(.*?)\s*<\/title>/i', $html, $title_matches);
if (is_array($title_matches) && count($title_matches) == 2) {
return $title_matches[1];
}
$truncated_string = mb_substr($html, 0, 100, 'utf-8');
return $truncated_string;
}
public function getFavicon($url, $iconType = 1, $path = './')
{
$html = $this->httpGet($url);
$res = [
'url' => null,
'path' => null
];
preg_match('/<head[^>]*>(.*?)<\/head>/is', $html, $head_matches);
if (!empty($head_matches[1])) {
// Try to find icon link within head content
$url_parts = parse_url($url);
$base_url = $url_parts['scheme'] . '://' . $url_parts['host'];
$pattern1 = '/<link[^>]*rel=["\'](?:shortcut )?icon["\'][^>]*href=["\']([^"\']+)["\'][^>]*>/i';
$pattern2 = '/<link[^>]*rel=["\'](?:shortcut )?icon["\'][^>]*(?:href=["\']([^"\']*)["\']|href=([^"\'>\s]+)[^>]*)>/i';
preg_match_all($pattern1, $head_matches[1], $icon_matches);
if (empty($icon_matches[1])) {
preg_match_all($pattern2, $head_matches[1], $icon_matches);
}
// dump($icon_matches);
$icon_url = null;
if (count($icon_matches) == 2 && !empty($icon_matches[1])) {
$icon_url = $icon_matches[1][0];
}
if (count($icon_matches) == 3 && !empty($icon_matches[2])) {
$icon_url = $icon_matches[2][0];
}
// dump($icon_url);
if ($icon_url) {
// Use the first found icon link
$icon_url_parts = parse_url($icon_url);
// return dump($icon_url_parts);
$iconScheme = isset($icon_url_parts['scheme']) ? $icon_url_parts['scheme'] : $url_parts['scheme'];
$iconHost = isset($icon_url_parts['host']) ? $icon_url_parts['host'] : $url_parts['host'];
$iconPath = isset($icon_url_parts['path']) ? $icon_url_parts['path'] : '/';
$icon_url = $iconScheme . '://' . $iconHost . $iconPath;
} else {
// Use default favicon.ico if not found
// $icon_url = rtrim($url, '/') . '/favicon.ico';
$icon_url = $base_url . '/favicon.ico';
}
// Download the icon
$res['url'] = $icon_url;
if ($iconType == 1 && !file_exists($icon_url)) {
$icon_data = file_get_contents($icon_url);
if ($icon_data !== false) {
// Save the icon with the domain name as filename
$filename = parse_url($url, PHP_URL_HOST) . '.ico';
file_put_contents($path . $filename, $icon_data);
$res['path'] = $path . $filename;
}
}
if (file_exists($icon_url)) {
$res['path'] = $path . $filename;
}
}
return $res;
}
/**
* 获取网页标题和图标
* @param string $url 网址
* @param int $iconType 图标类型 0:仅获取图标地址不下载 1:下载图标并保存
* @param string $path 图标保存路径
* @return array
*/
public static function getInfo($url, $iconType = 1, $path = './')
{
$obj = new titleFavicon();
$title = $obj->getTitle($url);
$icon = $obj->getFavicon($url, $iconType, $path);
return array_merge(['title' => $title], $icon);
// return ['title' => $title, 'icon' => $icon];
}
}

use使用

$res = titleFavicon::getInfo('https://www.baidu.com', 1, './');
dump($res);
array(3) {
["title"] => string(27) "百度一下,你就知道"
["url"] => string(33) "https://www.baidu.com/favicon.ico"
["path"] => string(19) "./www.baidu.com.ico"
}

更新日志

2024-03-10 兼容屎代码,例如:

<link rel="shortcut icon" type=image/x-icon href=https://www.ucharts.cn/v2/static/favicon.ico>

他妈的,你href加个双引号怎么了?

posted @   小枫同学  阅读(102)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· AI技术革命,工作效率10个最佳AI工具
点击右上角即可分享
微信分享提示