# 如果body中没有charset ,则在header中提取 $meta = get_meta_charset($html); $charset = preg_match("/charset=[^\w]?([-\w]+)/i", $meta, $temp) ? strtolower($temp[1]): ""; if(empty($charset)){ $header = print_r($html_array['header'], true); $charset = preg_match("/charset=[^\w]?([-\w]+)/i", $header, $temp) ? strtolower($temp[1]): ""; } echo $charset; if($charset !== 'utf-8' and !empty($charset)){ $html = mb_convert_encoding($html, 'utf-8', $charset); }
包含CHARSET的meta
$html = preg_replace("/<\s+/is", "<", $html); # 去掉 < 后边的空格 $html = preg_replace("/\s+>/is", ">", $html); # 去掉 > 前边的空格
$html = preg_replace("/\r\n|\r|\n/", ' ', $html); # 删除回车和换行
function get_meta_charset($html){
// $charset = preg_match("/<meta.+?charset=[^\w]?([-\w]+)/i", $html, $temp) ? strtolower($temp[1]):""; preg_match_all('/<meta.*?>/i', $html, $matches); $meta = ''; foreach($matches[0] as $value){ $value = strtolower(trim($value)); # 多个空格转为一个空格 $value = preg_replace("/\s(?=\s)/", "\\1", $value); // $value = preg_replace("/ {2,}/", "", $value); # {2,}前面的空格不能少 $value = preg_replace("/'/", '"', $value); $value = str_replace(array(' "', '=" '), array('"', '="'), $value); $value = str_replace(array('= ', ' ='), array('=', '='), $value); if(strpos($value, 'charset') !== false) $meta .= $value . "\n"; } return $meta; }
小说下载 https://files.cnblogs.com/files/yisuo/down-bqg.php.zip?t=1703168359&download=true