php中利用正则表达式获取网页中的数据

可以批量获取网页上的数据,再存入自己的数据库中,仅学习交流
 
<html>
<head>
<title>meishichina.com extraction</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<?php
include("../db/conn.php");
//conn.php中的数据库连接代码
//start
//$mysql_server_name="localhost";
//$mysql_database="menudb";
//$mysql_username="root";
//$mysql_password="cnhope";
//$conn=mysql_connect($mysql_server_name, $mysql_username, $mysql_password);
//if (!$conn){die('Could not connect: ' . mysql_error());}
//mysql_query("SET NAMES utf8");
//mysql_select_db($mysql_database,$conn);
//end
 
ini_set("user_agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)");
//设置页面过期时间,设为0的话页面永远不会过期!
set_time_limit(0);
//获取前五页
for($i=1;$i<=25;$i++){
}
 
//获取菜单列表
function extract_menulist($url){
global $conn;
$content = file_get_contents($url);//将网页内容读成一个字符串
//在本页面进行正则匹配
$reg = "/<div class=\"detail\">\s<h4><a href=\"(.*?)\"[^>]*>(.*?)<\/a><\/h4>\s<p class=\"tint\"><a href=\"[^\"]*\"[^>]*>[^<]*<\/a>[^<]*<\/p>\s<p>原料:(.*?)<\/p>/i";
preg_match_all($reg, $content, $result);//返回一个数据集,貌似类似C#里面的dataset
for($i=0;$i<count($result[1]);$i++ ){
$mID = date('U').$i;//Generate menu id
$mUrl = "http://XXXXXXXX.com/".$result[1][$i];//Detail menu page URL
$mName = $result[2][$i];//Name
$mMatLst = $result[3][$i];//Material list
$sql = "SELECT * FROM MenuInfo WHERE mName='". $mName ."'";
$rows=mysql_query($sql,$conn);
if($rs=mysql_fetch_array($rows)){
echo $mName." already exists<br>\n";
}else{
//Extract detail page
extract_menudetail($mID, $mUrl, $mName, $mMatLst);
}
}
}
 
//获取菜单详细页面
function extract_menudetail($mID, $mUrl, $mName, $mMatLst){
$content = file_get_contents($mUrl);
//基本属性
$mTaste = "";
$mTechn = "";
$mTime = "";
$mLevel = "";
$reg = "/<li>菜品口味:<a href=\"[^\"]*\"[^>]*>(.*?)<\/a><\/li>\s<li>主要工艺:<a href=\"[^\"]*\"[^>]*>(.*?)<\/a><\/li>[\s\S]*<li>所需时间:<a href=\"[^\"]*\"[^>]*>(.*?)<\/a><\/li>\s<li>制作难度:<a href=\"[^\"]*\"[^>]*>(.*?)<\/a><\/li>/i";
if(preg_match($reg, $content, $result)){
$mTaste = $result[1];
$mTechn = $result[2];
$mTime = $result[3];
$mLevel = $result[4];
}
 
//食材明细
$mDtlMat = "";
$reg = "/<h4>食材明细<\/h4>\s*(.*?)\s<\/div>/s";
if(preg_match($reg, $content, $result)){
$mDtlMat = $result[1];
$mDtlMat = eregi_replace("<[^>]*>","", $mDtlMat); 
$mDtlMat = eregi_replace("   ","", $mDtlMat); 
$mDtlMat = eregi_replace("\n",",", $mDtlMat); 
$mDtlMat = eregi_replace("\'","''", $mDtlMat); 
$mDtlMat = eregi_replace(", ,|,,","", $mDtlMat); 
}
 
//介绍
$mDesc = "";
$reg = "/<div class=\"infob\"[^>]*>\s*(.*?)\s*<\/div>/s";
if(preg_match_all($reg, $content, $result)){
$mDesc = $result[3][2];
$mDesc = eregi_replace("<[^>]*>","", $mDesc); 
$mDesc = eregi_replace("   ","", $mDesc); 
$mDesc = eregi_replace("\n",",", $mDesc); 
$mDesc = eregi_replace("\'","''", $mDesc); 
$mDesc = eregi_replace(", ,|,,","", $mDesc); 
}
 
//小贴士
$mTips="";
$reg = "/<h4>小贴士<\/h4>\s*(.*?)\s<\/div>/s";
if(preg_match($reg, $content, $result)){
$mTips = $result[1];
$mTips = eregi_replace("<[^>]*>","", $mTips); 
$mTips = eregi_replace("   ","", $mTips); 
$mTips = eregi_replace("\n",",", $mTips); 
$mTips = eregi_replace("\'","''", $mTips); 
$mTips = eregi_replace(", ,|,,","", $mTips); 
}
 
//步骤
$reg = "/<li[^>]*><div>([^<]*)<\/div><img src=\"([^\"]*)\"[^>]*>(.*?)<\/li>/i";
if(preg_match_all($reg, $content, $result)){
for($i=0;$i<count($result[1]) ;$i++ ){
$mStpIdx=$result[1][$i];
$mStpDtl=$result[3][$i];
$mStpDtl=eregi_replace("<[^>]*>","", $mStpDtl); 
$mStpDtl = eregi_replace("\'","''", $mStpDtl); 
$mStpPic=$result[2][$i];
 
//download picture
$picpath="../pics/".$mID;
if (!is_dir($picpath)) {
mkdir($picpath);
}
if (!file_exists($picpath."/".$mID."_".$mStpIdx.".jpg")) {
$picdata = file_get_contents($mStpPic);//读出字符串
file_put_contents(realpath($picpath)."/".$mID."_".$mStpIdx.".jpg", $picdata);//写入jpg文件并保存
}
add_menustep($mID, $mStpIdx, $mStpDtl, $mID."_".$mStpIdx.".jpg");
}
}
 
//Add to database
add_menuinfo($mID, $mUrl, $mName, $mMatLst, $mTaste, $mTechn, $mTime, $mLevel, $mDtlMat, $mDesc, $mTips);
}
 
function add_menuinfo($mID, $mUrl, $mName, $mMatLst, $mTaste, $mTechn, $mTime, $mLevel, $mDtlMat, $mDesc, $mTips){
global $conn;
$sql = "INSERT INTO MenuInfo(mID,mName,mStyle,mTaste,mTechn,mTime,mLevel,mMatLst,mUrl,mDtlMat,mDesc,mTips) VALUES('".$mID."','".$mName."','".$mStyle."','".$mTaste."','".$mTechn."','".$mTime."','".$mLevel."','".$mMatLst."','".$mUrl."','".$mDtlMat."','".$mDesc."','".$mTips."')";
if(mysql_query($sql,$conn)){
echo $mName." added<br>\n";
}else{
echo $sql."<br>\n";
die('Error: '. mysql_error());
}
}
 
 
function add_menustep($mID, $mStpIdx, $mStpDtl, $mStpPic){
global $conn;
$sql = "INSERT INTO MenuStep( mID, mStpIdx, mStpDtl, mStpPic ) VALUES('".$mID."' ,".$mStpIdx." ,'".$mStpDtl."' ,'".$mStpPic."')";
if(!mysql_query($sql,$conn)){
echo $sql."<br>\n";
die('Error: '. mysql_errno());
}
}
 
mysql_free_result($rows);
mysql_close($conn);  
?>
</body>
</html>
posted @ 2013-03-25 20:06  暂时菜鸟  Views(207)  Comments(0Edit  收藏  举报