今天看到一段代码,用于动态解析网页内容并表现为xml,虽然我不会PHP,但是思路认可借鉴。
下面的例子从http://www.oursci.org/news.htm 生成RSS Feed的PHP代码。效果看这个
[url=http://www.small-island.org/readnews/hackrss_.php/oursci.xml]http://www.small-island.org/readnews/hackrss_.php/oursci.xml[/url]
其实很简单,就是把网页抓下来,把各个条目分出来,然后把各个部分分开。关键是先通过读网页总结一下格式的特征,用正则表达式表达出来就好了。很简单的。
PS:你还需要分析一下搜索引擎的调用参数,比如中文Gooogle搜索是像这样。
http://www.google.com/search?q=要搜索的字符串&hl=zh-CN&ie=gb2312
PHP代码:
function oursci(){
$rss->channel["title"]="三思科学报道";
$rss->channel["link"]="http://www.oursci.org/news.htm";
$page=file_get_contents("http://www.oursci.org/news.htm");
if(preg_match('/<!-- 新闻部分主要内容 -->(.*)<br><br>.</td>/ims',$page , $headlines)){
$entries=explode("<td align=center valign=top width=38%>",$headlines[0]);
for($i=1;$i<count($entries);$i++){
if(preg_match('/p3><a href=".(.*?)" class=v1.*?<b>(.*?)</b>.*?<i>(.*?)</i>.*?p1>(.*?)<.*?>/ims',$entries[$i] , $headlines)){
$item["link"]="http://www.oursci.org".$headlines[1];
$item["title"]=$headlines[2];
$item["pubdate"]=$headlines[3];
$item["description"]=$headlines[4];
$rss->items[]=$item;
}
}
}else{
echo "NO";
}
return $rss;
}
$rss = call_user_func("oursci");
header("Content-Type: application/xml");
echo "<?xml version="1.0" encoding="gb2312" ?> ";
echo "<rss version="2.0"> ";
echo "<channel> ";
echo "<title>".$rss->channel["title"]."</title> ";
echo "<link>".$rss->channel["link"]."</link> ";
echo "<description>".$rss->channel["description"]."</description> ";
for($i=0;$i<count($rss->items);$i++){
echo "<item>";
echo "<title>".$rss->items[$i]["title"]."</title>";
echo "<link>".$rss->items[$i]["link"]."</link>";
echo "<pubDate>".$rss->items[$i]["pubdate"]."</pubDate>";
echo "<description>".$rss->items[$i]["description"]."</description>";
echo "</item>";
}
echo "</channel> ";
echo "</rss> ";
$rss->channel["title"]="三思科学报道";
$rss->channel["link"]="http://www.oursci.org/news.htm";
$page=file_get_contents("http://www.oursci.org/news.htm");
if(preg_match('/<!-- 新闻部分主要内容 -->(.*)<br><br>.</td>/ims',$page , $headlines)){
$entries=explode("<td align=center valign=top width=38%>",$headlines[0]);
for($i=1;$i<count($entries);$i++){
if(preg_match('/p3><a href=".(.*?)" class=v1.*?<b>(.*?)</b>.*?<i>(.*?)</i>.*?p1>(.*?)<.*?>/ims',$entries[$i] , $headlines)){
$item["link"]="http://www.oursci.org".$headlines[1];
$item["title"]=$headlines[2];
$item["pubdate"]=$headlines[3];
$item["description"]=$headlines[4];
$rss->items[]=$item;
}
}
}else{
echo "NO";
}
return $rss;
}
$rss = call_user_func("oursci");
header("Content-Type: application/xml");
echo "<?xml version="1.0" encoding="gb2312" ?> ";
echo "<rss version="2.0"> ";
echo "<channel> ";
echo "<title>".$rss->channel["title"]."</title> ";
echo "<link>".$rss->channel["link"]."</link> ";
echo "<description>".$rss->channel["description"]."</description> ";
for($i=0;$i<count($rss->items);$i++){
echo "<item>";
echo "<title>".$rss->items[$i]["title"]."</title>";
echo "<link>".$rss->items[$i]["link"]."</link>";
echo "<pubDate>".$rss->items[$i]["pubdate"]."</pubDate>";
echo "<description>".$rss->items[$i]["description"]."</description>";
echo "</item>";
}
echo "</channel> ";
echo "</rss> ";