采集器

<meta http-equiv=Content-Type content="text/html;charset=gbk">
<script src="./js/jquery.js" type="text/javascript"></script>
<script src="./js/jquery.validate.js" type="text/javascript"></script>
<script src="./js/jquery.metadata.js" type="text/javascript"></script>
<script type="text/javascript">
</script>
<?php
$url="http://www.jy.com.cn/PreSellCert_List.do?project=%B3%A4%BD%AD%B9%FA%BC%CA";
$str=file_get_contents($url);
$str=compress_html($str);
/*$str = 'http://www.youku.com/show_page/id_ABCDEFG.html';
$matches = array();
*/
$regex='/<span class="font_bold font_blue font_14px"><a href="PreSellCert_Detail\.do\?pscid=(.*)">.*\(<span class="font_12px">(.*)<\/span>\)<\/a><\/span><\/td>'
.'.*<span class="font_16px font_bold">(.*)<\/span>套<\/td>.*批准日期:(.*)<\/td><\/tr>.*<span class="font_16px font_bold">(.*)<\/span>套<\/td>/U';
//$str="adfadfadf预售许可证:123123</span>)";
if(preg_match_all($regex, $str, $matches,PREG_SET_ORDER)){
foreach($matches as $val){
$saleurl="http://www.jy.com.cn/ifrm_PreSellCert_SaleStat.do?pscid=".$val[1];
$salestr=file_get_contents($saleurl);
//print_R($salestr);exit;
$salestr=compress_html($salestr);
//$regex='/<td align="right">(.*)<\/td>/U';
$regex='/<tr><td align="right">(.{1,30})<\/td><td align="right">(.*)<\/td><td align="right">(.*)<\/td><td align="right">(.*)<\/td><td align="right">(.*)<\/td><td align="right">(.*)<\/td><td align="right">(.*)<\/td><td align="right">(.*)<\/td><td align="right">(.*)<\/td><td align="right">(.*)<\/td><td align="right">(.*)<\/td><td align="right">(.*)<\/td><td align="center">(.*)<\/td><\/tr>/U';
if(preg_match_all($regex, $salestr, $salematches,PREG_SET_ORDER)){
print_R($salematches);exit;
}
}
}

function compress_html($string) {
$string = str_replace("\r\n", '', $string); //清除换行符
$string = str_replace("\n", '', $string); //清除换行符
$string = str_replace("\t", '', $string); //清除制表符
$pattern = array (
"/> *([^ ]*) *</", //去掉注释标记
"/[\s]+/",
"/<!--[^!]*-->/",
"/\" /",
"/ \"/",
"'/\*[^*]*\*/'"
);
$replace = array (
">\\1<",
" ",
"",
"\"",
"\"",
""
);
return preg_replace($pattern, $replace, $string);
}
?>

 

posted @ 2014-04-18 17:17  hechunhua  阅读(164)  评论(0编辑  收藏  举报