PHP抓取豆瓣读书爬虫代码
<?php
//演示地址 http://asizu.sinaapp.com/reptile_douban.php
//数据量不是特别大,没有写抓完数据便停止。 喜欢的朋友拿去自己改改就好了
header("Content-Type:text/html;charset=utf-8"); define("MYSQL_HOST",SAE_MYSQL_HOST_M); define("MYSQL_NAME","douban"); define("MYSQL_USER",SAE_MYSQL_USER); define("MYSQL_PASSWORD",SAE_MYSQL_PASS); define("MYSQL_PORT",SAE_MYSQL_PORT); $action = $_GET['url']; if(!empty($action)) { $data = getLink($action); $data = str_substr('<div class="article">','<div class="aside">',$data); $dataArray = explode('<dl>',$data); array_splice($dataArray,0,1); foreach($dataArray as $key => $item) { $bookArray[$key]['title'] = str_substr('class="title" target="_blank">','</a>',$item); $bookArray[$key]['rating'] = str_substr('<span class="rating_nums">','</span>',$item); $bookArray[$key]['book_id'] = str_substr('/subject/','/?from',$item); if(!selectBookSaveed($bookArray[$key]['title'])) { insertMysql($bookArray[$key]); } //$dataArray[$key] = htmlspecialchars($item); } $page = explode('book?start=',$action); $new_url = $page[0] ."book?start=". ($page[1] + 15); // var_dump($page); // var_dump($bookArray); } //获取连接 function getLink($url) { $data = getData($url); return $data; } //抓取数据的函数 function getData($url) { //初始化 $ch = curl_init(); //设置选项,包括URL curl_setopt($ch,CURLOPT_URL,$url); curl_setopt($ch,CURLOPT_RETURNTRANSFER,1); curl_setopt($ch,CURLOPT_HEADER,0); curl_setopt($ch,CURLOPT_USERAGENT,'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.47 Safari/536.11'); //执行并获取HTML文档内容 $output = curl_exec($ch); //释放curl句柄 curl_close($ch); //返回数据 return $output; } //保存入库 function insertMysql($dataArray) { $field = ""; $value = ""; foreach($dataArray as $key => $item) { $field .= ",".$key; $value .= ",'".$item."'"; } $field = substr($field,1,strlen($field)-1); $value = substr($value,1,strlen($value)-1); $sql = "INSERT INTO douban (".$field.") VALUES(".$value.")"; // var_dump($sql); // exit; $result = mysqlOperation($sql); return $result; } //查询是否已经录入 function selectBookSaveed($title) { $sql = "SELECT * FROM douban WHERE title = '".$title."'"; $result = mysqlOperation($sql,"select"); if(empty($result)) { return false;//未存在 } return true; } //数据库操作方法 function mysqlOperation($sql,$method = "query") { $mysqli = new mysqli( SAE_MYSQL_HOST_M, SAE_MYSQL_USER, SAE_MYSQL_PASS, SAE_MYSQL_DB, SAE_MYSQL_PORT ); if( mysqli_connect_errno() ) echo 'error'; // return false; if($method == "select") return mysqli_fetch_assoc($mysqli->query($sql)); if($method == "query") return $mysqli->query($sql); } // 字符串截取函数 function str_substr($start, $end, $str) { $temp = explode($start, $str, 2); $content = explode($end, $temp[1], 2); return $content[0]; } ?> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <script src="http://ajax.aspnetcdn.com/ajax/jQuery/jquery-1.4.4.min.js"></script> <script> $(document).ready(function(){ if($("#input").val() != ""){ $("#froms").submit(); } }); </script> <form action="?" method="get" id="froms"> <input id="input" value="<?php echo $new_url; ?>" type="text" name="url"> <input type="submit"> </form>