php实现采集(仅做参考)
1 <?php 2 3 namespace App\Http\Controllers\Caiji; 4 5 use Illuminate\Http\Request; 6 use App\Http\Controllers\Controller; 7 use Illuminate\Support\Facades\DB; 8 9 10 class CollectionCotontroller extends Controller 11 { 12 public function __construct() 13 { 14 //设置php最大执行时间 15 ini_set('max_execution_time', '1000000'); 16 //设置错误模式 17 // error_reporting(0); 18 //采集的网站 19 $this->url = "http://33uudy.com"; 20 21 if (!is_dir('AllIdData')) { 22 mkdir('AllIdData', 0777); 23 file_put_contents('AllIdData/GetId.txt', ''); 24 } 25 } 26 27 public function film_get($url = "", $proxy = "", $cookie = "", $returnCookie = 0) 28 { 29 $curl = curl_init(); 30 if (!$url) { 31 $url = $this->url; 32 } 33 curl_setopt($curl, CURLOPT_PROXY, $proxy);//设置代理ip 34 curl_setopt($curl, CURLOPT_URL, $url);//url地址 35 curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)');//模仿header头中 "User-Agent:"的字符串。修改user_agent来伪造成浏览器请求 36 curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); //自cURL 7.10开始默认为 TRUE。 FALSE 禁止 cURL 验证对等证书(peer's certificate)。要验证的交换证书可以在 CURLOPT_CAINFO 选项中设置,或在 CURLOPT_CAPATH中设置证书目录 37 curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false); 38 curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1); //发送几次就重定向几次,除非设置了 CURLOPT_MAXREDIRS,限制最大重定向次数。 39 curl_setopt($curl, CURLOPT_AUTOREFERER, 1); //TRUE 时将根据 Location: 重定向时,自动设置 header 中的Referer:信息。 40 // curl_setopt($curl, CURLOPT_REFERER, "http://XXX"); 41 if ($cookie) { 42 curl_setopt($curl, CURLOPT_COOKIE, $cookie); 43 } 44 curl_setopt($curl, CURLOPT_HEADER, $returnCookie); 45 curl_setopt($curl, CURLOPT_TIMEOUT, 10); 46 curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); 47 $data = curl_exec($curl); 48 if (curl_errno($curl)) { 49 return curl_error($curl); 50 } 51 curl_close($curl); 52 if ($returnCookie) { 53 list($header, $body) = explode("\r\n\r\n", $data, 2); 54 preg_match_all("/Set\-Cookie:([^;]*);/", $header, $matches); 55 $info['cookie'] = substr($matches[1][0], 1); 56 $info['content'] = $body; 57 return $info; 58 } else { 59 return $this->data = $data; 60 } 61 } 62 63 /* 64 * 统计总共有多少页 65 */ 66 public function page() 67 { 68 $url = $this->film_get();//获取页面数据 69 $reg = "/<a.*class=\"pagelink_a.*<\/a>/"; 70 $reg1 = "/<a\b[^>]+\bhref=\"([^\"]*)\"[^>]*>尾页<\/a>/"; 71 $reg2 = "/[0-9].*[0-9]/"; 72 preg_match($reg, $url, $a); 73 preg_match($reg1, $a[0], $b); 74 preg_match($reg2, $b[1], $c); 75 76 //判断是否获取最大值。如果获取不到则返回1 77 if ($c[0]) { 78 return $c[0]; 79 } else { 80 return 1; 81 } 82 } 83 84 /* 85 * 获取首页的所有数据 86 */ 87 public function all_data($set_max_page, $set_min_page = 1) 88 { 89 if ($set_max_page) { 90 $this->page(); 91 $maxpage = $set_max_page; 92 } else { 93 $maxpage = $this->page();//获取最大页数 94 } 95 $page = $set_min_page; 96 $maxpage = $set_max_page ? $set_max_page : $maxpage;//判断是否存在 97 $data = $this->data;//获取页面数据 98 for ($page; $page <= $maxpage; $page++) { 99 $max_url = $this->url . '/?m=vod-index-pg-' . $page . '.html'; 100 $str = $this->film_get($max_url);//获取分页的页面数据 101 $reg = "/<span class=[\"|']tt[\"|'].*<\/span>/i"; 102 preg_match_all($reg, $str, $span_array); 103 foreach ($span_array[0] as $k => $v) { 104 $reg1 = "/<a href=\"[^\"]*\"[^>]*>(.*)<\/a>/"; //获取a标签的内容 105 $reg2 = "/href=\"([^\"]+)/"; //获取href的链接地址 106 $reg4 = '/<span[^>]*class=\"xing_vb[6|7]\".*?>.*?<\/span>/ism'; //获取视频更新时间 107 preg_match($reg1, $v, $acontent);//获取每个内容 108 preg_match($reg2, $v, $hrefarray);//获取每个链接 109 preg_match($reg4, $v, $up_time);//获取每个更新时间 110 $acontent = explode(' ', $acontent[1]); 111 $arr[$k]['last'] = intval(substr(strip_tags($up_time[0]), 3, 0)); 112 $arr[$k]['name'] = $acontent[0];//获取名称 113 114 $arr[$k]['letter'] = $this->getFirstCharter($acontent[0]);//获取首字母 115 $arr[$k]['note'] = $acontent[1]; 116 117 //获取连载 118 preg_match('/\d.*\d/', $acontent[1], $aa); 119 if ($aa) { 120 $arr[$k]['state'] = intval($aa[0]); 121 } else { 122 $arr[$k]['state'] = 0; 123 } 124 125 $url_link = $this->url . $hrefarray[1];//获取每一个视频的内容 126 $one_string = $this->film_get($url_link); 127 $arr_string = $this->get_link_data($one_string); 128 $arr[$k]['downurl'] = $url_link;//下载地址 129 130 foreach ($arr_string as $key => $value) { 131 $arr[$k]['pic'] = $arr_string['vod_pic'];//获取图片 132 $arr[$k]['subname'] = $arr_string['vod_ename'];//获取别名 133 $arr[$k]['director'] = $arr_string['vod_director'];//获取导演 134 $arr[$k]['actor'] = $arr_string['vod_actor'];//获取主演 135 $arr[$k]['type_name'] = $this->type_tf(isset($arr_string['vod_type']) ? explode(' ', $arr_string['vod_type'])[0] : '福利片');//获取类型 136 $arr[$k]['area'] = $arr_string['vod_area'];//获取地区 137 $arr[$k]['lang'] = $arr_string['vod_language'];//获取语言 138 $arr[$k]['score'] = $arr_string['score'];//获取评分 139 $arr[$k]['year'] = $arr_string['vod_year'] == "未知" ? 1 : $arr_string['vod_year'];//获取年份 140 $arr[$k]['playfrom'] = '';//过滤字段 141 // $arr[$k]['created_at'] = $arr_string['vod_addtime'];//获取天假时间 142 // $arr[$k]['vod_filmtime'] = $arr_string['vod_filmtime'];//获取电影时间 143 $arr[$k]['des'] = $arr_string['vod_content'];//获取内容 144 $episodes_string = '';//存放播放地址 145 foreach ($arr_string['Episodes'] as $key => $value) { 146 $episodes_string .= "$" . implode('#', $value); 147 } 148 $arr[$k]['dd'] = $episodes_string;//获取播放地址 149 } 150 } 151 } 152 if ($page % 5 == 0) { 153 sleep(10); 154 } 155 return $arr; 156 } 157 158 /** 159 * 获取子页的所有数据 160 **/ 161 public function get_link_data($url) 162 { 163 $reg8 = "/<div class=[\"|']vodinfobox.*<\/div>/ism"; 164 $reg9 = '/<ul>.*?<\/ul>/ism'; 165 $reg10 = '/<li>.*?<\/li>/'; 166 $reg11 = '/<img class=\"lazy.*?\/>/'; 167 $reg12 = '/<div class=\"vodplayinfo\"><!--介绍开始-->.*?<\/div>/ism'; 168 169 //采集图片 170 preg_match($reg11, $url, $a); 171 preg_match('/src=\"([^ \t]+)\"/', $a[0], $img_src); 172 $arr['vod_pic'] = $img_src[1]; 173 174 //采集评分 175 preg_match('/<label.*?<\/label>/', $url, $score); 176 $arr['score'] = strip_tags($score[0]); 177 //采集内容 178 preg_match($reg12, $url, $content); 179 $contentData = strip_tags($content[0]) ? strip_tags($content[0]) : " "; 180 $arr['vod_content'] = $contentData; 181 182 preg_match($reg8, $url, $a); 183 preg_match($reg9, $a[0], $b); 184 preg_match_all($reg10, $b[0], $c); 185 foreach ($c[0] as $keys => $values) { 186 $arr['vod_ename'] = mb_substr(strip_tags($c[0][0]), 3); 187 $arr['vod_director'] = mb_substr(strip_tags($c[0][1]), 3); 188 $arr['vod_actor'] = mb_substr(strip_tags($c[0][2]), 3); 189 $arr['vod_type'] = mb_substr(strip_tags($c[0][3]), 3); 190 $arr['vod_area'] = mb_substr(strip_tags($c[0][4]), 3); 191 $arr['vod_language'] = mb_substr(strip_tags($c[0][5]), 3); 192 $arr['vod_year'] = mb_substr(strip_tags($c[0][6]), 3); 193 $arr['vod_addtime'] = time(); 194 // $arr['vod_filmtime'] = strtotime(mb_substr(strip_tags($c[0][7]), 3)); 195 } 196 $reg5 = '/<h3>来源.*<\/h3>.*<ul>.*<\/ul>/ism'; 197 $reg6 = '/<ul>.*?<\/ul>/s'; 198 preg_match($reg5, $url, $a); 199 preg_match_all($reg6, $a[0], $b); 200 foreach ($b[0] as $key => $value) { 201 $reg7 = '/<li.*?<\/li>/ism'; 202 preg_match_all($reg7, $value, $all_li); 203 foreach ($all_li[0] as $ks => $vs) { 204 $arr['Episodes'][$key][$ks] = strip_tags($vs); 205 } 206 } 207 return $arr; 208 } 209 210 //判断分类 211 public function type_tf($type) 212 { 213 if (strstr($type, '动漫')) { 214 return '动漫'; 215 } elseif (strstr($type, '动画片')) { 216 return '动漫'; 217 } elseif (strstr($type, '动画片')) { 218 return '动漫'; 219 } elseif (strstr($type, '奇幻片')) { 220 return '剧情片'; 221 } elseif (strstr($type, '伦理')) { 222 return '伦理片'; 223 } elseif (strstr($type, '韩剧')) { 224 return '日韩剧'; 225 } elseif (strstr($type, '韩国剧')) { 226 return '日韩剧'; 227 } elseif (strstr($type, '其他剧')) { 228 return '电视剧'; 229 } elseif (strstr($type, '海外剧')) { 230 return '欧美剧'; 231 } elseif (strstr($type, '日剧')) { 232 return '日韩剧'; 233 } elseif (strstr($type, '日本剧')) { 234 return '日韩剧'; 235 } elseif (strstr($type, '台剧')) { 236 return '港台剧'; 237 } elseif (strstr($type, '台湾剧')) { 238 return '港台剧'; 239 } elseif (strstr($type, '港剧')) { 240 return '港台剧'; 241 } elseif (strstr($type, '香港剧')) { 242 return '港台剧'; 243 } elseif (strstr($type, '泰剧')) { 244 return '电视剧'; 245 } elseif (strstr($type, '泰国剧')) { 246 return '电视剧'; 247 } elseif (strstr($type, '视讯美女')) { 248 return '福利片'; 249 } elseif (strstr($type, '腿模写真')) { 250 return '福利片'; 251 } 252 return $type; 253 } 254 255 public function getFirstCharter($str)//取首拼音 256 { 257 if (empty($str)) { 258 return ''; 259 } 260 $str = str_replace('・', '', $str); 261 $firstchar_ord = ord(strtoupper($str{0})); 262 if (($firstchar_ord >= 65 and $firstchar_ord <= 91) or ($firstchar_ord >= 48 and $firstchar_ord <= 57)) return $str{0}; 263 $s = iconv("UTF-8", "gbk", $str); 264 $asc = ord($s{0}) * 256 + ord($s{1}) - 65536; 265 if ($asc >= -20319 and $asc <= -20284) return "A"; 266 if ($asc >= -20283 and $asc <= -19776) return "B"; 267 if ($asc >= -19775 and $asc <= -19219) return "C"; 268 if ($asc >= -19218 and $asc <= -18711) return "D"; 269 if ($asc >= -18710 and $asc <= -18527) return "E"; 270 if ($asc >= -18526 and $asc <= -18240) return "F"; 271 if ($asc >= -18239 and $asc <= -17923) return "G"; 272 if ($asc >= -17922 and $asc <= -17418) return "H"; 273 if ($asc >= -17417 and $asc <= -16475) return "J"; 274 if ($asc >= -16474 and $asc <= -16213) return "K"; 275 if ($asc >= -16212 and $asc <= -15641) return "L"; 276 if ($asc >= -15640 and $asc <= -15166) return "M"; 277 if ($asc >= -15165 and $asc <= -14923) return "N"; 278 if ($asc >= -14922 and $asc <= -14915) return "O"; 279 if ($asc >= -14914 and $asc <= -14631) return "P"; 280 if ($asc >= -14630 and $asc <= -14150) return "Q"; 281 if ($asc >= -14149 and $asc <= -14091) return "R"; 282 if ($asc >= -14090 and $asc <= -13319) return "S"; 283 if ($asc >= -13318 and $asc <= -12839) return "T"; 284 if ($asc >= -12838 and $asc <= -12557) return "W"; 285 if ($asc >= -12556 and $asc <= -11848) return "X"; 286 if ($asc >= -11847 and $asc <= -11056) return "Y"; 287 if ($asc >= -11055 and $asc <= -10247) return "Z"; 288 return 0;//null 289 } 290 291 292 //判断数据库去重(主动) 293 public function insert_into($page = 1) 294 { 295 $this->data = 'AllIdData'; 296 // $geturl = DB::table('vods')->get(['id','downurl']); 297 $html = $this->all_data($page); 298 // var_dump($html); 299 $geturllink = $this->updateLink(); 300 $arrData = array(); 301 foreach ($html as $key => $value) { 302 if (in_array($value['downurl'], $geturllink)) { 303 $one_string = $this->film_get($value['downurl']); 304 $getLinkData = $this->get_link_data($one_string); 305 $episodes_string = '';//存放播放地址 306 foreach ($getLinkData['Episodes'] as $key => $value) { 307 $episodes_string .= "$" . implode('#', $value); 308 } 309 DB::table('vods')->where('id', "=", $key) 310 ->update(['dd' => $episodes_string]); 311 } else { 312 $getId = DB::table('vods')->insertGetId($value); 313 $this->getLastId($getId, $value['downurl']); 314 } 315 } 316 } 317 318 //判断数据库去重(被动) 319 public function set_to_db($data) 320 { 321 $array = array(); 322 $geturllink = $this->updateLink(); 323 foreach ($data as $key => $value) { 324 if (in_array($value['downurl'], $geturllink)) { 325 $one_string = $this->film_get($value['downurl']); 326 $getLinkData = $this->get_link_data($one_string); 327 $episodes_string = '';//存放播放地址 328 foreach ($getLinkData['Episodes'] as $key => $value) { 329 $episodes_string .= "$" . implode('#', $value); 330 } 331 DB::table('vods')->where('id', "=", $key) 332 ->update(['dd' => $episodes_string]); 333 } else { 334 $getId = DB::table('vods')->insertGetId($value); 335 $array[] = $getId; 336 $this->getLastId($getId, $value['downurl']); 337 } 338 } 339 return $array; 340 } 341 342 //数据不存在的时候插入id和链接 343 public function getLastId($getId, $downurl) 344 { 345 $SigerId = ''; 346 $arr = ''; 347 $SigerId .= $getId . "@" . $downurl . "$"; 348 if (!is_dir('AllIdData')) { 349 mkdir('AllIdData', 0777); 350 file_get_contents('AllIdData/GetId.txt', ''); 351 } else { 352 $arr .= file_get_contents("AllIdData/GetId.txt"); 353 } 354 $arr .= $SigerId; 355 if (file_put_contents("AllIdData/GetId.txt", $arr)) return $arr; 356 } 357 358 //数据存在需要更新链接里面的视频源 359 public function updateLink() 360 { 361 $GetIdByFile = "AllIdData/GetId.txt"; 362 $data = file_get_contents($GetIdByFile); 363 $arr = explode("$", $data); 364 $geturllink = array(); 365 foreach ($arr as $key => $value) { 366 if (!$value) { 367 unset($value); 368 } else { 369 $url = explode('@', $value); 370 $geturllink[$url[0]] = $url[1]; 371 } 372 } 373 // var_dump($geturllink); 374 return $geturllink; 375 } 376 377 /** 378 * 必须经过接口获取到的数据 379 * 380 **/ 381 public function searchNameAllDate() 382 { 383 $wd = isset($_POST)?$_POST['wd']:""; 384 // var_dump($wd);die; 385 //通过cuel模拟post请求访问数据 386 $data = ['wd' => $wd]; 387 $action_url = '/index.php?m=vod-search'; 388 $post_url = $this->url . $action_url; 389 $ch = curl_init(); 390 curl_setopt($ch, CURLOPT_URL, $post_url); 391 curl_setopt($ch, CURLOPT_POST, 1); 392 curl_setopt($ch, CURLOPT_HEADER, 0); 393 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 394 curl_setopt($ch, CURLOPT_POSTFIELDS, $data); 395 $str = curl_exec($ch); 396 curl_close($ch); 397 $reg = "/<span class=[\"|']tt[\"|'].*<\/span>/i"; 398 preg_match_all($reg, $str, $span_array); 399 if (!$span_array[0]) { 400 return ""; 401 } 402 foreach ($span_array[0] as $k => $v) { 403 $reg1 = "/<a href=\"[^\"]*\"[^>]*>(.*)<\/a>/"; //获取a标签的内容 404 $reg2 = "/href=\"([^\"]+)/"; //获取href的链接地址 405 $reg4 = '/<span[^>]*class=\"xing_vb[6|7]\".*?>.*?<\/span>/ism'; //获取视频更新时间 406 preg_match($reg1, $v, $acontent);//获取每个内容 407 preg_match($reg2, $v, $hrefarray);//获取每个链接 408 preg_match($reg4, $v, $up_time);//获取每个更新时间 409 $acontent = explode(' ', $acontent[1]); 410 $arr[$k]['last'] = intval(substr(strip_tags($up_time[0]), 3, 0)); 411 $arr[$k]['name'] = $acontent[0];//获取名称 412 413 $arr[$k]['letter'] = $this->getFirstCharter($acontent[0]);//获取首字母 414 $arr[$k]['note'] = $acontent[1]; 415 416 //获取连载 417 preg_match('/\d.*\d/', $acontent[1], $aa); 418 if ($aa) { 419 $arr[$k]['state'] = intval($aa[0]); 420 } else { 421 $arr[$k]['state'] = 0; 422 } 423 424 $url_link = $this->url . $hrefarray[1];//获取每一个视频的内容 425 $one_string = $this->film_get($url_link); 426 $arr_string = $this->get_link_data($one_string); 427 $arr[$k]['downurl'] = $url_link;//下载地址 428 429 //判断数据库是否一样,去重 430 431 foreach ($arr_string as $key => $value) { 432 $arr[$k]['pic'] = $arr_string['vod_pic'];//获取图片 433 $arr[$k]['subname'] = $arr_string['vod_ename'];//获取别名 434 $arr[$k]['director'] = $arr_string['vod_director'];//获取导演 435 $arr[$k]['actor'] = $arr_string['vod_actor'];//获取主演 436 $arr[$k]['type_name'] = $this->type_tf(isset($arr_string['vod_type']) ? explode(' ', $arr_string['vod_type'])[0] : '福利片');//获取类型 437 $arr[$k]['area'] = $arr_string['vod_area'];//获取地区 438 $arr[$k]['lang'] = $arr_string['vod_language'];//获取语言 439 $arr[$k]['score'] = $arr_string['score'];//获取评分 440 $arr[$k]['year'] = $arr_string['vod_year'] == "未知" ? 1 : $arr_string['vod_year'];//获取年份 441 $arr[$k]['playfrom'] = '';//过滤字段 442 // $arr[$k]['created_at'] = $arr_string['vod_addtime'];//获取天假时间 443 // $arr[$k]['vod_filmtime'] = $arr_string['vod_filmtime'];//获取电影时间 444 $arr[$k]['des'] = $arr_string['vod_content'];//获取内容 445 $episodes_string = '';//存放播放地址 446 foreach ($arr_string['Episodes'] as $key => $value) { 447 $episodes_string .= "$" . implode('#', $value); 448 } 449 $arr[$k]['dd'] = $episodes_string;//获取播放地址 450 } 451 } 452 $all_id = $this->set_to_db($arr); 453 return $all_id; 454 } 455 456 /* 457 * 删除视频数据 458 */ 459 public function delDate($id) 460 { 461 // var_dump($id);die; 462 if (!$id) { 463 return [ 464 "status" => 400, 465 "msg" => "非法访问" 466 ]; 467 } 468 // $id = '107'; 469 $arr = array(); 470 $all_data = array(); 471 $allDate = file_get_contents('AllIdData/GetId.txt'); 472 foreach (explode("$", $allDate) as $key => $value) { 473 $arr[$key] = $value; 474 } 475 foreach (array_filter($arr) as $key => $value) { 476 $a = explode('@', $value); 477 $all_data[$a[0]] = $a; 478 } 479 // var_dump($all_data); 480 unset($all_data[$id]); 481 $all_string = ""; 482 483 // var_dump($all_data); 484 foreach ($all_data as $key => $value) { 485 $all_string .= $value[0] . "@" . $value[1] . "$"; 486 } 487 if(file_put_contents('AllIdData/GetId.txt', $all_string)) { 488 return [ 489 "status" => 200, 490 "msg" => "删除成功" 491 ]; 492 }; 493 } 494 495 /* 496 * 恢复视频数据 497 */ 498 499 public function recoveryData($id, $downurl) 500 { 501 if (!$id && !$downurl) { 502 return [ 503 "status" => 400, 504 "msg" => "非法访问" 505 ]; 506 } 507 $array = [ 508 'id' => $id, 509 'downurl' => $this->url."/?m=".$downurl 510 ]; 511 $data = $array; 512 $allDate = file_get_contents('AllIdData/GetId.txt'); 513 $str_data = ""; 514 $str_data .= $allDate . $array['id'] . "@" . $array['downurl'] . "$"; 515 if (file_put_contents('AllIdData/GetId.txt', $str_data)){ 516 return [ 517 "status" => 200, 518 "msg" => "恢复成功" 519 ]; 520 }; 521 } 522 }