实验楼的php比赛题,网页数据提取。
实验楼的php比赛题,网页数据提取。
题目的地址:https://www.shiyanlou.com/contests/lou5/challenges
以下代码是题目的答案
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | <?php header( "Content-Type:text/html;charset=utf-8" ); class Crawler{ private $content ; private $data ; static private $mysql ; public function __construct(){ echo "开始爬取内容...." ; } public function loadFile( $file_path ){ echo "正在加载文件" ; $this ->content = file_get_contents ( $file_path ); } public function parseCourseBody(){ $regex = "/<body[^>]*?>(.*\s*?)<\/body>/is" ; if (preg_match_all( $regex , $this ->content, $matches )){ $this ->content = $matches [0]; } } public function parseContent(){ echo "开始解析内容...<br/>" ; $this ->parseCourseBody(); $this ->parseTitle(); $this ->parseDesc(); $this ->parseType(); $this ->titleIsLong(); $this ->saveData(); echo "解析内容结束!<br/>" ; } public function saveData(){ echo "存入数据库...<br/>" ; self:: $mysql = mysql_connect( "localhost" , "root" , "root" ); mysql_query( "set names utf8" ); mysql_select_db( "databases" ,self:: $mysql ); $cnames = $this ->data[ 'cnames' ]; $cdescs = $this ->data[ 'cdescs' ]; $ctypes = $this ->data[ 'ctypes' ]; $nlongs = $this ->data[ 'nlongs' ]; foreach ( $cnames as $key => $value ) { $sql = "insert into `course_data`(`cname`,`cdesc`,`ctype`,`nlong`) values('" . $cnames [ $key ]. "','" . $cdescs [ $key ]. "','" . $ctypes [ $key ]. "','" . $nlongs [ $key ]. "')" ; mysql_query( $sql ); } mysql_close(); } public function parseTitle(){ echo "解析课程标题...<br/>" ; $regex = "/<div class=\"course-name\".*?>.*?<\/div>/ism" ; if (preg_match_all( $regex , $this ->content, $matches )){ $cnames = $matches [0]; } foreach ( $cnames as & $value ) { $value = str_replace ( "</div>" , "" , str_replace ( "<div class=\"course-name\">" , "" , $value )); } $this ->data[ 'cnames' ] = $cnames ; } public function parseDesc(){ echo "解析课程简介...<br/>" ; $regex4 = "/<div class=\"course-desc\".*?>.*?<\/div>/ism" ; if (preg_match_all( $regex , $this ->content, $matches )){ $cdescs = $matches [0]; } foreach ( $cdescs as & $value ) { $value = str_replace ( "</div>" , "" , str_replace ( "<div class=\"course-desc\">" , "" , $value )); } $this ->data[ 'cdescs' ] = $cdescs ; } public function parseType(){ echo "解析课程类型...<br/>" ; $regex = "/<div class=\"course-footer\".*?>.*?<\/div>/ism" ; if (preg_match_all( $regex , $this ->content, $matches )){ $ctypes = $matches [0]; } foreach ( $ctypes as & $value ) { $str = str_replace ( "</div>" , "" , str_replace ( "<div class=\"course-footer\">" , "" , $value )); if (preg_match_all( "/([\x{4e00}-\x{9fa5}])/u" , $str , $match )){ $value = join( "" , $match [0]); } else { $value = "免费" ; } $this ->data[ 'ctypes' ] = $ctypes ; } public function titleIsLong(){ echo "判断课程名是否超长...<br/>" ; $cnames = $this ->data[ 'cnames' ]; foreach ( $cnames as $value ) { $nlongs [] = mb_strlen( $value ) > 16 : "true" : "false" ; } $this ->data[ 'nlongs' ] = $nlongs ; } } $Crawler = new Crawler(); $Crawler ->loadFile( "test.html" ); $Crawler ->parseContent(); /** 表结构 cname(varchar):完整的课程名 cdesc(varchar):课程描述 ctype(varchar):课程类型,值为 免费,会员,训练营。 nlong(enum('true','false')):课程名是否过长,课程名称超过16字符的时候为 true,否则为 false create table `course_data`( `id` int(11) not null auto_increment, `cname` varchar(255) default null, `cdesc` varchar(255) default null, `ctype` varchar(255) default null, `nlong` enum('true','false') default null, primary key (`id`) )engine=InnoDB default charset=utf8; */ |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 单线程的Redis速度为什么快?
· SQL Server 2025 AI相关能力初探
· AI编程工具终极对决:字节Trae VS Cursor,谁才是开发者新宠?
· 展开说说关于C#中ORM框架的用法!