实验楼的php比赛题,网页数据提取。

实验楼的php比赛题,网页数据提取。

题目的地址:https://www.shiyanlou.com/contests/lou5/challenges

以下代码是题目的答案

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
<?php
header("Content-Type:text/html;charset=utf-8");
class Crawler{
     private $content;
     private $data;
     static private $mysql;
 
     public function __construct(){
        echo "开始爬取内容....";
     }
 
     public function loadFile($file_path){
        echo "正在加载文件";
        $this->content = file_get_contents($file_path);
     }
 
     public function parseCourseBody(){
        $regex = "/<body[^>]*?>(.*\s*?)<\/body>/is";
        if(preg_match_all($regex, $this->content, $matches)){
            $this->content = $matches[0];
        }
     }
 
     public function parseContent(){
        echo "开始解析内容...<br/>";
        $this->parseCourseBody();
        $this->parseTitle();
        $this->parseDesc();
        $this->parseType();
        $this->titleIsLong();
        $this->saveData();
        echo "解析内容结束!<br/>";
     }
 
     public function saveData(){
        echo "存入数据库...<br/>";
        self::$mysql = mysql_connect("localhost","root","root");
        mysql_query("set names utf8");
        mysql_select_db("databases",self::$mysql);
        $cnames = $this->data['cnames'];
        $cdescs = $this->data['cdescs'];
        $ctypes = $this->data['ctypes'];
        $nlongs = $this->data['nlongs'];
        foreach ($cnames as $key => $value) {
            $sql = "insert into `course_data`(`cname`,`cdesc`,`ctype`,`nlong`) values('".$cnames[$key]."','".$cdescs[$key]."','".$ctypes[$key]."','".$nlongs[$key]."')";
            mysql_query($sql);
        }
        mysql_close();
     }
 
     public function parseTitle(){
        echo "解析课程标题...<br/>";
        $regex= "/<div class=\"course-name\".*?>.*?<\/div>/ism";  
        if(preg_match_all($regex, $this->content, $matches)){
            $cnames = $matches[0];
        }
        foreach ($cnames as &$value) {
            $value = str_replace("</div>","",str_replace("<div class=\"course-name\">", "", $value));
        }
        $this->data['cnames'] = $cnames;
     }
 
     public function parseDesc(){
        echo "解析课程简介...<br/>";
        $regex4= "/<div class=\"course-desc\".*?>.*?<\/div>/ism";  
        if(preg_match_all($regex, $this->content, $matches)){
            $cdescs = $matches[0];
        }
        foreach ($cdescs as &$value) {
            $value = str_replace("</div>","",str_replace("<div class=\"course-desc\">", "", $value));
        }
        $this->data['cdescs'] = $cdescs;
     }
 
     public function parseType(){
        echo "解析课程类型...<br/>";
        $regex= "/<div class=\"course-footer\".*?>.*?<\/div>/ism";  
        if(preg_match_all($regex, $this->content, $matches)){
            $ctypes = $matches[0];
        }
        foreach ($ctypes as &$value) {
            $str = str_replace("</div>","",str_replace("<div class=\"course-footer\">", "", $value));
            if(preg_match_all("/([\x{4e00}-\x{9fa5}])/u", $str, $match)){
                $value = join("",$match[0]);
            }else{
                $value = "免费";
        }
        $this->data['ctypes'] = $ctypes;
     }
 
     public function titleIsLong(){
        echo "判断课程名是否超长...<br/>";
        $cnames = $this->data['cnames'];
        foreach ($cnames as $value) {
            $nlongs[] = mb_strlen($value) > 16 : "true" : "false";
        }
        $this->data['nlongs'] = $nlongs;
     }
}
$Crawler = new Crawler();
$Crawler->loadFile("test.html");
$Crawler->parseContent();
 
/**
 表结构
cname(varchar):完整的课程名
cdesc(varchar):课程描述
ctype(varchar):课程类型,值为 免费,会员,训练营。
nlong(enum('true','false')):课程名是否过长,课程名称超过16字符的时候为 true,否则为 false
 
create table `course_data`(
    `id` int(11) not null auto_increment,
    `cname` varchar(255) default null,
    `cdesc` varchar(255) default null,
    `ctype` varchar(255) default null,
    `nlong` enum('true','false') default null,
    primary key (`id`)
)engine=InnoDB default charset=utf8;
*/

  

posted @   Yxh_blogs  阅读(422)  评论(0编辑  收藏  举报
编辑推荐:
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
阅读排行:
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 单线程的Redis速度为什么快?
· SQL Server 2025 AI相关能力初探
· AI编程工具终极对决:字节Trae VS Cursor,谁才是开发者新宠?
· 展开说说关于C#中ORM框架的用法!
点击右上角即可分享
微信分享提示