Golang框架beego电影网爬虫小试牛刀
学习了一段时间golang,又参考课程学习了beego开发网站爬虫,项目的目录结构是:
采集的目标是豆瓣网电影,入口地址是:https://movie.douban.com/subject/1900841/?from=subject-page
数据结果
数据表结构
CREATE TABLE `movie_info` (
`id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`movie_id` int(11) unsigned NOT NULL COMMENT '电影id',
`movie_name` varchar(100) DEFAULT NULL COMMENT '电影名称',
`movie_pic` varchar(200) DEFAULT NULL COMMENT '电影图片',
`movie_director` varchar(50) DEFAULT NULL COMMENT '电影导演',
`movie_writer` varchar(50) DEFAULT NULL COMMENT '电影编剧',
`movie_country` varchar(50) DEFAULT NULL COMMENT '电影产地',
`movie_language` varchar(50) DEFAULT NULL COMMENT '电影语言',
`movie_main_character` varchar(50) DEFAULT NULL COMMENT '电影主演',
`movie_type` varchar(50) DEFAULT NULL COMMENT '电影类型',
`movie_on_time` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00' COMMENT '电影上映时间',
`movie_span` varchar(20) DEFAULT NULL COMMENT '电影时长',
`movie_grade` varchar(5) DEFAULT NULL COMMENT '电影评分',
`remark` varchar(500) DEFAULT '' COMMENT '备注',
`create_time` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00' COMMENT '创建时间',
`modify_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '修改时间',
`status` tinyint(1) DEFAULT '1',
PRIMARY KEY (`id`),
KEY `idx_movie_id` (`movie_id`),
KEY `idx_create_time` (`create_time`),
KEY `idx_modify_time` (`modify_time`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='电影信息表';
文件
app.conf文件,用来配置数据库
appname = mypro
httpport = 8080
runmode = dev
dbhost = 127.0.0.1
dbport = 3306
dbname = myblog
dbuser = root
dbpwd = root
路由文件router.go
//router.go文件
package routers
import (
"mypro/controllers"
"github.com/astaxie/beego"
)
func init() {
beego.Router("/", &controllers.MainController{})
beego.Router("/collection", &controllers.CollectionController{})
}
控制器下文件
package controllers
import (
"fmt"
"github.com/astaxie/beego"
"github.com/astaxie/beego/httplib"
"mypro/models"
"time"
)
type CollectionController struct {
beego.Controller
}
func (c *CollectionController) Get() {
sUrl := "https://movie.douban.com/subject/1900841/?from=subject-page" //这里作为入口
models.PutinUrlQueue(sUrl)
models.ConnectRedis("127.0.0.1:6379") //连接redis
for {
var MovieInfo models.MovieInfo
UrlQueueLength := models.GetQueueLength()
c.Ctx.WriteString(fmt.Sprintf("---%v---", UrlQueueLength))
if UrlQueueLength == 0 {
break;
}
//从队列中取出url
sUrl = models.PopfromQueue()
//如果url在集合里,那么过滤掉
if models.IsVisit(sUrl) {
continue
}
rsp := httplib.Get(sUrl)
//设置User-agent以及cookie是为了防止 豆瓣网的 403
rsp.Header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0")
rsp.Header("Cookie", `bid=gFP9qSgGTfA; __utma=30149280.1124851270.1482153600.1483055851.1483064193.8; __utmz=30149280.1482971588.4.2.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ll="118221"; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1483064193%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_id.100001.4cf6=5afcf5e5496eab22.1482413017.7.1483066280.1483057909.; __utma=223695111.1636117731.1482413017.1483055857.1483064193.7; __utmz=223695111.1483055857.6.5.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _vwo_uuid_v2=BDC2DBEDF8958EC838F9D9394CC5D9A0|2cc6ef7952be8c2d5408cb7c8cce2684; ap=1; viewed="1006073"; gr_user_id=e5c932fc-2af6-4861-8a4f-5d696f34570b; __utmc=30149280; __utmc=223695111; _pk_ses.100001.4cf6=*; __utmb=30149280.0.10.1483064193; __utmb=223695111.0.10.1483064193`)
sMovieHtml, err := rsp.String()
// fmt.Print(sMovieHtml)
if err != nil {
panic(err)
}
MovieInfo.Movie_name = models.GetMovieName(sMovieHtml)
// fmt.Print(MoveInfo.Movie_name)
if MovieInfo.Movie_name != "" { //如果为空,则说明不是电影,如果不为空,则是电影
//获取电影导演
MovieInfo.Movie_director = models.GetMovieDirector(sMovieHtml)
//获取主演
MovieInfo.Movie_main_character = models.GetMovieMainCharacters(sMovieHtml)
//电影类型
MovieInfo.Movie_type = models.GetMovieGenre(sMovieHtml)
//上映时间
MovieInfo.Movie_on_time = models.GetMovieOnTime(sMovieHtml)
//评分
MovieInfo.Movie_grade = models.GetMovieGrade(sMovieHtml)
//时长
MovieInfo.Movie_span = models.GetMovieRunningTime(sMovieHtml)
// c.Ctx.WriteString(fmt.Sprintf("%v", MovieInfo))
//入库
//models.AddMovieToDb(&MovieInfo)
//fmt.Println(MovieInfo)
//fmt.Println(&MovieInfo)
//os.Exit(1)
id, err := models.AddMovieToDb(&MovieInfo)
fmt.Println(err)
c.Ctx.WriteString(fmt.Sprintf("%v", id))
}
//提取该页面的所有连接
urls := models.GetMovieUrls(sMovieHtml)
//遍历url
//为了把url写入队列
//同样需要开启一个协程,这个协程专门负责从队列中取,负责get,set,
//第一判断这个url是不是一个电影,是的话加入到数据库,
// 第二是提取这个电影有关的url
//第三把url放入set(集合)里,表明这个url已经访问过
for _, url := range urls {
models.PutinUrlQueue(url)
// c.Ctx.WriteString("<br>" + url + "</br>")
}
//sUrl 需要记录到set集合里,表明这个url访问过
models.AddToSet(sUrl)
time.Sleep(time.Second) //适当休息
}
c.Ctx.WriteString("爬虫执行结束")
//models.PutinUrlQueue(sUrl)
//c.Data["Website"] = "beego.me"
//c.Data["Email"] = "astaxie@gmail.com"
//c.TplName = "index.tpl"
}
models目录下文件
package models
import (
"github.com/astaxie/beego"
"regexp" //正则包
"strings"
//"strings"
"github.com/astaxie/beego/orm"
_ "github.com/go-sql-driver/mysql"
)
var (
db orm.Ormer
)
type MovieInfo struct {
Id int64
Movie_id int64
Movie_name string
Movie_pic string
Movie_director string
Movie_writer string
Movie_country string
Movie_language string
Movie_main_character string
Movie_type string
Movie_on_time string
Movie_span string
Movie_grade string
Create_time string
}
func init(){
orm.Debug = true //是否开启调试模式,调试模式下会打印sql语句
dbhost := beego.AppConfig.String("dbhost")
dbport := beego.AppConfig.String("dbport")
dbname := beego.AppConfig.String("dbname")
dbuser := beego.AppConfig.String("dbuser")
dbpwd := beego.AppConfig.String("dbpwd")
orm.RegisterDataBase("default", "mysql", dbuser + ":" + dbpwd + "@tcp("+dbhost + ":" + dbport + ")/" + dbname + "?charset=utf8")
orm.RegisterModel(new(MovieInfo))
db = orm.NewOrm()
}
//获取电影名称
func GetMovieName(html string) string{
var movieName string
movieName = ""
if html != "" {
reg := regexp.MustCompile(`<span\s*property="v:itemreviewed">(.*?)</span>`)
result := reg.FindAllStringSubmatch(html, -1)
if len(result) != 0 {
movieName = string(result[0][1])
}
}
return movieName
}
//获取导演
func GetMovieDirector(html string) string {
var movieDirector string
movieDirector = ""
if html != "" {
reg := regexp.MustCompile(`<a.*?rel="v:directedBy">(.*)</a>`)
result := reg.FindAllStringSubmatch(html, -1)
if len( result[0]) > 0 && result[0][1] != "" {
movieDirector = string(result[0][1])
}
}
return movieDirector
}
//获取主演
func GetMovieMainCharacters(html string) string {
var movieMainCharacters string
movieMainCharacters = ""
if html != "" {
reg := regexp.MustCompile(`<a.*?rel="v:starring">(.*?)</a>`)
result := reg.FindAllStringSubmatch(html, -1)
if len(result) != 0 {
for _, v := range result {
movieMainCharacters += v[1] + "/"
}
movieMainCharacters = strings.Trim(movieMainCharacters, "/")
}
}
return movieMainCharacters
}
//获取电影类型
func GetMovieGenre(html string) string {
var movieGenre string
movieGenre = ""
if html != ""{
reg := regexp.MustCompile(`<span.*?property="v:genre">(.*?)</span>`)
result := reg.FindAllStringSubmatch(html, -1)
if len(result) != 0 {
for _, v := range result {
movieGenre += v[1] + "/"
}
}
movieGenre = strings.Trim(movieGenre, "/")
}
return movieGenre
}
//获取电影上映时间
func GetMovieOnTime(html string) string {
var movieOnTime string
movieOnTime = ""
if html != "" {
reg := regexp.MustCompile(`<span.*?property="v:initialReleaseDate".*?>(.*?)</span>`)
result := reg.FindAllStringSubmatch(html, -1)
if len(result) != 0 {
movieOnTime = string(result[0][1])
}
}
return movieOnTime
}
//获取评分
func GetMovieGrade(html string) string {
var movieGrade string
movieGrade = ""
if html != "" {
reg := regexp.MustCompile(`<strong.*?property="v:average">(.*?)</strong>`)
result := reg.FindAllStringSubmatch(html, -1)
if len(result) != 0 {
movieGrade = string(result[0][1])
}
}
return movieGrade
}
//获取电影时长
func GetMovieRunningTime(html string) string {
var movieRunningTime string
movieRunningTime = ""
if html != "" {
reg := regexp.MustCompile(`<span.*?property="v:runtime".*?>(.*?)</span>`)
result := reg.FindAllStringSubmatch(html, -1)
if len(result) != 0 {
movieRunningTime = string(result[0][1])
}
}
return movieRunningTime
}
//入库电影
func AddMovieToDb(movie_info *MovieInfo) (int64, error) {
id, err := db.Insert(movie_info)
return id, err
}
//获取当前电影页下对的所有相关电影url
func GetMovieUrls(html string) []string {
reg := regexp.MustCompile(`<a.*?href="(https://movie.douban.com/.*?)"`)
result := reg.FindAllStringSubmatch(html, -1)
var movieSets []string
for _, v := range result {
movieSets = append(movieSets, v[1])
}
return movieSets
}
redis.go文件
package models
import (
"github.com/astaxie/goredis"
)
var (
RediCclient goredis.Client
)
const (
URL_QUEUE = "url_queue" //作为队列标识
URL_VISIT_SET = "url_visit_set" //记录曾经访问过的url
)
func ConnectRedis(addr string) {
RediCclient.Addr = addr
}
//把提取的url放入队列
func PutinUrlQueue(url string) {
RediCclient.Lpush(URL_QUEUE, []byte(url))
}
//获取队列长度
func GetQueueLength() int {
length, err := RediCclient.Llen(URL_QUEUE)
if err != nil {
return 0
}
return length
}
//从队列里读取
func PopfromQueue() string{
res, err := RediCclient.Rpop(URL_QUEUE)
if err != nil {
panic(err)
}
return string(res)
}
// 把曾经访问过的加入一个集合
func AddToSet(url string) {
RediCclient.Sadd(URL_VISIT_SET, []byte(url))
}
//判断某个URL是否存在于集合中
func IsVisit(url string) bool{
isVisit, err := RediCclient.Sismember(URL_VISIT_SET, []byte(url))
if err != nil {
return false
}
return isVisit
}
分类:
Golang Beego框架
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 【自荐】一款简洁、开源的在线白板工具 Drawnix
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY
· Docker 太简单,K8s 太复杂?w7panel 让容器管理更轻松!