Python3爬取影片入库

Python3爬取影片入库

 

1服务器说明

[root@openshift maoyan]# cat /etc/redhat-release

CentOS Linux release 7.4.1708 (Core)

[root@openshift maoyan]# python -V

Python 3.6.3 :: Anaconda, Inc.

 

2爬取电影入库

首页页面分地址分析

 

子页面数据获取,四个字段的数据写入,

 

 

3mysql数据库连接

import pymysql

pymysql.install_as_MySQLdb()

 

class Sql(object):

    conn = pymysql.connect(

        host="127.0.0.1",

        port=3306,

        user='root',

        passwd='123456',

        db="movies",

        charset="utf8"

)

  

4源代码编写

[root@openshift maoyan]# cat maoyan2.py

# coding:utf-8

import requests,os,sys,django

from bs4 import BeautifulSoup

import re,urllib

import pymysql

pymysql.install_as_MySQLdb()

import datetime

 

headers = {

'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

'Accept-Encoding':'gzip, deflate, sdch',

'Accept-Language':'zh-CN,zh;q=0.8',

'Cache-Control':'max-age=0',

'Connection':'keep-alive',

'Host':'maoyan.com',

'Upgrade-Insecure-Requests':'1',

'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}

class Sql(object):

    conn = pymysql.connect(

        host="127.0.0.1",

        port=3306,

        user='root',

        passwd='123456',

        db="movies",

        charset="utf8"

    )

    

    url = 'http://maoyan.com/films?showType=3'

    #url = 'https://maoyan.com/films?showType=3&offset=30'

    wbdata = requests.get(url,headers=headers)

    soup = BeautifulSoup(wbdata.content,'html5lib')

    movie_list = soup.select('div.movie-item > a')

    for movie in movie_list:

        m_url = 'http://maoyan.com' + movie.get('href')

        m_data = requests.get(m_url,headers=headers)

        m_soup = BeautifulSoup(m_data.content,'html5lib')

        name = m_soup.select_one('div.movie-brief-container > h3.name').get_text()

        movie_cate = m_soup.select("div.movie-brief-container > ul > li")[0].get_text()

        release_date = m_soup.select("div.movie-brief-container > ul > li")[2].get_text()[0:10]

        movie_img = m_soup.select_one('div.avatar-shadow > img').get('src')

 

        created = datetime.datetime.now()

        viewd = 1

 

        cur = conn.cursor()

        cur.execute("insert into userscore_movie(name,movie_cate,viewed,created,release_date,movie_img) VALUES('%s','%s','%d','%s','%s','%s')" %(name,movie_cate,viewd,created,release_date,movie_img))

        print('正在爬取电影: '+name)

        cur.close()

        conn.commit()

Sql()

 

5执行脚本,爬取数据过程

 

 

6数据库查看

 

自此,完成了Python3爬取影片入库过程。

posted @ 2019-02-18 11:08  wang_wei123  阅读(217)  评论(0编辑  收藏  举报