数据分析

猫眼电影简单的分析实例

# -*- coding: utf-8 -*-
"""
Created on Sat Apr 14 14:45:49 2018

@author: Administrator
"""

'''
爬猫眼网站TOP100的电影数据:
    http://maoyan.com/board/4?offset=0
    http://maoyan.com/board/4?offset=10
    http://maoyan.com/board/4?offset=20
    。。。
    http://maoyan.com/board/4?offset=90
'''
import requests
import json
import random
import re
import time
from multiprocessing import Pool
import functools
import myPymysql
import logging

# 获取logger的实例
logger = logging.getLogger("maoyan")
# 指定logger的输出格式
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
# 文件日志,终端日志
file_handler = logging.FileHandler("maoyan.txt")
file_handler.setFormatter(formatter)

# 设置默认的级别
logger.setLevel(logging.INFO)
logger.addHandler(file_handler)

def get_one_page(url):
    """
    发起Http请求,获取Response的响应结果
    """
    ua_headers = {"User-Agent":"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}
    reponse = requests.get(url,headers=ua_headers)
    if reponse.status_code == 200: #ok
        return reponse.text
    return None

def write_to_file(item):
    """
    把抓取到的数据写入本地文件
    """
    with open("猫眼电影4.txt", 'a', encoding='utf-8') as f:
        # json encode -> json str 
        f.write(json.dumps(item,ensure_ascii=False)+'\n')

def write_to_sql(item):
    """
    把数据写入数据库
    """
    dbhelper = myPymysql.DBHelper()
    title_data = item['title']
    actor_data = item['actor']
    time_data = item['time']
    sql = "INSERT INTO testdb.maoyan(title,actor,time) VALUES (%s,%s,%s);"
    params = (title_data, actor_data, time_data)
    result = dbhelper.execute(sql, params)
    if result == True:
        print("插入成功")
    else:
        logger.error("execute: "+sql)
        logger.error("params: ",params)
        logger.error("插入失败")
        print("插入失败")

def parse_one_page(html):
    """
    从获取到的html页面中提取真实想要存储的数据:
    电影名,主演,上映时间
    """
    pattern = re.compile('<p class="name">.*?title="([\s\S]*?)"[\s\S]*?<p class="star">([\s\S]*?)</p>[\s\S]*?<p class="releasetime">([\s\S]*?)</p>')
    items = re.findall(pattern,html)
    
    # yield在返回的时候会保存当前的函数执行状态
    for item in items:
        yield {
                'title':item[0].strip(),
                'actor':item[1].strip(),
                'time':item[2].strip()
    }
    
import matplotlib.pyplot as plt

def analysisCounry():
    # 从数据库表中查询出每个国家的电影数量来做分析
    dbhelper = myPymysql.DBHelper()
    # fetchCount
    Total = dbhelper.fetchCount("SELECT count(*) FROM `testdb`.`maoyan`;")
    Am = dbhelper.fetchCount('SELECT count(*) FROM `testdb`.`maoyan` WHERE time like "%美国%";')
    Ch = dbhelper.fetchCount('SELECT count(*) FROM `testdb`.`maoyan` WHERE time like "%中国%";')
    Jp = dbhelper.fetchCount('SELECT count(*) FROM `testdb`.`maoyan` WHERE time like "%日本%";')
    Other = Total[0] - Am[0] - Ch[0] - Jp[0]
    sizes = Am[0], Ch[0], Jp[0], Other
    labels = 'America','China','Japan','Others'
    colors = 'Blue','Red','Yellow','Green'
    explode = 0,0,0,0
    # 画出统计图表的饼状图
    plt.pie(sizes,explode=explode,labels=labels,
        colors=colors, autopct="%1.1f%%", shadow=True)
    plt.show()


def CrawlMovieInfo(lock, offset):
    """
    抓取电影的电影名,主演,上映时间
    """
    url = 'http://maoyan.com/board/4?offset='+str(offset)
    # 抓取当前的页面
    html = get_one_page(url)
    #print(html)
    
    # 这里的for
    for item in parse_one_page(html):
        lock.acquire()
        #write_to_file(item)
        write_to_sql(item)
        lock.release()
        
    # 每次下载完一个页面,随机等待1-3秒再次去抓取下一个页面
    #time.sleep(random.randint(1,3))
    

if __name__ == "__main__":
    analysisCounry()
    # 把页面做10次的抓取,每一个页面都是一个独立的入口
    # from multiprocessing import Manager
    # #from multiprocessing import Lock 进程池中不能用这个lock
    
    # # 进程池之间的lock需要用Manager中lock
    # manager = Manager()
    # lock = manager.Lock()
    
    # # 使用 functools.partial对函数做一层包装,从而把这把锁传递进进程池
    # #这样进程池内就有一把锁可以控制执行流程
    # partial_CrawlMovieInfo = functools.partial(CrawlMovieInfo, lock)
    # pool = Pool()
    # pool.map(partial_CrawlMovieInfo, [i*10 for i in range(10)])
    # pool.close()
    # pool.join()
    
#    for i in range(10):
#        CrawlMovieInfo(i*10) #offset -> 0,10,20,...90

logger.removeHandler(file_handler)
posted @ 2018-12-23 11:04  IndustriousHe  阅读(275)  评论(0编辑  收藏  举报