数据分析

基本统计:

from pandas import read_csv;

data = read_csv('D:\\PA\\8.1\\data.csv')

data.score.describe()

data.score.size

data.score.max();

data.score.min;

data.score.sum;

data.score.mean;

data.score.var;

data.score.std;
View Code

分组分析:

import numpy;
from pandas import read_csv;

data = read_csv('C:\\PA\\8.2\\data.csv');
data['score2'] = data['score']*2

data.groupby(by=['class'])['score'].agg({
    '总分':numpy.sum, 
    '人数':numpy.size, 
    '平均值':numpy.mean, 
    '方差':numpy.var, 
    '标准差':numpy.std
})

data.groupby(by=['class', 'name'])[['score', 'score2']].agg([
    numpy.size, 
    numpy.sum
])

result = data.groupby(by=['class'])['score'].agg({
    '总分':numpy.sum, 
    '人数':numpy.size, 
    '平均值':numpy.mean, 
    '方差':numpy.var, 
    '标准差':numpy.std
})

result.index
result.columns
result['平均值']

result2 = data.groupby(by=['class', 'name'])[['score', 'score2']].agg([
    numpy.size, 
    numpy.sum
])

result2.index
result2.columns
result2['score']
result2['score']['sum']

result.reset_index()
result2.reset_index()
View Code

分布分析:

import numpy;
import pandas;
from pandas import read_csv;

data = read_csv('D:\\PA\\8.3\\data.csv');

bins = [min(data.年龄)-1, 20, 30, 40, max(data.年龄)+1];
labels = ['20岁以及以下', '21岁到30岁', '31岁到40岁', '41岁以上'];

年龄分层 = pandas.cut(data.年龄, bins, labels=labels)

data['年龄分层'] = 年龄分层;

data.groupby(by=['年龄分层'])['年龄'].agg({'人数':numpy.size})
View Code 

交叉分析:

import numpy;
import pandas;
from pandas import read_csv;

df = read_csv('D:\\PA\\8.4\\data.csv');

bins = [min(df.年龄)-1, 20, 30, 40, max(df.年龄)+1];
labels = ['20岁以及以下', '21岁到30岁', '31岁到40岁', '41岁以上'];

年龄分层 = pandas.cut(df.年龄, bins, labels=labels)
df['年龄分层'] = 年龄分层;

r1 = df.pivot_table(
    values=['年龄'], 
    index=['年龄分层'], 
    columns=['性别'], 
    aggfunc=[numpy.size, numpy.mean]
);

r2 = df.pivot_table(
    values=['年龄'], 
    index=['年龄分层'], 
    columns=['性别'], 
    aggfunc=[numpy.std]
);

r1.join(r2)
View Code

结构分析:

import numpy;
from pandas import read_csv;

data = read_csv('C:\\PA\\8.5\\data.csv');

data_pt = data.pivot_table(
    values=['月消费(元)'], 
    index=['省份'], 
    columns=['通信品牌'], 
    aggfunc=[numpy.sum]
);

data_pt.sum()

data_pt.sum(axis=0)

data_pt.sum(axis=1)

data_pt.div(data_pt.sum(axis=1), axis=0);

data_pt.div(data_pt.sum(axis=0), axis=1);
View Code

相关分析:

# -*- coding: utf-8 -*-
from pandas import read_csv;
data = read_csv('D:\\PA\\8.6\\data.csv');

#先来看看如何进行两个列之间的相关度的计算
data['人口'].corr(data['文盲率'])

#多列之间的相关度的计算方法
#选择多列的方法
#data.loc[:, ['列1', '列2', '……', '列n']]
data.loc[:, ['超市购物率', '网上购物率', '文盲率', '人口']].corr()
View Code

 

posted @ 2018-06-06 17:44  秋雨秋雨秋雨  阅读(152)  评论(0编辑  收藏  举报