R 语言描述性分析

1. 数字特征

随即构造一个正态样本,研究它的数字特征

# 构造数据,随机生成一个正态分布
X <- rnorm(10,0,1)

1.1 样本均值

mean(X)

> mean(X)
[1] -0.0586382

加入截尾参数 trim,该参数默认为0

mean(X, trim = 0.1) # 去掉 X 向量排序后的 两端数据的 10%,

> mean(X, trim = 0.1) # 去掉 X 向量排序后的 两端数据的 10%,
[1] -0.1577639

# 等价于
mean(sort(X)[2:9])

> mean(sort(X)[2:9])
[1] -0.1577639

如果数据有缺失值,那么可以使用 na.rm 参数,默认为“False”,例如

Y <- X
Y[3] <- NaN
Y

> Y
 [1]  0.439891287 -0.004540984          NaN  0.075344112 -0.103447416        
 [6]  0.145027671 -1.860503334 -1.443864860  0.658675667 -1.029196428 

mean(Y, na.rm = T)

> mean(Y, na.rm = T)
[1] -0.3469571

Y[3] <- 0
sum(Y)/(length(Y)-1)

> sum(Y)/(length(Y)-1)
[1] -0.3469571
# 因此它的作用是直接消除缺失项,同时项数减1

1.2 样本方差

var(X)

> var(X)
[1] 1.518705

也可以用var()计算两样本的协方差

Y <- rnorm(10,0,1)
var(X,Y)

> var(X,Y)
[1] -0.5049783

# 等价于
cov(X,Y)

> cov(X,Y)
[1] -0.5049783

与 mean() 一样,也有 na.rm 参数

Y[3] <- NaN
var(Y, na.rm = T)

> var(Y, na.rm = T)
[1] 0.9002631

1.3 标准差

sd(X)

> sd(X)
[1] 1.232357

Y <- rnorm(10,0,1)
Y[3] <- NaN
sd(Y, na.rm = T)

> sd(Y, na.rm = T)
[1] 0.8890223

1.4 中位数

median(X)

> median(X)
[1] 0.03540156

median(Y, na.rm = T)

> median(Y, na.rm = T)
[1] 0.4815701

1.5 分位数

quantile(X)

> quantile(X)
         0%         25%         50%         75%        100%
-1.86050333 -0.79775917  0.03540156  0.36617538  2.53623226
# 0.25 分位数
quantile(X, probs = 0.25)

> quantile(X, probs = 0.25)
       25%
-0.7977592

# 0.75 分位数
quantile(X, probs = 0.75)

> quantile(X, probs = 0.75)
      75% 
0.3661754

# na.rm 用法
quantile(Y, na.rm = T)

> quantile(Y, na.rm = T)
        0%        25%        50%        75%       100% 
-1.3108392 -0.1681030  0.4815701  0.6309903  1.6807826

# names 用法
quantile(X, probs = 0.25, names = F)

> quantile(X, probs = 0.25, names = F)
[1] -0.7977592

1.6 极差

# 最大值
max(X)

> max(X)
[1] 2.536232

# 最小值
min(X)

> min(X)
[1] -1.860503

# 极差
max(X) - min(X)

> max(X) - min(X)
[1] 4.396736

# range 函数
range(X) # 返回最小值和最大值构成的向量

> range(X) # 返回最小值和最大值构成的向量
[1] -1.860503  2.536232

range(X)[2] - range(X)[1]

> range(X)[2] - range(X)[1]
[1] 4.396736

# 四分位极差(半极差)
quantile(X, 3/4) - quantile(X, 1/4)

> quantile(X, 3/4) - quantile(X, 1/4)
     75%
1.163935

1.7 自定义数字特征函数

describe <- function(x){
    R1 <- quantile(x, 3/4, name = F) - quantile(x, 1/4, name = F)

    data.frame(
        n = length(x),
        max = max(x),
        min = min(x),
        R = max(x) - min(x),
        R1 = R1,
        mean = mean(x),
        median = median(x),
        var = var(x),
        sd = sd(x)
    )
}
describe(X)

> describe(X)
   n      max       min        R       R1       mean     median      var
1 10 2.536232 -1.860503 4.396736 1.163935 -0.0586382 0.03540156 1.518705
        sd
1 1.232357

2 常用的分布

2.1 正态分布

2.1.1 概率密度函数 dnorm(x, mean = 0, sd = 1, lower.tail = T)

dnorm(1, 0, 1)

> dnorm(1, 0, 1)
[1] 0.2419707

绘制标准正态分布概率密度函数图

x <- seq(-5, 5, length = 100)
y1 <- dnorm(x, 0, 1)
y2 <- dnorm(x, 0, 0.5)
y3 <- dnorm(x, 2, 0.5)
plot(x, y1, type = 'l', col = 'red', xlab = 'x', ylab = 'y', ylim = c(0,1))
par(new = TRUE)
plot(x, y2, type = 'l', col = 'blue', xlab = 'x', ylab = 'y', ylim = c(0,1))
par(new = TRUE)
plot(x, y3, type = 'l', col = 'green', xlab = 'x', ylab = 'y', ylim = c(0,1))
legend("topright",                                    #图例位置为右上角
    legend=c("mu=0,sig=1","mu=0,sig-0.5","mu=2,sig=0.5"),        #图例内容
    col=c("red","blue","green"),                 #图例颜色
    lty=1,lwd=2)                                    #图例大小

2.1.2 分布函数 pnorm(q, mean = 0, sd = 1, lower.tail = T)

pnorm(1, 0, 1)

> pnorm(1, 0, 1)
[1] 0.8413447

lower.tail 参数,默认为 TRUE,即作下尾运算 \(F(x)=P(X \leq x)\);若改为 FALSE,则表示 \(F(x)=P(X>x)\)

pnorm(1, 0, 1, lower.tail = FALSE)

> pnorm(1, 0, 1, lower.tail = FALSE)
[1] 0.1586553

分布函数图像

Fx <- pnorm(x, 0, 1)
plot(x, Fx, type = 'l')

2.1.3 分位函数(默认下分位点)qnorm(p, mean = 0, sd = 1, lower.tail = T)

qnorm(0.3, 0, 1)

> qnorm(0.3, 0, 1)
[1] -0.5244005

q <- qnorm(0.3, 0, 1)
pnorm(q, 0, 1)

> pnorm(q, 0, 1)
[1] 0.3

1 - pnorm(q, 0, 1, lower.tail = F)

> 1 - pnorm(q, 0, 1, lower.tail = F)
[1] 0.3

## 也可以计算上分位点
qnorm(0.3, 0, 1, lower.tail = F)

> qnorm(0.3, 0, 1, lower.tail = F)
[1] 0.5244005

2.2 \(\mathcal{X}^2\)卡方分布

2.2.1 概率密度函数 dchisq(x, df, ncp = 0)

dchisq(1, df = 5)

> dchisq(1, df = 5)
[1] 0.08065691

绘制密度函数图像

x <- seq(0, 20, length = 100)
y1 <- dt(x, df = 1)
y2 <- dt(x, df = 4)
y3 <- dt(x, df = 10)
plot(x, y1, type = 'l', col = 'red', xlab = 'x', ylab = 'y', ylim = c(0,0.5))
par(new = TRUE)
plot(x, y2, type = 'l', col = 'blue', xlab = 'x', ylab = 'y', ylim = c(0,0.5))
par(new = TRUE)
plot(x, y3, type = 'l', col = 'green', xlab = 'x', ylab = 'y', ylim = c(0,0.5))
legend("topright",                                    #图例位置为右上角
    legend=c("df=1","df=4","df=10"),        #图例内容
    col=c("red","blue","green"),                 #图例颜色
    lty=1,
    lwd=2
    )                                    #图例大小

2.2.2 分布函数 pchisq(q, df, ncp = 0, lower.tail = T)

pchisq(1, 5)

> pchisq(1, 5)
[1] 0.03743423

lower.tail 参数,默认为 TRUE,即作下尾运算 \(F(x)=P(X\leq x)\);若改为 FALSE,则表示 \(F(x)=P(X>x)\)

pchisq(1, 5, lower.tail = FALSE)

> pchisq(1, 5, lower.tail = FALSE)
[1] 0.9625658

分布函数图像

Fx <- pchisq(x, 5) 
plot(x, Fx, type = 'l')

2.2.3 分位函数 qchisq(p, df, ncp = 0, lower.tail = T)

qchisq(0.3, 5)

> qchisq(0.3, 5)
[1] 2.999908

q <- qchisq(0.3, 5)
pchisq(q, 5)

> pchisq(q, 5)
[1] 0.3

1 - pchisq(q, 5, lower.tail = F)

> 1 - pchisq(q, 5, lower.tail = F)
[1] 0.3

## 也可以计算上分位点
qnorm(0.3, 5, lower.tail = F)

> qnorm(0.3, 5, lower.tail = F)
[1] 5.524401

2.3 t分布

2.3.1 概率密度函数 dt(x, df, ncp = 0)

dt(1, df = 5)

> dt(1, df = 5)
[1] 0.2196798

绘制密度函数图像

x <- seq(-5, 5, length = 100)
y1 <- dt(x, df = 1)
y2 <- dt(x, df = 4)
y3 <- dt(x, df = 10)
plot(x, y1, type = 'l', lty = 1, xlab = 'x', ylab = 'y', ylim = c(0,0.5))
par(new = TRUE)
plot(x, y2, type = 'l', lty = 2, xlab = 'x', ylab = 'y', ylim = c(0,0.5))
par(new = TRUE)
plot(x, y3, type = 'l', lty = 6, xlab = 'x', ylab = 'y', ylim = c(0,0.5))
legend("topright",                                    #图例位置为右上角
    legend=c("df=1","df=4","df=10"),        #图例内容
    # col=c("red","blue","green"),                 #图例颜色
    lty=c(1, 2, 6),
    lwd=2
    )                                    #图例大小

2.3.2 分布函数 pt(q, df, ncp = 0, lower.tail = T)

pt(1, 5)

> pt(1, 5)
[1] 0.8183913

lower.tail 参数,默认为 TRUE,即作下尾运算 \(F(x)=P(X\leq x)\);若改为 FALSE,则表示 \(F(x)=P(X>x)\).

pt(1, 5, lower.tail = FALSE)

> pt(1, 5, lower.tail = FALSE)
[1] 0.1816087

分布函数图像

Fx <- pt(x, 5)  
plot(x, Fx, type = 'l')

2.3.3 分位函数 qt(p, df, ncp = 0, lower.tail = T)

qt(0.3, 5)

> qt(0.3, 5)
[1] -0.5594296

q <- qt(0.3, 5)
pt(q, 5)

> pt(q, 5)
[1] 0.3

1 - pt(q, 5, lower.tail = F)

> 1 - pt(q, 5, lower.tail = F)
[1] 0.3

## 也可以计算上分位点
qt(0.3, 5, lower.tail = F)

> qt(0.3, 5, lower.tail = F)
[1] 0.5594296

2.4 F 分布

2.4.1 概率密度函数 df(x, df1, df2, ncp = 0)

df(1, df1 = 5, df2 = 1)

> df(1, df1 = 5, df2 = 1)
[1] 0.2196798

绘制密度函数图像

x <- seq(0, 5, length = 100)
y1 <- df(x, df1 = 4, df2 = 1)
y2 <- df(x, df1 = 4, df2 = 4)
y3 <- df(x, df1 = 4, df2 = 4000)
plot(x, y1, type = 'l', lty = 1, lwd = 3, xlab = 'x', ylab = 'y', ylim = c(0,0.8))
par(new = TRUE)
plot(x, y2, type = 'l', lty = 2, lwd = 3, xlab = 'x', ylab = 'y', ylim = c(0,0.8))
par(new = TRUE)
plot(x, y3, type = 'l', lty = 6, lwd = 3, xlab = 'x', ylab = 'y', ylim = c(0,0.8))
legend("topright",                                    #图例位置为右上角
    legend=c("F(4,1)","F(4,4)","F(4,4000)"),        #图例内容
    # col=c("red","blue","green"),                 #图例颜色
    lty=c(1, 2, 6),
    lwd=3
    )                                    #图例大小

2.4.2 分布函数 pf(q, df1, df2, ncp = 0, lower.tail = T)

pf(1, 5, 5)

> pf(1, 5, 5)
[1] 0.5

lower.tail 参数,默认为 TRUE,即作下尾运算 \(F(x)=P(X\leq x)\);若改为 FALSE,则表示 \(F(x)=P(X>x)\).

pf(1, 5, 5, lower.tail = FALSE)

> pf(1, 5, 5, lower.tail = FALSE)
[1] 0.5

分布函数图像

Fx <- pf(x, 4, 4) 
plot(x, Fx, type = 'l')

2.4.3 分位函数 qf(p, df1, df2, ncp = 0, lower.tail = T)

qf(0.3, 5, 5)

> qf(0.3, 5, 5)
[1] 0.6093936

q <- qf(0.3, 5, 5)
pf(q, 5, 5)

> pf(q, 5, 5)
[1] 0.3

1 - pf(q, 5, 5, lower.tail = F)

> 1 - pf(q, 5, 5, lower.tail = F)
[1] 0.3

## 也可以计算上分位点
qf(0.3, 5, 5, lower.tail = F)

> qf(0.3, 5, 5, lower.tail = F)
[1] 1.640976

3 数据的图形描述

3.1 直方图

# 考试成绩数据
x <- c(25, 45, 50, 54, 55,
       61, 64, 68, 72, 75,
       78, 79, 81, 83, 84,
       84, 84, 85, 86, 86,
       86, 87, 89, 89, 90,
       91, 91, 92, 100)
# 频数直方图
hist(x, col = 'lightblue', border = 'red', labels = T, ylim = c(0, 14.5))
# 概率密度直方图
r <- hist(x, freq = FALSE, density = 10, angle = 15+ 30* 1:6)
text(r$mids, 0, r$counts, adj = c(.5, - .5), cex = 1.2)
lines(density(x), col = 'blue', lwd = 2)
y <- seq(from = 20, to = 100, by = 1)
lines(y, dnorm(y, mean(x), sd(x)), col = 'red', lwd = 2)

图中红线为正态分布曲线,蓝线为数据的密度曲线,从图像可以看出考试成绩数据不服从正态分布.

3.2 QQ图

qqnorm(x)
qqline(x)

从QQ图可以看出,散点与直线的偏差过多,因此认为考试成绩数据不服从正态分布.

posted @ 2022-09-29 17:53  只会加减乘除  阅读(325)  评论(0编辑  收藏  举报