R语言学习笔记（五）

总结下第七章的统计分析方法，里面涉及到了很多统计专业概念。

Summary 函数
> myvars<-c("mpg","hp","wt")
> summary(mtcars[myvars])
mpg hp wt 
Min. :10.40 Min. : 52.0 Min. :1.513 
1st Qu.:15.43 1st Qu.: 96.5 1st Qu.:2.581 
Median :19.20 Median :123.0 Median :3.325 
Mean :20.09 Mean :146.7 Mean :3.217 
3rd Qu.:22.80 3rd Qu.:180.0 3rd Qu.:3.610 
Max. :33.90 Max. :335.0 Max. :5.424

 

sapply 函数，针对某一数据源的某一数据列应用特点的统计函数
mystats<-function(x,na.omit=FALSE){
+ if(na.omit){x<-x[!is.na(x)]}
+ m<-mean(x)
+ n<-length(x)
+ s<-sd(x)
+ skew<-sum((x-m)^3/s^3)/n
+ kurt<-sum((x-m)^4/s^4)/n-3
+ return(c(n=n,mean=m,stdev=s,skew=skew,kurtosis=kurt))}

> myvars<-c("mpg","hp","wt")
> sapply(mtcars[myvars],mystats)
mpg hp wt
n 32.000000 32.0000000 32.00000000
mean 20.090625 146.6875000 3.21725000
stdev 6.026948 68.5628685 0.97845744
skew 0.610655 0.7260237 0.42314646
kurtosis -0.372766 -0.1355511 -0.02271075

 

#hmisc describe 包 - 生成详细的统计概括
> library(Hmisc)

> myvars<-c("mpg","hp","wt")
> describe(mtcars[myvars])
mtcars[myvars]


3 Variables 32 Observations
-------------------------------------------------------------------------------------
mpg 
n missing distinct Info Mean Gmd .05 .10 .25 
32 0 25 0.999 20.09 6.796 12.00 14.34 15.43 
.50 .75 .90 .95 
19.20 22.80 30.09 31.30


lowest : 10.4 13.3 14.3 14.7 15.0, highest: 26.0 27.3 30.4 32.4 33.9
-------------------------------------------------------------------------------------
hp 
n missing distinct Info Mean Gmd .05 .10 .25 
32 0 22 0.997 146.7 77.04 63.65 66.00 96.50 
.50 .75 .90 .95 
123.00 180.00 243.50 253.55


lowest : 52 62 65 66 91, highest: 215 230 245 264 335
-------------------------------------------------------------------------------------
wt 
n missing distinct Info Mean Gmd .05 .10 .25 
32 0 29 0.999 3.217 1.089 1.736 1.956 2.581 
.50 .75 .90 .95 
3.325 3.610 4.048 5.293


lowest : 1.513 1.615 1.835 1.935 2.140, highest: 3.845 4.070 5.250 5.345 5.424
-------------------------------------------------------------------------------------
>

 


#pastecs desc - 和上面的describe函数功能类似

> install.packages("pastecs")

> library(pastecs)

> myvars<-c("mpg","hp","wt")
> stat.desc(mtcars[myvars])
mpg hp wt
nbr.val 32.0000000 32.0000000 32.0000000
nbr.null 0.0000000 0.0000000 0.0000000
nbr.na 0.0000000 0.0000000 0.0000000
min 10.4000000 52.0000000 1.5130000
max 33.9000000 335.0000000 5.4240000
range 23.5000000 283.0000000 3.9110000
sum 642.9000000 4694.0000000 102.9520000
median 19.2000000 123.0000000 3.3250000
mean 20.0906250 146.6875000 3.2172500
SE.mean 1.0654240 12.1203173 0.1729685
CI.mean.0.95 2.1729465 24.7195501 0.3527715
var 36.3241028 4700.8669355 0.9573790
std.dev 6.0269481 68.5628685 0.9784574
coef.var 0.2999881 0.4674077 0.3041285
> 
> 
>


> #psych describe - 描述函数

> install.packages("psych")

> library(psych)

> describe(mtcars[myvars])
vars n mean sd median trimmed mad min max range skew kurtosis
mpg 1 32 20.09 6.03 19.20 19.70 5.41 10.40 33.90 23.50 0.61 -0.37
hp 2 32 146.69 68.56 123.00 141.19 77.10 52.00 335.00 283.00 0.73 -0.14
wt 3 32 3.22 0.98 3.33 3.15 0.77 1.51 5.42 3.91 0.42 -0.02
se
mpg 1.07
hp 12.12
wt 0.17
>

> #doBy summaryBy -分组统计函数，第一个输入参数是一个公式：var1+var2+var3~group1+group2(~左边的变量是待分析变量，右边的是待分组变量)

> install.packages("doBy")
> library(doBy)
> summaryBy(mpg+hp+wt~am,data=mtcars,FUN=mystats)
am mpg.n mpg.mean mpg.stdev mpg.skew mpg.kurtosis hp.n hp.mean hp.stdev
1 0 19 17.14737 3.833966 0.01395038 -0.8031783 19 160.2632 53.90820
2 1 13 24.39231 6.166504 0.05256118 -1.4553520 13 126.8462 84.06232
hp.skew hp.kurtosis wt.n wt.mean wt.stdev wt.skew wt.kurtosis
1 -0.01422519 -1.2096973 19 3.768895 0.7774001 0.9759294 0.1415676
2 1.35988586 0.5634635 13 2.411000 0.6169816 0.2103128 -1.1737358
>

#调用同名函数 - 和编程中的命名空间概念类似
> Hmisc::describe(mtcars[myvars])
mtcars[myvars]


3 Variables 32 Observations
-------------------------------------------------------------------------------------
mpg 
n missing distinct Info Mean Gmd .05 .10 .25 
32 0 25 0.999 20.09 6.796 12.00 14.34 15.43 
.50 .75 .90 .95 
19.20 22.80 30.09 31.30


lowest : 10.4 13.3 14.3 14.7 15.0, highest: 26.0 27.3 30.4 32.4 33.9
-------------------------------------------------------------------------------------
hp 
n missing distinct Info Mean Gmd .05 .10 .25 
32 0 22 0.997 146.7 77.04 63.65 66.00 96.50 
.50 .75 .90 .95 
123.00 180.00 243.50 253.55


lowest : 52 62 65 66 91, highest: 215 230 245 264 335
-------------------------------------------------------------------------------------
wt 
n missing distinct Info Mean Gmd .05 .10 .25 
32 0 29 0.999 3.217 1.089 1.736 1.956 2.581 
.50 .75 .90 .95 
3.325 3.610 4.048 5.293


lowest : 1.513 1.615 1.835 1.935 2.140, highest: 3.845 4.070 5.250 5.345 5.424
-------------------------------------------------------------------------------------
> 
> 
>

#分组 - 两个分组函数：aggregate和by
> mtcars$am
[1] 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1
> help("aggregate")
> 
> 
> aggregate(mtcars[myvars],by=list(am=mtcars$am),mean)
am mpg hp wt
1 0 17.14737 160.2632 3.768895
2 1 24.39231 126.8462 2.411000
> aggregate(mtcars[myvars],by=list(am=mtcars$am),sd)
am mpg hp wt
1 0 3.833966 53.90820 0.7774001
2 1 6.166504 84.06232 0.6169816
> 
> 
> dstats<-function(x)sapply(x,mystats)
> by(mtcars[myvars],mtcars$am,dstats)
mtcars$am: 0
mpg hp wt
n 19.00000000 19.00000000 19.0000000
mean 17.14736842 160.26315789 3.7688947
stdev 3.83396639 53.90819573 0.7774001
skew 0.01395038 -0.01422519 0.9759294
kurtosis -0.80317826 -1.20969733 0.1415676
--------------------------------------------------------------- 
mtcars$am: 1
mpg hp wt
n 13.00000000 13.0000000 13.0000000
mean 24.39230769 126.8461538 2.4110000
stdev 6.16650381 84.0623243 0.6169816
skew 0.05256118 1.3598859 0.2103128
kurtosis -1.45535200 0.5634635 -1.1737358
>

 

> #频数表 - 这东西和Excel中的Pivit Table 类似
> #table, xtabs, prop.table, margin.table, addmargins, ftable
> library(vcd)
载入需要的程辑包：grid
Warning message:
程辑包‘vcd’是用R版本3.3.3 来建造的 
> attach(Arthritis)
> mytable<-table(Improved)
> mytable
Improved
None Some Marked 
42 14 28 
> 
> #百分比形式

> prop.table(mytable)
Improved
None Some Marked 
0.5000000 0.1666667 0.3333333 
> 
> #二维列联表
> mytable<-xtabs(~Treatment+Improved, data=Arthritis)
> mytable
Improved
Treatment None Some Marked
Placebo 29 7 7
Treated 13 7 21
> 
> 
> 
> #cross table
> install.packages("gmodels")
> library(gmodels)

> CrossTable(Arthritis$Treatment,Arthritis$Improved)



Cell Contents
|-------------------------|
| N |
| Chi-square contribution |
| N / Row Total |
| N / Col Total |
| N / Table Total |
|-------------------------|



Total Observations in Table: 84



| Arthritis$Improved 
Arthritis$Treatment | None | Some | Marked | Row Total | 
--------------------|-----------|-----------|-----------|-----------|
Placebo | 29 | 7 | 7 | 43 | 
| 2.616 | 0.004 | 3.752 | | 
| 0.674 | 0.163 | 0.163 | 0.512 | 
| 0.690 | 0.500 | 0.250 | | 
| 0.345 | 0.083 | 0.083 | | 
--------------------|-----------|-----------|-----------|-----------|
Treated | 13 | 7 | 21 | 41 | 
| 2.744 | 0.004 | 3.935 | | 
| 0.317 | 0.171 | 0.512 | 0.488 | 
| 0.310 | 0.500 | 0.750 | | 
| 0.155 | 0.083 | 0.250 | | 
--------------------|-----------|-----------|-----------|-----------|
Column Total | 42 | 14 | 28 | 84 | 
| 0.500 | 0.167 | 0.333 | | 
--------------------|-----------|-----------|-----------|-----------|



> 
> 
> #多维列联表
> mytable<-xtab(~Treatment+Sex+Improved,data=Arthritis) #这个公式和之前的类型，左边的是需要分析的变量，右边是需要分组的变量
Error: could not find function "xtab"
> mytable<-xtabs(~Treatment+Sex+Improved,data=Arthritis)
> mytable
, , Improved = None


Sex
Treatment Female Male
Placebo 19 10
Treated 6 7


, , Improved = Some


Sex
Treatment Female Male
Placebo 7 0
Treated 5 2


, , Improved = Marked


Sex
Treatment Female Male
Placebo 6 1
Treated 16 5


> ftable(mytable)
Improved None Some Marked
Treatment Sex 
Placebo Female 19 7 6
Male 10 0 1
Treated Female 6 5 16
Male 7 2 5
> 
> margin.table(mytable,1)
Treatment
Placebo Treated 
43 41 
> margin.table(mytable,2)
Sex
Female Male 
59 25 
> margin.table(mytable,3)
Improved
None Some Marked 
42 14 28 
> margin.table(mytable,c(1,3))
Improved
Treatment None Some Marked
Placebo 29 7 7
Treated 13 7 21
> ftable(prop.table(mytable,c(1,2)))
Improved None Some Marked
Treatment Sex 
Placebo Female 0.59375000 0.21875000 0.18750000
Male 0.90909091 0.00000000 0.09090909
Treated Female 0.22222222 0.18518519 0.59259259
Male 0.50000000 0.14285714 0.35714286
> 
> 
> ftable(addmargins(prop.table(mytable,c(1,2)),3))
Improved None Some Marked Sum
Treatment Sex 
Placebo Female 0.59375000 0.21875000 0.18750000 1.00000000
Male 0.90909091 0.00000000 0.09090909 1.00000000
Treated Female 0.22222222 0.18518519 0.59259259 1.00000000
Male 0.50000000 0.14285714 0.35714286 1.00000000
> help(addmargins)
> #addmargins 求和
> ftable(addmargins(prop.table(mytable,c(1,2)),3))*100
Improved None Some Marked Sum
Treatment Sex 
Placebo Female 59.375000 21.875000 18.750000 100.000000
Male 90.909091 0.000000 9.090909 100.000000
Treated Female 22.222222 18.518519 59.259259 100.000000
Male 50.000000 14.285714 35.714286 100.000000

#独立性检验 - 检测变量之间是否有关联
> 
> 
> #卡方独立性检验
> library(vcd)
> mytable<-xtabs(~Treatment+Improved,data=Arthritis)
> chisq.test(mytable)


Pearson's Chi-squared test


data: mytable
X-squared = 13.055, df = 2, p-value = 0.001463


> 
> 
> mytable<-xtabs(~Improved+Sex,dat=Arthritis)
> chisq.test(mytable)


Pearson's Chi-squared test


data: mytable
X-squared = 4.8407, df = 2, p-value = 0.08889


Warning message:
In chisq.test(mytable) : Chi-squared近似算法有可能不准
> 
> 
> 
> 
> 
> #fisher
> mytable<-xtabs(~Treatment+Improved,data=Arthritis)
> fisher.test(mytable)


Fisher's Exact Test for Count Data


data: mytable
p-value = 0.001393
alternative hypothesis: two.sided


> 
> 
> #mantelhaen
> mytable<-xtabs(~Treatment+Improved+Sex,data=Arthritis)
> mantelhaen.test(mytable)


Cochran-Mantel-Haenszel test


data: mytable
Cochran-Mantel-Haenszel M^2 = 14.632, df = 2, p-value = 0.0006647


> #associate
> library(vcd)
> mytable<-xtabs(~Treatment+Improved,data=Arthritis)
> associate(mytable)
Error: could not find function "associate"
> associates(mytable)
Error: could not find function "associates"
> assocstats(mytable)
X^2 df P(> X^2)
Likelihood Ratio 13.530 2 0.0011536
Pearson 13.055 2 0.0014626


Phi-Coefficient : NA 
Contingency Coeff.: 0.367 
Cramer's V : 0.394 
> 
> 
>

#相关 批量检测变量的相关性，比较有用，用来推导变量之间的依赖关系
> states<-state.x77[,1:6]
> cov(states)
Population Income Illiteracy Life Exp Murder
Population 19931683.7588 571229.7796 292.8679592 -407.8424612 5663.523714
Income 571229.7796 377573.3061 -163.7020408 280.6631837 -521.894286
Illiteracy 292.8680 -163.7020 0.3715306 -0.4815122 1.581776
Life Exp -407.8425 280.6632 -0.4815122 1.8020204 -3.869480
Murder 5663.5237 -521.8943 1.5817755 -3.8694804 13.627465
HS Grad -3551.5096 3076.7690 -3.2354694 6.3126849 -14.549616
HS Grad
Population -3551.509551
Income 3076.768980
Illiteracy -3.235469
Life Exp 6.312685
Murder -14.549616
HS Grad 65.237894
> 
> cor(states)
Population Income Illiteracy Life Exp Murder HS Grad
Population 1.00000000 0.2082276 0.1076224 -0.06805195 0.3436428 -0.09848975
Income 0.20822756 1.0000000 -0.4370752 0.34025534 -0.2300776 0.61993232
Illiteracy 0.10762237 -0.4370752 1.0000000 -0.58847793 0.7029752 -0.65718861
Life Exp -0.06805195 0.3402553 -0.5884779 1.00000000 -0.7808458 0.58221620
Murder 0.34364275 -0.2300776 0.7029752 -0.78084575 1.0000000 -0.48797102
HS Grad -0.09848975 0.6199323 -0.6571886 0.58221620 -0.4879710 1.00000000
> 
> cor(states,method="spearman")
Population Income Illiteracy Life Exp Murder HS Grad
Population 1.0000000 0.1246098 0.3130496 -0.1040171 0.3457401 -0.3833649
Income 0.1246098 1.0000000 -0.3145948 0.3241050 -0.2174623 0.5104809
Illiteracy 0.3130496 -0.3145948 1.0000000 -0.5553735 0.6723592 -0.6545396
Life Exp -0.1040171 0.3241050 -0.5553735 1.0000000 -0.7802406 0.5239410
Murder 0.3457401 -0.2174623 0.6723592 -0.7802406 1.0000000 -0.4367330
HS Grad -0.3833649 0.5104809 -0.6545396 0.5239410 -0.4367330 1.0000000
> 
> 
> x<- states[,c("Population","InCome","HS Grad")]
Error in states[, c("Population", "InCome", "HS Grad")] : 
subscript out of bounds
> x<- states[,c("Population","InCome","Illiteracy", "HS Grad")]
Error in states[, c("Population", "InCome", "Illiteracy", "HS Grad")] : 
subscript out of bounds
> head(states)
Population Income Illiteracy Life Exp Murder HS Grad
Alabama 3615 3624 2.1 69.05 15.1 41.3
Alaska 365 6315 1.5 69.31 11.3 66.7
Arizona 2212 4530 1.8 70.55 7.8 58.1
Arkansas 2110 3378 1.9 70.66 10.1 39.9
California 21198 5114 1.1 71.71 10.3 62.6
Colorado 2541 4884 0.7 72.06 6.8 63.9
> x<-states[,1]
> x
Alabama Alaska Arizona Arkansas California 
3615 365 2212 2110 21198 
Colorado Connecticut Delaware Florida Georgia 
2541 3100 579 8277 4931 
Hawaii Idaho Illinois Indiana Iowa 
868 813 11197 5313 2861 
Kansas Kentucky Louisiana Maine Maryland 
2280 3387 3806 1058 4122 
Massachusetts Michigan Minnesota Mississippi Missouri 
5814 9111 3921 2341 4767 
Montana Nebraska Nevada New Hampshire New Jersey 
746 1544 590 812 7333 
New Mexico New York North Carolina North Dakota Ohio 
1144 18076 5441 637 10735 
Oklahoma Oregon Pennsylvania Rhode Island South Carolina 
2715 2284 11860 931 2816 
South Dakota Tennessee Texas Utah Vermont 
681 4173 12237 1203 472 
Virginia Washington West Virginia Wisconsin Wyoming 
4981 3559 1799 4589 376 
> x<-states[,c("Population","Income","Illiteracy","HS Grad")]
> y<-states[,c("Life Exp","Murder")]
> cor(x,y)
Life Exp Murder
Population -0.06805195 0.3436428
Income 0.34025534 -0.2300776
Illiteracy -0.58847793 0.7029752
HS Grad 0.58221620 -0.4879710
> 
>

 


> 
> #偏相关
> install.packages("ggm")


程辑包‘ggm’是用R版本3.3.3 来建造的 
> colnames(states)
[1] "Population" "Income" "Illiteracy" "Life Exp" "Murder" "HS Grad" 
> pcor(c(1,5,2,3,6),cov(states))
[1] 0.3462724
> 
> 
> #相关性的显著性验证
> cor.test(states[,3],states[,5])


Pearson's product-moment correlation


data: states[, 3] and states[, 5]
t = 6.8479, df = 48, p-value = 1.258e-08
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5279280 0.8207295
sample estimates:
cor 
0.7029752


> 
> 
> #多变量验证
> library(psych)
> corr.test(states,use="complete")
Call:corr.test(x = states, use = "complete")
Correlation matrix 
Population Income Illiteracy Life Exp Murder HS Grad
Population 1.00 0.21 0.11 -0.07 0.34 -0.10
Income 0.21 1.00 -0.44 0.34 -0.23 0.62
Illiteracy 0.11 -0.44 1.00 -0.59 0.70 -0.66
Life Exp -0.07 0.34 -0.59 1.00 -0.78 0.58
Murder 0.34 -0.23 0.70 -0.78 1.00 -0.49
HS Grad -0.10 0.62 -0.66 0.58 -0.49 1.00
Sample Size 
[1] 50
Probability values (Entries above the diagonal are adjusted for multiple tests.) 
Population Income Illiteracy Life Exp Murder HS Grad
Population 0.00 0.59 1.00 1.0 0.10 1
Income 0.15 0.00 0.01 0.1 0.54 0
Illiteracy 0.46 0.00 0.00 0.0 0.00 0
Life Exp 0.64 0.02 0.00 0.0 0.00 0
Murder 0.01 0.11 0.00 0.0 0.00 0
HS Grad 0.50 0.00 0.00 0.0 0.00 0


To see confidence intervals of the correlations, print with the short=FALSE option
> 
> 
> 
> 
> #T检验
> #测试在美国南部和北部犯罪后被关进监狱的概率
> library(MASS)
> t.test(Prob~So, data=UScrime)


Welch Two Sample t-test


data: Prob by So
t = -3.8954, df = 24.925, p-value = 0.0006506
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.03852569 -0.01187439
sample estimates:
mean in group 0 mean in group 1 
0.03851265 0.06371269


> 
> library(MASS)
> sapply(UCcrime[c("U1","U2")], function(x){c(mean=mean(x),sd=sd(X))})
Error in lapply(X = X, FUN = FUN, ...) : object 'UCcrime' not found
> sapply(UScrime[c("U1","U2")], function(x){c(mean=mean(x),sd=sd(X))})
Show Traceback

Rerun with Debug
Error in is.data.frame(x) : object 'X' not found > attach(UScrime)
The following object is masked _by_ .GlobalEnv:


y


> sapply(UScrime[c("U1","U2")], function(x){c(mean=mean(x),sd=sd(X))})
Show Traceback

Rerun with Debug
Error in is.data.frame(x) : object 'X' not found > head(UScrime)
M So Ed Po1 Po2 LF M.F Pop NW U1 U2 GDP Ineq Prob Time y
1 151 1 91 58 56 510 950 33 301 108 41 394 261 0.084602 26.2011 791
2 143 0 113 103 95 583 1012 13 102 96 36 557 194 0.029599 25.2999 1635
3 142 1 89 45 44 533 969 18 219 94 33 318 250 0.083401 24.3006 578
4 136 0 121 149 141 577 994 157 80 102 39 673 167 0.015801 29.9012 1969
5 141 0 121 109 101 591 985 18 30 91 20 578 174 0.041399 21.2998 1234
6 121 0 110 118 115 547 964 25 44 84 29 689 126 0.034201 20.9995 682
> sapply(UScrime[c("U1","U2")], function(x)(c(mean=mean(x),sd=sd(X)))
+ )
Show Traceback

Rerun with Debug
Error in is.data.frame(x) : object 'X' not found > 
> 
> library(MASS)
> sapply(UScrime[c("U1","U2")],function(x)(c(mean=mean(x),sd=sd(x))))
U1 U2
mean 95.46809 33.97872
sd 18.02878 8.44545
> 
> 
> with(UScrime,t.test(U1,U2,paired=TRUE))


Paired t-test


data: U1 and U2
t = 32.407, df = 46, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
57.67003 65.30870
sample estimates:
mean of the differences 
61.48936


> 
> 
> 
> #两组差异的非参数检验
> states<-data.frame(state.region,state.x77)
> kruskal.test(Illiteracy~state.region,data=states)


Kruskal-Wallis rank sum test


data: Illiteracy by state.region
Kruskal-Wallis chi-squared = 22.672, df = 3, p-value = 4.726e-05
posted @ 2017-09-25 16:05 aifans2019 阅读(3632) 评论(0) 编辑收藏举报
刷新页面返回顶部
AI Fans

Data Science门徒

R语言学习笔记（五）

公告