R语言学习笔记(三)
第四,五章学习总结,这两章主要讲解了R语言的系统函数,例如怎么把缺失值剔除,数据类型的识别和转换,数据集的合并,怎么使用SQL库,函数以及R程序的顺序,选择和循环的实现方式。 缺失值 > manager <- c(1,2,3,4,5) > date<-c("10/24/08","10/28/08","10/1/08","10/12/08","5/1/09") > country<-c("US","US","UK","UK","UK") > gender<-c("M","F","F","M","F") > age<-c(32,45,25,39,99) > q1<-(5,3,3,3,2) > q2<-c(4,5,5,3,2) > q3<-c(,2,5,4,1) > q3<-c(5,2,5,4,1) > q4<-c(5,5,5,NA,2) > q5<-c(5,5,2,NA,1) leadership<-data.frame(manager,date,country,gender,age,q1,q2,q3,q4,q5,stringsAsFactors=FALSE) > leadership manager date country gender age q1 q2 q3 q4 q5 1 1 10/24/08 US M 32 5 4 5 5 5 2 2 10/28/08 US F 45 3 5 2 5 5 3 3 10/1/08 UK F 25 3 5 5 5 2 4 4 10/12/08 UK M 39 3 3 4 NA NA 5 5 5/1/09 UK F 99 2 2 1 2 1 运算符号 > #+,-,*,/ > #^ ** 求幂 > # %% 求余 > # %/% 求除数 逻辑运算符 > # <, <=,>, >=, ==, !=, !x, x|y, x&y, isTRUE(x) 矩阵计算 > mydata<-data.frame(x1=c(2,2,6,4),x2=c(3,4,2,8)) > mydata$sumx NULL > mydata$sumx<-mydata$x1+mydata$x2 > mydata$meanx<-(mydata$x1+mydata$x2)/2 > mydata$sumx [1] 5 6 8 12 > mydata$meanx [1] 2.5 3.0 4.0 6.0 扩展数据集 mydata<-transform(mydata,sumx=x1+x2,meanx=(x1+x2)/2) > mydata x1 x2 sumx meanx 1 2 3 5 2.5 2 2 4 6 3.0 3 6 2 8 4.0 4 4 8 12 6.0 列选择 > leadership$age[leadership$age ==99]<-NA > leadership$agecat NULL > leadership$agecat[leadership$age >75] <- "Elder" > leadership$agecat[leadership$age >=55 & leadership$age<=75] <- "Middle Aged" > leadership$agecat[leadership$age <55] <- "Young" > leadership <- within(leadership,{agecat <- NA}) > > leadership manager date country gender age q1 q2 q3 q4 q5 agecat 1 1 10/24/08 US M 32 5 4 5 5 5 NA 2 2 10/28/08 US F 45 3 5 2 5 5 NA 3 3 10/1/08 UK F 25 3 5 5 5 2 NA 4 4 10/12/08 UK M 39 3 3 4 NA NA NA 5 5 5/1/09 UK F NA 2 2 1 2 1 NA > leadership$agecat[leadership$age >75] <- "Elder" > leadership$agecat[leadership$age >=55 & leadership$age<=75] <- "Middle Aged" > leadership$agecat[leadership$age <55] <- "Young" > leadership manager date country gender age q1 q2 q3 q4 q5 agecat 1 1 10/24/08 US M 32 5 4 5 5 5 Young 2 2 10/28/08 US F 45 3 5 2 5 5 Young 3 3 10/1/08 UK F 25 3 5 5 5 2 Young 4 4 10/12/08 UK M 39 3 3 4 NA NA Young 5 5 5/1/09 UK F NA 2 2 1 2 1 <NA> > leadership<-within(leadership,{}) > > leadership<-within(leadership,{}) > leadership<-within(leadership,{}) > leadership<-within(leadership,{}) > leadership<-within(leadership,{}) > leadership<-within(leadership,{ + agecat<-NA + agecat[age>75]<-"Elder" + agecat[age>=55 & age<=75] <-"Middle Aged" + agecat[age<55]<-"Young"}) > leadership manager date country gender age q1 q2 q3 q4 q5 agecat 1 1 10/24/08 US M 32 5 4 5 5 5 Young 2 2 10/28/08 US F 45 3 5 2 5 5 Young 3 3 10/1/08 UK F 25 3 5 5 5 2 Young 4 4 10/12/08 UK M 39 3 3 4 NA NA Young 5 5 5/1/09 UK F NA 2 2 1 2 1 <NA> 重命名列 > fix(leadership) names(leadership)[2] <- "testDate" > leadership managerID testDate country gender age q1 q2 q3 q4 q5 agecat 1 1 10/24/08 US M 32 5 4 5 5 5 Young 2 2 10/28/08 US F 45 3 5 2 5 5 Young 3 3 10/1/08 UK F 25 3 5 5 5 2 Young 4 4 10/12/08 UK M 39 3 3 4 NA NA Young 5 5 5/1/09 UK F NA 2 2 1 2 1 <NA> > names(leadership)[6:10] <- c("item1","item2","item3","item4","item5") > leadership managerID testDate country gender age item1 item2 item3 item4 item5 agecat 1 1 10/24/08 US M 32 5 4 5 5 5 Young 2 2 10/28/08 US F 45 3 5 2 5 5 Young 3 3 10/1/08 UK F 25 3 5 5 5 2 Young 4 4 10/12/08 UK M 39 3 3 4 NA NA Young 5 5 5/1/09 UK F NA 2 2 1 2 1 <NA> 判断空值 is.na(leadership) managerID testDate country gender age item1 item2 item3 item4 item5 1 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 2 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 3 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 4 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE 5 FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE agecat 1 FALSE 2 FALSE 3 FALSE 4 FALSE 5 TRUE 排除缺失值 > x<-c(1,2,NA,3) > y<-x[1]+x[2]+x[3]+x[4] > z<-sum(x) > z [1] NA > > y [1] NA > k<-sum(x,na.rm=TRUE) > k [1] 6 > > leadership managerID testDate country gender age item1 item2 item3 item4 item5 agecat 1 1 10/24/08 US M 32 5 4 5 5 5 Young 2 2 10/28/08 US F 45 3 5 2 5 5 Young 3 3 10/1/08 UK F 25 3 5 5 5 2 Young 4 4 10/12/08 UK M 39 3 3 4 NA NA Young 5 5 5/1/09 UK F NA 2 2 1 2 1 <NA> > newda<-na.omit(leadership) > newda managerID testDate country gender age item1 item2 item3 item4 item5 agecat 1 1 10/24/08 US M 32 5 4 5 5 5 Young 2 2 10/28/08 US F 45 3 5 2 5 5 Young 3 3 10/1/08 UK F 25 3 5 5 5 2 Young > 日期转换 > #%d 数字日期(1~31) > #%a 缩写星期名 > #%A 非缩写星期名 > #%m 月份(00~12) > #%b 缩写月份 e.g. Jan > #%B 非缩写月份 e.g. January > #%y 两位数的年份 07 > #%Y 四位数的年份 2007 > > mydates<-as.Date(c("2007-06-22","2004-02-13")) > mydates [1] "2007-06-22" "2004-02-13" > strDates<-c("01/05/1965","08/16/1975") > dates<-as.Date(strDates,"%m/%d/%Y") > dates [1] "1965-01-05" "1975-08-16" > > myFormat<-"%m/%d/%y" > leadership$date<-as.Date(leadership$date,myformat) > leadership managerID testDate country gender age item1 item2 item3 item4 item5 agecat 1 1 10/24/08 US M 32 5 4 5 5 5 Young 2 2 10/28/08 US F 45 3 5 2 5 5 Young 3 3 10/1/08 UK F 25 3 5 5 5 2 Young 4 4 10/12/08 UK M 39 3 3 4 NA NA Young 5 5 5/1/09 UK F NA 2 2 1 2 1 <NA> > fix(leadership) > leadership managerID Date country gender age item1 item2 item3 item4 item5 agecat 1 1 10/24/08 US M 32 5 4 5 5 5 Young 2 2 10/28/08 US F 45 3 5 2 5 5 Young 3 3 10/1/08 UK F 25 3 5 5 5 2 Young 4 4 10/12/08 UK M 39 3 3 4 NA NA Young 5 5 5/1/09 UK F NA 2 2 1 2 1 <NA> > leadership$date<-as.Date(leadership$date,myformat) > leadership$Date<-as.Date(leadership$Date,myFormat) > leadership managerID Date country gender age item1 item2 item3 item4 item5 1 1 2008-10-24 US M 32 5 4 5 5 5 2 2 2008-10-28 US F 45 3 5 2 5 5 3 3 2008-10-01 UK F 25 3 5 5 5 2 4 4 2008-10-12 UK M 39 3 3 4 NA NA 5 5 2009-05-01 UK F NA 2 2 1 2 1 agecat 1 Young 2 Young 3 Young 4 Young 5 <NA> > Sys.Date() [1] "2017-04-13" > date() [1] "Thu Apr 13 22:19:48 2017" > > > today<-Sys.Date() > format(today,format="%B %d %Y") [1] "四月 13 2017" > > startDate<-as.Date("2004-02-12") > endDate<-as.Date("2011-01-22") > days<-endDate-startDate > days Time difference of 2536 days > > > today<-Sys.Date() > dob<-as.Date("1987-01-03") 日期数据运算 > difftime(today,dob,units="days") Time difference of 11058 days > difftime(today,dob,units="weeks") Time difference of 1579.714 weeks 日期转字符 > strDate<-as.character(Sys.Date()) > strDate [1] "2017-04-13" > 类型转换 > #is.numeric() as.numeric() > #is.character() as.character() > #is.vector() as.vector() > #is.matrix() as.matrix() > #is.data.frame() as.data.frame() > #is.factor() as.factor() > #is.logical() as.logical() > > > > is.numeric(1) [1] TRUE > is.numeric("1") [1] FALSE > is.vector("1") [1] TRUE > is.vector(c(1,2,3)) [1] TRUE > is.factor("a") [1] FALSE > is.logical(TRUE) [1] TRUE > is.logical("TRUE") [1] FALSE 数据排序 > newdata<-leadership[order(leadership$age),] > leadership managerID Date country gender age item1 item2 item3 item4 item5 1 1 2008-10-24 US M 32 5 4 5 5 5 2 2 2008-10-28 US F 45 3 5 2 5 5 3 3 2008-10-01 UK F 25 3 5 5 5 2 4 4 2008-10-12 UK M 39 3 3 4 NA NA 5 5 2009-05-01 UK F NA 2 2 1 2 1 agecat 1 Young 2 Young 3 Young 4 Young 5 <NA> > newdata managerID Date country gender age item1 item2 item3 item4 item5 3 3 2008-10-01 UK F 25 3 5 5 5 2 1 1 2008-10-24 US M 32 5 4 5 5 5 4 4 2008-10-12 UK M 39 3 3 4 NA NA 2 2 2008-10-28 US F 45 3 5 2 5 5 5 5 2009-05-01 UK F NA 2 2 1 2 1 agecat 3 Young 1 Young 4 Young 2 Young 5 <NA> > newdata<-leadership[order(leadership$age,decreasing=TRUE),] #这里有个逗号 > newdata managerID Date country gender age item1 item2 item3 item4 item5 2 2 2008-10-28 US F 45 3 5 2 5 5 4 4 2008-10-12 UK M 39 3 3 4 NA NA 1 1 2008-10-24 US M 32 5 4 5 5 5 3 3 2008-10-01 UK F 25 3 5 5 5 2 5 5 2009-05-01 UK F NA 2 2 1 2 1 agecat 2 Young 4 Young 1 Young 3 Young 5 <NA> > 数据集合并 > total<-merge(dataframeA,dataframeB,by="ID") > #横向合并 merge() > #纵向合并 rbind() 排除选择 > myvars<-names(leadership) %in% c("q3","q4") > newdata<-leadership[!myvars] > newdata managerID Date country gender age item1 item2 item3 item4 item5 1 1 2008-10-24 US M 32 5 4 5 5 5 2 2 2008-10-28 US F 45 3 5 2 5 5 3 3 2008-10-01 UK F 25 3 5 5 5 2 4 4 2008-10-12 UK M 39 3 3 4 NA NA 5 5 2009-05-01 UK F NA 2 2 1 2 1 agecat 1 Young 2 Young 3 Young 4 Young 5 <NA> > myvars [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE > myvars<-names(leadership) %in% c("item3","item4") > newdata<-leadership[!myvars] > newdata managerID Date country gender age item1 item2 item5 agecat 1 1 2008-10-24 US M 32 5 4 5 Young 2 2 2008-10-28 US F 45 3 5 5 Young 3 3 2008-10-01 UK F 25 3 5 2 Young 4 4 2008-10-12 UK M 39 3 3 NA Young 5 5 2009-05-01 UK F NA 2 2 1 <NA> > newdata managerID Date country gender age item1 item2 item5 agecat 1 1 2008-10-24 US M 32 5 4 5 Young 2 2 2008-10-28 US F 45 3 5 5 Young 3 3 2008-10-01 UK F 25 3 5 2 Young 4 4 2008-10-12 UK M 39 3 3 NA Young 5 5 2009-05-01 UK F NA 2 2 1 <NA> > newdata managerID Date country gender age item1 item2 item5 1 1 2008-10-24 US M 32 5 4 5 2 2 2008-10-28 US F 45 3 5 5 3 3 2008-10-01 UK F 25 3 5 2 4 4 2008-10-12 UK M 39 3 3 NA 5 5 2009-05-01 UK F NA 2 2 1 > > subset -- 类似SQL中的Select > newdata<-subset(leadership,age>=35 |age<24,select=c(item1,item2,item3,item4)) > newdata item1 item2 item3 item4 2 3 5 2 5 4 3 3 4 NA > newdata<-subset(leadership,age>=35 |age<24,select=gender:item4) > newdata gender age item1 item2 item3 item4 2 F 45 3 5 2 5 4 M 39 3 3 4 NA > 数据抽样 > mysample<-leadership[sample(1:nrow(leadership),3,replace=FALSE),] > mysample managerID Date country gender age item1 item2 item3 item4 item5 1 1 2008-10-24 US M 32 5 4 5 5 5 2 2 2008-10-28 US F 45 3 5 2 5 5 5 5 2009-05-01 UK F NA 2 2 1 2 1 agecat 1 Young 2 Young 5 <NA> > leadership managerID Date country gender age item1 item2 item3 item4 item5 1 1 2008-10-24 US M 32 5 4 5 5 5 2 2 2008-10-28 US F 45 3 5 2 5 5 3 3 2008-10-01 UK F 25 3 5 5 5 2 4 4 2008-10-12 UK M 39 3 3 4 NA NA 5 5 2009-05-01 UK F NA 2 2 1 2 1 agecat 1 Young 2 Young 3 Young 4 Young 5 <NA> 使用SQL > install.packages("sqldf") library("sqldf") dataset<-sqldf("select * from leadership") Loading required package: tcltk Warning message: Quoted identifiers should have class SQL, use DBI::SQL() if the caller performs the quoting. > dataset<-sqldf("select * from leadership",row.names=TRUE) > dataset managerID Date country gender age item1 item2 item3 item4 item5 agecat 1 1 2008-10-24 US M 32 5 4 5 5 5 Young 2 2 2008-10-28 US F 45 3 5 2 5 5 Young 3 3 2008-10-01 UK F 25 3 5 5 5 2 Young 4 4 2008-10-12 UK M 39 3 3 4 NA NA Young 5 5 2009-05-01 UK F NA 2 2 1 2 1 <NA> > dataset<-sqldf("select managerID,Date,country,item1+item2+item3+item4 from leadership",row.names=TRUE) > dataset managerID Date country item1+item2+item3+item4 1 1 2008-10-24 US 19 2 2 2008-10-28 US 15 3 3 2008-10-01 UK 18 4 4 2008-10-12 UK NA 5 5 2009-05-01 UK 7 > dataset<-sqldf("select managerID,Date,country,item1+item2+item3+item4 'ab' from leadership",row.names=TRUE) > dataset managerID Date country ab 1 1 2008-10-24 US 19 2 2 2008-10-28 US 15 3 3 2008-10-01 UK 18 4 4 2008-10-12 UK NA 5 5 2009-05-01 UK 7 > dataset<-sqldf("select managerID,Date,country,item1+item2+item3+item4 'composed field' from leadership",row.names=TRUE) > dataset managerID Date country composed field 1 1 2008-10-24 US 19 2 2 2008-10-28 US 15 3 3 2008-10-01 UK 18 4 4 2008-10-12 UK NA 5 5 2009-05-01 UK 7 服从正态分布的伪函数 --用来生成测试数据比较好 > runif(5) [1] 0.8085362 0.4827602 0.4272967 0.8801272 0.2206980 > set.seed(1234) > runif(5) [1] 0.1137034 0.6222994 0.6092747 0.6233794 0.8609154 > set.seed(1234) > runif(5) [1] 0.1137034 0.6222994 0.6092747 0.6233794 0.8609154 > runif(6) [1] 0.640310605 0.009495756 0.232550506 0.666083758 0.514251141 0.693591292 多元正态分布函数 > library(MASS) > options(digits=3) > set.seed(1234) > mean<-c(230.7, 146,7, 3.6) > sigma<-matrix(c(15360.8,6721.2, -47.1, 6721.2, 4700.9, -16.5,-47.1,-16.5,0.3),nrow=3,ncol=3) > sigma [,1] [,2] [,3] [1,] 15360.8 6721.2 -47.1 [2,] 6721.2 4700.9 -16.5 [3,] -47.1 -16.5 0.3 矩阵操作 > a<-5 > sqrt(a) [1] 2.24 > b<-c(1.243,5.654,2.99) > round(b) [1] 1 6 3 > c<-matrix(runif(12),nrow=3) > c [,1] [,2] [,3] [,4] [1,] 0.694 0.923 0.286 0.232 [2,] 0.545 0.292 0.267 0.317 [3,] 0.283 0.837 0.187 0.303 > log(c) [,1] [,2] [,3] [,4] [1,] -0.366 -0.0797 -1.25 -1.46 [2,] -0.607 -1.2299 -1.32 -1.15 [3,] -1.263 -0.1776 -1.68 -1.20 > > #将运算运用到单列或单列 > apply(c,1,mean) [1] 0.534 0.355 0.402 > apply(c,2,mean) [1] 0.507 0.684 0.247 0.284 统计学生成绩排名 -- 下面的代码是一个比较学生排名的例子,学生的排名需要根据三门不同的学科来决定,但是不同的学科没有可比性,所有需要先将不同的学科转化为具有可比性的系数,然后在汇总系数并进行排序。 > options(digits=3) > Student<-c("John Davis","Angela Williams", "Bullwinkle Moose", "David Jones", "Janice Makhammer", "Cheryl Cushing", "reuven Ytzrhak", "Greg Knox","Joel England","Mary Rayburn") > Math<-c(502,600,412,358,495,512,410,625,573,522) > Science<-c(95,99,80,82,75,85,80,95,89,86) > English<-c(25,22,18,15,20,28,15,30,27,18) > roster<-data.frame(Student,Math,Science,English,stringAsFactors=FALSE) > z<-scale(roster[,2:4]) -- 这个函数将具体的分数转化为具有可比性的系数 > roster Student Math Science English stringAsFactors 1 John Davis 502 95 25 FALSE 2 Angela Williams 600 99 22 FALSE 3 Bullwinkle Moose 412 80 18 FALSE 4 David Jones 358 82 15 FALSE 5 Janice Makhammer 495 75 20 FALSE 6 Cheryl Cushing 512 85 28 FALSE 7 reuven Ytzrhak 410 80 15 FALSE 8 Greg Knox 625 95 30 FALSE 9 Joel England 573 89 27 FALSE 10 Mary Rayburn 522 86 18 FALSE > z Math Science English [1,] 0.0127 1.078 0.5869 [2,] 1.1434 1.591 0.0367 [3,] -1.0257 -0.847 -0.6969 [4,] -1.6487 -0.590 -1.2471 [5,] -0.0681 -1.489 -0.3301 [6,] 0.1281 -0.205 1.1370 [7,] -1.0488 -0.847 -1.2471 [8,] 1.4318 1.078 1.5038 [9,] 0.8319 0.308 0.9536 [10,] 0.2434 -0.077 -0.6969 attr(,"scaled:center") Math Science English 500.9 86.6 21.8 attr(,"scaled:scale") Math Science English 86.67 7.79 5.45 > help(scale) > roster[1,] Student Math Science English stringAsFactors 1 John Davis 502 95 25 FALSE > roster[1,2:4] Math Science English 1 502 95 25 > score<-apply(z,1,mean) # 1是按行,2是按列, > score [1] 0.559 0.924 -0.857 -1.162 -0.629 0.353 -1.048 1.338 0.698 -0.177 > roster<-cbind(roster,score) > roster Student Math Science English stringAsFactors score 1 John Davis 502 95 25 FALSE 0.559 2 Angela Williams 600 99 22 FALSE 0.924 3 Bullwinkle Moose 412 80 18 FALSE -0.857 4 David Jones 358 82 15 FALSE -1.162 5 Janice Makhammer 495 75 20 FALSE -0.629 6 Cheryl Cushing 512 85 28 FALSE 0.353 7 reuven Ytzrhak 410 80 15 FALSE -1.048 8 Greg Knox 625 95 30 FALSE 1.338 9 Joel England 573 89 27 FALSE 0.698 10 Mary Rayburn 522 86 18 FALSE -0.177 > y<-quantile(score, c(.8,.6,.4,.2) + ) > y 80% 60% 40% 20% 0.743 0.436 -0.358 -0.895 > roster$grade[score>y[1]]<-"A" > roster$grade[score<y[1] & score>=y[2]]<-"B" > roster$grade[score<y[2] & score>=y[3]]<="C" [1] NA NA > roster$grade[score<y[2] & score>=y[3]]<-"C" > roster$grade[score<y[3] & score>=y[4]]<-"D" > roster$grade[score<y[4]]<-"E" > roster Student Math Science English stringAsFactors score grade 1 John Davis 502 95 25 FALSE 0.559 B 2 Angela Williams 600 99 22 FALSE 0.924 A 3 Bullwinkle Moose 412 80 18 FALSE -0.857 D 4 David Jones 358 82 15 FALSE -1.162 E 5 Janice Makhammer 495 75 20 FALSE -0.629 D 6 Cheryl Cushing 512 85 28 FALSE 0.353 C 7 reuven Ytzrhak 410 80 15 FALSE -1.048 E 8 Greg Knox 625 95 30 FALSE 1.338 A 9 Joel England 573 89 27 FALSE 0.698 B 10 Mary Rayburn 522 86 18 FALSE -0.177 C > roster$grade[score<y[4]]<-"F" > roster Student Math Science English stringAsFactors score grade 1 John Davis 502 95 25 FALSE 0.559 B 2 Angela Williams 600 99 22 FALSE 0.924 A 3 Bullwinkle Moose 412 80 18 FALSE -0.857 D 4 David Jones 358 82 15 FALSE -1.162 F 5 Janice Makhammer 495 75 20 FALSE -0.629 D 6 Cheryl Cushing 512 85 28 FALSE 0.353 C 7 reuven Ytzrhak 410 80 15 FALSE -1.048 F 8 Greg Knox 625 95 30 FALSE 1.338 A 9 Joel England 573 89 27 FALSE 0.698 B 10 Mary Rayburn 522 86 18 FALSE -0.177 C > > > roster$Student [1] John Davis Angela Williams Bullwinkle Moose David Jones Janice Makhammer [6] Cheryl Cushing reuven Ytzrhak Greg Knox Joel England Mary Rayburn 10 Levels: Angela Williams Bullwinkle Moose Cheryl Cushing David Jones ... reuven Ytzrhak > options(digits=2) > roster Student Math Science English stringAsFactors score grade 1 John Davis 502 95 25 FALSE 0.56 B 2 Angela Williams 600 99 22 FALSE 0.92 A 3 Bullwinkle Moose 412 80 18 FALSE -0.86 D 4 David Jones 358 82 15 FALSE -1.16 F 5 Janice Makhammer 495 75 20 FALSE -0.63 D 6 Cheryl Cushing 512 85 28 FALSE 0.35 C 7 reuven Ytzrhak 410 80 15 FALSE -1.05 F 8 Greg Knox 625 95 30 FALSE 1.34 A 9 Joel England 573 89 27 FALSE 0.70 B 10 Mary Rayburn 522 86 18 FALSE -0.18 C > roster[order(grade),] R语言控制流 > #for 循环 > for(i in 1:10) print("hello") [1] "hello" [1] "hello" [1] "hello" [1] "hello" [1] "hello" [1] "hello" [1] "hello" [1] "hello" [1] "hello" [1] "hello" > for(i in 1:10) print(as.character(i)) [1] "1" [1] "2" [1] "3" [1] "4" [1] "5" [1] "6" [1] "7" [1] "8" [1] "9" [1] "10" > for(i in 1:10) print(as.character(i))> > > roster[order(roster$grade),] Student Math Science English stringAsFactors score grade 2 Angela Williams 600 99 22 FALSE 0.92 A 8 Greg Knox 625 95 30 FALSE 1.34 A 1 John Davis 502 95 25 FALSE 0.56 B 9 Joel England 573 89 27 FALSE 0.70 B 6 Cheryl Cushing 512 85 28 FALSE 0.35 C 10 Mary Rayburn 522 86 18 FALSE -0.18 C 3 Bullwinkle Moose 412 80 18 FALSE -0.86 D 5 Janice Makhammer 495 75 20 FALSE -0.63 D 4 David Jones 358 82 15 FALSE -1.16 F 7 reuven Ytzrhak 410 80 15 FALSE -1.05 F > > > #while 循环 > i<-10 > while(i>0){ + print("Hello"); + i<-i-1; + } [1] "Hello" [1] "Hello" [1] "Hello" [1] "Hello" [1] "Hello" [1] "Hello" [1] "Hello" [1] "Hello" [1] "Hello" [1] "Hello" > > > #条件 > if(is.character(roster$grade)) roster$grade<-as.factor(roster$grade) > if(!is.factor(roster$grade)) grade<- as.factor(grade) else print("Grade already is a factor") [1] "Grade already is a factor" > > ifelse(roster$score>0.5,print("Passed"),print("Failed")) [1] "Passed" [1] "Failed" [1] "Passed" "Passed" "Failed" "Failed" "Failed" "Failed" "Failed" "Passed" "Passed" "Failed" > > > feelings<-c("sad","afraid") > for (i in feelings) + print( switch(i, + happy="I am glad you are happy", + afraid="There is nothing to fear", + sad="Cheer up", + angry="Calm down now") + ) [1] "Cheer up" [1] "There is nothing to fear" 自定义函数 > > > mystats<-function(x,parametric=TRUE,print=FALSE){ #为形参设置默认值 + if(parametric){ + center<-mean(x);spreed<-sd(x) + }else{ + center<-median(x);spread<-mad(x)} + if(print $ parametric){ + cat("Mean=", center, "\n", "SD=", spread, "\n") + }else if(print & !parametric){ + cat("Median=",center,"\n","MAD=",spread,"\n") + } + result<-list(center=center,spread=spread) + return(result) + } > mystats function(x,parametric=TRUE,print=FALSE){ if(parametric){ center<-mean(x);spreed<-sd(x) }else{ center<-median(x);spread<-mad(x)} if(print $ parametric){ cat("Mean=", center, "\n", "SD=", spread, "\n") }else if(print & !parametric){ cat("Median=",center,"\n","MAD=",spread,"\n") } result<-list(center=center,spread=spread) return(result) } > set.seed(1234) > x<-rnorm(500) > mystats> > > function(x,parametric=TRUE,print=FALSE){ + if(parametric){ + center<-mean(x);spreed<-sd(x) + }else{ + center<-median(x);spread<-mad(x)} + if(print $ parametric){ + cat("Mean=", center, "\n", "SD=", spread, "\n") + }else if(print & !parametric){ + cat("Median=",center,"\n","MAD=",spread,"\n") + } + result<-list(center=center,spread=spread) + return(result) + } function(x,parametric=TRUE,print=FALSE){ if(parametric){ center<-mean(x);spreed<-sd(x) }else{ center<-median(x);spread<-mad(x)} if(print $ parametric){ cat("Mean=", center, "\n", "SD=", spread, "\n") }else if(print & !parametric){ cat("Median=",center,"\n","MAD=",spread,"\n") } result<-list(center=center,spread=spread) return(result) } > > mystats<-function(x,parametric=TRUE,print=FALSE){ + if(parametric){ + center<-mean(x);spreed<-sd(x) + }else{ + center<-median(x);spread<-mad(x)} + if(print & parametric){ + cat("Mean=", center, "\n", "SD=", spread, "\n") + }else if(print & !parametric){ + cat("Median=",center,"\n","MAD=",spread,"\n") + } + result<-list(center=center,spread=spread) + return(result) + } > mystats function(x,parametric=TRUE,print=FALSE){ if(parametric){ center<-mean(x);spreed<-sd(x) }else{ center<-median(x);spread<-mad(x)} if(print & parametric){ cat("Mean=", center, "\n", "SD=", spread, "\n") }else if(print & !parametric){ cat("Median=",center,"\n","MAD=",spread,"\n") } result<-list(center=center,spread=spread) return(result) } > mystats<-function(x,parametric=TRUE,print=FALSE){ + if(parametric){ + center<-mean(x);spread<-sd(x) + }else{ + center<-median(x);spread<-mad(x)} + if(print & parametric){ + cat("Mean=", center, "\n", "SD=", spread, "\n") + }else if(print & !parametric){ + cat("Median=",center,"\n","MAD=",spread,"\n") + } + result<-list(center=center,spread=spread) + return(result) + } > mystats(x) $center [1] 0.0018 $spread [1] 1 > x 转置 - 行转列,列转行 > cars<-mtcars[1:5,1:4] > cars mpg cyl disp hp Mazda RX4 21 6 160 110 Mazda RX4 Wag 21 6 160 110 Datsun 710 23 4 108 93 Hornet 4 Drive 21 6 258 110 Hornet Sportabout 19 8 360 175 > t(cars) Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive Hornet Sportabout mpg 21 21 23 21 19 cyl 6 6 4 6 8 disp 160 160 108 258 360 hp 110 110 93 110 175 > cars mpg cyl disp hp Mazda RX4 21 6 160 110 Mazda RX4 Wag 21 6 160 110 Datsun 710 23 4 108 93 Hornet 4 Drive 21 6 258 110 Hornet Sportabout 19 8 360 175 > cars2<-t(cars) > cars2 Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive Hornet Sportabout mpg 21 21 23 21 19 cyl 6 6 4 6 8 disp 160 160 108 258 360 hp 110 110 93 110 175 aggregate -- 分组计算 aggdata<-aggregate(mtcars,by=list(cyl,gear),FUN=mean,na.rm=TRUE) > aggdata Group.1 Group.2 mpg cyl disp hp drat wt qsec vs am gear carb 1 4 3 21.5 4 120 97 3.70 2.46 20.0 1.0 0.00 3 1.00 2 6 3 19.8 6 242 108 2.92 3.34 19.8 1.0 0.00 3 1.00 3 8 3 15.1 8 358 194 3.12 4.10 17.1 0.0 0.00 3 3.08 4 4 4 26.9 4 103 76 4.11 2.38 19.6 1.0 0.75 4 1.50 5 6 4 19.8 6 164 116 3.91 3.09 17.7 0.5 0.50 4 4.00 6 4 5 28.2 4 108 102 4.10 1.83 16.8 0.5 1.00 5 2.00 7 6 5 19.7 6 145 175 3.62 2.77 15.5 0.0 1.00 5 6.00 8 8 5 15.4 8 326 300 3.88 3.37 14.6 0.0 1.00 5 6.00 融合 & 重铸 -- 融合:将包含多个变量的数据打散为只具有唯一变量的数据集合,重铸:将打散后的数据安装要求组合为包含多个变量的数据集合 #融合 install.packages("reshape2") library(reshape2) > ID<-c("1","1","2","2") > Time<-c("1","2","1","2") > X1<-c("5","3","6","2") > X2<-c("6","5","1","4") > mydata<-data.frame(ID,Time,X1,X2) > mydata ID Time X1 X2 1 1 1 5 6 2 1 2 3 5 3 2 1 6 1 4 2 2 2 4 > md<-melt(mydata,id=c("ID","Time")) Warning message: attributes are not identical across measure variables; they will be dropped > md ID Time variable value 1 1 1 X1 5 2 1 2 X1 3 3 2 1 X1 6 4 2 2 X1 2 5 1 1 X2 6 6 1 2 X2 5 7 2 1 X2 1 8 2 2 X2 4 > md2<-melt(mydata,id=c("ID")) Warning message: attributes are not identical across measure variables; they will be dropped > md2 ID variable value 1 1 Time 1 2 1 Time 2 3 2 Time 1 4 2 Time 2 5 1 X1 5 6 1 X1 3 7 2 X1 6 8 2 X1 2 9 1 X2 6 10 1 X2 5 11 2 X2 1 12 2 X2 4 > > #重铸 > dcast(mydata,ID+Time~variable) Using X2 as value column: use value.var to override. Error in eval(expr, envir, enclos) : 找不到对象'variable' > dcast(md,ID+Time~variable) ID Time X1 X2 1 1 1 5 6 2 1 2 3 5 3 2 1 6 1 4 2 2 2 4 > dcast(md,ID~variable) Aggregation function missing: defaulting to length ID X1 X2 1 1 2 2 2 2 2 2 > dcast(md,ID~variable+time) Error in unique.default(x) : unique()只适用于矢量 > dcast(md,ID~variable+Time) ID X1_1 X1_2 X2_1 X2_2 1 1 5 3 6 5 2 2 6 2 1 4