R语言语法基础二
R语言语法基础二
重塑数据
增加行和列
# 创建向量
city = c("Tampa","Seattle","Hartford","Denver")
state = c("FL","WA","CT","CO")
zipcode = c(33602, 98104, 06161, 80294)
# 组合向量成数据帧
address1 = cbind(city, state, zipcode)
print(address1)
# 另一种方式创建dataFrame
address2 = data.frame(
city = c("Lowry","Charlotte"),
state = c("CO","FL"),
zipcode = c("80230","33949")
)
print(address2)
# 组合数据帧,rbind是行组合,cbind是列组成
address3 = rbind(address1,address2)
print(address3)
city state zipcode
[1,] "Tampa" "FL" "33602"
[2,] "Seattle" "WA" "98104"
[3,] "Hartford" "CT" "6161"
[4,] "Denver" "CO" "80294"
city state zipcode
1 Lowry CO 80230
2 Charlotte FL 33949
city state zipcode
1 Tampa FL 33602
2 Seattle WA 98104
3 Hartford CT 6161
4 Denver CO 80294
5 Lowry CO 80230
6 Charlotte FL 33949
合并两个dataFrame
#合并两个dataFrame,x和y
#以c("bp","bmi","type")为key做join
merged.Pima = merge(
x = MASS::Pima.te, #如果已经装载入MASS库,就不用加MASS::
y = MASS::Pima.tr,
by.x = c("bp","bmi","type"),
by.y = c("bp","bmi","type")
)
nrow(MASS::Pima.te)
ncol(MASS::Pima.te)
nrow(MASS::Pima.tr)
ncol(MASS::Pima.tr)
nrow(merged.Pima)
ncol(merged.Pima)
[1] 332
[1] 8
[1] 200
[1] 8
[1] 10
[1] 13 # 8 + 8 - 3 = 13
分片
MASS::ships[1:5,c("type","year")]
type year
1 A 60
2 A 60
3 A 65
4 A 65
5 A 70
# head(ships)和tail(ships)查看前后6条
函数
内置函数举例
seq(5, 9, by = 0.4) #默认by为1
mean(1:5)
sum(1:5)
[1] 5.0 5.4 5.8 6.2 6.6 7.0 7.4 7.8 8.2 8.6 9.0
[1] 3
[1] 15
自定义函数
#可以使用默认参数
myfunc = function(a = 2, b)
{
print(a + b)
#只能返回一个值
return(a * b)
}
product = myfunc(b = 3)
print(product)
[1] 5
[1] 6
字符串
在R语言中,不区分单引号和双引号,但要求成对出现
# 字符串拼接
a = "Hello"
b = "How"
c = "are you?"
# R语言非数值对象无法直接运算,字符串拼接要用paste
#seperator分隔符默认为空格
paste(a,b,c, sep = "-")
[1] "Hello-How-are you?"
#格式化输出
format(23.123456, digits = 5) #digits表示显示5位有效数字
format(3.14159, nsmall = 8) #nsmall表示小数点后至少8位
format(23.123456, scientific = TRUE) #科学计数法
format(23.123456, width = 10, justify = "right") #位宽为10,右对齐
[1] "23.123"
[1] "2.312346e+01"
[1] "3.14159000"
[1] " 23.12346"
#统计字数
nchar("hello world")
[1] 11
#大小写
toupper("Hello World!")
tolower("Hello World!")
#字符串截取
substring("Extract", 5, 7)
[1] "act"
向量
向量是R语言中最基本的原子性数据对象,内部数据类型相同。即使只有一个值,也当做长度位1的向量
#索引
t = c("Sun", "Mon", "Tue", "Wed", "Thurs", "Fri", "Sat")
t[c(1, 3, 5)]
[1] "Sun" "Tue" "Thurs"
#排序
t = c("Sun", "Mon", "Tue", "Wed", "Thurs", "Fri", "Sat")
sort(t, decreasing = TRUE) #默认是递增
列表
列表是比向量还要高级的数据对象,可以包含不同类型的元素,如数字、字符串、向量、其他列表等,使用list函数创建
创建
#创建列表对象
#本质上是5个对象
list("Red", "Green", 21:25, TRUE, 51.23, 119.1)
[[1]]
[1] "Red"
[[2]]
[1] "Green"
[[3]]
[1] 21 22 23 24 25
[[4]]
[1] TRUE
[[5]]
[1] 51.23
[[6]]
[1] 119.1
索引
# 可以命名列元素
list_data = list(
c("Jan","Feb","Mar"),
matrix(1:6, nrow = 2),
list("green", 12.3)
)
# 用name函数进行命名
names(list_data) = c("item1", "item2", "item3")
print(list_data)
# 索引访问
print(list_data[1])
# 符号(命名访问)访问
print(list_data[["item3"]][1])
添加&删除
# 添加一列
list_data[["new"]] = c("A","B","C")
print(list_data)
# 删除一列只要把那一列设为NULL即可
合并&把list转化成vector
#合并
l1 = list(1,2,3)
l2 = list("Sun","Mon","Tue")
c(l1,l2)
[[1]]
[1] 1
[[2]]
[1] 2
[[3]]
[1] 3
[[4]]
[1] "Sun"
[[5]]
[1] "Mon"
[[6]]
[1] "Tue"
#list转为向量
r = unlist(l1)
r
[1] 1 2 3
矩阵
矩阵是其中元素以二维矩形布局布置的R对象。 它们包含相同原子类型的元素。
创建
# 用向量生成矩阵,4行,默认byrow=FALSE,即数据按列生成
# 可以添加dimnames给维度命名
M = matrix(1:12, nrow = 4, byrow = FALSE,
dimnames = list(
c("row1","row2","row3","row4"),
c("col1","col2","col3")
))
M
col1 col2 col3
row1 1 5 9
row2 2 6 10
row3 3 7 11
row4 4 8 12
索引
M[c("row1","row3"),1]
# 行列索引,跟MATLAB一样,可以使用符号索引或者数字索引
row1 row3
1 3
基本运算
m1 = matrix(c(3, 9, -1, 4, 2, -6),nrow = 2)
m2 = matrix(c(5, 2, 0, 9, 3, 4), nrow = 2)
print(m1)
print(m2)
# 这里的直接+-*./都是一一对应的运算
print(m1*m2)
# %*%是整数乘法,t()函数是转置
print(m1%*%t(m2))
[,1] [,2] [,3]
[1,] 3 -1 2
[2,] 9 4 -6
[,1] [,2] [,3]
[1,] 5 0 3
[2,] 2 9 4
[,1] [,2] [,3]
[1,] 15 0 6
[2,] 18 36 -24
[,1] [,2]
[1,] 21 5
[2,] 27 30
数组
数组可以储存2维以上的R数据对象
创建
v1 = c(5,9,3)
v2 = c(10:15)
column.names <- c("COL1","COL2","COL3")
row.names <- c("ROW1","ROW2","ROW3")
matrix.names <- c("Matrix1","Matrix2")
result = array(c(v1,v2),dim = c(3,3,2), #3x3x2维
dimnames = list(row.names, #给维度取名
column.names,
matrix.names))
print(result) #这里的数据发生了循环补全
, , Matrix1
COL1 COL2 COL3
ROW1 5 10 13
ROW2 9 11 14
ROW3 3 12 15
, , Matrix2
COL1 COL2 COL3
ROW1 5 10 13
ROW2 9 11 14
ROW3 3 12 15
索引
result[c("ROW1","ROW2"),c(2,3),"Matrix2"]
COL2 COL3
ROW1 10 13
ROW2 11 14
apply
v3 = array(c(v1,v2),dim = c(2,3))
print(v3)
# apply传入的是数组,第二个参数是维度(此例中为列)即运算方向
# 第三个参数为函数,也可以是自己写的
apply(v3,c(2),sum)
[,1] [,2] [,3]
[1,] 5 3 11
[2,] 9 10 12
[1] 14 13 23
因子
因子是用于对数据进行分类并将其存储为级别的数据对象
是一种离散的数据类型
data = c("East","West","East","North","North",
"East","West","West","West","East","North")
f = factor(data)
f
[1] East West East North North East West West West East North
Levels: East North West
数据帧
统计中最重要的数据集合类型,类似于表格的形式
创建
data = data.frame(
id = 1:5,
name = c("Rick","Dan","Michelle","Ryan","Gary"),
salary = c(623.3,515.2,611.0,729.0,843.25),
start_date = as.Date(c("2012-01-01", "2013-09-23",
"2014-11-15", "2014-05-11",
"2015-03-27"))
stringsAsFactors = FALSE
)
data
id name salary start_date
1 1 Rick 623.30 2012-01-01
2 2 Dan 515.20 2013-09-23
3 3 Michelle 611.00 2014-11-15
4 4 Ryan 729.00 2014-05-11
5 5 Gary 843.25 2015-03-27
str(data) #structure查看数据帧结构
'data.frame': 5 obs. of 4 variables:
$ id : int 1 2 3 4 5
$ name : chr "Rick" "Dan" "Michelle" "Ryan" ...
$ salary : num 623 515 611 729 843
$ start_date: Date, format: "2012-01-01" "2013-09-23" ...
查看摘要信息
summary(data)
id name salary start_date
Min. :1 Length:5 Min. :515.2 Min. :2012-01-01
1st Qu.:2 Class :character 1st Qu.:611.0 1st Qu.:2013-09-23
Median :3 Mode :character Median :623.3 Median :2014-05-11
Mean :3 Mean :664.4 Mean :2014-01-14
3rd Qu.:4 3rd Qu.:729.0 3rd Qu.:2014-11-15
Max. :5 Max. :843.2 Max. :2015-03-27
索引
data[1:3,c("name","salary")]
name salary
1 Rick 623.3
2 Dan 515.2
3 Michelle 611.0
扩展数据帧
# 添加列
data["dept"] = c("IT","Operations","IT","HR","Finance")
data
id name salary start_date dept
1 1 Rick 623.30 2012-01-01 IT
2 2 Dan 515.20 2013-09-23 Operations
3 3 Michelle 611.00 2014-11-15 IT
4 4 Ryan 729.00 2014-05-11 HR
5 5 Gary 843.25 2015-03-27 Finance
# 添加行
emp.newdata = data.frame(
id = c (6:8),
name = c("Rasmi","Pranab","Tusar"),
salary = c(578.0,722.5,632.8),
start_date = as.Date(c("2013-05-21","2013-07-30","2014-06-17")),
dept = c("IT","Operations","Fianance"),
stringsAsFactors = FALSE
)
# 添加数据
data
emp.newdata
rbind(data,emp.newdata)
id name salary start_date dept
1 1 Rick 623.30 2012-01-01 IT
2 2 Dan 515.20 2013-09-23 Operations
3 3 Michelle 611.00 2014-11-15 IT
4 4 Ryan 729.00 2014-05-11 HR
5 5 Gary 843.25 2015-03-27 Finance
6 6 Rasmi 578.00 2013-05-21 IT
7 7 Pranab 722.50 2013-07-30 Operations
8 8 Tusar 632.80 2014-06-17 Fianance