R语言--字符串操作
字符串操作一般分割、拼接、替换、提取等等
拆分
strsplit
strsplit默认输出格式为列表
strsplit(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE)
- x:字符串向量,向量中的每个字符串元素都会被分割
- split:位置的字串向量,即在哪个字串处开始拆分;该参数默认是正则表达式匹配;若设置fixed= T则表示是用普通文本匹配或者正则表达式的精确匹配。用普通文本来匹配的运算速度要快些。
x <- c(as = "asfef", qu = "qwerty", "yuiop[", "b", "stuff.blah.yech")
strsplit(x, "e")
运行结果:
$`as` $qu [[3]] [[4]] [[5]]
[1] "asf" "f" [1] "qw" "rty" [1] "yuiop[" [1] "b" [1] "stuff.blah.y" "ch"
str_split
stringr包中的str_split函数与标准库中的strsplit一样
str_split(string, pattern, n = Inf, simplify = FALSE)
- string:字符串向量,向量中的每个字符串元素都会被分割
- pattern:分割位置的字符串向量,即在哪个字符串处开始
library(stringr)
fruits <- c( "apples and oranges and pears and bananas","pineapples and mangos and guavas")
str_split(fruits, " and ")
运行结果:
[[1]] [[2]]
[1] "apples" "oranges" "pears" "bananas" [1] "pineapples" "mangos" "guavas"
拼接
paste和paste0
paste和paste0之间的区别是拼接的字符之间是否带有空格
paste (..., sep = " ", collapse = NULL)
paste0(..., collapse = NULL)
- ...:一个或者多个R对象,该对象需转换为字符向量.如果是字符串,则所有字符串拼接在一起,如果是字符串向量,则匹配。具体看实例
- sep:分割字符串
paste0(1:12, c("st", "nd", "rd", rep("th", 9)))
# 结果
[1] "1st" "2nd" "3rd" "4th" "5th" "6th" "7th" "8th" "9th" "10th" "11th" "12th"
paste(1:12, c("st", "nd", "rd", rep("th", 9)))
# 结果
"1 st" "2 nd" "3 rd" "4 th" "5 th" "6 th" "7 th" "8 th" "9 th" "10 th" "11 th" "12 th"
paste(1:12, c("st", "nd"))
# 结果
[1] "1 st" "2 nd" "3 st" "4 nd" "5 st" "6 nd" "7 st" "8 nd" "9 st" "10 nd" "11 st" "12 nd"
paste0(1:12, c("st", "nd"))
# 结果
[1] "1st" "2nd" "3st" "4nd" "5st" "6nd" "7st" "8nd" "9st" "10nd" "11st" "12nd"
paste("I","love","you")
# 结果
[1] "I love you"
paste0("I","love","you")
# 结果
[1] "Iloveyou"
str_c
str_c(..., sep = "", collapse = NULL)
str_c和paste0函数一样
str_c(1:12, c("st", "nd", "rd", rep("th", 9)))
# 结果
[1] "1st" "2nd" "3rd" "4th" "5th" "6th" "7th" "8th" "9th" "10th" "11th" "12th"
str_c(1:12, c("st", "nd"))
# 结果
[1] "1st" "2nd" "3st" "4nd" "5st" "6nd" "7st" "8nd" "9st" "10nd" "11st" "12nd"
str_c("I","love","you")
# 结果
[1] "Iloveyou"
替换
chartr
chartr(old, new, x)
- x:字符串向量
- old:需要被替换的字符/字符串,其长度不能长于new。也就是说只会更改下标上的字符,而不能更改下标。而且替换的时候,会old和new根据下标对应替换
- new:替换的字符/字符串
chartr(old = "a",new = "c",c("a123","a15","a23"))
# 结果
[1] "c123" "c15" "c23"
chartr(old = "a12345",new = "c6789101456",c("a123","a15","a23"))
# 结果
[1] "c678" "c61" "c78" # 拿a15说明,a在old中下标为1,便替换为new[1]。1在old中下标为2,所以替换为new[2]。5在old中下标为6,所以替换为new[6],所以最后a15替换为c61。
chartr(old = "a1",new = "c4",c("a123","a15","a23"))
# 结果
[1] "c423" "c45" "c23"
sub
sub可以替换字符串,但是sub()函数不会对原字符串进行操作。所以需要创建一个变量来储存该操作后的字符串。另外,sub函数只会替换匹配到的第一个
sub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
- pattern:包含正则表达式的字符串
- replacement:与pattern匹配的部分进行替换的值
- x:字符串向量或者转化为字符的R对象
str <- "Now is the time "
sub(" +$", " 12:00", str) #正则表达式,即str尾部的空格替换为12:00
# 结果
"Now is the time 12:00"
# 此时我们只是调用了sub函数,却没有保存这个结果。而且该函数不会对原函数操作的。
print(str)
"Now is the time "
sub("Now","what",str)
# 结果
[1] "what is the time "
sub(pattern = "nd",replacement = "ND",c("andbndcnd","sndendfund"))
# 结果,字符串元素中有很多"nd",但是只会替换第一个"nd"。
[1] "aNDbndcnd" "sNDendfund"
gsub
gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
gsub()函数和sub用法一样,不过,gsub()函数可以替换所有匹配字符
gsub(pattern = "nd",replacement = "ND",c("andbndcnd","sndendfund"))
# 结果
[1] "aNDbNDcND" "sNDeNDfuND"
substr和substring
这两个函数可以提取、替换字符串。而且是对原字符串进行操作
substr(x, start, stop) <- value
substring(text, first, last = 1000000L) <- value
- x, text:字符串向量
- start, first:整型,替换字符的起始下标
- stop,:整型,替换字符的结束下标
- last:字符串长度
- value:替换的字符,如果需要的话(与代替换向量长度不同),自动循环补齐
shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")
substr(shopping_list,1,3) <- "AAA"
# 结果
[1] "AAAles x4" "AAA of flour" "AAA of sugar" "AAAk x2"
substr(shopping_list,1) <- "AAA"
# 结果
[1] "AAAles x4" "AAA of flour" "AAA of sugar" "AAAk x2"
substr(shopping_list,1,20) <- "yesterday once more"
# 结果
[1] "yesterday" "yesterday on" "yesterday on" "yesterd"
substring(shopping_list,1) <- "yesterday once more"
# 结果
[1] "yesterday" "yesterday on" "yesterday on" "yesterd"
str_replace和str_replace_all
第三方包中的str_replace和str_replace_all
str_replace(string, pattern, replacement) # 和sub一样,只替换第一个匹配字符
str_replace_all(string, pattern, replacement) # 和gsub一样,替换所有匹配字符
fruits <- c("one apple", "two pears", "three bananas")
str_replace(fruits, "[aeiou]", "-") #正则表达式,即对字符串中的小写字母a或e或i或o或u,替换为-
# 结果
[1] "-ne apple" "tw- pears" "thr-e bananas"
str_replace_all(fruits, "[aeiou]", "-")
# 结果
[1] "-n- -ppl-" "tw- p--rs" "thr-- b-n-n-s"
str_sub
第三方包stringr
str_sub(string, start = 1L, end = -1L, omit_na = FALSE) <- value
shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")
str_sub(shopping_list,1,3) <- "AAA"
# 结果
[1] "AAAles x4" "AAA of flour" "AAA of sugar" "AAAk x2"
str_sub(shopping_list,1) <- "AAA"
# 结果
[1] "AAA" "AAA" "AAA" "AAA"
提取
substr 和substring
substr(x, start, stop)
substring(text, first, last = 1000000L)
substr("abcdef", 2, 4)
# 结果
"bcd"
substr("abcdef", 1:6, 1:6)
# 结果
"a","b","c","d","d","e"
str_extract 和str_extract_all
第三方包stringr
str_extract(string, pattern)
str_extract_all(string, pattern, simplify = FALSE)
shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")
str_extract(shopping_list, "[a-z]+")
# 结果
[1] "apples" "bag" "bag" "milk"
str_extract_all(shopping_list, "[a-z]+")
# 结果
[[1]] [[2]] [[3]] [[4]]
[1] "apples" "x" [1] "bag" "of" "flour" [1] "bag" "of" "sugar" [1] "milk" "x"
str_sub
第三方包stringr
str_sub(string, start = 1L, end = -1L)
str_sub(shopping_list,1,5)
# 结果
[1] "apple" "bag o" "bag o" "milk "
测定字符串长度
nchar
nchar(x, type = "chars", allowNA = FALSE, keepNA = NA) #以字符串为向量,返回向量元素--字符串的长度组成的向量
nzchar(x, keepNA = FALSE) #快速判定字符串向量元素是否为非空值
shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")
nchar(shopping_list)
# 结果
[1] 9 12 12 7
nzchar(shopping_list)
# 结果
[1] TRUE TRUE TRUE TRUE
str_count
str_count(string, pattern = "")
str_count不仅可以测定元素长度,还以测定某字符在字符串中的下标位置
str_count(shopping_list)
# 结果
[1] 9 12 12 7
str_count(shopping_list, "a")
# 结果,如果不包含则返回0
[1] 1 1 2 0
str_length
第三方包stringr
shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")
str_length(string)
str_length(shopping_list)
# 结果
[1] 9 12 12 7
字符串匹配
grep
grep(pattern, x, ignore.case = FALSE, perl = FALSE, value = FALSE,
fixed = FALSE, useBytes = FALSE, invert = FALSE)
- pattern: 包含一个正则表达式的字符串(或者,当fixed = True时,为字符串)
- x: 一个待匹配的字符串向量,或者是一个可强制转换为字符串的R对象
- value:当value = False时,函数返回匹配值的下标。当value = True,函数返回匹配值
shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")
grep("apple",shopping_list)
# 结果
[1] 1
grep("apple",shopping_list,value = T)
# 结果
[1] "apples x4"
grepl
grepl(pattern, x, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
grepl和grep的用法差不多,只是grepl返回的是逻辑变量TRUE或FALSE
grepl("apple",shopping_list)
# 结果
[1] TRUE FALSE FALSE FALSE
str_subset
str_subset(string, pattern, negate = FALSE)
- string: 待匹配的字符串向量
- pattern: 一个包含正则表达式的字符串
- negate: 当negate = False,函数返回匹配值。当negate = True,函数返回与pattern不匹配的字符串
fruit <- c("apple", "banana", "pear", "pinapple")
str_subset(fruit, "a") #匹配所有含有a的字符串
# 结果
[1] "apple" "banana" "pear" "pinapple"
str_subset(fruit, "^p", negate = TRUE) # 返回所有不以p开头的字符串
# 结果
[1] "apple" "banana"
str_which
str_which(string, pattern, negate = FALSE)
str_which(fruit, "a")
# 结果
[1] 1 2 3 4
排序
str_sort
str_sort(x, decreasing = FALSE, na_last = TRUE, locale = "en",
numeric = FALSE, ...)
- x:待排序的字符串向量
- decreasing:布尔值,默认FALSE,表示从低到高排序。如果为TRUE,表示从高到低排序
- na_last:NA 应该排在什么位置,TRUE表示放在末端,FALSE表示放在开头,NA向下排。
- numeric:如果为True,则按照数字排序而不是按照字符排序
x <- c("100a10", "100a5", "2b", "2a")
str_sort(x)
# 结果
[1] "100a10" "100a5" "2a" "2b"
str_sort(x, numeric = TRUE)
# 结果
[1] "2a" "2b" "100a5" "100a10"