R&SQL合并记录并去重
====================================================================================
data:
have <- data.frame(ID=paste0("id", c(101, 102, 102, 103, 103, 104, 105, 105)),
info1=c("one", "twoA", "twoB", "threeA", "threeB", "four", "five", "five"),
stringsAsFactors=FALSE)
want <- data.frame(ID=paste0("id", c(101:105)),
info1=c("one", "twoA; twoB", "threeA; threeB", "four", "five"),
stringsAsFactors=FALSE)
====================================================================================
MySQL
SELECT user_id, COUNT(id) AS freq, GROUP_CONCAT(DISTINCT ip SEPARATOR ",") AS ips FROM log_table GROUP BY user_id ORDER BY freq ASC;
====================================================================================
R&SQL
library(sqldf)
sqldf("
SELECT ID,
GROUP_CONCAT(distinct info1) as info1
FROM have
GROUP BY ID")
注:此种情况没能实现更改间隔符,默认为逗号。
====================================================================================
R
--------------------------------------------------------------------------------------------------------------------------------------
require(dplyr)
METHOD1: have %>% group_by(ID) %>% summarise_each(funs(toString(sort(unique(info1)))))
METHOD2: have %>% group_by(ID) %>% summarise(name = toString(sort(unique(info1))))
--------------------------------------------------------------------------------------------------------------------------------------
require(data.table)
METHOD1: setDT(have)[, .(info1 = toString(sort(unique(info1)))), by = ID]
METHOD2: setDT(have)[ , .(info1 = paste(unique(info1), collapse = ",")),by = ID]
注:paste函数作为其他函数的参数时,不能。例如
aggregate(have[,2], by=list(have$ID), paste(unique(info1)), collapse=";") ×
aggregate(have[,2], by=list(have$ID), paste, collapse=";") √
setDT(have)[, lapply(.SD, paste(unique(info1)), collapse = "; "), by = ID] ×
setDT(have)[, lapply(.SD, paste, collapse = "; "), by = ID] √
--------------------------------------------------------------------------------------------------------------------------------------
aggregate(data=have,info1~ID,FUN = function(t) sort(unique(t)))