Fastest Way to Add New Variables to A Large Data.Frame
1 pkgs <- list("hflights", "doParallel", "foreach", "dplyr", "rbenchmark", "data.table") 2 lapply(pkgs, require, character.only = T) 3 4 data(hflights) 5 6 benchmark(replications = 10, order = "user.self", relative = "user.self", 7 transform = { 8 ### THE GENERIC FUNCTION MODIFYING THE DATA.FRAME, SIMILAR TO DATA.FRAME() ### 9 transform(hflights, wday = ifelse(DayOfWeek %in% c(6, 7), 'weekend', 'weekday'), delay = ArrDelay + DepDelay) 10 }, 11 within = { 12 ### EVALUATE THE EXPRESSION WITHIN THE LOCAL ENVIRONMENT ### 13 within(hflights, {wday = ifelse(DayOfWeek %in% c(6, 7), 'weekend', 'weekday'); delay = ArrDelay + DepDelay}) 14 }, 15 mutate = { 16 ### THE SPECIFIC FUNCTION IN DPLYR PACKAGE TO ADD VARIABLES ### 17 mutate(hflights, wday = ifelse(DayOfWeek %in% c(6, 7), 'weekend', 'weekday'), delay = ArrDelay + DepDelay) 18 }, 19 foreach = { 20 ### SPLIT AND THEN COMBINE IN PARALLEL ### 21 registerDoParallel(cores = 2) 22 v <- c(names(hflights), 'wday', 'delay') 23 f <- expression(ifelse(hflights$DayOfWeek %in% c(6, 7), 'weekend', 'weekday'), 24 hflights$ArrDelay + hflights$DepDelay) 25 df <- foreach(fn = iter(f), .combine = mutate, .init = hflights) %dopar% { 26 eval(fn) 27 } 28 names(df) <- v 29 }, 30 data.table = { 31 ### DATA.TABLE ### 32 data.table(hflights)[, c("wday", "delay") := list(ifelse(hflights$DayOfWeek %in% c(6, 7), 'weekend', 'weekday'), hflights$ArrDelay + hflights$DepDelay)] 33 } 34 ) 35 36 # test replications elapsed relative user.self sys.self user.child 37 # 4 foreach 10 1.442 1.000 0.240 0.144 0.848 38 # 2 within 10 0.667 2.783 0.668 0.000 0.000 39 # 3 mutate 10 0.679 2.833 0.680 0.000 0.000 40 # 5 data.table 10 0.955 3.983 0.956 0.000 0.000 41 # 1 transform 10 1.732 7.200 1.728 0.000 0.000
----------------------------------------------------------------------------------
数据和特征决定了效果上限,模型和算法决定了逼近这个上限的程度
----------------------------------------------------------------------------------