Sys.setenv(SPARK_HOME="/usr/spark")
.libPaths(c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib"), .libPaths()))
library(SparkR)
sc<-sparkR.init(master="spark://Master.Hadoop:7077")
people <- read.df(sqlContext, "/people.json", "json") read json file
read csv file:
https://github.com/databricks/spark-csv
in shell:
sparkR --packages com.databricks:spark-csv_2.10:1.0.3
df <- read.df(sqlContext, "/test.csv", source = "com.databricks.spark.csv", inferSchema = "true") // read data in HDFS
in RStudio:
Sys.setenv(SPARK_HOME="/usr/spark")
.libPaths(c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib"), .libPaths()))
library(SparkR)
Sys.setenv('SPARKR_SUBMIT_ARGS'='"--packages" "com.databricks:spark-csv_2.10:1.3.0" "sparkr-shell"')
sc<-sparkR.init(master="spark://Master.Hadoop:7077")
sqlContext <- sparkRSQL.init(sc)
df <- read.df(sqlContext, "/test.csv", source = "com.databricks.spark.csv", inferSchema = "true")
write.df(df, "newcars.csv", "com.databricks.spark.csv", "overwrite")
Using R file
./sparkR --pakcagescom.databricks:spark-csv_2.10:1.0.3
*.R (有时不灵。。some times cannot)
sparkR read files in one directory in HDFS:
df <- read.df(sqlContext, "/tdir/*.csv", source="com.databricks.spark.csv", interSchema="true")
or :
in the code:
#!/usr/bin/Rscript
directly run R code
./*.R
http://thirteen-01.stat.iastate.edu/snoweye/hpsc/?item=rscript