R通过RJDBC连接Hive相关
连接方式1
require("DBI")
require("rJava")
require("RJDBC")
username <- '******'
password <- '******'
# .jclassLoader()$setDebug(1L)
cp <- dir("/opt/cloudera/parcels/CDH/jars", full.names = TRUE)
.jinit(classpath = cp)
options( java.parameters = "-Xmx8g" )
drv <- JDBC("org.apache.hive.jdbc.HiveDriver", "/opt/cloudera/parcels/CDH/lib/hive/lib/hive-jdbc.jar", identifier.quote="`")
# make sure disconnect the connection after the query
conn <- dbConnect(drv, "jdbc:hive2://172.156.188.28:10000/default", username, password)
options(width = 160)
show_databases <- dbGetQuery(conn, 'select * from library limit 10')
head(show_databases)
library(knitr)
kable(show_databases)
dbDisconnect(conn)
### http://172.16.18.2/
连接方式2
library("DBI")
library("rJava")
library("RJDBC")
username <- '******'
password <- '******'
for(l in list.files('/opt/cloudera/parcels/CDH/jars/', full.names = TRUE)){ .jaddClassPath(l)}
drv <- JDBC("org.apache.hive.jdbc.HiveDriver", "/opt/cloudera/parcels/CDH/lib/hive/lib/hive-jdbc-standalone.jar")
for(l in list.files('/opt/org/apache/hadoop/2.7.3/share/hadoop/common/', full.names = TRUE)){ .jaddClassPath(l)}
conn <- dbConnect(drv, "jdbc:hive2://172.156.188.28:10000/default", username, password)
library(knitr)
kable(show_databases) # table's meta data, for gollum wiki
## Get data from hive
show_databases <- dbGetQuery(conn, 'select * from library limit 10')
head(show_databases)
library(knitr)
kable(show_databases)
dbDisconnect(conn)
连接方式-函数
数据量太大问题
Have you tired to make increase the memory to 4g instead of 1g. I mean set options for the java parameters to be -Xmx4g instead of -Xmx1g
Simply change the java.parameters to 8GB before load RJDBC package:
# options(java.parameters = "-Xmx8048m")
# library("RJDBC")
You can increase the memory as below and restart your Rstudio and it worked for me.
# memory.limit(size=10000000000024)
函数
get_hive_data <- function(oUsername, oPassword, oSql){
# library(DBI)
# library(rJava)
# library(RJDBC)
options( java.parameters = "-Xmx8048m" )
require(DBI)
require(rJava)
require(RJDBC)
username <- oUsername
password <- oPassword
cp <- dir("/opt/cloudera/parcels/CDH/jars", full.names = TRUE)
.jinit(classpath = cp)
# options( java.parameters = "-Xmx8g" )
drv <- JDBC("org.apache.hive.jdbc.HiveDriver", "/opt/cloudera/parcels/CDH/lib/hive/lib/hive-jdbc.jar", identifier.quote="`")
# make sure disconnect the connection after the query
conn <- dbConnect(drv, "jdbc:hive2://172.156.188.28:10000/default", username, password)
options(width = 160)
result <- dbGetQuery(conn, oSql)
dbDisconnect(conn)
return(result)
}
oUsername <- '******'
oPassword <- '******'
oSql <- 'select * from library limit 1000000'
# oSql <- 'select count(*) from library'
test_data <- get_hive_data(oUsername, oPassword, oSql)
dim(test_data)
head(test_data)
library(knitr)
kable(test_data)
©哈尔滨商业大学 银河统计工作室
银河统计工作室成员由在校统计、计算机部分师生和企业数据数据分析师组成,维护和开发银河统计网和银河统计博客(技术文档)。专注于数据挖掘技术研究和运用,探索统计学、应用数学和IT技术有机结合,尝试大数据条件下新型统计学教学模式。
邮箱:wanglei@hrbcu.edu.cn 关于我们