neo4j-(12)-spark操作

创建对象

<dependencies>
  <!-- list of dependencies -->
  <dependency>
    <groupId>org.neo4j</groupId>
    <artifactId>neo4j-connector-apache-spark_2.11</artifactId>
    <version>4.0.1_for_spark_2.4</version>
  </dependency>
</dependencies>
<repositories>
  <!-- list of other repositories -->
  <repository>
    <id>SparkPackagesRepo</id>
    <url>http://dl.bintray.com/spark-packages/maven</url>
  </repository>
</repositories>

使用sparkSession配置

val spark=SparkSession.builder().appName("play")
.master("local[*]")
.config("spark.neo4j.bolt.url", "bolt://localhost:7687")
.config("spark.neo4j.bolt.user", "neo4j")
.config("spark.neo4j.bolt.password", "hortmt")
.getOrCreate()
 import spark.implicits._
 val neo=Neo4j(spark.sparkContext)

使用 sparkConf配置

val conf = new SparkConf().setAppName("neoej")
                .setMaster("local[*]")
                .set("spark.neo4j.bolt.url", "bolt://localhost:7687")
                .set("spark.neo4j.bolt.user", "neo4j")
                .set("spark.neo4j.bolt.password", "hortmt")
        val sc =new SparkContext(conf)

        val neo=Neo4j(sc)

通过 Neo4jConfig 来做配置

val sparkSession = SparkSession.builder()
              .master("local[*]")
                .appName("LoadDataToNeo4j")
                    .getOrCreate();

  val sc = sparkSession.sparkContext

  val config = Neo4jConfig("localhost:","neo4j",Option("root"))
  Neo4j(sc).cypher("CREATE (c:Client {id:1230}) return c").loadRdd
  sparkSession.close()

返回类型

选择要返回的数据类型

  • loadRowRdd, loadNodeRdds, loadRelRdd,loadRdd[T]
  • loadDataFrame,loadDataFrame(schema)
  • loadGraph[VD,ED]
  • loadGraphFrame[VD,ED]

使用

loadRowRdd

import org.neo4j.spark._

val neo = Neo4j(sc)

val rdd = neo.cypher("MATCH (n:Person) RETURN id(n) as id ").loadRowRdd
rdd.count

// inferred schema
rdd.first.schema.fieldNames
//   => ["id"]
rdd.first.schema("id")
//   => StructField(id,LongType,true)

neo.cypher("MATCH (n:Person) RETURN id(n)").loadRdd[Long].mean
//   => res30: Double = 236696.5

neo.cypher("MATCH (n:Person) WHERE n.id <= {maxId} RETURN n.id").param("maxId", 10).loadRowRdd.count
//   => res34: Long = 10

loadGraph

import org.neo4j.spark._

val neo = Neo4j(sc)

import org.graphframes._

val graphFrame = neo.pattern(("Person","id"),("KNOWS",null), ("Person","id")).partitions(3).rows(1000).loadGraphFrame

graphFrame.vertices.count
//     => 100
graphFrame.edges.count
//     => 1000

val pageRankFrame = graphFrame.pageRank.maxIter(5).run()
val ranked = pageRankFrame.vertices
ranked.printSchema()

val top3 = ranked.orderBy(ranked.col("pagerank").desc).take(3)
//     => top3: Array[org.apache.spark.sql.Row]
//     => Array([236716,70,0.62285...], [236653,7,0.62285...], [236658,12,0.62285])

文档: https://neo4j.com/developer/apache-spark/

posted @ 2021-06-02 12:21  宁君  阅读(470)  评论(0编辑  收藏  举报