spark_to_es
package es import java.io.InputStream import java.text.SimpleDateFormat import java.util.{Calendar, Date, Properties} import org.elasticsearch.spark.rdd.EsSpark import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.slf4j.LoggerFactory object ShoppingcartMarketToEs { private val log = LoggerFactory.getLogger(ShoppingcartMarketToEs.getClass) val prop = new Properties() val is: InputStream = this.getClass().getResourceAsStream("/elastic.properties") prop.load(is) val ENVIRONMENT_SETING = "es_host_sit" val host = prop.getProperty(ENVIRONMENT_SETING) def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("ReadSnCategoryToEs") // sparkConf.set("spark.sql.hive.metastorePartitionPruning", "false") sparkConf.set("es.nodes", host) sparkConf.set("es.nodes.wan.only", "true") // sparkConf.set("es.port", "9200") // sparkConf.set("es.index.auto.create", "true") // sparkConf.set("es.batch.size.entries", "5000") // sparkConf.set("es.write.operation", "upsert") val session = SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate() session.sql("use sospdm") session.udf.register("get_utc_time", () => { val cal = Calendar.getInstance() cal.setTime(new Date()) val zoneOffset = cal.get(Calendar.ZONE_OFFSET) val dstOffset = cal.get(Calendar.DST_OFFSET) cal.add(Calendar.MILLISECOND, -(zoneOffset + dstOffset)) val utcTime = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS").format(cal.getTime) utcTime.replace(" ", "T") + "+0000" }) val querySql = "select pid,shop_id,gds_cd,gds_nm,gds_add_num,gds_add_time,gds_price,expect_tran_price,l4_gds_grp_cd,l4_gds_grp_nm,category_cd,category_nm,brand_cd,brand_nm,'null' as create_user,'null' as update_user,create_time,update_time,get_utc_time() as `@timestamp` from sospdm.tdm_wbank_opts_t_goods_info_newest_ed" val resultDF = session.sql(querySql) if (!ENVIRONMENT_SETING.contains("prd")) { resultDF.show(10) } val tuple = resultDF.rdd.map(row => { val pid: String = row.getAs[String]("pid").toString() val shop_id: String = row.getAs[String]("shop_id").toString() val gds_cd: String = row.getAs[String]("gds_cd").toString() val gds_nm: String = row.getAs[String]("gds_nm").toString() val gds_add_num: String = row.getAs[String]("gds_add_num").toString() val gds_add_time: String = row.getAs[String]("gds_add_time").toString() val gds_price: String = row.getAs[String]("gds_price").toString() val expect_tran_price: String = row.getAs[String]("expect_tran_price").toString() val l4_gds_grp_cd: String = row.getAs[String]("l4_gds_grp_cd").toString() val l4_gds_grp_nm: String = row.getAs[String]("l4_gds_grp_nm").toString() val category_cd: String = row.getAs[String]("category_cd").toString() val category_nm: String = row.getAs[String]("category_nm").toString() val brand_cd: String = row.getAs[String]("brand_cd").toString() val brand_nm: String = row.getAs[String]("brand_nm").toString() val create_user: String = row.getAs[String]("create_user").toString() val update_user: String = row.getAs[String]("update_user").toString() val create_time: String = row.getAs[String]("create_time").toString() val update_time: String = row.getAs[String]("update_time").toString() val `@timestamp`: String = row.getAs[String]("@timestamp").toString() var map = Map[String, Object]() map += ("pid" -> pid) map += ("shop_id" -> shop_id) map += ("gds_cd" -> gds_cd) map += ("gds_nm" -> gds_nm) map += ("gds_add_num" -> gds_add_num) map += ("gds_add_time" -> gds_add_time) map += ("gds_price" -> gds_price) map += ("expect_tran_price" -> expect_tran_price) map += ("l4_gds_grp_cd" -> l4_gds_grp_cd) map += ("l4_gds_grp_nm" -> l4_gds_grp_nm) map += ("category_cd" -> category_cd) map += ("category_nm" -> category_nm) map += ("brand_cd" -> brand_cd) map += ("brand_nm" -> brand_nm) map += ("create_user" -> create_user) map += ("update_user" -> update_user) map += ("create_time" -> create_time) map += ("@timestamp" -> `@timestamp`) (shop_id + gds_cd + gds_add_time, map) }) EsSpark.saveToEsWithMeta(tuple, "idx_shop_goods_addcart/idx_shop_goods_addcart") }
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>spark-hive</groupId> <artifactId>spark-hive</artifactId> <version>1.0-SNAPSHOT</version> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <scala.version>2.11.8</scala.version> <spark.version>2.1.0.9</spark.version> <spark.artifactId.version>2.11</spark.artifactId.version> </properties> <dependencies> <dependency> <groupId>commons-logging</groupId> <artifactId>commons-logging</artifactId> <version>1.1.1</version> <type>jar</type> </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> <version>3.1</version> </dependency> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> <version>1.2.17</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>2.6.2</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.21</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.11</artifactId> <version>2.1.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming_2.11</artifactId> <version>2.1.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming-kafka-0-8_2.11</artifactId> <version>2.1.0</version> </dependency> <dependency> <groupId>com.google.code.gson</groupId> <artifactId>gson</artifactId> <version>2.8.2</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.11</artifactId> <version>2.1.0</version> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.29</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-hive_${spark.artifactId.version}</artifactId> <version>${spark.version}</version> <scope>provided</scope> </dependency> <!--flink dependency--> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-java</artifactId> <version>1.5.0</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-java_2.11</artifactId> <version>1.5.0</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-clients_2.11</artifactId> <version>1.5.0</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-connector-wikiedits_2.11</artifactId> <version>1.5.0</version> </dependency> <!--hbase dependency--> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase</artifactId> <version>0.98.8-hadoop2</version> <type>pom</type> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-client</artifactId> <version>0.98.8-hadoop2</version> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-common</artifactId> <version>0.98.8-hadoop2</version> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-server</artifactId> <version>0.98.8-hadoop2</version> </dependency> <dependency> <groupId>org.elasticsearch</groupId> <artifactId>elasticsearch-spark-20_${spark.artifactId.version}</artifactId> <version>6.7.1</version> </dependency> <!-- https://mvnrepository.com/artifact/org.elasticsearch/elasticsearch --> </dependencies> <build> <plugins> <plugin> <artifactId>maven-assembly-plugin</artifactId> <configuration> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> </configuration> </plugin> <plugin> <groupId>org.codehaus.mojo</groupId> <artifactId>build-helper-maven-plugin</artifactId> <version>1.8</version> <executions> <execution> <id>add-source</id> <phase>generate-sources</phase> <goals> <goal>add-source</goal> </goals> <configuration> <sources> <source>src/main/scala</source> <source>src/test/scala</source> </sources> </configuration> </execution> <execution> <id>add-test-source</id> <phase>generate-sources</phase> <goals> <goal>add-test-source</goal> </goals> <configuration> <sources> <source>src/test/scala</source> </sources> </configuration> </execution> </executions> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>2.3.2</version> <configuration> <source>1.7</source> <target>1.7</target> <encoding>${project.build.sourceEncoding}</encoding> </configuration> </plugin> <plugin> <groupId>org.scala-tools</groupId> <artifactId>maven-scala-plugin</artifactId> <executions> <execution> <goals> <goal>compile</goal> <goal>add-source</goal> <goal>testCompile</goal> </goals> </execution> </executions> <configuration> <scalaVersion>2.11.8</scalaVersion> <sourceDir>src/main/scala</sourceDir> <jvmArgs> <jvmArg>-Xms64m</jvmArg> <jvmArg>-Xmx1024m</jvmArg> </jvmArgs> </configuration> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-release-plugin</artifactId> <version>2.5.3</version> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-deploy-plugin</artifactId> <configuration> <skip>false</skip> </configuration> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> <version>2.4.1</version> <executions> <execution> <phase>package</phase> <goals> <goal>shade</goal> </goals> <configuration> <filters> <filter> <artifact>*:*</artifact> <excludes> <exclude>META-INF/*.SF</exclude> org.apache.hive <exclude>META-INF/*.DSA</exclude> <exclude>META-INF/*.RSA</exclude> </excludes> </filter> </filters> <minimizeJar>false</minimizeJar> </configuration> </execution> </executions> </plugin> </plugins> <resources> <resource> <directory>src/main/resources</directory> <filtering>true</filtering> </resource> <resource> <directory>src/main/resources/${profiles.active}</directory> </resource> </resources> <!-- 修复 Plugin execution not covered by lifecycle configuration --> <pluginManagement> <plugins> <plugin> <groupId>org.eclipse.m2e</groupId> <artifactId>lifecycle-mapping</artifactId> <version>1.0.0</version> <configuration> <lifecycleMappingMetadata> <pluginExecutions> <pluginExecution> <pluginExecutionFilter> <groupId>org.codehaus.mojo</groupId> <artifactId>build-helper-maven-plugin</artifactId> <versionRange>[1.8,)</versionRange> <goals> <goal>add-source</goal> <goal>add-test-source</goal> </goals> </pluginExecutionFilter> <action> <ignore></ignore> </action> </pluginExecution> <pluginExecution> <pluginExecutionFilter> <groupId>org.scala-tools</groupId> <artifactId>maven-scala-plugin</artifactId> <versionRange>[1.8,)</versionRange> <goals> <goal>compile</goal> <goal>add-source</goal> <goal>testCompile</goal> </goals> </pluginExecutionFilter> <action> <ignore></ignore> </action> </pluginExecution> </pluginExecutions> </lifecycleMappingMetadata> </configuration> </plugin> </plugins> </pluginManagement> </build> </project>