数据摄取

一:使用sql加载外部文件(http)
REPLACE INTO "wikipedia" OVERWRITE ALL WITH ext AS (SELECT * FROM TABLE( EXTERN( '{"type":"http","uris":["https://druid.apache.org/data/wikipedia.json.gz"]}', '{"type":"json"}', '[{"name":"isRobot","type":"string"},{"name":"channel","type":"string"},{"name":"timestamp","type":"string"},{"name":"flags","type":"string"},{"name":"isUnpatrolled","type":"string"},{"name":"page","type":"string"},{"name":"diffUrl","type":"string"},{"name":"added","type":"long"},{"name":"comment","type":"string"},{"name":"commentLength","type":"long"},{"name":"isNew","type":"string"},{"name":"isMinor","type":"string"},{"name":"delta","type":"long"},{"name":"isAnonymous","type":"string"},{"name":"user","type":"string"},{"name":"deltaBucket","type":"long"},{"name":"deleted","type":"long"},{"name":"namespace","type":"string"},{"name":"cityName","type":"string"},{"name":"countryName","type":"string"},{"name":"regionIsoCode","type":"string"},{"name":"metroCode","type":"long"},{"name":"countryIsoCode","type":"string"},{"name":"regionName","type":"string"}]' ) )) SELECT TIME_PARSE("timestamp") AS __time, isRobot, channel, flags, isUnpatrolled, page, diffUrl, added, comment, commentLength, isNew, isMinor, delta, isAnonymous, user, deltaBucket, deleted, namespace, cityName, countryName, regionIsoCode, metroCode, countryIsoCode, regionName FROM ext PARTITIONED BY DAY
二:从kafka摄入数据
{ "type": "kafka", "spec": { "ioConfig": { "type": "kafka", "consumerProperties": { "bootstrap.servers": "localhost:9092" }, "topic": "kttm", "inputFormat": { "type": "json" }, "useEarliestOffset": true }, "tuningConfig": { "type": "kafka" }, "dataSchema": { "dataSource": "kttm-kafka-supervisor-console", "timestampSpec": { "column": "timestamp", "format": "iso" }, "dimensionsSpec": { "dimensions": [ "session", "number", "client_ip", "language", "adblock_list", "app_version", "path", "loaded_image", "referrer", "referrer_host", "server_ip", "screen", "window", { "type": "long", "name": "session_length" }, "timezone", "timezone_offset", { "type": "json", "name": "event" }, { "type": "json", "name": "agent" }, { "type": "json", "name": "geo_ip" } ] }, "granularitySpec": { "queryGranularity": "none", "rollup": false, "segmentGranularity": "day" } } } }
三:从hdfs摄入数据
四:从本地文件摄入
{ "type" : "index_parallel", "spec" : { "dataSchema" : { "dataSource" : "ingestion-tutorial", "timestampSpec" : { "format" : "iso", "column" : "ts" }, "dimensionsSpec" : { "dimensions": [ "srcIP", { "name" : "srcPort", "type" : "long" }, { "name" : "dstIP", "type" : "string" }, { "name" : "dstPort", "type" : "long" }, { "name" : "protocol", "type" : "string" } ] }, "metricsSpec" : [ { "type" : "count", "name" : "count" }, { "type" : "longSum", "name" : "packets", "fieldName" : "packets" }, { "type" : "longSum", "name" : "bytes", "fieldName" : "bytes" }, { "type" : "doubleSum", "name" : "cost", "fieldName" : "cost" } ], "granularitySpec" : { "type" : "uniform", "segmentGranularity" : "HOUR", "queryGranularity" : "MINUTE", "intervals" : ["2018-01-01/2018-01-02"], "rollup" : true } }, "ioConfig" : { "type" : "index_parallel", "inputSource" : { "type" : "local", "baseDir" : "quickstart/", "filter" : "ingestion-tutorial-data.json" }, "inputFormat" : { "type" : "json" } }, "tuningConfig" : { "type" : "index_parallel", "partitionsSpec": { "type": "dynamic", "maxRowsPerSegment" : 5000000 } } } }

posted on 2024-05-03 09:38  天生一对  阅读(4)  评论(0编辑  收藏  举报

导航