同步工具之Vector

用于构建可观察性管道的轻量级、超快速工具

[安装]

curl --proto '=https' --tlsv1.2 -sSf https://sh.vector.dev | bash

source ~/.profile

可测试配置文件:
vector --config /root/.vector/config/vector.toml

[使用]

[加入系统管理]

vim /etc/systemd/system/vector.service

[Unit]
Description=Vector
Documentation=https://vector.dev
After=network-online.target
Requires=network-online.target
 
[Service]
User=vector
Group=vector
ExecStart=/usr/bin/vector -c /etc/vector/datacenter/*.yaml
ExecReload=/bin/kill -HUP $MAINPID
Restart=no
EnvironmentFile=-/etc/default/vector
 
[Install]
WantedBy=multi-user.target

示例:

Toml数据格式



---

sources:
  kafka_app_events:
    type: "kafka"
    bootstrap_servers: "kafka1:9092,kafka2:9092,kafka3:9092"
    group_id: vector-sink-beta  
    topics:
      - login_test
      - button_click_test
    auto_offset_reset: earliest

transforms:
  remap_public_fields:
    type: remap
    drop_on_error: true
    inputs:
      - kafka_app_events
    source: |-
      msg = parse_json!(.message)
      msg.kafka_offset = .offset
      msg.kafka_partition = .partition
      msg.kafka_topic = .topic

      msg.app_id = to_int!(msg.app_id)
      msg.number_id = to_int!(msg.number_id)
      msg.player_id = to_string!(msg.player_id)
      msg.player_type = to_int!(msg.player_type)
      msg.platform = to_int!(msg.platform)
      msg.params = to_string!(msg.params)
      msg.client_version = to_string!(msg.client_version)
      msg.reg_channel = to_int!(msg.reg_channel) 
      msg.channel = to_int(msg.channel)??0
      msg.main_channel = msg.channel
      if msg.channel > 10000000 {
        msg.main_channel = to_int(msg.channel / 10000 ?? 0)
      }
      . = msg    
  
  route_events:
    type: "route"
    inputs:
      - remap_public_fields
    route:
      login: .kafka_topic == "login_test"
      button_click: .kafka_topic == "button_click_test"

  remap_button_click_test:
    type: remap
    drop_on_error: true
    inputs:
    - route_events.button_click
    source: |-
      .button_id = to_int!(.button_id)
  
  remap_login_test:
    type: remap
    drop_on_error: true
    inputs:
    - route_events.login
    source: |-
      .is_new = to_int!(.is_new)  
      .longitude = to_float!(.longitude)   
      .latitude = to_float!(.latitude)

sinks:
  clickhouse_button_click_test:
    type: clickhouse
    auth:
      user: vector_beta
      password: xxx
      strategy: basic   
    inputs:
    - remap_button_click_test
    compression: gzip
    database: events_beta
    endpoint: http://xxx.com:8123
    table: button_click_all
    encoding:
      only_fields:
      - kafka_partition
      - kafka_offset    

      - data_time
      - app_id
      - tags
      - player_id
      - number_id
      - player_type
      - params
      - platform
      - reg_channel
      - channel
      - main_channel
      - client_version
      - button_id
    healthcheck:
      enabled: true

  clickhouse_login_test:
    type: clickhouse
    auth:
      user: vector_beta
      password: xxx
      strategy: basic
    inputs:
    - remap_login_test
    compression: gzip
    database: events_beta
    endpoint: http://xxx.com:8123
    table: login_all
    encoding:
      only_fields:
      - kafka_partition
      - kafka_offset   

      - data_time
      - app_id
      - tags
      - player_id
      - number_id
      - player_type
      - params
      - platform
      - reg_channel
      - channel
      - main_channel      
      - client_version

      - is_new
      - ip
      - device_id
      - device_os
      - device_brand
      - device_model
      - ppi
      - longitude
      - latitude
    healthcheck:
      enabled: true

实战:

使用vector+clickhouse来收集nginx日志, 最后使用gradfana进行展示

1)定义nginx的访问日志格式

log_format track '$remote_addr - $time_iso8601 "$request_uri" '
                 '$status $body_bytes_sent "$http_user_agent"';

2)例如数据的日志path为 /var/log/track.log

定义解析日志

[sources.home]
type = "file"
include = ["/var/log/track.log"]
read_from = "end"


[transforms.process]
type = "remap"
inputs = ["home"]
source = '''
. |= parse_regex!(.message, r'^(?P<ip>\d+\.\d+\.\d+\.\d+) \- (?P<date>\d+\-\d+\-\d+)T(?P<time>\d+:\d+:\d+).+?"(?P<url>.+?)" (?P<status>\d+) (?P<size>\d+) "(?P<agent>.+?)"$')
.status = to_int!(.status)
.size = to_int!(.size)
.time = .date + " " + .time
'''

[sinks.print]
type = "console"
inputs = ["process"]
encoding.codec = "json"

[sinks.clickhouse]
type = "clickhouse"
inputs = ["process"]
endpoint = "http://xx.xx.xx.xx:8123"
database = "nginx_db"
table = "log"
compression = "gzip"
auth.strategy = "basic"
auth.user = "username"
auth.password = "password"
skip_unknown_fields = true
request.concurrency = "adaptive"

一般定义三部分:

[source.***] 定义数据源

[transforms.***] 定义如何解析,处理数据的结构

[sinks.***] 定义数据的接收与存储

这里的"***" 是可以被替换名称的

3)创建clickhouse的数据库和表

CREATE TABLE log
(
    `ip` String,
    `time` Datetime,
    `url` String,
    `status` UInt8,
    `size` UInt32,
    `agent` String
)
ENGINE = MergeTree
ORDER BY date(time)

参考文档: https://vector.dev/docs/

https://medium.com/datadenys/using-vector-to-feed-nginx-logs-to-clickhouse-in-real-time-197745d9e88b

posted @ 2022-05-24 17:12 X-Wolf 阅读(1216) 评论(0) 收藏举报

刷新页面返回顶部

蓝狼爱猫

与其临渊羡鱼，不如退而结网

同步工具之Vector

公告