[Impala]impala监控任务 用es+grafana实现监控
一、前言
目前要对impala执行的任务进行监控,目前采用的是用python每10分钟请求Cloudera Manager的数据,然后发送到es,配合grafana进行展示。
大致的架构图如下:
二、impala 任务REST API
首先要知道CM 关于 impala 任务的rest api是什么才能进行后续的工作。
1. cm的rest api 文档
通过cm界面可以很快的找到cm的api文档
2. 在api文档中找关于impala 的api
下面这些就是和impala 相关的api
/clusters/{clusterName}/services/{serviceName}/impalaQueries
/clusters/{clusterName}/services/{serviceName}/impalaQueries/{queryId}
/clusters/{clusterName}/services/{serviceName}/impalaQueries/{queryId}/cancel
/clusters/{clusterName}/services/{serviceName}/impalaQueries/attributes
/clusters/{clusterName}/services/{serviceName}/impalaUtilization
如下图所示:
3. 查看impala查询的任务
查看impala查询的任务,对应的接口为/clusters/{clusterName}/services/{serviceName}/impalaQueries
,以他为例。
可以查看接口具体的请求参数和返回结果等。
具体的几个参数:
- filter 使用比较简单,文档里也有详细的案例
- from 这里使用的是ISO 8601 时间
- to 这里使用的是ISO 8601 时间
- limit 返回的任务数
这里要实现的功能是每10分钟,请求任务状态为FINSHED 和 EXCEPTION的任务。
大概的API如下:
http://xxx:7180/api/v18/clusters/cluster/services/impala/impalaQueries?filter=(queryState=FINISHED or queryState=EXCEPTION)&from= fromTime &to=toTime&limit=1000"
fromTime 和 toTime 后续作为参数进行替换。
三、使用python请求数据
其中有几个需要注意的点:
- 请求cm的api时,需要携带用户名和密码进行认证
参考:https://blog.csdn.net/Mrerlou/article/details/115483865
- api的 from 和 to 参数是ISO 8661的时间格式
需要将时间转为 ISO时间,from 为当前时间减去10分钟的时间,to则为当前时间。
from时间如下:
from datetime import datetime, timedelta
dt = datetime.now() + timedelta(minutes=-10)
fromTime = dt.isoformat()
1. impala_sql.py:(仅供参考)
from datetime import datetime, timedelta
import requests
from requests.auth import HTTPBasicAuth
import time
import es
import isoToTimeStamp
# 获取from 和 to 的iso时间
dt = datetime.now() + timedelta(minutes=-10)
fromTime = dt.isoformat()
toTime = (datetime.now() + timedelta(minutes=0)).isoformat()
nowTime = time.strftime("%Y-%m-%d %H:%M:%S")
print(nowTime)
# api相关信息
api = "http://127.0.0.1:7180/api/v18/clusters/cluster/services/impala/impalaQueries?filter=(queryState=FINISHED or " \
"queryState=EXCEPTION)&from=%s&to=%s&limit=1000" % (fromTime, toTime)
cmip = "10.11.4.247"
# 填写cm的用户名密码
cmUser = "username"
cmPasswd = "password"
# 主机id 和 host字典
map = {
"d2fd7746-96be-4ef3-9412-6dc68c108a0d": "dx-hadoop59.dx", "9a4a71dd-8e9b-4546-9688-c8802bdd8e6f": "dx-hadoop62.dx",
"198c19fa-3d68-4391-9824-3f6dedf0147a": "dx-hadoop64.dx", "0e612a40-da39-46a2-9a90-bbd7cf40e11e": "dx-hadoop69.dx",
"e5f5adf0-7a24-4e86-be70-326675d66db4": "dx-hadoop63.dx", "f3546cff-fd2d-41c1-8ecb-228d1ce52970": "dx-hadoop58.dx",
"22b9654f-e9d2-432c-af23-95e4c666b6b6": "dx-hadoop70.dx", "4dad5fa0-bbaf-4291-814a-80a725468a20": "dx-hadoop56.dx",
"013d0331-391b-4df6-91bf-ba6427c5fbb7": "dx-hadoop61.dx", "22227355-9353-499c-8f68-74aff0bf3b12": "dx-hadoop57.dx",
"714b3136-39bc-4ae8-bf1b-23b922275115": "dx-hadoop60.dx", "d9554ced-bc06-4660-a269-3f723ab4fe28": "dx-hadoop65.dx",
"a6c85f95-80ec-4c65-8a07-a56a3c13e678": "dx-hadoop67.dx", "509d0398-12a0-4eff-b800-50512a7b2ff6": "dx-hadoop68.dx"
}
# 每10分钟统计impala查询的sql,默认200条
def impala_sql():
esdata = []
print(api)
try:
res = requests.get(api, auth=HTTPBasicAuth(cmUser, cmPasswd))
data = res.json()
querys = data['queries']
except Exception as e:
print(e)
for query in querys:
raw = {}
raw['queryId'] = query.get('queryId')
raw['statement'] = query.get('statement')
# 是同一个字段,只是类型不一样一个是keyword 一个是text
raw['statement_txt'] = query.get('statement')
raw['queryType'] = query.get('queryType')
raw['queryState'] = query.get('queryState')
ISO8610StartTime = query.get('startTime')
raw['startTime'] = isoToTimeStamp.iso2timestamp(ISO8610StartTime) * 1000
ISO8610EndTime = query.get('endTime')
if ISO8610EndTime is None and raw['queryState'] == "EXCEPTION":
count_exception_sql.cancel_failed_job(raw['queryId'])
if ISO8610EndTime is not None:
raw['endTime'] = isoToTimeStamp.iso2timestamp(ISO8610EndTime) * 1000
else:
raw['endTime'] = query.get('endTime')
raw['rowsProduced'] = query.get('rowsProduced')
attributes = query.get('attributes')
if raw['queryType'] == 'QUERY' or raw['queryType'] == 'DML':
raw['parse_sql'] = sql_patten.parse_sql(raw['statement'])
raw['thread_cpu_time_percentage'] = attributes.get('thread_cpu_time_percentage')
raw['thread_network_receive_wait_time'] = attributes.get('thread_network_receive_wait_time')
raw['thread_cpu_time'] = attributes.get('thread_cpu_time')
raw['hdfs_average_scan_range'] = attributes.get('hdfs_average_scan_range')
raw['bytes_streamed'] = attributes.get('bytes_streamed')
raw['hdfs_bytes_read_short_circuit'] = attributes.get('hdfs_bytes_read_short_circuit')
raw['hdfs_bytes_read_from_cache'] = attributes.get('hdfs_bytes_read_from_cache')
raw['hdfs_bytes_read'] = attributes.get('hdfs_bytes_read')
raw['hdfs_scanner_average_bytes_read_per_second'] = attributes.get(
'hdfs_scanner_average_bytes_read_per_second')
raw['thread_network_receive_wait_time_percentage'] = attributes.get(
'thread_network_receive_wait_time_percentage')
raw['memory_per_node_peak_node'] = attributes.get('memory_per_node_peak_node')
raw['hdfs_bytes_read_remote'] = attributes.get('hdfs_bytes_read_remote')
raw['estimated_per_node_peak_memory'] = attributes.get('estimated_per_node_peak_memory')
raw['hdfs_bytes_read_local_percentage'] = attributes.get('hdfs_bytes_read_local_percentage')
raw['hdfs_bytes_read_from_cache_percentage'] = attributes.get('hdfs_bytes_read_from_cache_percentage')
raw['hdfs_bytes_read_remote_percentage'] = attributes.get('hdfs_bytes_read_remote_percentage')
raw['hdfs_bytes_read_short_circuit_percentage'] = attributes.get('hdfs_bytes_read_short_circuit_percentage')
raw['thread_total_time'] = attributes.get('thread_total_time')
raw['thread_network_send_wait_time_percentage'] = attributes.get('thread_network_send_wait_time_percentage')
raw['thread_storage_wait_time_percentage'] = attributes.get('thread_storage_wait_time_percentage')
raw['hdfs_bytes_read_local'] = attributes.get('hdfs_bytes_read_local')
raw['memory_per_node_peak'] = attributes.get('memory_per_node_peak')
raw['memory_accrual'] = attributes.get('memory_accrual')
raw['memory_aggregate_peak'] = attributes.get('memory_aggregate_peak')
raw['thread_network_send_wait_time'] = attributes.get('thread_network_send_wait_time')
raw['thread_storage_wait_time'] = attributes.get('thread_storage_wait_time')
raw['pool'] = attributes.get('pool')
elif raw['queryType'] == 'DDL':
raw['ddl_type'] = attributes.get('ddl_type')
raw['client_fetch_wait_time'] = attributes.get('client_fetch_wait_time')
raw['file_formats'] = attributes.get('file_formats')
raw['query_status'] = attributes.get('query_status')
raw['admission_result'] = attributes.get('admission_result')
raw['original_user'] = attributes.get('original_user')
raw['session_id'] = attributes.get('session_id')
raw['stats_corrupt'] = attributes.get('stats_corrupt')
raw['oom'] = attributes.get('oom')
raw['planning_wait_time_percentage'] = attributes.get('planning_wait_time_percentage')
raw['connected_user'] = attributes.get('connected_user')
raw['impala_version'] = attributes.get('impala_version')
raw['stats_missing'] = attributes.get('stats_missing')
raw['planning_wait_time'] = attributes.get('planning_wait_time')
raw['client_fetch_wait_time_percentage'] = attributes.get('client_fetch_wait_time_percentage')
raw['network_address'] = attributes.get('network_address')
raw['session_type'] = attributes.get('session_type')
raw['user'] = query.get('user')
coordinator = query.get('coordinator')
if coordinator is not None:
hostId = coordinator.get('hostId')
raw['hostId'] = map.get(hostId)
raw['detailsAvailable'] = query.get('detailsAvailable')
raw['database'] = query.get('database')
raw['durationMillis'] = query.get('durationMillis')
esdata.append(es.set_es_data(raw['queryId'], raw))
finshtime = time.strftime("%m-%d %H:%M:%S", time.localtime())
# 将数据存入es中:
print(len(esdata))
if len(esdata) != 0:
bulklen, bulkstat = es.es_bulk_load(esdata)
print
"%s finish get presto-job %d, error: %s" % (finshtime, bulklen, bulkstat)
if __name__ == '__main__':
impala_sql()
2. 数据导入es
es.py
- 索引按月来创建 格式为hadoop_impala_2021-05
- 索引会根据mapping 自动创建
- queryId 会作为文档ID,这样数据不会造成重复
文件内容如下:
#coding=utf-8
from datetime import datetime, timedelta
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import time
# 获取当前时间
now = datetime.now()
# 使用格式当前时间
dt = now.strftime("%Y-%m")
# 索引名
index = "hadoop_impala_%s" %(dt)
# 当前时间减30分钟
dtt = now + timedelta(minutes=-30)
# 获取时间戳
dtt = int(time.mktime(dtt.timetuple()))
# 获取毫秒时间戳
dtt = dtt * 1000
# 初始化es
es = Elasticsearch(
['127.0.0.1', '127.0.0.2', '127.0.0.3'],
port=9200,
sniff_on_start=True,
sniff_on_connection_fail=True,
sniffer_timeout=60
)
# 创建es索引的mapping
mappings = {
"mappings": {
"event": {
"properties": {
"impala_version": {
"type": "text"
},
"memory_per_node_peak": {
"type": "double"
},
"memory_per_node_peak_node": {
"type": "keyword"
},
"memory_per_node_peak": {
"type": "double"
},
"memory_accrual": {
"type": "double"
},
"memory_aggregate_peak": {
"type": "double"
},
"stats_corrupt": {
"type": "boolean"
},
"planning_wait_time": {
"type": "integer"
},
"connected_user": {
"type": "keyword"
},
"network_address": {
"type": "text"
},
"hdfs_bytes_read_short_circuit": {
"type": "long"
},
"hdfs_bytes_read": {
"type": "long"
},
"client_fetch_wait_time": {
"type": "integer"
},
"hdfs_bytes_read_local": {
"type": "long"
},
"rowsProduced": {
"type": "integer"
},
"thread_storage_wait_time_percentage": {
"type": "integer"
},
"original_user": {
"type": "keyword"
},
"query_status": {
"type": "keyword"
},
"ddl_type": {
"type": "keyword"
},
"hdfs_bytes_read_local_percentage": {
"type": "integer"
},
"hdfs_bytes_read_from_cache": {
"type": "integer"
},
"thread_total_time": {
"type": "long"
},
"client_fetch_wait_time_percentage": {
"type": "integer"
},
"detailsAvailable": {
"type": "boolean"
},
"stats_missing": {
"type": "boolean"
},
"thread_storage_wait_time": {
"type": "long"
},
"thread_network_receive_wait_time_percentage": {
"type": "integer"
},
"pool": {
"type": "keyword"
},
"statement": {
"type": "keyword"
},
"statement_text": {
"type": "text"
},
"sql": {
"type": "keyword"
},
"database": {
"type": "keyword"
},
"table": {
"type": "keyword"
},
"durationMillis": {
"type": "integer"
},
"hdfs_average_scan_range": {
"type": "double"
},
"queryType": {
"type": "keyword"
},
"hdfs_bytes_read_from_cache_percentage": {
"type": "integer"
},
"hdfs_bytes_read_remote": {
"type": "double"
},
"hdfs_bytes_read_remote_percentage": {
"type": "integer"
},
"thread_cpu_time": {
"type": "integer"
},
"session_type": {
"type": "keyword"
},
"oom": {
"type": "boolean"
},
"planning_wait_time_percentage": {
"type": "integer"
},
"user": {
"type": "keyword"
},
"startTime": {
"type": "date"
},
"admission_result": {
"type": "keyword"
},
"bytes_streamed": {
"type": "double"
},
"thread_cpu_time_percentage": {
"type": "integer"
},
"thread_network_send_wait_time_percentage": {
"type": "integer"
},
"file_formats": {
"type": "keyword"
},
"estimated_per_node_peak_memory": {
"type": "long"
},
"database": {
"type": "keyword"
},
"session_id": {
"type": "text"
},
"thread_network_receive_wait_time": {
"type": "long"
},
"hdfs_scanner_average_bytes_read_per_second": {
"type": "double"
},
"queryId": {
"type": "text"
},
"thread_network_send_wait_time": {
"type": "long"
},
"hostId": {
"type": "keyword"
},
"queryState": {
"type": "keyword"
},
"endTime": {
"type": "date"
},
"hdfs_bytes_read_short_circuit_percentage": {
"type": "integer"
}
}
}
}
}
def index_is_exists(index):
indexExist = es.indices.exists(index)
if indexExist:
print("索引已存在")
else:
create_index(index)
# 根据mapping 创建es索引
def create_index(index):
print("开始创建索引")
es.indices.create(index=index, body=mappings)
print("创建索引成功")
# es文档id,为impala任务的queryId
def set_es_data(queryId, data):
raw = {
"_index": index,
"_type": "event",
"_id": queryId,
"_source": data
}
return raw
# 批量插入数据
def es_bulk_load(datas):
index_is_exists(index)
res = helpers.bulk(es, datas)
print(res)
return res
if __name__ == '__main__':
index_is_exists(index)
3. 编写脚本
- 赋予执行权限
chmod 764 impala_sql.py
- 编写定时任务
vim /etc/cron.d/impala_sql
内容如下:
*/10 * * * * root cd /home/impala_job && flock -xn run.lock -c "./impala_sql.py" &>> run.log
以上的话,就实现定时将数据写入es中了。
四、kibana结合es中的数据
-
添加索引
-
以startTime 作为时间字段
-
点击创建后,显示如下:
-
kibana成功与es索引结合
-
kibana简单查询
比如说查看查询类型为"QUERY" 并且任务HDFS本地化率为100%的任务,更多kibana 语法规则可见https://blog.csdn.net/Mrerlou/article/details/116017240
以上kibana 也成功结合es
五、grafana 结合 es
1. 添加数据源
- 添加数据源
- 输入es的url
2. 创建dashboard
-
创建dashboard
-
修改dashborad名字
-
修改名字,文件夹、刷新时间等
-
添加一些变量
详细内容如下:
3. 添加监控
-
添加监控
-
编写queries
-
选择之前创建的时间变量
-
选择图标
-
x轴y轴配置
-
标题和介绍信息填写
-
最后结果成功展示
总体效果展示:
好了,到此impala 任务监控实现就到此为止!