使用bloom 加速sqler + gitbase 的代码统计分析情况
我们基于gitbase 暴露的mysql 服务,可以方便的查询数据,但是如果需要长时间计算的就不太好了
还是我们可以通过bloom通过配置的方式就可以解决,以下是一个实践以及一些问题的解决访问
环境准备
- docker-compose 文件
version: "3"
services:
lb:
image: openresty/openresty:alpine
volumes:
- "./nginx-lb.conf:/usr/local/openresty/nginx/conf/nginx.conf"
ports:
- "9000:80"
bloom:
image: dalongrong/bloom:v1.28.0
volumes:
- "./bloom.cfg:/etc/bloom.cfg"
ports:
- "8811:8811"
- "9001:8080"
redis:
image: redis
ports:
- "6379:6379"
gitbase:
container_name: gitbase
hostname: gitbase
image: srcd/gitbase:v0.24.0-rc2
volumes:
- "./git-demos:/opt/repos"
ports:
- "3306:3306"
sqler:
image: dalongrong/sqler:2.2-pprof
volumes:
- "./sqler.hcl:/app/config.example.hcl"
environment:
- "DSN=root@tcp(gitbase:3306)/gitbase?multiStatements=true"
ports:
- "3678:3678"
- "8025:8025"
- 运行原理
如下图:openresty进行api 的代理(自动添加header头以及proxy,bloom需要),bloom cache 请求api,sqler 暴露rest api
gitbase 提供git sql 引擎服务
配置说明
- openresty
worker_processes 1;
user root;
events {
worker_connections 1024;
}
http {
include mime.types;
default_type application/octet-stream;
lua_need_request_body on;
gzip on;
resolver 127.0.0.11 ipv6=off;
real_ip_header X-Forwarded-For;
real_ip_recursive on;
server {
listen 80;
charset utf-8;
default_type text/html;
location / {
default_type text/plain;
index index.html;
}
location /codecounts {
proxy_pass http://bloom:8080;
proxy_set_header Bloom-Request-Shard 0;
proxy_set_header Host $host;
proxy_read_timeout 10000;
proxy_send_timeout 10000;
proxy_buffer_size 1M;
proxy_buffers 8 1M;
proxy_busy_buffers_size 1M;
proxy_temp_file_write_size 1M;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Host $server_name;
}
location /repos {
proxy_pass <a href="http://bloom:8080;">http://bloom:8080;</a>
// 核心,动态添加header
proxy_set_header Bloom-Request-Shard 0;
proxy_set_header Host $host;
proxy_read_timeout 10000;
proxy_send_timeout 10000;
proxy_buffer_size 1M;
proxy_buffers 8 1M;
proxy_busy_buffers_size 1M;
proxy_temp_file_write_size 1M;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Host $server_name;
}
location /apps {
proxy_pass <a href="http://bloom:8080;">http://bloom:8080;</a>
// 核心,动态添加header
proxy_set_header Bloom-Request-Shard 0;
proxy_set_header Host $host;
proxy_read_timeout 10000;
proxy_send_timeout 10000;
proxy_buffer_size 1M;
proxy_buffers 8 1M;
proxy_busy_buffers_size 1M;
proxy_temp_file_write_size 1M;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Host $server_name;
}
error_page 500 502 503 504 /50x.html;
location = /50x.html {
root html;
}
}
}
- bloom
基于toml 的配置
[server]
log_level = "debug"
inet = "0.0.0.0:8080"
[control]
inet = "0.0.0.0:8811"
tcp_timeout = 900
[proxy]
[[proxy.shard]]
shard = 0
host = "sqler"
port = 8025
[cache]
ttl_default = 600
executor_pool = 64
disable_read = false
disable_write = false
compress_body = true
[redis]
host = "redis"
port = 6379
database = 0
pool_size = 80
max_lifetime_seconds = 60
idle_timeout_seconds = 600
connection_timeout_seconds = 1
max_key_size = 256000
max_key_expiration = 2592000
- sqler 代码统计查询
codecounts {
exec = <<SQL
SET inmemory_joins = 1;
SET SQL_SELECT_LIMIT=200;
SELECT
repo,
MONTH,
YEAR,
DAY,
sum( JSON_EXTRACT( info, '$.Code.Additions' ) AS code_lines_added ) AS code_lines_addeds,
sum( JSON_EXTRACT( info, '$.Code.Deletions' ) AS code_lines_removed ) AS code_lines_removeds
FROM
(
SELECT
repository_id AS repo,
commit_stats ( repository_id, commit_hash ) AS info,
commits.commit_author_when AS commit_when,
YEAR ( committer_when ) AS YEAR,
MONTH ( committer_when ) AS MONTH,
DAY ( committer_when ) AS DAY
FROM
ref_commits
NATURAL JOIN commits
) a
GROUP BY
repo,
YEAR,
MONTH,
DAY
ORDER BY
MONTH,
YEAR,
DAY
limit 500;
SQL
}
cache {
cron = "* 1 * * *"
trigger {
webhook = "http://lb/codecounts"
}
}
apps {
exec = <<SQL
SET SQL_SELECT_LIMIT=200;
SET inmemory_joins = 1;
select 1;
SQL
}
repos {
exec = <<SQL
SET SQL_SELECT_LIMIT=500;
SELECT
repository_id,
LANGUAGE(file_path, blob_content) as lang,
SUM(JSON_EXTRACT(LOC(file_path, blob_content), '$.Code')) as code,
SUM(JSON_EXTRACT(LOC(file_path, blob_content), '$.Comment')) as comments,
SUM(JSON_EXTRACT(LOC(file_path, blob_content), '$.Blank')) as blanks,
COUNT(1) as files
FROM refs
NATURAL JOIN commit_files
NATURAL JOIN blobs
WHERE ref_name='HEAD'
GROUP BY lang,repository_id;
SQL
}
几个问题
- gzip 问题
因为bloom是不处理数据解压缩的,但是sqler默认gzip了,所以使用中会有问题,解决方法,源码编译,禁用gzip部分
参考
server_rest.go 注意我同时添加了pprof,方便分析性能问题
// Copyright 2018 The SQLer Authors. All rights reserved.
// Use of this source code is governed by a Apache 2.0
// license that can be found in the LICENSE file.
package main
import (
"net/http"
_ "net/http/pprof"
"strings"
"github.com/labstack/echo"
"github.com/labstack/echo/middleware"
)
// initialize RESTful server
func initRESTServer() error {
e := echo.New()
e.HideBanner = true
e.HidePort = true
e.Pre(middleware.RemoveTrailingSlash())
e.Use(middleware.CORS())
// e.Use(middleware.GzipWithConfig(middleware.GzipConfig{Level: 9}))
e.Use(middleware.Recover())
e.GET("/", routeIndex)
e.Any("/:macro", routeExecMacro, middlewareAuthorize)
e.GET("/debug/pprof/*", echo.WrapHandler(http.DefaultServeMux))
return e.Start(*flagRESTListenAddr)
}
- 大量代码走sqler查询很慢
这个问题应该是gitbase 实现问题,默认用ide工具,测试很快,但是sqler就是很慢,发现如果我们执行了
查询的数据(分页)就很快了,实际推荐还是走分页方式处理,如果数据是可估算的,可以通过
SET SQL_SELECT_LIMIT=200;
解决
参考资料
https://github.com/rongfengliang/bloom-sqler-gitbase
https://github.com/alash3al/sqler
https://github.com/valeriansaliou/bloom