运维自动化平台
1、项目背景
经过几年的发展,公司最早是人工发布工程代码,之后使用了jenkins大部分工作只要新建一次脚本、nginx配置后就能比较自动的完成,但是回退依然使用人工处理的方式。自从公司业务发展起来以后,每个月2次的活动,都要上下主机,根据工程扩容。其中主要工作有新建ecs主机,初始化主机环境,添加dns解析,添加监控,添加jenkins发布脚本配置,配置项目发布模板等,这些给运维工作带了许多重复而且非常容易出错的过程效率也非常低下,同时活动结束后还要完成下线删除nginx配置,删除jenkins发布脚本配置,删除dns解析,删除ecs等,一次活动扩容、缩容往往要耗费2个运维1天的时间,而且频繁的线上文本配置变革已经不止一次的出现人为事故,刚开始的时候为了降低工作疲劳度,甚至提前2天开始扩容这样下来对公司业务基础设施的成本也有不小的开销。在这个时候准备思考使用可视化,平台化的运维去解决这样的问题,并且为后期公司技术人员的扩容做好运维支撑。(感谢公司前端大神路飞的给力支持,用react快速的搞出了一套界面)
2、平台功能
因为不是专业的开发和产品,整个平台的需求和后端设计都是由运维自己完成,大方向上有如下几个需求:新建ecs主机;新建工程;根据工程关联ecs主机形成发布调用;可以实时查看发布日志;批量和串行的发布支持;发布单的概念以便日后的发布权限审批。但是做着做着就发现细节的东西越来越多,比如说和前端的对接方式,代码变更日志等,处理这些小细节实际上耗费了不少时间。整个平台使用python django web框架开发完成。
3、流程图
ecs与工程:
工程构建与发布:
下发salt发布命令流程:
4、平台数据库设计
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
class project_type(models.Model): #给前端用的工程的类型名称 ProjectType = models.CharField(max_length = 30 , default = None , null = True ) class project_info(models.Model): #工程信息 ProjectName = models.CharField(max_length = 50 ,default = None ,null = False ,unique = True ) #工程名 PackageName = models.CharField(max_length = 50 ,default = None ,null = True ) #java工程打包出来的包名字 ProjectDir = models.TextField(max_length = 300 ,default = None ,null = True ) #工程的运行路径 UpsName = models.CharField(max_length = 50 ,default = None ,null = True ) #nginx中upstream的名字 Repertory = models.CharField(max_length = 10 ,default = None ,null = True ) #仓库类型,git还是svn DeployAddress = models.TextField(max_length = 300 ,default = None ,null = True ) #仓库地址 Branch = models.CharField(max_length = 100 ,default = None ,null = True ) #分支 ProjectType = models.CharField(max_length = 30 ,default = None ,null = True ) #工程类型 CreationTime = models.IntegerField(default = None ,null = True ) #创建时间 Port = models.CharField(max_length = 10 ,default = None ,null = True ) #tcp端口 MavenArgs = models.CharField(max_length = 100 ,default = None ,null = True ) #java打包参数 LastBuildStatus = models.CharField(max_length = 30 ,default = None ,null = True ) #最近一次的构建状态 LastBuildTime = models.IntegerField(default = None , null = True ) #最近一次的构建时间 class server_project_r(models.Model): #工程和ecs主机的关系表,表示ecs属于某个工程 ProjectName = models.CharField(max_length = 50 , default = None , null = False , db_index = True ) #工程名 InstanceId = models.CharField(max_length = 50 , default = None , null = False , db_index = True ) #ecs的id号,唯一 CreationTime = models.IntegerField(default = None , null = True ) #创建时间 class server_info(models.Model): #ecs主机信息表 InstanceId = models.CharField(max_length = 50 ,default = None ,null = False ,unique = True ) #ecs的id号 InstanceName = models.CharField(max_length = 50 ,default = None ,null = True ,db_index = True ) #ecs的名字 ZoneId = models.CharField(max_length = 50 ,default = None ,null = True ) #ecs所属的区域,比如说杭州B PrivateIp = models.CharField(max_length = 50 ,default = None ,null = True ) #vpc网络ip Cpu = models.CharField(max_length = 50 ,default = None ,null = True ) #CPU个数 Memory = models.CharField(max_length = 50 ,default = None ,null = True ) #内存大小 OsType = models.CharField(max_length = 50 ,default = None ,null = True ) #系统类型,linux还是windows PayType = models.CharField(max_length = 50 ,default = None ,null = True ) #支付类型,预付费,后付费 Status = models.CharField(max_length = 30 ,default = None ,null = True ) #状态,running,stoped, CreationTime = models.IntegerField(default = None ,null = True ) #创建时间 class build_history(models.Model): #工程构建历史表 BuildId = models.CharField(max_length = 50 ,default = None ,null = False ,unique = True ) #构建id,唯一 ProjectName = models.CharField(max_length = 50 , default = None , null = False , db_index = True ) #工程名字 Status = models.CharField(max_length = 30 ,default = None ,null = True ) #构建状态 BuildLog = models.TextField(max_length = 1000 ,default = None ,null = True ) #构建日志的路径 CreationTime = models.IntegerField(default = None , null = True ) #创建日期 Note = models.TextField(max_length = 1000 ,default = None ,null = True ) #备注 class deploy_build_r(models.Model): #工程构建发布关系表 BuildId = models.CharField(max_length = 50 , default = None , null = False , db_index = True ) #构建id,唯一 DeployId = models.CharField(max_length = 50 , default = None , null = False , db_index = True ) #发布id,唯一 ProjectName = models.CharField(max_length = 50 , default = None , null = False , db_index = True ) #工程名字 CreationTime = models.IntegerField(default = None , null = True ) #创建日期 Status = models.CharField(max_length = 30 ,default = None ,null = True ) #发布状态 class deploy_server(models.Model): #工程发布ecs表 DeployId = models.CharField(max_length = 50 ,default = None ,null = False ,db_index = True ) #发布id InstanceId = models.CharField(max_length = 50 , default = None , null = False , db_index = True ) #ecs的id PrivateIp = models.CharField(max_length = 50 , default = None , null = True ) #vpc私网ip ProjectName = models.CharField(max_length = 50 , default = None , null = False , db_index = True ) #ecs的id CreationTime = models.IntegerField(default = None , null = True ) #创建日期 Note = models.TextField(max_length = 1000 , default = None , null = True ) #备注 Status = models.CharField(max_length = 30 , default = None , null = True ) #发布状态 class ecs_type(models.Model): #ecs类型表给前端用的 TypeName = models.CharField(max_length = 30 ,default = None ,null = False ) TypeId = models.CharField(max_length = 50 ,default = None ,null = False ) class vswitch(models.Model): #vswitch阿里云网络的id表 VswitchName = models.CharField(max_length = 30 ,default = None ,null = False ) VswitchId = models.CharField(max_length = 50 , default = None , null = False ) |
5、核心业务代码实现
其中有用到的python库如下:svn,gitpython,json,django,salt,aliyunsdkcore,aliyunsdkecs,shutil,threadpool,time,datetime,subprocess,uuid,base64,jinja2,sqlalchemy等
下载git或者svn仓库代码:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
def getcode(Repertory = None , DeployAddress = None ,buildProjectDir = None ,Branch = None ,BuildLog = None ,ProjectName = None ): p1 = project_info.objects. filter (ProjectName = ProjectName).get() LastBuildTime = p1.LastBuildTime if Repertory = = "git" : try : Repo.clone_from(DeployAddress, buildProjectDir, branch = Branch) except Exception, e: print e return False if LastBuildTime or LastBuildTime ! = "null" : with open (BuildLog, "a" ) as codelog: codelog.write( "代码仓库日志如下,如果第一次构建或者无变更记录则为空:\n\n" ) g = Git(buildProjectDir) log = g.log( "--since=" + time.strftime( "%Y-%m-%d %H:%M:%S" , time.localtime(LastBuildTime))) with open (BuildLog, "a" ) as codelog: codelog.write(log.encode( "utf8" )) return True elif Repertory = = "svn" : try : svncmd = svn.remote.RemoteClient(DeployAddress, username = "xxxxx" , password = "xxxxx" ) except Exception, e: print e return False svncmd.checkout(buildProjectDir) localsvn = svn.local.LocalClient(buildProjectDir, username = "xxxxx" , password = "xxxxx" ) if LastBuildTime or LastBuildTime ! = "null" : with open (BuildLog, "a" ) as codelog: codelog.write( "代码仓库日志如下,如果第一次构建或者无变更记录则为空:\n\n" ) for e in localsvn.log_default(timestamp_from_dt = datetime.datetime.utcfromtimestamp(LastBuildTime), timestamp_to_dt = datetime.datetime.now()): with open (BuildLog, "a" ) as codelog: codelog.write(e.author.encode( "utf8" ) + " " + e.msg.encode( "utf8" ) + " " + e.date.strftime( "%Y-%m-%d %H:%M:%S" ) + "\n" ) return True |
maven打包实现:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
|
def mavenProject(ProjectName = None ,BuildId = None , BuildLog = None , Repertory = None ,DeployAddress = None , Branch = None , ProjectType = None , MavenArgs = None , PackageName = None ): basedir = "/data/src/" buildProjectDir = basedir + ProjectName cmd = "cd " + buildProjectDir + ";/opt/apache-maven/bin/mvn -B -f pom.xml -s /opt/apache-maven/conf/settings.xml -gs /opt/apache-maven/conf/settings.xml " + MavenArgs print cmd if os.path.exists(buildProjectDir): shutil.rmtree(buildProjectDir) os.makedirs(buildProjectDir) else : os.makedirs(buildProjectDir) #下载仓库代码 coderesult = getcode(Repertory = Repertory, DeployAddress = DeployAddress, buildProjectDir = buildProjectDir, Branch = Branch, BuildLog = BuildLog, ProjectName = ProjectName) if coderesult is False : mavenWriteDb(ProjectName = ProjectName, BuildId = BuildId, Result = "Failed" ) #构建工程,并且实时得到输出写入文件中。 with open (BuildLog, "a" ) as loggin: loggin.write( "\n打包日志日志如下:\n\n" ) buildcommand = subprocess.Popen(cmd, shell = True , stdout = subprocess.PIPE, stderr = subprocess.STDOUT) while buildcommand.poll() is None : line = buildcommand.stdout.readline() if line: with open (BuildLog, "a" ) as loggin: loggin.write(line) else : time.sleep( 2 ) continue if buildcommand.returncode = = 0 : backupMaven(ProjectType = ProjectType, PackageName = PackageName, buildProjectDir = buildProjectDir, BuildId = BuildId) mavenWriteDb(ProjectName = ProjectName, BuildId = BuildId, Result = "Success" ) else : mavenWriteDb(ProjectName = ProjectName, BuildId = BuildId, Result = "Failed" ) |
saltstack调用方式:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
def SaltDepoly(PrivateIp = None ,DeployId = None ,ProjectName = None ,ProjectType = None ): p1 = deploy_server.objects.get(DeployId = DeployId, PrivateIp = PrivateIp) p1.Status = "InProcessing" p1.save() config_file_path = "/etc/salt/master" projectargs = list () projectargs.append(ProjectName) projectargs.append(DeployId) try : print projectargs client = salt.client.LocalClient(config_file_path) if ProjectType = = "tomcat" : result = client.cmd(tgt = PrivateIp,tgt_type = "ipcidr" , fun = 'projectdeploy.deploy' ,arg = projectargs, timeout = 600 ) elif ProjectType = = "dubbo" : result = client.cmd(tgt = PrivateIp, tgt_type = "ipcidr" , fun = 'dubbodeploy.deploy' , arg = projectargs, timeout = 600 ) print result return True except Exception, e: print e return False |
发布项目控制实现部分:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
|
if len (IpList) > 3 : #并行发布 print "主机大于3台,开始第一轮发布" pool = threadpool.ThreadPool( len (IpList)) args_list = list () for i in range ( 0 , len (IpList) / 2 ): args_tup = ([IpList[i], DeployId, ProjectName, ProjectType], None ) args_list.append(args_tup) request = threadpool.makeRequests(SaltDepoly, args_list) [pool.putRequest(req) for req in request] pool.wait() #检测这个DeployId中是否有fail的主机,如果有报错并且退出 if GodeployCheck(DeployId = DeployId) ! = True : GoDeployStatus(DeployId = DeployId, Action = "close" ) print "发布过程中有部分主机失败,退出发布主进程" return elif GodeployCheck(DeployId = DeployId) = = True : GoDeployStatus(DeployId = DeployId, Action = "close" ) print "开始第二轮发布" pool = threadpool.ThreadPool( len (IpList)) args_list = list () for i in range ( len (IpList) / 2 , len (IpList)): args_tup = ([IpList[i], DeployId, ProjectName, ProjectType], None ) args_list.append(args_tup) request = threadpool.makeRequests(SaltDepoly, args_list) [pool.putRequest(req) for req in request] pool.wait() #检测这个DeployId中是否有fail的主机,如果有报错并且退出 if GodeployCheck(DeployId = DeployId) ! = True : GoDeployStatus(DeployId = DeployId, Action = "close" ) print "发布过程中有部分主机失败,退出发布主进程" return elif GodeployCheck(DeployId = DeployId) = = True : GoDeployStatus(DeployId = DeployId, Action = "close" ) print "全部发布成功" else : #串行发布 for i in range ( 0 , len (IpList)): ip = IpList[i] print ip SaltDepoly(ip, DeployId, ProjectName, ProjectType) if GodeployCheck(DeployId = DeployId) ! = True : GoDeployStatus(DeployId = DeployId, Action = "close" ) print "发布过程中有部分主机失败,退出发布主进程" return elif GodeployCheck(DeployId = DeployId) = = True : GoDeployStatus(DeployId = DeployId, Action = "close" ) print "一台发布成功" |
jwt检测,装饰器实现:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
def CheckToken(func): @wraps(func) def Check( * keys, * * kw): if keys[ 0 ].COOKIES.has_key( "token" ) is True : secret_key = "xxxxxx" tokenstring = keys[ 0 ].COOKIES.get( "token" ,"") try : userid = jwt.decode(tokenstring, key = secret_key)[ "userInfo" ][ "id" ] except Exception,e: print e print "Token不正确,禁止访问" result = json.dumps({ "isSuccess" : 0 , "message" : "接口不支持单独调用" }) return HttpResponse(result, content_type = "application/json" ) return func( * keys, * * kw) else : print "没有token,禁止访问" result = json.dumps({ "isSuccess" : 0 , "message" :""}) return HttpResponse(result, content_type = "application/json" ) return Check #装饰器在django里的应用 @CheckToken def webEcsList (request): #ecs展示接口 if request.method = = "GET" : PageSize = request.GET.get( "PageSize" , None ) PageNumber = request.GET.get( "PageNumber" , None ) aliyun = request.GET.get( "aliyun" , None ) result = getDescribeInstances(PageNumber = PageNumber, PageSize = PageSize,aliyun = aliyun) return HttpResponse(result,content_type = "application/json" ) |
6、salt扩展模块
用过salt的人应该知道,salt在执行起来比ansible要快,而且配置方法比chef要舒服很多。但是有个比较致命的缺点就是sdk调用的时候只要内部的python代码执行不出错,salt就无法明确的告诉你本次调用是否真的达到你想要结果了,所以我采取了一个思路用salt模块去检测每台ecs的发布结果然后落库。
其实整个平台的核心不止是http的调用和显示,salt这边也非常非常的重要和复杂。其中要先写最基础的发布bash脚本,给salt调用,然后判断project类型,是否tomcat项目要用jinja2渲染配置配置。最终判断结果落库,下面是一些核心的代码。其中落库使用了sqlalchemy这个非常著名的python orm。
渲染配置:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
def rendering_argv( self ): if not os.path.exists( self .ProjectDir): os.system( "cp -rp /data/tomcat_template %s" % ( self .ProjectDir)) port = int ( self .Port) http_port = port shutdown_port = port + 1 ajp_port = port + 2 redirect_port = port + 3 jmx_port = port + 4 templateloader = jinja2.FileSystemLoader( "/data/tomcat_template/conf" ) env = jinja2.Environment(loader = templateloader) template = env.get_template( "server_template.xml" ) server_xml = template.render(http_port = http_port, shutdown_port = shutdown_port, ajp_port = ajp_port, redirect_port = redirect_port) with open ( self .ProjectDir + "/conf/server.xml" , "w" ) as f: f.write(server_xml) templateloader = jinja2.FileSystemLoader( "/data/tomcat_template/bin" ) env = jinja2.Environment(loader = templateloader) template = env.get_template( "catalina_template.sh" ) catalina_sh = template.render(jmx_port = jmx_port) with open ( self .ProjectDir + "/bin/catalina.sh" , "w" ) as f: f.write(catalina_sh) else : return |
启动、停止业务程序:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
def stopApp( self ,ProjectDir = None ): stoppath = self .ProjectDir + "/bin/stop.sh" cpid = os.system( "sh " + stoppath) if cpid ! = 0 : exit( 1 ) os.system( "cd " + self .ProjectDir + ";rm -rf webapps/*" ) def startApp( self ,ProjectDir = None ): self .getPackage() startpath = self .ProjectDir + "/bin/start.sh" cpid = os.system( "sh " + startpath + " " + self .myaddr) if cpid ! = 0 : print "发布失败" result = self .db.sessiondb.query(deploy_server). filter (and_(deploy_server.DeployId = = self .DeployId, deploy_server.PrivateIp = = self .myaddr)) result.update({deploy_server.Status: "Failed" }) result.update({deploy_server.CreationTime: int (time.mktime(datetime.datetime.now().timetuple()))}) self .db.sessiondb.commit() exit( 1 ) else : result = self .db.sessiondb.query(deploy_server). filter (and_(deploy_server.DeployId = = self .DeployId, deploy_server.PrivateIp = = self .myaddr)) result.update({deploy_server.Status: "Success" }) result.update({deploy_server.CreationTime: int (time.mktime(datetime.datetime.now().timetuple()))}) self .db.sessiondb.commit() print "发布成功" |
6、效果图
工程界面
ecs界面
工程详情
构建界面
发布界面
7、结束语
这套平台上线后,日常耗费半天甚至一天时间的扩容和锁容。基本上15分钟就能搞定。当然其中还是有很多不足和需要改进的地方。后续还会慢慢优化。