Greenplum Python专用库gppylib学习——GpArray
gparray.py依赖的python包(datetime、copy、traceback、os),依赖的gp包(gplog、utils、db、gpversion、commands.unix)
1 from datetime import date 2 import copy 3 import traceback 4 from gppylib.utils import checkNotNone, checkIsInt 5 from gppylib import gplog 6 from gppylib.db import dbconn 7 from gppylib.gpversion import GpVersion 8 from gppylib.commands.unix import * 9 import os
代码分析
QD(Query Dispatcher)包含master和standby master,QE(Query Executor)包含primary和mirror。每个posgres数据库的信息使用GpDB对象表示。Segment对象代表primaryDB和其对应的零个、一个或多个mirrorDB。GpArray对象就是master、standbyMaster和多个Segmnet对象的组合。
GpDB类
GpDB类是单个dbid所指的postgres数据库实例的配置信息。其余成员都可以很好地理解,这里说说__filespaces成员是存放key为数据库对象oid,value为其数据库对象的文件目录路径的字典。因此GpDB类构造函数的datadir是SYSTEM_FILESPACE(oid为3052)所处的文件路径。
1 class GpDB: 2 def __init__(self, content, preferred_role, dbid, role, mode, status, hostname, address, port, datadir, replicationPort): 3 self.content=content 4 self.preferred_role=preferred_role 5 self.dbid=dbid 6 self.role=role 7 self.mode=mode 8 self.status=status 9 self.hostname=hostname 10 self.address=address 11 self.port=port 12 self.datadir=datadir 13 self.replicationPort=replicationPort 14 # Filespace mappings for this segment 15 self.__filespaces = { SYSTEM_FILESPACE: datadir } # SYSTEM_FILESPACE oid of the system filespace 3052 16 # Pending filespace creation 17 self.__pending_filespace = None 18 # Catalog directory for each database in this segment 19 self.catdirs = None 20 # Todo: Remove old dead code 21 self.valid = (status == 'u') 22 def __str__(self): # 构造GpDB类可打印的字符串表示 23 def __repr__(self): 24 fsOids = [oid for oid in self.__filespaces] # 取出__filespaces中所有的key,及数据库对象对应的oid 25 fsOids.sort() # sort for determinism 26 filespaces = [] 27 for fsoid in fsOids: 28 if fsoid not in [SYSTEM_FILESPACE]: 29 filespaces.append("%d:%s" % (fsoid, self.__filespaces[fsoid])) # 以oid:datadir字符串为item,放入filespaces 30 return '%d|%d|%s|%s|%s|%s|%s|%s|%d|%s|%s|%s|%s' % (self.dbid,self.content,self.role,self.preferred_role,self.mode,self.status,self.hostname,self.address,self.port,self.replicationPort,self.datadir,','.join(filespaces),','.join(self.catdirs) if self.catdirs else []) 31 def __cmp__(self,other): # 使用__reper__函数序列化GpDB对象,并进行比较 32 def equalIgnoringModeAndStatusAndReplicationPort(self, other): # 如果核心属性(比如filespace)都相同则返回true,该方法在updateSystemConfig函数调用(在移除mirror segment或再添加mirror segmnet时会造成catalog改变) 33 def copy(self): 34 def isSegmentQD(self): 35 def isSegmentMaster(self, current_role=False): 36 ... 37 def isSegmentModeInResynchronization(self): 38 def getSegmentDbId(self): 39 def getSegmentContentId(self): 40 ... 41 def getSegmentFilespaces(self): 42 def setSegmentDbId(self, dbId): 43 def setSegmentContentId(self, contentId): 44 ... 45 def setSegmentDataDirectory(self, dataDirectory): 46 def addSegmentFilespace(self, oid, path): 47 def getSegmentPendingFilespace(self): 48 @staticmethod 49 def getDataDirPrefix(datadir): 50 retValue = "" 51 retValue = datadir[:datadir.rfind('/')] 52 return retValue
成员变量createTemplate函数创建GpDB的信息的模板,第一步确保dstDir有足够的空间存放segment和其filespace(通过fillespaces中存放的oid和dirpath,查询各数据库对象所对应的空间占用大小);第二步获取磁盘空闲空间(DiskFree.get_size_local(name = "Check for available free space for segment template", directory = dstDir)
);第三步使用LocalDirCopy类对象将segment数据目录拷贝到目标目录dstDir;第四步先判别__filespaces中除了SYSTEM_FILESPACE(oid为3052)之外是否还有其他数据库对象,如果有,先判别dstDir + "/fs_directory"目录是否存在,不断将fillespaces中存放的dirpath中的目录在目标路径进行创建,数据库对象文件进行拷贝;第五步,删除目标路径下的gp_dbid文件(dstDir + ‘/gp_dbid’),对dstDir设置0700权限。
1 def createTemplate(self, dstDir): 2 # Make sure we have enough room in the dstDir to fit the segment and its filespaces. 3 duCmd = DiskUsage(name = "srcDir", directory = dstDir) 4 duCmd.run(validateAfter=True) 5 requiredSize = duCmd.get_bytes_used() 6 name = "segcopy filespace get_size" 7 for oid in self.__filespaces: 8 if oid == SYSTEM_FILESPACE: 9 continue 10 dir = self.__filespaces[oid] 11 duCmd = DiskUsage(name, dir) 12 duCmd.run(validateAfter=True) 13 size = duCmd.get_bytes_used() 14 requiredSize = requiredSize + size 15 dstBytesAvail = DiskFree.get_size_local(name = "Check for available free space for segment template", directory = dstDir) 16 if dstBytesAvail <= requiredSize: 17 raise Exception("Not enough space on directory: '%s'. Currently %d bytes free but need %d bytes." % (dstDir, int(dstBytesAvail), int(requiredSize))) 18 logger.info("Starting copy of segment dbid %d to location %s" % (int(self.getSegmentDbId()), dstDir)) 19 cpCmd = LocalDirCopy("Copy system data directory", self.getSegmentDataDirectory(), dstDir) 20 cpCmd.run(validateAfter = True) 21 res = cpCmd.get_results() 22 if len(self.__filespaces) > 1: 23 """ Make directory to hold file spaces """ 24 fullPathFsDir = dstDir + "/" + DESTINATION_FILE_SPACES_DIRECTORY # DESTINATION_FILE_SPACES_DIRECTORY = "fs_directory" 25 cmd = FileDirExists( name = "check for existance of template filespace directory", directory = fullPathFsDir) 26 cmd.run(validateAfter = True) 27 MakeDirectory.local("gpexpand make directory to hold file spaces", fullPathFsDir) 28 for oid in self.__filespaces: 29 MakeDirectory.local("gpexpand make directory to hold file space oid: " + str(oid), fullPathFsDir) 30 dir = self.__filespaces[oid] 31 destDir = fullPathFsDir + "/" + str(oid) 32 MakeDirectory.local("gpexpand make directory to hold file space: " + destDir, destDir) 33 name = "GpSegCopy %s to %s" % (dir, destDir) 34 cpCmd = LocalDirCopy(name, dir, destDir) 35 cpCmd.run(validateAfter = True) 36 res = cpCmd.get_results() 37 # Remove the gp_dbid file from the data dir 38 RemoveFile.local('Remove gp_dbid file', os.path.normpath(dstDir + '/gp_dbid')) 39 logger.info("Cleaning up catalog for schema only copy on destination") 40 # We need 700 permissions or postgres won't start 41 Chmod.local('set template permissions', dstDir, '0700')
静态成员函数initFromString(s)为工厂函数,从字符串中初始化GpDB对象,该字符串和repr()输出兼容。
1 @staticmethod 2 def initFromString(s): 3 tup = s.strip().split('|') 4 # Old format: 8 fields Todo: remove the need for this, or rework it to be cleaner 5 if len(tup) == 8: 6 # This describes the gp_configuration catalog (pre 3.4) 7 content = int(tup[0]) 8 ... 9 datadir = tup[7] 10 # Calculate new fields from old ones 11 # Note: this should be kept in sync with the code in 12 # GpArray.InitFromCatalog() code for initializing old catalog 13 # formats. 14 preferred_role = ROLE_PRIMARY if definedprimary else ROLE_MIRROR 15 role = ROLE_PRIMARY if isprimary else ROLE_MIRROR 16 hostname = None 17 mode = MODE_SYNCHRONIZED # ??? 18 status = STATUS_UP if valid else STATUS_DOWN 19 replicationPort = None 20 filespaces = "" 21 catdirs = "" 22 # Catalog 3.4 format: 12 fields 23 elif len(tup) == 12: 24 # This describes the gp_segment_configuration catalog (3.4) 25 dbid = int(tup[0]) 26 ... 27 catdirs = "" 28 # Catalog 4.0+: 13 fields 29 elif len(tup) == 13: 30 # This describes the gp_segment_configuration catalog (3.4+) 31 dbid = int(tup[0]) 32 ... 33 catdirs = tup[12] 34 else: 35 raise Exception("GpDB unknown input format: %s" % s) 36 # Initialize segment without filespace information 37 gpdb = GpDB(content=content,preferred_role=preferred_role,dbid=dbid,role=role,mode=mode,status=status,hostname=hostname,address=address,port=port,datadir=datadir,replicationPort=replicationPort) 38 # Add in filespace information, if present 39 for fs in filespaces.split(","): 40 if fs == "": 41 continue 42 (fsoid, fselocation) = fs.split(":") 43 gpdb.addSegmentFilespace(fsoid, fselocation) 44 # Add Catalog Dir, if present 45 gpdb.catdirs = [] 46 for d in catdirs.split(","): 47 if d == "": 48 continue 49 gpdb.catdirs.append(d) 50 # Return the completed segment 51 return gpdb
Segment类
Segment类代表相同contentID的SegmentDBs,目前至多一个primary SegDB和单个mirror SegDB,在后续版本中会支持多mirror SegDB。
1 class Segment: 2 primaryDB=None #primary (GpDB实例) 3 mirrorDBs =None 4 def __init__(self): 5 self.mirrorDBs = [] #mirror (GpDB实例) 6 pass 7 def addPrimary(self,segDB) #设置primary 8 def addMirror(self,segDB) #追加mirror 9 def get_dbs(self) #返回Primary和Mirror实例组成的列表(GpDB实例列表) 10 def get_hosts(self) #返回Primary和Mirror所在主机的主机名的列表 11 def is_segment_pair_valid(self): 12 """Validates that the primary/mirror pair are in a valid state""" 13 for mirror_db in self.mirrorDBs: 14 prim_status = self.primaryDB.getSegmentStatus() 15 prim_mode = self.primaryDB.getSegmentMode() 16 mirror_status = mirror_db.getSegmentStatus() 17 mirror_role = mirror_db.getSegmentMode() 18 if (prim_status, prim_mode, mirror_status, mirror_role) not in VALID_SEGMENT_STATES: 19 return False 20 return True
primary和mirror对的合法状态如下,各个字段含义如下:primaryDB.getSegmentStatus、primaryDB.getSegmentMode、mirror_db.getSegmentStatus、mirror_db.getSegmentMode。
VALID_SEGMENT_STATES = [
(STATUS_UP, MODE_CHANGELOGGING, STATUS_DOWN, MODE_SYNCHRONIZED),
(STATUS_UP, MODE_CHANGELOGGING, STATUS_DOWN, MODE_RESYNCHRONIZATION),
(STATUS_UP, MODE_RESYNCHRONIZATION, STATUS_UP, MODE_RESYNCHRONIZATION),
(STATUS_UP, MODE_SYNCHRONIZED, STATUS_UP, MODE_SYNCHRONIZED)
]
- primaryDB状态为up,模式为CHANGELOGGING,mirrorDB状态为down,模式可以为SYNCHRONIZED、RESYNCHRONIZATION
- primaryDB状态为up,模式为RESYNCHRONIZATION,mirrorDB状态为up,模式为RESYNCHRONIZATION
- primaryDB状态为up,模式为SYNCHRONIZED,mirrorDB状态为up,模式为SYNCHRONIZED
如果要返回primaryDB的主机名,可使用segment1.primaryDB.getSegmentHostName()。
GpArray类
GpArray类构造函数接受包含QD和QE的GpDB的列表segments
1 class GpArray: 2 def __init__(self, segments, segmentsAsLoadedFromDb=None, strategyLoadedFromDb=None): 3 self.master =None #GpDB实例 4 self.standbyMaster = None #GpDB实例 5 self.segments = [] #Segment实例列表 6 self.expansionSegments=[] 7 self.numPrimarySegments = 0 8 self.recoveredSegmentDbids = [] 9 self.__version = None 10 self.__segmentsAsLoadedFromDb = segmentsAsLoadedFromDb 11 self.__strategyLoadedFromDb = strategyLoadedFromDb 12 self.__strategy = FAULT_STRATEGY_NONE # FAULT_STRATEGY_NONE = 'n' # mirrorless systems 无mirror系统 13 self.setFilespaces([]) 14 for segdb in segments: 15 # Handle QD nodes # 处理QD节点 16 if segdb.isSegmentMaster(True): 17 if self.master != None: 18 logger.error("multiple master dbs defined") 19 raise Exception("GpArray - multiple master dbs defined") 20 self.master = segdb 21 elif segdb.isSegmentStandby(True): 22 if self.standbyMaster != None: 23 logger.error("multiple standby master dbs defined") 24 raise Exception("GpArray - multiple standby master dbs defined") 25 self.standbyMaster = segdb 26 # Handle regular segments # 处理QE节点 27 elif segdb.isSegmentQE(): 28 if segdb.isSegmentMirror(): 29 self.__strategy = FAULT_STRATEGY_FILE_REPLICATION # FAULT_STRATEGY_FILE_REPLICATION = 'f' # valid for versions 4.0+ # 有mirror节点 30 self.addSegmentDb(segdb) 31 else: 32 # Not a master, standbymaster, primary, or mirror? 33 # shouldn't even be possible. 34 logger.error("FATAL - invalid dbs defined") 35 raise Exception("Error: GpArray() - invalid dbs defined") 36 # Make sure we have a master db 37 if self.master is None: 38 logger.error("FATAL - no master dbs defined!") 39 raise Exception("Error: GpArray() - no master dbs defined") 40 def __str__(self): 41 def hasStandbyMaster(self): 42 def addSegmentDb(self, segdb): # segdb是GpDB实例,向self.segments中加入新的segment或向原有的segment对象添加GpDB实例(addPrimary或addMirror) 43 def isStandardArray(self): 44 def is_array_valid(self): 45 def dumpToFile(self, filename): 46 def setFaultStrategy(self, strategy): 47 def getFaultStrategy(self): 48 ....
initFromCatalog从数据库中获取GpArray对象的数据成员的数据,形参为数据库URL,设置utility模式。主要是一些查找数据库状态信息的SQL,作为DBA需要收集学习这些SQL,以备后续学习运维使用。
1 @staticmethod 2 def initFromCatalog(dbURL, utility=False): 3 conn = dbconn.connect(dbURL, utility) 4 # Get the version from the database: 5 version_str = None 6 for row in dbconn.execSQL(conn, "SELECT version()"): 7 version_str = row[0] 8 version = GpVersion(version_str) 9 if version.getVersionRelease() in ("3.0", "3.1", "3.2", "3.3"): 10 # In older releases we get the fault strategy using the 11 # gp_fault_action guc. 12 strategy_rows = dbconn.execSQL(conn, "show gp_fault_action") 13 # Note: Mode may not be "right", certainly 4.0 concepts of mirroring 14 # mode do not apply to 3.x, so it depends on how the scripts are 15 # making use of mode. For now it is initialized to synchronized. 16 # 17 # Note: hostname is initialized to null since the catalog does not 18 # contain this information. Initializing a hostcache using the 19 # resulting gparray will automatically fill in a value for hostname. 20 # 21 # Note: this should be kept in sync with the code in 22 # GpDB.InitFromString() code for initializing old catalog formats. 23 config_rows = dbconn.execSQL(conn, ''' 24 SELECT dbid, content,case when isprimary then 'p' else 'm' end as role, 25 case when definedprimary then 'p' else 'm' end as preferred_role, 26 's' as mode,case when valid then 'u' else 'd' end as status, 27 null as hostname,hostname as address,port,null as replication_port, 28 %s as fsoid,datadir as fselocation FROM pg_catalog.gp_configuration 29 ORDER BY content, preferred_role DESC 30 ''' % str(SYSTEM_FILESPACE)) 31 # no filespace support in older releases. 32 filespaceArr = [] 33 else: 34 strategy_rows = dbconn.execSQL(conn, ''' 35 SELECT fault_strategy FROM gp_fault_strategy 36 ''') 37 config_rows = dbconn.execSQL(conn, ''' 38 SELECT dbid, content, role, preferred_role, mode, status, 39 hostname, address, port, replication_port, fs.oid, 40 fselocation 41 FROM pg_catalog.gp_segment_configuration 42 JOIN pg_catalog.pg_filespace_entry on (dbid = fsedbid) 43 JOIN pg_catalog.pg_filespace fs on (fsefsoid = fs.oid) 44 ORDER BY content, preferred_role DESC, fs.oid 45 ''') 46 filespaceRows = dbconn.execSQL(conn, ''' 47 SELECT oid, fsname FROM pg_filespace ORDER BY fsname; 48 ''') 49 filespaceArr = [GpFilespaceObj(fsRow[0], fsRow[1]) for fsRow in filespaceRows] 50 # Todo: add checks that all segments should have the same filespaces? 51 recoveredSegmentDbids = [] 52 segments = [] 53 seg = None 54 for row in config_rows: 55 # Extract fields from the row 56 (dbid, content, role, preferred_role, mode, status, hostname, 57 address, port, replicationPort, fsoid, fslocation) = row 58 # If we have segments which have recovered, record them. 59 if preferred_role != role and content >= 0: 60 if mode == MODE_SYNCHRONIZED and status == STATUS_UP: 61 recoveredSegmentDbids.append(dbid) 62 # The query returns all the filespaces for a segment on separate 63 # rows. If this row is the same dbid as the previous row simply 64 # add this filespace to the existing list, otherwise create a 65 # new segment. 66 if seg and seg.getSegmentDbId() == dbid: 67 seg.addSegmentFilespace(fsoid, fslocation) 68 else: 69 seg = GpDB(content, preferred_role, dbid, role, mode, status, 70 hostname, address, port, fslocation, replicationPort) 71 segments.append(seg) 72 datcatloc = dbconn.execSQL(conn, ''' 73 select fsloc.dbid, fsloc.fselocation || '/' || case when db.dattablespace = 1663 74 then 'base' else db.dattablespace::text end || '/'||db.oid as catloc 75 from pg_Database db, pg_tablespace ts, 76 (SELECT dbid, fs.oid, fselocation 77 FROM pg_catalog.gp_segment_configuration 78 JOIN pg_catalog.pg_filespace_entry on (dbid = fsedbid) 79 JOIN pg_catalog.pg_filespace fs on (fsefsoid = fs.oid)) fsloc 80 where db.dattablespace = ts.oid 81 and ts.spcfsoid = fsloc.oid''') 82 conn.close() 83 catlocmap = {} 84 for row in datcatloc: 85 if catlocmap.has_key(row[0]): 86 catlocmap[row[0]].append(row[1]) 87 else: 88 catlocmap[row[0]] = [row[1]] 89 for seg in segments: 90 seg.catdirs = catlocmap[seg.dbid] 91 origSegments = [seg.copy() for seg in segments] 92 if strategy_rows.rowcount == 0: 93 raise Exception("Database does not contain gp_fault_strategy entry") 94 if strategy_rows.rowcount > 1: 95 raise Exception("Database has too many gp_fault_strategy entries") 96 strategy = strategy_rows.fetchone()[0] 97 array = GpArray(segments, origSegments, strategy) 98 array.__version = version 99 array.recoveredSegmentDbids = recoveredSegmentDbids 100 array.setFaultStrategy(strategy) # override the preliminary default `__strategy` with the database state, if available 101 array.setFilespaces(filespaceArr) 102 return array
initFromFile函数从文件中读取GpArray的信息,通过GpDB的initFromString函数,并使用GpArray构造函数创建GpArray对象。
1 @staticmethod 2 def initFromFile(filename): 3 segdbs=[] 4 fp = open(filename, 'r') 5 for line in fp: 6 segdbs.append(GpDB.initFromString(line)) 7 fp.close() 8 return GpArray(segdbs)
使用
通过gppylib的system文件夹下提供的configurationInterface接口,注册配置Provider,并初始化Provider,通过调用loadSystemConfig函数加载GpArray对象。get_gparray_from_config函数返回GpArray对象。
1 def get_gparray_from_config(): 2 # imports below, when moved to the top, seem to cause an import error in a unit test because of dependency issue 3 from gppylib.system import configurationInterface 4 from gppylib.system import configurationImplGpdb 5 from gppylib.system.environment import GpMasterEnvironment 6 master_data_dir = os.environ['MASTER_DATA_DIRECTORY'] 7 gpEnv = GpMasterEnvironment(master_data_dir, False) 8 configurationInterface.registerConfigurationProvider(configurationImplGpdb.GpConfigurationProviderUsingGpdbCatalog()) 9 confProvider = configurationInterface.getConfigurationProvider().initializeProvider(gpEnv.getMasterPort()) 10 return confProvider.loadSystemConfig(useUtilityMode=True)
代码来自于greenplum-db-5.27.1源代码