Geoprocessing scripts 利用多核进行计算
基于Python 2.5,利用多核进行计算,并非多核计算或并行计算。
作者:Flyingis
本文欢迎友情转载,但请注明作者及原文链接,严禁用于商业目的!
让ArcGIS桌面软件进行GP脚本并行运算确实不是一件容易的事情,一方面是粗粒度AO对象的限制,另外一方面是Python本身的问题。
Python是解释型的语言,使用GIL全局解释器锁在内部禁止并行运算,因此在相同时间内只能有一条指令被执行,为什么存在GIL?是因为Python解释器后台的不可见变量,比如为了进行垃圾回收而维护的引用计数,如果没有GIL,则可能出现由于线程切换导致的对同一对象释放两次的情况(参考该文),Jython和IronPython没有GIL问题,倒是可以拿来一试。对于ArcGIS 9.3.1,使用的Python版本为2.5,目前最新的Python 3.0都发布了,从2.6开始,Python新增multiprocessing模块来解决这个问题,ArcGIS 9.4将支持这一版本。
那么,Parallel Python不能解决所有问题,又必须得使用Python 2.5,如何有效利用多核计算(注意,是利用多核计算,并非并行计算)?通常情况下,我们会将多个任务分解成不同的脚本,开启多个Python解释器或ArcCatalog运行,每个程序占用一个核,多核可以同时利用起来,各自处理完成后,进行汇总,得到最终计算结果。ESRI Resource Center上提供了一种类似的方法,差别在于将所有任务都写在一个Python脚本中,实际上和前者的原理相同,作为参考。
代码
1 # Author: ESRI
2 # Date: August 2009
3 # Purpose: This script demonstrates a methodology for parallel geoprocessing in ArcGIS Desktop.
4 # The input is split into parts which are processed by separate subprocesses,
5 # their outputs are appended into the final result. In this specific case, the
6 # geoprocessing undertaken is address geocoding, but any feasible geoprocessing
7 # can be done by simple modification of this script.
8 # Look for the comment line "Begin code block for GP process of interest"
9 try:
10 import arcgisscripting, os, subprocess, sys, time, traceback
11 gp = arcgisscripting.create(9.3)
12 gp.overwriteoutput = True
13 startTime = time.time()
14
15 #Get the number of concurrent GP processes desired.
16 #It is recommended this agree with the number of your available cores.
17 pCount = gp.getparameter(0)
18 inObject = gp.getparameterastext(1)
19 result = gp.getcount_management(inObject)
20 count = int(result.getoutput(0))
21 gp.addmessage("Processing " + inObject + " in " + str(pCount) + " concurrent processes.")
22 #We will use a generic approach to splitting the input into parts, namely slicing it
23 #with modulo arithmetic. For example, if we want to run 4 concurrent processes
24 #we create 4 parts from the input using "ObjectID modulo 4 = 0 (then 1,2,3)"
25
26 #Build the splitting expressions(Note you will need to edit this code block if using input from SDE on Oracle)
27 inDesc = gp.describe(inObject)
28 delimitedOIDName = str(gp.addfielddelimiters(inObject,str(inDesc.oidfieldname)))
29 wsType = str(gp.describe(str(inDesc.path)).workspacetype)
30 queryList = []
31 for q in range(pCount):
32 if wsType == "RemoteDatabase": #SDE
33 #Non-Oracle
34 queryList.append(delimitedOIDName + " % " + str(pCount) + " = " + str(q))
35 #Oracle - comment out the above line and uncomment the below line when using Oracle inputs
36 #queryList.append("mod(\"" + delimitedOIDName + "\"," + str(pCount) + ") = " + str(q))
37 elif wsType == "FileSystem": #DBase
38 queryList.append("mod(\"" + delimitedOIDName + "\"," + str(pCount) + ") = " + str(q))
39 elif wsType == "LocalDatabase" and inDesc.path.endswith(".mdb"): #Personal GDB
40 queryList.append(delimitedOIDName + " mod " + str(pCount) + " = " + str(q))
41 elif wsType == "LocalDatabase" and inDesc.path.endswith(".gdb"): #File GDB
42 queryList.append("mod(\"" + delimitedOIDName + "\"," + str(pCount) + ") = " + str(q))
43 #Create a seed name for the parts
44 if inDesc.name.endswith((".dbf",".shp")):
45 seedName = str(inDesc.name).split(".")[0]
46 elif inDesc.name.count(".") > 1: #Fully qualified DBMS object name
47 seedName = str(inDesc.name).split(".")[-1]
48 else:
49 seedName = str(inDesc.name)
50
51 #Make file GDBs at a well known path (to the system); edit this to use fast disk if you have it.
52 #The child scripts will write into these separate file GDBs so as to avoid lock conflicts.
53 #import tempfile
54 #wkPath = tempfile.gettempdir()
55 wkPath = r"C:\Temp"
56 gdbNameList = []
57 j = 0
58 for i in range(pCount):
59 gdbName = wkPath+"\parallel"+str(i)+".gdb"
60 while gp.exists(gdbName): #Do not clobber other parallel processes
61 j+=1
62 gdbName = wkPath+"\parallel"+str(j)+".gdb"
63 gdbNameList.append(gdbName)
64 result = gp.createfilegdb_management(wkPath,os.path.basename(gdbName))
65 gp.workspace = "in_memory"
66
67 #Create scripts that geoprocess the parts, write these into the file GDB directories created above
68 scriptPathList = []
69 for r in range(pCount):
70 gdbName = gdbNameList[r]
71 sName = gdbName+"/parallel.py"
72 scriptPathList.append(sName)
73 scriptLines = []
74 scriptLines.append("import arcgisscripting\n")
75 scriptLines.append("gp = arcgisscripting.create(9.3)\n")
76 scriptLines.append("gp.overwriteoutput = True\n")
77 if str(inDesc.datatype) == "Table" or str(inDesc.datatype) == "DbaseTable":
78 scriptLines.append("result = gp.tabletotable_conversion("+\
79 "\""+inObject+"\","+\
80 "\""+gdbName+"\","+\
81 "\""+seedName+str(r)+"\","+\
82 "\""+queryList[r]+"\")\n")
83 else:
84 scriptlines.append("result = gp.featureclasstofeatureclass_conversion("+\
85 "\""+inObject+"\","+\
86 "\""+gdbName+"\","+\
87 "\""+seedName+str(r)+"\","+\
88 "\""+queryList[r]+"\")\n")
89 scriptLines.append("part = result.getoutput(0)\n")
90 #
91 #Begin code block for GP process of interest:
92 scriptLines.append("aLocator = r\"D:\Dev\Python\GP_multicore\TestData.gdb\NZ Locator\""+"\n")
93 #We could use a service. Note the \\a in \\arcgis below would be a BELL literal if not escaped as in \a (rcgis)
94 #scriptLines.append("aLocator = r\"GIS Servers\\arcgis on bruceh\CA_Locator.GeocodeServer\""+"\n")
95 scriptLines.append("fieldMap = \"Address Address;Suburb Suburb;Mailtown Mailtown;Postcode Postcode\"\n")
96 scriptLines.append("outFC = "+"\""+gdbName+"\\"+"Result"+str(r)+"\"\n")
97 scriptLines.append("result = gp.geocodeaddresses_geocoding(part,aLocator,fieldMap,outFC)\n")
98 scriptLines.append("result = result.getoutput(0)\n")
99 #End code block for GP process of interest:
100 #
101 #Make stdio aware of completion - this message will be returned to the GP stdio message pipe
102 scriptLines.append("print "+ "\"Finished Part" + str(r) + "\"\n")
103 scriptLines.append("del gp\n")
104 #Write script
105 f = open(sName,'w')
106 f.writelines(scriptLines)
107 f.flush
108 f.close()
109 #Launch the parallel processes
110 gp.addmessage("Launching parallel processes\n")
111 processList = []
112 for r in range(pCount):
113 processList.append(subprocess.Popen([r"C:\Python25\python.exe",scriptPathList[r]],\
114 shell=True,\
115 stdout=subprocess.PIPE,\
116 stderr=subprocess.PIPE))
117 gp.addmessage("Launched process " + str(r)+ "\n")
118 time.sleep(2)
119 #
120 #Wait for completion
121 messageList = []
122 Done = False
123 while not Done:
124 Done = True
125 for p in processList:
126 if p.poll() is None:
127 Done = False
128 else:
129 messageList.append("Process " + str(processList.index(p)) + " complete at " + str(time.ctime()) + "\n")
130 stdOutLines = p.stdout.readlines()
131 for s in stdOutLines:
132 messageList.append(s)
133 stdErrorLines = p.stderr.readlines()
134 for e in stdErrorLines:
135 messageList.append(e)
136 if not Done: #Save 2 seconds if complete
137 time.sleep(2)
138 for m in messageList:
139 gp.addmessage(m)
140 #Merge the results
141 outFC = gp.getparameterastext(2)
142 fm = gp.createobject("fieldmappings")
143 vt = gp.createobject("valuetable")
144 for g in gdbNameList:
145 gp.workspace = g
146 if len(gp.listfeatureclasses("Result*")) > 0:
147 fcDesc = gp.describe(gp.listfeatureclasses("Result*")[0])
148 fm.addtable(fcDesc.catalogpath)
149 vt.addrow(fcDesc.catalogpath)
150 gp.workspace = "in_memory"
151 result = gp.merge_management(vt,outFC,fm)
152 #Clean up
153 for g in gdbNameList:
154 result = gp.delete_management(g)
155 #Done!
156
157 endTime = time.time()
158 gp.addmessage("Geoprocessing duration was " + str(int(endTime - startTime)) + " seconds.")
159 gp.addmessage("Net performance was " + \
160 str(int(1.0 * count / (endTime - startTime) * 3600.0)) + " records per hour.")
161 del gp
162
163 except:
164 tb = sys.exc_info()[2]
165 tbinfo = traceback.format_tb(tb)[0]
166 pymsg = "PYTHON ERRORS:\nTraceback Info:\n" + tbinfo + "\nError Info:\n " + \
167 str(sys.exc_type)+ ": " + str(sys.exc_value) + "\n"
168 gp.AddError(pymsg)
169
170 msgs = "GP ERRORS:\n" + gp.GetMessages(2) + "\n"
171 gp.AddError(msgs)
2 # Date: August 2009
3 # Purpose: This script demonstrates a methodology for parallel geoprocessing in ArcGIS Desktop.
4 # The input is split into parts which are processed by separate subprocesses,
5 # their outputs are appended into the final result. In this specific case, the
6 # geoprocessing undertaken is address geocoding, but any feasible geoprocessing
7 # can be done by simple modification of this script.
8 # Look for the comment line "Begin code block for GP process of interest"
9 try:
10 import arcgisscripting, os, subprocess, sys, time, traceback
11 gp = arcgisscripting.create(9.3)
12 gp.overwriteoutput = True
13 startTime = time.time()
14
15 #Get the number of concurrent GP processes desired.
16 #It is recommended this agree with the number of your available cores.
17 pCount = gp.getparameter(0)
18 inObject = gp.getparameterastext(1)
19 result = gp.getcount_management(inObject)
20 count = int(result.getoutput(0))
21 gp.addmessage("Processing " + inObject + " in " + str(pCount) + " concurrent processes.")
22 #We will use a generic approach to splitting the input into parts, namely slicing it
23 #with modulo arithmetic. For example, if we want to run 4 concurrent processes
24 #we create 4 parts from the input using "ObjectID modulo 4 = 0 (then 1,2,3)"
25
26 #Build the splitting expressions(Note you will need to edit this code block if using input from SDE on Oracle)
27 inDesc = gp.describe(inObject)
28 delimitedOIDName = str(gp.addfielddelimiters(inObject,str(inDesc.oidfieldname)))
29 wsType = str(gp.describe(str(inDesc.path)).workspacetype)
30 queryList = []
31 for q in range(pCount):
32 if wsType == "RemoteDatabase": #SDE
33 #Non-Oracle
34 queryList.append(delimitedOIDName + " % " + str(pCount) + " = " + str(q))
35 #Oracle - comment out the above line and uncomment the below line when using Oracle inputs
36 #queryList.append("mod(\"" + delimitedOIDName + "\"," + str(pCount) + ") = " + str(q))
37 elif wsType == "FileSystem": #DBase
38 queryList.append("mod(\"" + delimitedOIDName + "\"," + str(pCount) + ") = " + str(q))
39 elif wsType == "LocalDatabase" and inDesc.path.endswith(".mdb"): #Personal GDB
40 queryList.append(delimitedOIDName + " mod " + str(pCount) + " = " + str(q))
41 elif wsType == "LocalDatabase" and inDesc.path.endswith(".gdb"): #File GDB
42 queryList.append("mod(\"" + delimitedOIDName + "\"," + str(pCount) + ") = " + str(q))
43 #Create a seed name for the parts
44 if inDesc.name.endswith((".dbf",".shp")):
45 seedName = str(inDesc.name).split(".")[0]
46 elif inDesc.name.count(".") > 1: #Fully qualified DBMS object name
47 seedName = str(inDesc.name).split(".")[-1]
48 else:
49 seedName = str(inDesc.name)
50
51 #Make file GDBs at a well known path (to the system); edit this to use fast disk if you have it.
52 #The child scripts will write into these separate file GDBs so as to avoid lock conflicts.
53 #import tempfile
54 #wkPath = tempfile.gettempdir()
55 wkPath = r"C:\Temp"
56 gdbNameList = []
57 j = 0
58 for i in range(pCount):
59 gdbName = wkPath+"\parallel"+str(i)+".gdb"
60 while gp.exists(gdbName): #Do not clobber other parallel processes
61 j+=1
62 gdbName = wkPath+"\parallel"+str(j)+".gdb"
63 gdbNameList.append(gdbName)
64 result = gp.createfilegdb_management(wkPath,os.path.basename(gdbName))
65 gp.workspace = "in_memory"
66
67 #Create scripts that geoprocess the parts, write these into the file GDB directories created above
68 scriptPathList = []
69 for r in range(pCount):
70 gdbName = gdbNameList[r]
71 sName = gdbName+"/parallel.py"
72 scriptPathList.append(sName)
73 scriptLines = []
74 scriptLines.append("import arcgisscripting\n")
75 scriptLines.append("gp = arcgisscripting.create(9.3)\n")
76 scriptLines.append("gp.overwriteoutput = True\n")
77 if str(inDesc.datatype) == "Table" or str(inDesc.datatype) == "DbaseTable":
78 scriptLines.append("result = gp.tabletotable_conversion("+\
79 "\""+inObject+"\","+\
80 "\""+gdbName+"\","+\
81 "\""+seedName+str(r)+"\","+\
82 "\""+queryList[r]+"\")\n")
83 else:
84 scriptlines.append("result = gp.featureclasstofeatureclass_conversion("+\
85 "\""+inObject+"\","+\
86 "\""+gdbName+"\","+\
87 "\""+seedName+str(r)+"\","+\
88 "\""+queryList[r]+"\")\n")
89 scriptLines.append("part = result.getoutput(0)\n")
90 #
91 #Begin code block for GP process of interest:
92 scriptLines.append("aLocator = r\"D:\Dev\Python\GP_multicore\TestData.gdb\NZ Locator\""+"\n")
93 #We could use a service. Note the \\a in \\arcgis below would be a BELL literal if not escaped as in \a (rcgis)
94 #scriptLines.append("aLocator = r\"GIS Servers\\arcgis on bruceh\CA_Locator.GeocodeServer\""+"\n")
95 scriptLines.append("fieldMap = \"Address Address;Suburb Suburb;Mailtown Mailtown;Postcode Postcode\"\n")
96 scriptLines.append("outFC = "+"\""+gdbName+"\\"+"Result"+str(r)+"\"\n")
97 scriptLines.append("result = gp.geocodeaddresses_geocoding(part,aLocator,fieldMap,outFC)\n")
98 scriptLines.append("result = result.getoutput(0)\n")
99 #End code block for GP process of interest:
100 #
101 #Make stdio aware of completion - this message will be returned to the GP stdio message pipe
102 scriptLines.append("print "+ "\"Finished Part" + str(r) + "\"\n")
103 scriptLines.append("del gp\n")
104 #Write script
105 f = open(sName,'w')
106 f.writelines(scriptLines)
107 f.flush
108 f.close()
109 #Launch the parallel processes
110 gp.addmessage("Launching parallel processes\n")
111 processList = []
112 for r in range(pCount):
113 processList.append(subprocess.Popen([r"C:\Python25\python.exe",scriptPathList[r]],\
114 shell=True,\
115 stdout=subprocess.PIPE,\
116 stderr=subprocess.PIPE))
117 gp.addmessage("Launched process " + str(r)+ "\n")
118 time.sleep(2)
119 #
120 #Wait for completion
121 messageList = []
122 Done = False
123 while not Done:
124 Done = True
125 for p in processList:
126 if p.poll() is None:
127 Done = False
128 else:
129 messageList.append("Process " + str(processList.index(p)) + " complete at " + str(time.ctime()) + "\n")
130 stdOutLines = p.stdout.readlines()
131 for s in stdOutLines:
132 messageList.append(s)
133 stdErrorLines = p.stderr.readlines()
134 for e in stdErrorLines:
135 messageList.append(e)
136 if not Done: #Save 2 seconds if complete
137 time.sleep(2)
138 for m in messageList:
139 gp.addmessage(m)
140 #Merge the results
141 outFC = gp.getparameterastext(2)
142 fm = gp.createobject("fieldmappings")
143 vt = gp.createobject("valuetable")
144 for g in gdbNameList:
145 gp.workspace = g
146 if len(gp.listfeatureclasses("Result*")) > 0:
147 fcDesc = gp.describe(gp.listfeatureclasses("Result*")[0])
148 fm.addtable(fcDesc.catalogpath)
149 vt.addrow(fcDesc.catalogpath)
150 gp.workspace = "in_memory"
151 result = gp.merge_management(vt,outFC,fm)
152 #Clean up
153 for g in gdbNameList:
154 result = gp.delete_management(g)
155 #Done!
156
157 endTime = time.time()
158 gp.addmessage("Geoprocessing duration was " + str(int(endTime - startTime)) + " seconds.")
159 gp.addmessage("Net performance was " + \
160 str(int(1.0 * count / (endTime - startTime) * 3600.0)) + " records per hour.")
161 del gp
162
163 except:
164 tb = sys.exc_info()[2]
165 tbinfo = traceback.format_tb(tb)[0]
166 pymsg = "PYTHON ERRORS:\nTraceback Info:\n" + tbinfo + "\nError Info:\n " + \
167 str(sys.exc_type)+ ": " + str(sys.exc_value) + "\n"
168 gp.AddError(pymsg)
169
170 msgs = "GP ERRORS:\n" + gp.GetMessages(2) + "\n"
171 gp.AddError(msgs)
Flyingis @ China
email: dev.vip#gmail.com
blog: http://flyingis.cnblogs.com/