从阿里云DATAV GeoAtlas接口抽取行政区划数据
阿里云提供的地理信息接口
https://datav.aliyun.com/tools/atlas/
有两个接口, 一个是[行政编码].json, 一个是[行政编码]_full.json, 从接口中可以提取到区县一级的行政区划信息. 提取的过程中遇到的一些问题:
- 从[行政编码].json中读取的信息中, 可能parent = null, 出现这种情况的大都是一些撤县改区的节点, 要将其设为上一级节点的行政编码
- 从[行政编码].json中读到的parent的adcode, 可能与[父节点行政编码]_full.json中读到的parent的adcode不一致, 例如从110000_full.json中得到的节点列表, 其parent都是110000, 但是在取其字节点110101.json时会发现, parent变成了110100, 这时候要使用110100这个行政编码
- 因为从上至下遍历时, 是不会遇到110100这个节点的, 所以在遍历的过程中, 要检查是否出现了未知的行政编码, 如果有, 需要额外读取并入库
- 有部分节点, 其json无法读取(不存在), 例如密云110118.json, 延庆110119.json, 这时候要用前一步得到的信息入库
使用生成的行政区划数据时, 对于香港澳门的数据, 因为没有level=city的这一级, 所以需要特殊处理一下, 例如在读取province这一级的子节点时, 如果发现没有level=city的节点, 那么就返回一个虚拟的节点, 这个节点各字段值和自己一样, 但是level=city.
#!/usr/bin/python3 # -*- coding: UTF-8 -*- import json import traceback import rbcommon def readRegion(adcode, parent_code = None): # https://geo.datav.aliyun.com/areas/bound/140000.json url = 'https://geo.datav.aliyun.com/areas/bound/' + adcode + '.json' print(url) echo = rbcommon.requestGet(url, 'UTF-8', 20, 10) if echo is None: print('URL request failed: ' + url) return elif echo.find('<?') == 0: print('Not found: ' + url) return # print(echo) json_obj = json.loads(echo) region = {} region['name'] = json_obj['features'][0]['properties']['name'] region['adcode'] = json_obj['features'][0]['properties']['adcode'] region['telecode'] = json_obj['features'][0]['properties']['telecode'] level = json_obj['features'][0]['properties']['level'] if (level == 'country'): region['level'] = 0 elif (level == 'province'): region['level'] = 1 elif (level == 'city'): region['level'] = 2 elif (level == 'district'): region['level'] = 3 if ('parent' in json_obj['features'][0]['properties']) and (not json_obj['features'][0]['properties']['parent'] is None): region['parent'] = json_obj['features'][0]['properties']['parent']['adcode'] else: region['parent'] = parent_code # read sub regions sub_regions = [] region['children'] = sub_regions # https://geo.datav.aliyun.com/areas/bound/140000_full.json url = 'https://geo.datav.aliyun.com/areas/bound/' + adcode + '_full.json' print(url) echo = rbcommon.requestGet(url, 'UTF-8', 20, 10) if echo is None: print('URL request failed: ' + url) return region elif echo.find('<?') == 0: print('Not found: ' + url) return region # print(echo) json_obj = json.loads(echo) sub_objs = json_obj['features'] for sub_obj in sub_objs: sub_region = {} sub_region['adcode'] = (str)(sub_obj['properties']['adcode']) if (sub_region['adcode'] == region['adcode']): continue sub_region['name'] = sub_obj['properties']['name'] sub_region['telecode'] = None level = sub_obj['properties']['level'] if (level == 'country'): sub_region['level'] = 0 elif (level == 'province'): sub_region['level'] = 1 elif (level == 'city'): sub_region['level'] = 2 elif (level == 'district'): sub_region['level'] = 3 sub_region['parent'] = adcode sub_regions.append(sub_region) # further check if the parent adcode is correct if (len(sub_regions) > 0): # https://geo.datav.aliyun.com/areas/bound/140000.json url = 'https://geo.datav.aliyun.com/areas/bound/' + sub_regions[0]['adcode'] + '.json' # print(url) echo = rbcommon.requestGet(url, 'UTF-8', 20, 10) if echo is None: print('URL request failed: ' + url) elif echo.find('<?') == 0: print('Not found: ' + url) else: json_obj = json.loads(echo) if ('parent' in json_obj['features'][0]['properties']) and (not json_obj['features'][0]['properties']['parent'] is None): dummy_parent = json_obj['features'][0]['properties']['parent']['adcode'] if (dummy_parent != sub_regions[0]['parent']): print('Update parent from {} to {}', sub_regions[0]['parent'], dummy_parent) for sub_region in sub_regions: sub_region['parent'] = dummy_parent return region def readAllRegion(parent_region): region = readRegion(parent_region['adcode'], parent_region['parent']) if not region is None: if (not region['parent'] is None) and (not region['parent'] in regions): new_region = readRegion(region['parent'], parent_region['parent']) if not new_region is None: regions.add(new_region['adcode']) insert(new_region) regions.add(region['adcode']) insert(region) for sub_region in region['children']: readAllRegion(sub_region) else: regions.add(parent_region['adcode']) insert(parent_region) def insert(region): try: with rbcommon.mysqlclient.cursor() as cursor: sql = 'INSERT IGNORE INTO `s_region` (`id`, `parent_id`, `level`, `name`, `tele_code`, `short_name`, ' \ '`full_name`) VALUES (%s, %s, %s, %s, %s, %s, %s)' cursor.execute(sql, ( region['adcode'], None if (not 'parent' in region) else region['parent'], region['level'], region['name'], region['telecode'], region['name'], '{}')) rbcommon.mysqlclient.commit() except Exception as e: print(json.dumps(region)) traceback.print_exc() ### MAIN ### regions = set() region = readRegion('100000') readAllRegion(region)
其中rbcommon.mysqlclient的初始化方法
mysqlclient = pymysql.connect( host=cfg['mysql']['host'], port=cfg['mysql']['port'], user=cfg['mysql']['user'], password=cfg['mysql']['password'], db=cfg['mysql']['db'], charset=cfg['mysql']['charset'], cursorclass=pymysql.cursors.DictCursor)