python爬虫获取国家统计局区划代码和城乡划分代码添加到数据库
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | import pymysql from bs4 import BeautifulSoup import requests import time from lxml import etree def get_area(year): year = str (year) url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/" + year + "/index.html" headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36' } response = requests.get(url, headers) response.encoding = 'UTF-8' page_text = response.text soup = BeautifulSoup(page_text, 'lxml' ) all_province = soup.find_all( 'tr' , class_ = 'provincetr' ) # 获取所有省份第一级的tr 有4个tr # all_province长度为4,其中第一组是从北京市到黑龙江省 """ 格式是这样的: <tr class="provincetr"><td><a href="11.html">北京市<br/></a></td> <td><a href="12.html">天津市<br/></a></td> <td><a href="13.html">河北省<br/></a></td> <td><a href="14.html">山西省<br/></a></td> <td><a href="15.html">内蒙古自治区<br/></a></td> <td><a href="21.html">辽宁省<br/></a></td><td> """ province_str = "" # 为了方便处理,把省份数据变成一个字符串 for i in range ( len (all_province)): province_str = province_str + str (all_province[i]) # 开始分别获得a标签的href和text province = {} provinceList = [] province_soup = BeautifulSoup(province_str, 'lxml' ) province_href = province_soup.find_all( "a" ) # 获取所有的a标签 for i in province_href: href_str = str (i) province_str = [] # 创建省份数据字典 province.update( {BeautifulSoup(href_str, 'lxml' ).find( "a" ).text: BeautifulSoup(href_str, 'lxml' ).find( "a" )[ "href" ]}) province_str.append({BeautifulSoup(href_str, 'lxml' ).find( "a" )[ "href" ][: 2 ] + '0000000000' : BeautifulSoup(href_str, 'lxml' ).find( "a" ).text}) provinceList.append(province_str) """ 数据provide字典 {'北京市': '11.html', '天津市': '12.html', '河北省': '13.html', '山西省': '14.html', '内蒙古自治区': '15.html', '辽宁省': '21.html', '吉林省': '22.html', '黑龙江省': '23.html', '上海市': '31.html', '江苏省': '32.html', '浙江省': '33.html', '安徽省': '34.html', '福建省': '35.html', '江西省': '36.html', '山东省': '37.html', '河南省': '41.html', '湖北省': '42.html', '湖南省': '43.html', '广东省': '44.html', '广西壮族自治区': '45.html', '海南省': '46.html', '重庆市': '50.html', '四川省': '51.html', '贵州省': '52.html', '云南省': '53.html', '西藏自治区': '54.html', '陕西省': '61.html', '甘肃省': '62.html', '青海省': '63.html', '宁夏回族自治区': '64.html', '新疆维吾尔自治区': '65.html'} """ # 根据身份数据字典继续爬取下一级的市级数据,创建市级数据字典 city = [] city_list = [] city_tr = [] temp_list = [] for item in province.items(): city_url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/" + year + "/" + item[ 1 ] city_html = requests.get(city_url, headers) city_html.encoding = 'UTF-8' city_text = city_html.text city_tr.append(BeautifulSoup(city_text, 'lxml' ).find_all( 'tr' , class_ = "citytr" )) # 获得所有的市区tr city_tr列表长度是31 对应31个省或直辖市 # 下面开始建立市区的字典{"名字":"链接"} # 存放省名字列表 province_key = [] for key in province.keys(): province_key.append(key) num = 0 for i in city_tr: for j in i: city_str_list = [] etree_ = etree.HTML( str (j)) temp_list.append({ etree_.xpath( '//tr/td[2]/a/text()' )[ 0 ]: etree_.xpath( '//tr/td[2]/a/@href' )[ 0 ] }) city_str_list.append({ etree_.xpath( '//tr/td[2]/a/@href' )[ 0 ][ 3 : 7 ] + '00000000' : etree_.xpath( '//tr/td[2]/a/text()' )[ 0 ] }) city_list.append(city_str_list) city.append({province_key[num]: temp_list}) num = num + 1 temp_list = [] """ city[11] {'安徽省': [{'合肥市': '34/3401.html'}, {'芜湖市': '34/3402.html'}, {'蚌埠市': '34/3403.html'}, {'淮南市': '34/3404.html'}, {'马鞍山市': '34/3405.html'}, {'淮北市': '34/3406.html'}, {'铜陵市': '34/3407.html'}, {'安庆市': '34/3408.html'}, {'黄山市': '34/3410.html'}, {'滁州市': '34/3411.html'}, {'阜阳市': '34/3412.html'}, {'宿州市': '34/3413.html'}, {'六安市': '34/3415.html'}, {'亳州市': '34/3416.html'}, {'池州市': '34/3417.html'}, {'宣城市': '34/3418.html'}]} """ # 搞定市级字典,下面开始最后一步,area area_list = [] temp_area_list = [] for item1 in city: for k1, v1 in item1.items(): province_name = k1 if (province_name in [ "北京" , "天津" , "上海" , "重庆" ]): province_name = province_name + "市" if (province_name = = "宁夏" ): province_name = province_name + "回族自治区" if (province_name in [ "西藏" , "内蒙古" ]): province_name = province_name + "自治区" if (province_name = = "新疆" ): province_name = province_name + "维吾尔自治区" if (province_name = = "广西" ): province_name = province_name + "壮族自治区" if (province_name = = "黑龙江" ): province_name = province_name + "省" if ( len (province_name) = = 2 and province_name not in [ "西藏" , "宁夏" , "新疆" , "广西" , "北京" , "天津" , "上海" , "重庆" ]): province_name = province_name + "省" for item2 in v1: for k2, v2 in item2.items(): city_name = k2 area_url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/" + year + "/" + v2 area_response = requests.get(area_url, headers) area_response.encoding = 'UTF-8' area_text = area_response.text area_soup = BeautifulSoup(area_text, 'lxml' ) area_tr = area_soup.find_all( "tr" , class_ = "countytr" ) for i in range ( len (area_tr)): etree_area = etree.HTML( str (area_tr[i])) try : area_name = etree_area.xpath( "//tr/td[2]/a/text()" )[ 0 ] except : area_name = etree_area.xpath( "//tr/td[2]/text()" )[ 0 ] try : temp_area_list.append({ etree_area.xpath( "//tr/td[1]/a/text()" )[ 0 ]: area_name }) except : temp_area_list.append({ etree_area.xpath( "//tr/td[1]/text()" )[ 0 ]: area_name }) area_list.append(temp_area_list) temp_area_list = [] time.sleep( 1 ) return provinceList + city_list + area_list def into_mysql(year): year = str (year) conn, cursor = get_mysql_conn() res = get_area(year) # print(res) try : for item in res: for k, v in item[ 0 ].items(): SQL = "insert into base_position (areaCode, name) values ('" + k + "','" + v + "')" cursor.execute(SQL) conn.commit() except : print ( "出现错误" ) conn, cursor.close() return None def query(sql, * args): """ 通用封装查询 :param sql: :param args: :return:返回查询结果 ((),()) """ conn, cursor = get_mysql_conn() cursor.execute(sql) res = cursor.fetchall() close_conn(conn, cursor) return res """ ------------------------------------------------------------------------------------ """ def get_mysql_conn(): """ :return: 连接,游标 """ # 创建连接 conn = pymysql.connect(host = "", user = "", password = "", db = "", charset = "utf8" ) # 创建游标 cursor = conn.cursor() # 执行完毕返回的结果集默认以元组显示 return conn, cursor def close_conn(conn, cursor): if cursor: cursor.close() if conn: conn.close() if __name__ = = '__main__' : into_mysql( '2022' ) |
数据格式和逻辑可根据自己需求更改!
希望大佬看到有不对的地方,提出博主予以改正!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 【自荐】一款简洁、开源的在线白板工具 Drawnix
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY
· Docker 太简单,K8s 太复杂?w7panel 让容器管理更轻松!