python爬虫获取国家统计局区划代码和城乡划分代码添加到数据库

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import pymysql
from bs4 import BeautifulSoup
import requests
import time
from lxml import etree
 
def get_area(year):
    year = str(year)
    url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/" + year + "/index.html"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
    }
    response = requests.get(url, headers)
    response.encoding = 'UTF-8'
    page_text = response.text
    soup = BeautifulSoup(page_text, 'lxml')
    all_province = soup.find_all('tr', class_='provincetr'# 获取所有省份第一级的tr 有4个tr
    # all_province长度为4,其中第一组是从北京市到黑龙江省
    """
    格式是这样的:
    <tr class="provincetr"><td><a href="11.html">北京市<br/></a></td>
    <td><a href="12.html">天津市<br/></a></td>
    <td><a href="13.html">河北省<br/></a></td>
    <td><a href="14.html">山西省<br/></a></td>
    <td><a href="15.html">内蒙古自治区<br/></a></td>
    <td><a href="21.html">辽宁省<br/></a></td><td>
    """
    province_str = ""  # 为了方便处理,把省份数据变成一个字符串
    for i in range(len(all_province)):
        province_str = province_str + str(all_province[i])
    # 开始分别获得a标签的href和text
    province = {}
    provinceList = []
    province_soup = BeautifulSoup(province_str, 'lxml')
    province_href = province_soup.find_all("a"# 获取所有的a标签
    for i in province_href:
        href_str = str(i)
        province_str = []
        # 创建省份数据字典
        province.update(
            {BeautifulSoup(href_str, 'lxml').find("a").text: BeautifulSoup(href_str, 'lxml').find("a")["href"]})
        province_str.append({BeautifulSoup(href_str, 'lxml').find("a")["href"][:2] + '0000000000': BeautifulSoup(href_str, 'lxml').find("a").text})
        provinceList.append(province_str)
  
    """
    数据provide字典
    {'北京市': '11.html', '天津市': '12.html', '河北省': '13.html', '山西省': '14.html',
    '内蒙古自治区': '15.html', '辽宁省': '21.html', '吉林省': '22.html', '黑龙江省': '23.html',
    '上海市': '31.html', '江苏省': '32.html', '浙江省': '33.html', '安徽省': '34.html',
    '福建省': '35.html', '江西省': '36.html', '山东省': '37.html', '河南省': '41.html',
    '湖北省': '42.html', '湖南省': '43.html', '广东省': '44.html', '广西壮族自治区': '45.html',
    '海南省': '46.html', '重庆市': '50.html', '四川省': '51.html', '贵州省': '52.html', '云南省': '53.html',
    '西藏自治区': '54.html', '陕西省': '61.html', '甘肃省': '62.html', '青海省': '63.html',
    '宁夏回族自治区': '64.html', '新疆维吾尔自治区': '65.html'}
    """
    # 根据身份数据字典继续爬取下一级的市级数据,创建市级数据字典
    city = []
    city_list = []
    city_tr = []
    temp_list = []
    for item in province.items():
        city_url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/" + year + "/" + item[1]
        city_html = requests.get(city_url, headers)
        city_html.encoding = 'UTF-8'
        city_text = city_html.text
        city_tr.append(BeautifulSoup(city_text, 'lxml').find_all('tr', class_="citytr"))
        # 获得所有的市区tr city_tr列表长度是31 对应31个省或直辖市
        # 下面开始建立市区的字典{"名字":"链接"}
    # 存放省名字列表
    province_key = []
    for key in province.keys():
        province_key.append(key)
    num = 0
    for i in city_tr:
        for j in i:
            city_str_list = []
            etree_ = etree.HTML(str(j))
            temp_list.append({
                etree_.xpath('//tr/td[2]/a/text()')[0]:
                    etree_.xpath('//tr/td[2]/a/@href')[0]
            })
            city_str_list.append({ etree_.xpath('//tr/td[2]/a/@href')[0][3:7] + '00000000': etree_.xpath('//tr/td[2]/a/text()')[0] })
            city_list.append(city_str_list)
        city.append({province_key[num]: temp_list})
        num = num + 1
        temp_list = []
    """
    city[11]
    {'安徽省': [{'合肥市': '34/3401.html'}, {'芜湖市': '34/3402.html'}, {'蚌埠市': '34/3403.html'},
    {'淮南市': '34/3404.html'}, {'马鞍山市': '34/3405.html'}, {'淮北市': '34/3406.html'}, {'铜陵市': '34/3407.html'},
    {'安庆市': '34/3408.html'}, {'黄山市': '34/3410.html'}, {'滁州市': '34/3411.html'}, {'阜阳市': '34/3412.html'},
    {'宿州市': '34/3413.html'}, {'六安市': '34/3415.html'}, {'亳州市': '34/3416.html'}, {'池州市': '34/3417.html'},
    {'宣城市': '34/3418.html'}]}
    """
  
    # 搞定市级字典,下面开始最后一步,area
    area_list = []
    temp_area_list = []
  
    for item1 in city:
        for k1, v1 in item1.items():
            province_name = k1
            if (province_name in ["北京", "天津", "上海", "重庆"]):
                province_name = province_name + "市"
            if (province_name == "宁夏"):
                province_name = province_name + "回族自治区"
            if (province_name in ["西藏", "内蒙古"]):
                province_name = province_name + "自治区"
            if (province_name == "新疆"):
                province_name = province_name + "维吾尔自治区"
            if (province_name == "广西"):
                province_name = province_name + "壮族自治区"
            if (province_name == "黑龙江"):
                province_name = province_name + "省"
            if (len(province_name) == 2 and province_name not in ["西藏", "宁夏", "新疆", "广西", "北京", "天津",
                                                                  "上海", "重庆"]):
                province_name = province_name + "省"
            for item2 in v1:
                for k2, v2 in item2.items():
                    city_name = k2
                    area_url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/" + year + "/" + v2
                    area_response = requests.get(area_url, headers)
                    area_response.encoding = 'UTF-8'
                    area_text = area_response.text
                    area_soup = BeautifulSoup(area_text, 'lxml')
                    area_tr = area_soup.find_all("tr", class_="countytr")
                    for i in range(len(area_tr)):
                        etree_area = etree.HTML(str(area_tr[i]))
                        try:
                            area_name = etree_area.xpath("//tr/td[2]/a/text()")[0]
                        except:
                            area_name = etree_area.xpath("//tr/td[2]/text()")[0]
                        try:
                            temp_area_list.append({
                                etree_area.xpath("//tr/td[1]/a/text()")[0]: area_name
                            })
                        except:
                            temp_area_list.append({
                                etree_area.xpath("//tr/td[1]/text()")[0]: area_name
                            })
                        area_list.append(temp_area_list)
                        temp_area_list = []
        time.sleep(1)
    return provinceList + city_list + area_list
  
  
def into_mysql(year):
    year = str(year)
    conn, cursor = get_mysql_conn()
    res = get_area(year)
    # print(res)
    try:
        for item in res:
            for k, v in item[0].items():
                SQL = "insert into base_position (areaCode, name) values ('" + k + "','" + v + "')"
                cursor.execute(SQL)
                conn.commit()
    except:
        print("出现错误")
    conn, cursor.close()
    return None
  
  
def query(sql, *args):
    """
    通用封装查询
    :param sql:
    :param args:
    :return:返回查询结果 ((),())
    """
    conn, cursor = get_mysql_conn()
    cursor.execute(sql)
    res = cursor.fetchall()
    close_conn(conn, cursor)
    return res
  
  
"""
------------------------------------------------------------------------------------
"""
  
  
def get_mysql_conn():
    """
    :return: 连接,游标
    """
    # 创建连接
    conn = pymysql.connect(host="",
                           user="",
                           password="",
                           db="",
                           charset="utf8")
    # 创建游标
    cursor = conn.cursor()  # 执行完毕返回的结果集默认以元组显示
    return conn, cursor
  
  
def close_conn(conn, cursor):
    if cursor:
        cursor.close()
    if conn:
        conn.close()
  
  
if __name__ == '__main__':
    into_mysql('2022')

  

 

 数据格式和逻辑可根据自己需求更改!

posted @   zaijinyang  阅读(410)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 【自荐】一款简洁、开源的在线白板工具 Drawnix
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY
· Docker 太简单,K8s 太复杂?w7panel 让容器管理更轻松!
点击右上角即可分享
微信分享提示