全国地区 4级 5级 mysql 数据

数据来源于 

http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/

 注意表结构和字段编码为 utf8mb4

CREATE TABLE `area_5` (
  `area_id` bigint(13) NOT NULL AUTO_INCREMENT COMMENT '地区id',
  `title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL COMMENT '地区名称',
  `p_title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL COMMENT '父级地区名称',
  `pid` bigint(13) NOT NULL DEFAULT '0' COMMENT '父级地区ID',
  `zip_code` varchar(10) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '邮编',
  `level` tinyint(2) NOT NULL DEFAULT '1' COMMENT '地区层级',
  `sort` smallint(6) NOT NULL DEFAULT '0' COMMENT '排序值',
  PRIMARY KEY (`area_id`) USING BTREE,
  KEY `pid` (`pid`) USING BTREE
) ENGINE=MyISAM AUTO_INCREMENT=820008000001 DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci COMMENT='区域表';

 

 

 

 

 

4级地区

 

五级地区

 

 

爬取代码

import scrapy

class AreaSpider(scrapy.Spider):
    name = 'area'

    start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html']

    def parse(self, response):


        print("开始运行")

        href_list = response.xpath("//tr[@class='provincetr']/td/a/@href").extract()
        area_list = response.xpath("//tr[@class='provincetr']/td/a/@href").extract()

        i=0

        for href in href_list:
            url = response.urljoin(href)

            add_params = {}
            add_params['area_id'] = area_list[i]

            i=i+1

            yield scrapy.Request(url=url, callback=self.parse_city,cb_kwargs=add_params)

    # 城市的数据,传入父级省份的id,解开注释的yield 就返回城市数据,下方其他地区同理
    def parse_city(self, response,area_id):
        # for quote in response.css('tr.citytr'):
        #     yield {
        #         'area_id': quote.css('td a::text').get(),
        #         'name': quote.xpath('./td[2]/a/text()').get(),
        #         'url':area_id
        #     }

        href_list = response.xpath("//tr[@class='citytr']/td[1]/a/@href").extract()
        area_list = response.xpath("//tr[@class='citytr']/td[1]/a//text()").extract()

        i=0

        for href in href_list:
            url = response.urljoin(href)
            add_params = {}
            add_params['area_id'] = area_list[i]

            i = i + 1

            yield scrapy.Request(url=url, callback=self.parse_area, cb_kwargs=add_params)

    ##地区的数据
    def parse_area(self, response,area_id):

        href_list = response.xpath("//tr[@class='countytr']/td[1]/a/@href").extract()
        area_list = response.xpath("//tr[@class='countytr']/td[1]/a//text()").extract()

        print(href_list)

        i = 0

        for href in href_list:
            url = response.urljoin(href)

            add_params = {}
            add_params['area_id'] = area_list[i]

            i = i + 1

            yield scrapy.Request(url=url, callback=self.parse_street, cb_kwargs=add_params)

        # for quote in response.css('tr.countytr'):
        #     yield {
        #         'area_id': quote.xpath('./td[1]//text()').get(),
        #         'name': quote.xpath('./td[2]//text()').get(),
        #         'pid':area_id
        #     }

    ##街道数据
    def parse_street(self, response,area_id):

        href_list = response.xpath("//tr[@class='towntr']/td[1]/a/@href").extract()
        area_list = response.xpath("//tr[@class='towntr']/td[1]/a//text()").extract()

        print(href_list)

        i = 0

        for href in href_list:
            url = response.urljoin(href)

            add_params = {}
            # time.sleep(1)  # 休眠0.1秒

            print(i)
            add_params['area_id'] = area_list[i]

            i = i + 1

            yield scrapy.Request(url=url, callback=self.parse_cun, cb_kwargs=add_params)

        # for quote in response.css('tr.towntr'):
        #     yield {
        #         'area_id': quote.xpath('./td[1]//text()').get(),
        #         'name': quote.xpath('./td[2]//text()').get(),
        #         'pid':area_id
        #     }

    ##村\居委会数据
    def parse_cun(self, response,area_id):
        ##没有下级了就直接返回了
        for quote in response.css('tr.villagetr'):
            yield {
                'area_id': quote.xpath('./td[1]//text()').get(),
                'name': quote.xpath('./td[3]//text()').get(),
                'type': quote.xpath('./td[2]//text()').get(),
                'pid':area_id
            }
View Code

 

posted @ 2022-10-04 23:25  万隆  阅读(224)  评论(0编辑  收藏  举报