php 抓取中国统计局 最新县及县以上行政区划代码

起因:

  前两天突然想找个省市县的行政代码库,发现网上要么不是最新的,要么要帐号,要积分,要钱。让人好烦,就写了这个脚本。

数据库结构:

 1 CREATE TABLE IF NOT EXISTS `area` (
 2   `id` int(11) NOT NULL auto_increment,
 3   `code` varchar(6) NOT NULL,
 4   `name` varchar(20) NOT NULL,
 5   `citycode` varchar(6) NOT NULL,
 6   PRIMARY KEY  (`id`)
 7 ) ENGINE=Innodb  DEFAULT CHARSET=utf8;
 8  
 9 CREATE TABLE IF NOT EXISTS `city` (
10   `id` int(11) NOT NULL auto_increment,
11   `code` varchar(6) NOT NULL,
12   `name` varchar(20) NOT NULL,
13   `provincecode` varchar(6) NOT NULL,
14   PRIMARY KEY  (`id`)
15 ) ENGINE=Innodb  DEFAULT CHARSET=utf8;
16 
17  
18 CREATE TABLE IF NOT EXISTS `province` (
19   `id` int(11) NOT NULL auto_increment,
20   `code` varchar(6) NOT NULL,
21   `name` varchar(20) NOT NULL,
22   PRIMARY KEY  (`id`)
23 ) ENGINE=Innodb  DEFAULT CHARSET=utf8 ;

脚本文件:

  1 <?php
  2 set_time_limit(0);
  3 /**
  4  * 
  5  */
  6 class get_city_code {
  7     //
  8     private $html = '';
  9     public  $code_rt;
 10     private static $instance = '';
 11     private $db = '';
 12     private $box = array();
 13     private $url = 'http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201504/t20150415_712722.html';
 14     //
 15     private function __construct() {
 16         
 17     }
 18 
 19     public static function getInstance() {
 20         if ( ! self::$instance instanceof get_city_code ) {
 21             self::$instance = new self();
 22         }
 23         return self::$instance;
 24     }
 25     public  function start() {
 26         //
 27         $this->connect_tongji_html();
 28         $this->code_rt = new code_result();
 29         $this->code_rt->html = $this->html;
 30         $this->code_rt->filter_all_data();
 31     }
 32     private function connect_tongji_html() {
 33         $ch = curl_init();
 34         $url = $this->getUrl();
 35 
 36         curl_setopt($ch, CURLOPT_URL, $url);
 37         curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
 38         curl_setopt($ch, CURLOPT_HEADER, 0);
 39         //执行并获取HTML文档内容
 40         $this->html = curl_exec($ch);
 41 
 42         //释放curl句柄
 43         curl_close($ch);
 44     }
 45 
 46     //获取url
 47     private function getUrl() {
 48         return $this->url;
 49     }
 50 
 51     //插入数据库
 52     public function insert() {
 53         $i=0;
 54         $box = array();
 55         foreach($this->code_rt->code_data as $k => $v) {
 56             $k_arr = str_split($k,2);
 57             $first = $k_arr[0];
 58             $second = $k_arr[1];
 59             $three = $k_arr[2];
 60 
 61             if( !empty( $box[$first]) ){
 62                 if( !empty($box[$first][$second]) ) {
 63                     $box[$first][$second][$three] = $v;
 64                 }else{                    
 65                     $box[$first][$second][] = $v;
 66                 }
 67             }else{
 68                 $box[$first][] = array($v);
 69             }
 70         }
 71         
 72         $this->get_db();
 73         //print_r($box);
 74         foreach( $box as $k1=>$v1){
 75 
 76             $code1 = $k1."0000";
 77             $name1 = $v1[0][0];
 78             $sql = "insert into province values (NULL,'".$code1."','".$name1."')";
 79             $this->db->query($sql);
 80             foreach ( $v1 as $k2 => $v2 ) {
 81                 if($k2 == 0) {
 82                     continue;
 83                 }
 84                 $code2 = $k1.$k2."00";
 85                 $name2 = $v2[0]=='市辖区' ? $name1 : $v2[0];
 86                 if( $name2 == '县'){
 87                     continue;
 88                 }
 89                 $sql = "insert into city values (NULL,'".$code2."','".$name2."','".$code1."')";
 90                 $this->db->query($sql);
 91 
 92                 foreach( $v2 as $k3=>$v3 ) {
 93                     if($k3 == 0){
 94                         continue;
 95                     }
 96                     $code3 = $k1.$k2.$k3;
 97                     $name3 = $v3;
 98                     $sql = "insert into area values (NULL,'".$code3."','".$name3."','".$code2."')";
 99                     $this->db->query($sql);
100                 }
101             }
102         }
103 
104         $this->db->close();
105     }
106 
107     //
108     private function get_db () {
109         $db = new mysqli('localhost','root','sunl','blog');
110         $db->set_charset('utf8');
111         $this->db = $db;
112     }
113 }
114 
115 class code_result {
116     public $html = '';
117     public $code_data = array();
118     private $code_arr  = array();
119     private $name_arr  = array();
120 
121     public function __construct () {
122 
123     }
124 
125     public function filter_all_data() {
126         //获取所有的p标签
127         $patten = "/<p.*><\/p>/";
128         preg_match($patten, $this->html, $p);
129         //去除所有的&nbsp;
130         $this->html = preg_replace( '/&nbsp;/', '', strip_tags($p[0]));
131 
132         $this->html = preg_replace( '/\s+/', '', $this->html );
133         $this->html = preg_replace( '/ /', '', $this->html );
134         $this->html = preg_replace( '/ /', '', $this->html );
135         $this->html = preg_replace( '/ /', '', $this->html );
136         //echo $this->html;
137         
138         //匹配code id
139         $patten2 = "/[\d{6}]+/";
140         preg_match_all($patten2, $this->html, $this->code_arr);
141         
142         //匹配县市名称
143         $patten3 = "/[\x{4e00}-\x{9fa5}]+/u";
144         preg_match_all($patten3, $this->html, $this->name_arr);
145         
146         $this->code_data = array_combine($this->code_arr[0] , $this->name_arr[0]);
147         //print_r($this->code_data);die;
148     }
149 
150     public function getCodeData() {
151         return $this->code_data;
152     }
153 }
154 
155 $code = get_city_code::getInstance();
156 $code->start();
157 $code->insert();
158 ?>

 

posted @ 2015-10-19 14:53  jayson.s  阅读(882)  评论(0编辑  收藏  举报