php 抓取中国统计局 最新县及县以上行政区划代码
起因:
前两天突然想找个省市县的行政代码库,发现网上要么不是最新的,要么要帐号,要积分,要钱。让人好烦,就写了这个脚本。
数据库结构:
1 CREATE TABLE IF NOT EXISTS `area` ( 2 `id` int(11) NOT NULL auto_increment, 3 `code` varchar(6) NOT NULL, 4 `name` varchar(20) NOT NULL, 5 `citycode` varchar(6) NOT NULL, 6 PRIMARY KEY (`id`) 7 ) ENGINE=Innodb DEFAULT CHARSET=utf8; 8 9 CREATE TABLE IF NOT EXISTS `city` ( 10 `id` int(11) NOT NULL auto_increment, 11 `code` varchar(6) NOT NULL, 12 `name` varchar(20) NOT NULL, 13 `provincecode` varchar(6) NOT NULL, 14 PRIMARY KEY (`id`) 15 ) ENGINE=Innodb DEFAULT CHARSET=utf8; 16 17 18 CREATE TABLE IF NOT EXISTS `province` ( 19 `id` int(11) NOT NULL auto_increment, 20 `code` varchar(6) NOT NULL, 21 `name` varchar(20) NOT NULL, 22 PRIMARY KEY (`id`) 23 ) ENGINE=Innodb DEFAULT CHARSET=utf8 ;
脚本文件:
1 <?php 2 set_time_limit(0); 3 /** 4 * 5 */ 6 class get_city_code { 7 // 8 private $html = ''; 9 public $code_rt; 10 private static $instance = ''; 11 private $db = ''; 12 private $box = array(); 13 private $url = 'http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201504/t20150415_712722.html'; 14 // 15 private function __construct() { 16 17 } 18 19 public static function getInstance() { 20 if ( ! self::$instance instanceof get_city_code ) { 21 self::$instance = new self(); 22 } 23 return self::$instance; 24 } 25 public function start() { 26 // 27 $this->connect_tongji_html(); 28 $this->code_rt = new code_result(); 29 $this->code_rt->html = $this->html; 30 $this->code_rt->filter_all_data(); 31 } 32 private function connect_tongji_html() { 33 $ch = curl_init(); 34 $url = $this->getUrl(); 35 36 curl_setopt($ch, CURLOPT_URL, $url); 37 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 38 curl_setopt($ch, CURLOPT_HEADER, 0); 39 //执行并获取HTML文档内容 40 $this->html = curl_exec($ch); 41 42 //释放curl句柄 43 curl_close($ch); 44 } 45 46 //获取url 47 private function getUrl() { 48 return $this->url; 49 } 50 51 //插入数据库 52 public function insert() { 53 $i=0; 54 $box = array(); 55 foreach($this->code_rt->code_data as $k => $v) { 56 $k_arr = str_split($k,2); 57 $first = $k_arr[0]; 58 $second = $k_arr[1]; 59 $three = $k_arr[2]; 60 61 if( !empty( $box[$first]) ){ 62 if( !empty($box[$first][$second]) ) { 63 $box[$first][$second][$three] = $v; 64 }else{ 65 $box[$first][$second][] = $v; 66 } 67 }else{ 68 $box[$first][] = array($v); 69 } 70 } 71 72 $this->get_db(); 73 //print_r($box); 74 foreach( $box as $k1=>$v1){ 75 76 $code1 = $k1."0000"; 77 $name1 = $v1[0][0]; 78 $sql = "insert into province values (NULL,'".$code1."','".$name1."')"; 79 $this->db->query($sql); 80 foreach ( $v1 as $k2 => $v2 ) { 81 if($k2 == 0) { 82 continue; 83 } 84 $code2 = $k1.$k2."00"; 85 $name2 = $v2[0]=='市辖区' ? $name1 : $v2[0]; 86 if( $name2 == '县'){ 87 continue; 88 } 89 $sql = "insert into city values (NULL,'".$code2."','".$name2."','".$code1."')"; 90 $this->db->query($sql); 91 92 foreach( $v2 as $k3=>$v3 ) { 93 if($k3 == 0){ 94 continue; 95 } 96 $code3 = $k1.$k2.$k3; 97 $name3 = $v3; 98 $sql = "insert into area values (NULL,'".$code3."','".$name3."','".$code2."')"; 99 $this->db->query($sql); 100 } 101 } 102 } 103 104 $this->db->close(); 105 } 106 107 // 108 private function get_db () { 109 $db = new mysqli('localhost','root','sunl','blog'); 110 $db->set_charset('utf8'); 111 $this->db = $db; 112 } 113 } 114 115 class code_result { 116 public $html = ''; 117 public $code_data = array(); 118 private $code_arr = array(); 119 private $name_arr = array(); 120 121 public function __construct () { 122 123 } 124 125 public function filter_all_data() { 126 //获取所有的p标签 127 $patten = "/<p.*><\/p>/"; 128 preg_match($patten, $this->html, $p); 129 //去除所有的 130 $this->html = preg_replace( '/ /', '', strip_tags($p[0])); 131 132 $this->html = preg_replace( '/\s+/', '', $this->html ); 133 $this->html = preg_replace( '/ /', '', $this->html ); 134 $this->html = preg_replace( '/ /', '', $this->html ); 135 $this->html = preg_replace( '/ /', '', $this->html ); 136 //echo $this->html; 137 138 //匹配code id 139 $patten2 = "/[\d{6}]+/"; 140 preg_match_all($patten2, $this->html, $this->code_arr); 141 142 //匹配县市名称 143 $patten3 = "/[\x{4e00}-\x{9fa5}]+/u"; 144 preg_match_all($patten3, $this->html, $this->name_arr); 145 146 $this->code_data = array_combine($this->code_arr[0] , $this->name_arr[0]); 147 //print_r($this->code_data);die; 148 } 149 150 public function getCodeData() { 151 return $this->code_data; 152 } 153 } 154 155 $code = get_city_code::getInstance(); 156 $code->start(); 157 $code->insert(); 158 ?>
做好记录,以后总会用到!