python 取城乡划分代码(省市县镇乡村5级)
1 # coding:utf-8 2 #!/bin/env python 3 import requests,re,time 4 from urllib import parse 5 from bs4 import BeautifulSoup 6 import bs4 7 8 header = { 9 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 10 "Accept-Encoding": "gzip, deflate", 11 "Accept-Language": "zh-CN,zh;q=0.9", 12 "Connection": "keep-alive", 13 "Host": "www.stats.gov.cn", 14 "Referer": "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html", 15 "Upgrade-Insecure-Requests": "1", 16 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36" 17 } 18 url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html' 19 def gethtml(url_str='data'): 20 try: 21 html_s = req.get(url_str) 22 if html_s.status_code == 200: 23 # print('抓取成功网页长度:', len(html_s.text)) 24 test_html = html_s.text.encode("latin1").decode("gbk") 25 while re.findall('charset=(.*?)"',test_html) != ['gb2312']: 26 print('进入while') 27 html_s = req.get(url_str) 28 test_html = html_s.text.encode("latin1").decode("gbk") 29 return test_html 30 except BaseException as e: 31 32 print('抓取出现错误:',e) 33 def writetxt(str,str_type): 34 if str_type == 'url': 35 file_str = 'D:\curl.txt' 36 else: 37 file_str = 'D:\city.txt' 38 with open(file_str, 'a+') as s_f: 39 s_f.writelines(str + '\n') 40 def province(url_str): 41 if url_str != '': 42 soup_html = re.findall("</td></tr>\r\n(.*)\r\n</table>", gethtml(url_str), re.S) 43 soup_html = soup_html[0] if soup_html != [] else '' 44 soup = BeautifulSoup(soup_html, 'lxml') 45 for soup_tr in soup.findAll('tr', class_='provincetr'): 46 for soup_td in soup_tr.find_all(name='a'): 47 soup_sid = soup_td['href'].split('.')[0] 48 soup_txt = soup_td.get_text() 49 soup_url = parse.urljoin(url_str, soup_td['href']) 50 print('level_1', ['0', soup_sid, soup_txt, soup_url]) 51 writetxt(str(['level_1', '0', soup_sid, soup_txt, soup_url]), 'data') 52 writetxt(soup_url, 'url') 53 def getcity(url_str): 54 if url_str != '': 55 soup_html = re.findall("</td></tr>\r\n(.*)\r\n</table>", gethtml(url_str), re.S) 56 soup_html = soup_html[0] if soup_html != [] else '' 57 soup = BeautifulSoup(soup_html, 'lxml') 58 Parent_url = re.findall("(\d+).html", url_str) 59 Parent_url = Parent_url[0] if Parent_url != [] else '' 60 level = str(int(len(Parent_url)/2+1)) 61 class_str = {'2': 'citytr', '3': 'countytr' ,'4': 'towntr' ,'5': 'villagetr'} 62 for soup_tr in soup.findAll('tr', class_=class_str[level]): 63 soup_sid = re.findall(r'\d+', soup_tr.get_text()) 64 soup_sid = soup_sid[0] if soup_sid != [] else '' 65 soup_txt = re.findall(r'\D+', soup_tr.get_text()) 66 soup_txt = soup_txt[0] if soup_txt != [] else '' 67 soup_url = re.findall('href="(.*?)">', str(soup_tr)) 68 soup_url = parse.urljoin(url_str, soup_url[0]) if soup_url != [] else '' 69 print('level_'+level, [Parent_url, soup_sid, soup_txt, soup_url]) 70 writetxt(str(['level_'+level, Parent_url, soup_sid, soup_txt, soup_url]), 'data') 71 writetxt(soup_url, 'url') 72 def updateurl(): 73 file_str = 'D:\curl.txt' 74 with open(file_str, 'r') as f: 75 lines = f.readlines() 76 with open(file_str, 'w+') as f_w: 77 if lines != []: 78 lines[0] = '' 79 f_w.writelines(lines) 80 def geturl(): 81 file_str = 'D:\curl.txt' 82 lines_str = '' 83 with open(file_str, 'r') as f: 84 lines = f.readlines() 85 if lines !=[]: 86 lines_str = lines[0].strip() 87 return lines_str 88 89 req = requests.session() 90 req.headers = header 91 province(url) 92 93 while 1 != 2: 94 try: 95 current_url = geturl() 96 getcity(current_url) 97 updateurl() 98 except BaseException as e: 99 print(e) 100 time.sleep(1)