国家统计局区划码爬取
目标数据
oracle存储表格
-- Create table create table VILLAGE_CODE ( id INTEGER, area_code VARCHAR2(500), city_village_code VARCHAR2(500), area_name VARCHAR2(500) ) tablespace SYSTEM pctfree 10 pctused 40 initrans 1 maxtrans 255 storage ( initial 64K next 1M minextents 1 maxextents unlimited ); -- Add comments to the columns comment on column VILLAGE_CODE.id is '自增ID'; comment on column VILLAGE_CODE.area_code is '统计用区划代码'; comment on column VILLAGE_CODE.city_village_code is '城乡分类代码 '; comment on column VILLAGE_CODE.area_name is '名称';
爬取代码
#!/usr/bin/env python # encoding: utf-8 ''' @author: lurenjia @contact: 1499418300@qq.com @file: areacode.py @time: 2018/9/29 14:40 @desc: ''' import urllib2, re from time import sleep from random import random from config import DBSession headers = { "User-Agent": "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36" } session = DBSession() def insertVillage(code, name, city_village_code='-1'): print code, name, city_village_code session.execute("insert into village_code(area_code, area_name, city_village_code) VALUES ('%s','%s','%s')" %(code, name, city_village_code)) session.commit() def openUrl(url, type): try: sleep(random()*0.5) request = urllib2.Request(url,headers=headers) html = urllib2.urlopen(request,timeout=10).read().decode('gbk') except: html = None with open('error.txt', 'a+') as f: f.write(url+' '+str(type)+'\n') finally: return html def parseCode1(baseUrl, lastUrl): html = openUrl(baseUrl+lastUrl,1) if html: for tr in re.findall("<tr class='provincetr'>.+?</tr>", html): for td in re.findall("<a href='(.+?html)'>(.+?)<br/>", tr): parseCode2(baseUrl, td[0]) def parseCode2(baseUrl, lastUrl): html = openUrl(baseUrl + lastUrl,2) if html: for tr in re.findall("<tr class='citytr'>.+?</tr>", html): for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr): insertVillage(td[1], td[2]) parseCode3(baseUrl, td[0]) def parseCode3(baseUrl, lastUrl): baseUrl = baseUrl + lastUrl.split('/')[0] + '/' lastUrl = '/'.join(lastUrl.split('/')[1:]) html = openUrl(baseUrl + lastUrl,3) if html: for tr in re.findall("<tr class='countytr'>.+?</tr>", html): for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr): insertVillage(td[1], td[2]) parseCode4(baseUrl, td[0]) def parseCode4(baseUrl, lastUrl): baseUrl = baseUrl + lastUrl.split('/')[0] + '/' lastUrl = '/'.join(lastUrl.split('/')[1:]) html = openUrl(baseUrl + lastUrl,4) if html: for tr in re.findall("<tr class='towntr'>.+?</tr>", html): for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr): insertVillage(td[1], td[2]) parseCode5(baseUrl, td[0]) def parseCode5(baseUrl, lastUrl): baseUrl = baseUrl + lastUrl.split('/')[0] + '/' lastUrl = '/'.join(lastUrl.split('/')[1:]) html = openUrl(baseUrl + lastUrl,5) if html: for tr in re.findall("<tr class='villagetr'>.+?</tr>", html): for td in re.findall("<td>(.+?)</td><td>(.+?)</td><td>(.+?)</td>", tr): insertVillage(td[0], td[2], td[1]) if __name__=="__main__": baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/' parseCode1(baseUrl, 'index.html')
分布式爬取
纯手写
#!/usr/bin/env python # encoding: utf-8 ''' @author: lurenjia @contact: 1499418300@qq.com @file: areacode.py @time: 2018/9/29 14:40 @desc: ''' import urllib2, re, os, redis from time import sleep from random import random from sqlalchemy import * from sqlalchemy.orm import sessionmaker from multiprocessing import Process os.environ['NLS_LANG'] = 'AMERICAN_AMERICA.AL32UTF8' engine = create_engine('oracle://xxx:xxx@xxx:1521/xe', pool_size=100, encoding='utf8') DBSession = sessionmaker(bind=engine) session = DBSession() pool = redis.ConnectionPool(host='xxx', port=6379) MRedis = redis.Redis(connection_pool=pool) headers = { "User-Agent": "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36" } def insertVillage(code, name, city_village_code='-1'): print code, name, city_village_code session.execute("insert into village_code(area_code, area_name, city_village_code) VALUES ('%s','%s','%s')" %(code, name, city_village_code)) session.commit() def openUrl(url): try: sleep(random() * 0.5) request = urllib2.Request(url, headers=headers) html = urllib2.urlopen(request, timeout=10).read().decode('gbk') except: html = None MRedis.lpush('area_code_error', url) finally: return html def run(): while True: area_code2 = MRedis.lpop('area_code2') while area_code2: html = openUrl(area_code2) if html: parseCode2(html, area_code2) area_code2 = MRedis.lpop('area_code2') area_code3 = MRedis.lpop('area_code3') while area_code3: html = openUrl(area_code3) if html: parseCode3(html, area_code3) area_code3 = MRedis.lpop('area_code3') area_code4 = MRedis.lpop('area_code4') while area_code4: html = openUrl(area_code4) if html: parseCode4(html, area_code4) area_code4 = MRedis.lpop('area_code4') area_code5 = MRedis.lpop('area_code5') while area_code5: html = openUrl(area_code5) if html: parseCode5(html, area_code5) area_code5 = MRedis.lpop('area_code5') def parseCode1(baseUrl, lastUrl): html = openUrl(baseUrl+lastUrl) if html: for tr in re.findall("<tr class='provincetr'>.+?</tr>", html): for td in re.findall("<a href='(.+?html)'>(.+?)<br/>", tr): MRedis.lpush('area_code2', baseUrl+td[0]) def parseCode2(html, url): for tr in re.findall("<tr class='citytr'>.+?</tr>", html): for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr): insertVillage(td[1], td[2]) baseUrl = '/'.join(url.split('/')[:-1]) MRedis.lpush('area_code3', baseUrl +'/'+ td[0]) def parseCode3(html, url): for tr in re.findall("<tr class='countytr'>.+?</tr>", html): for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr): insertVillage(td[1], td[2]) baseUrl = '/'.join(url.split('/')[:-1]) MRedis.lpush('area_code4', baseUrl + '/' + td[0]) def parseCode4(html, url): for tr in re.findall("<tr class='towntr'>.+?</tr>", html): for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr): insertVillage(td[1], td[2]) baseUrl = '/'.join(url.split('/')[:-1]) MRedis.lpush('area_code5', baseUrl + '/' + td[0]) def parseCode5(html): for tr in re.findall("<tr class='villagetr'>.+?</tr>", html): for td in re.findall("<td>(.+?)</td><td>(.+?)</td><td>(.+?)</td>", tr): insertVillage(td[0], td[2], td[1]) if __name__=="__main__": baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/' parseCode1(baseUrl, 'index.html') # p1 = Process(target=run) # p1.start() # p2 = Process(target=run) # p2.start() # p3 = Process(target=run) # p3.start()
Become a Linux Programmer