国家统计局区划码爬取

目标数据

 

oracle存储表格

-- Create table
create table VILLAGE_CODE
(
  id                INTEGER,
  area_code         VARCHAR2(500),
  city_village_code VARCHAR2(500),
  area_name         VARCHAR2(500)
)
tablespace SYSTEM
  pctfree 10
  pctused 40
  initrans 1
  maxtrans 255
  storage
  (
    initial 64K
    next 1M
    minextents 1
    maxextents unlimited
  );
-- Add comments to the columns 
comment on column VILLAGE_CODE.id
  is '自增ID';
comment on column VILLAGE_CODE.area_code
  is '统计用区划代码';
comment on column VILLAGE_CODE.city_village_code
  is '城乡分类代码    ';
comment on column VILLAGE_CODE.area_name
  is '名称';

 

爬取代码

#!/usr/bin/env python
# encoding: utf-8
'''
@author: lurenjia
@contact: 1499418300@qq.com
@file: areacode.py
@time: 2018/9/29 14:40
@desc:
'''

import urllib2, re
from time import sleep
from random import random
from config import DBSession


headers = {
    "User-Agent": "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}
session = DBSession()


def insertVillage(code, name, city_village_code='-1'):
    print code, name, city_village_code
    session.execute("insert into village_code(area_code, area_name, city_village_code) VALUES ('%s','%s','%s')" %(code, name, city_village_code))
    session.commit()


def openUrl(url, type):
    try:
        sleep(random()*0.5)
        request = urllib2.Request(url,headers=headers)
        html = urllib2.urlopen(request,timeout=10).read().decode('gbk')
    except:
        html = None
        with open('error.txt', 'a+') as f:
            f.write(url+'                   '+str(type)+'\n')
    finally:
        return html
    
    
def parseCode1(baseUrl, lastUrl):
    html = openUrl(baseUrl+lastUrl,1)
    if html:
        for tr in re.findall("<tr class='provincetr'>.+?</tr>", html):
            for td in re.findall("<a href='(.+?html)'>(.+?)<br/>", tr):
                parseCode2(baseUrl, td[0])
        

def parseCode2(baseUrl, lastUrl):
    html = openUrl(baseUrl + lastUrl,2)
    if html:
        for tr in re.findall("<tr class='citytr'>.+?</tr>", html):
            for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr):
                insertVillage(td[1], td[2])
                parseCode3(baseUrl, td[0])
        

def parseCode3(baseUrl, lastUrl):
    baseUrl = baseUrl + lastUrl.split('/')[0] + '/'
    lastUrl = '/'.join(lastUrl.split('/')[1:])
    html = openUrl(baseUrl + lastUrl,3)
    if html:
        for tr in re.findall("<tr class='countytr'>.+?</tr>", html):
            for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr):
                insertVillage(td[1], td[2])
                parseCode4(baseUrl, td[0])
        

def parseCode4(baseUrl, lastUrl):
    baseUrl = baseUrl + lastUrl.split('/')[0] + '/'
    lastUrl = '/'.join(lastUrl.split('/')[1:])
    html = openUrl(baseUrl + lastUrl,4)
    if html:
        for tr in re.findall("<tr class='towntr'>.+?</tr>", html):
            for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr):
                insertVillage(td[1], td[2])
                parseCode5(baseUrl, td[0])
        

def parseCode5(baseUrl, lastUrl):
    baseUrl = baseUrl + lastUrl.split('/')[0] + '/'
    lastUrl = '/'.join(lastUrl.split('/')[1:])
    html = openUrl(baseUrl + lastUrl,5)
    if html:
        for tr in re.findall("<tr class='villagetr'>.+?</tr>", html):
            for td in re.findall("<td>(.+?)</td><td>(.+?)</td><td>(.+?)</td>", tr):
                insertVillage(td[0], td[2], td[1])
        

if __name__=="__main__":
    baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
    parseCode1(baseUrl, 'index.html')

 

 

分布式爬取

纯手写

#!/usr/bin/env python
# encoding: utf-8
'''
@author: lurenjia
@contact: 1499418300@qq.com
@file: areacode.py
@time: 2018/9/29 14:40
@desc:
'''

import urllib2, re, os, redis
from time import sleep
from random import random
from sqlalchemy import *
from sqlalchemy.orm import sessionmaker
from multiprocessing import Process

os.environ['NLS_LANG'] = 'AMERICAN_AMERICA.AL32UTF8'
engine = create_engine('oracle://xxx:xxx@xxx:1521/xe', pool_size=100, encoding='utf8')
DBSession = sessionmaker(bind=engine)
session = DBSession()
pool = redis.ConnectionPool(host='xxx', port=6379)
MRedis = redis.Redis(connection_pool=pool)

headers = {
    "User-Agent": "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}


def insertVillage(code, name, city_village_code='-1'):
    print code, name, city_village_code
    session.execute("insert into village_code(area_code, area_name, city_village_code) VALUES ('%s','%s','%s')" %(code, name, city_village_code))
    session.commit()


def openUrl(url):
    try:
        sleep(random() * 0.5)
        request = urllib2.Request(url, headers=headers)
        html = urllib2.urlopen(request, timeout=10).read().decode('gbk')
    except:
        html = None
        MRedis.lpush('area_code_error', url)
    finally:
        return html


def run():
    while True:
        area_code2 = MRedis.lpop('area_code2')
        while area_code2:
            html = openUrl(area_code2)
            if html:
                parseCode2(html, area_code2)
            area_code2 = MRedis.lpop('area_code2')

        area_code3 = MRedis.lpop('area_code3')
        while area_code3:
            html = openUrl(area_code3)
            if html:
                parseCode3(html, area_code3)
            area_code3 = MRedis.lpop('area_code3')

        area_code4 = MRedis.lpop('area_code4')
        while area_code4:
            html = openUrl(area_code4)
            if html:
                parseCode4(html, area_code4)
            area_code4 = MRedis.lpop('area_code4')

        area_code5 = MRedis.lpop('area_code5')
        while area_code5:
            html = openUrl(area_code5)
            if html:
                parseCode5(html, area_code5)
            area_code5 = MRedis.lpop('area_code5')


def parseCode1(baseUrl, lastUrl):
    html = openUrl(baseUrl+lastUrl)
    if html:
        for tr in re.findall("<tr class='provincetr'>.+?</tr>", html):
            for td in re.findall("<a href='(.+?html)'>(.+?)<br/>", tr):
                MRedis.lpush('area_code2', baseUrl+td[0])


def parseCode2(html, url):
    for tr in re.findall("<tr class='citytr'>.+?</tr>", html):
        for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr):
            insertVillage(td[1], td[2])
            baseUrl = '/'.join(url.split('/')[:-1])
            MRedis.lpush('area_code3', baseUrl +'/'+ td[0])


def parseCode3(html, url):
    for tr in re.findall("<tr class='countytr'>.+?</tr>", html):
        for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr):
            insertVillage(td[1], td[2])
            baseUrl = '/'.join(url.split('/')[:-1])
            MRedis.lpush('area_code4', baseUrl + '/' + td[0])


def parseCode4(html, url):
    for tr in re.findall("<tr class='towntr'>.+?</tr>", html):
        for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr):
            insertVillage(td[1], td[2])
            baseUrl = '/'.join(url.split('/')[:-1])
            MRedis.lpush('area_code5', baseUrl + '/' + td[0])


def parseCode5(html):
    for tr in re.findall("<tr class='villagetr'>.+?</tr>", html):
        for td in re.findall("<td>(.+?)</td><td>(.+?)</td><td>(.+?)</td>", tr):
            insertVillage(td[0], td[2], td[1])
        

if __name__=="__main__":
    baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
    parseCode1(baseUrl, 'index.html')
    # p1 = Process(target=run)
    # p1.start()
    # p2 = Process(target=run)
    # p2.start()
    # p3 = Process(target=run)
    # p3.start()

 

posted @ 2018-09-29 16:22  吃饭睡觉打逗逗  阅读(565)  评论(0编辑  收藏  举报