python抓取网页例子

python抓取网页例子

最近在学习python,刚刚完成了一个网页抓取的例子,通过python抓取全世界所有的学校以及学院的数据,并存为xml文件。数据源是人人网。

因为刚学习python,写的代码还不够Pythonic。

核心代码如下:



#!/usr/bin/python


import urllib.request
from html.parser import HTMLParser  
import json
import time
import xml.dom.minidom
import os


class Dept():
    id = 0
    name = ''
    
class University(Dept):
    depts = []

class City(Dept):
    universities  = []    
    
class Country(Dept):
    cities = []

class MyHtmlParser(HTMLParser):
    def __init__(self):   
        HTMLParser.__init__(self)   
        self.links = []  
        self.depts = [] 
    def handle_starttag(self, tag, attrs):   
        if tag == 'option':
            for att in attrs:
                for a in att:
                    if a != 'value' and a != '':
                        self.depts.append(a)
def readDept(code):
    depts = []
    html = ''
    for word in urllib.request.urlopen('http://www.renren.com/GetDep.do?id=' + str(code)).readlines():
        real = word.strip().decode('gbk') 
        html =  html  + real
    hp = MyHtmlParser()   
    hp.feed(html)  
    for inst in hp.depts:
        dept = Dept()
        dept.name = inst
        depts.append(dept) 
    return depts

def writeXml(city):
    impl = xml.dom.minidom.getDOMImplementation()
    dom = impl.createDocument(None, 'city', None)
    root = dom.documentElement  
    filename = city.name + '.xml'
    if os.path.isfile(filename): 
        os.remove(filename)
    nameE = dom.createElement('name')
    nameT = dom.createTextNode(city.name)
    idE = dom.createElement('id')
    idT = dom.createTextNode(str(city.id))
    nameE.appendChild(nameT)
    idE.appendChild(idT)
    root.appendChild(nameE)
    root.appendChild(idE)
    univs = dom.createElement('universities')
    root.appendChild(univs)
    for uni in city.universities:
#        print('write xml' + city.name + '\t' + uni.name)
        universityE = dom.createElement('university')
        univs.appendChild(universityE)
        uniE = dom.createElement('name')
        uniT = dom.createTextNode(uni.name)
        uidE = dom.createElement('id')
        uidT = dom.createTextNode(str(uni.id))
        uniE.appendChild(uniT)
        uidE.appendChild(uidT)
        universityE.appendChild(uniE)
        universityE.appendChild(uidE)
        deptsE = dom.createElement('departments')
        universityE.appendChild(deptsE)
        for dep in uni.depts:
            deptE = dom.createElement('department')
            deptsE.appendChild(deptE)
            deptNameE = dom.createElement('name')
            deptIdE = dom.createElement('id')
            deptT = dom.createTextNode(dep.name)
            deptIdT = dom.createTextNode(str(dep.id))
            deptNameE.appendChild(deptT)
            deptIdE.appendChild(deptIdT)
            deptE.appendChild(deptNameE)
            
    f= open(filename, 'w', encoding='utf-8')
    dom.writexml(f, addindent='  ', newl='\n',encoding='utf-8')
    print('write xml :' + city.name + '.xml')
    f.close()  
    

def mkdir(path):

    path=path.strip()
    path=path.rstrip("/")
    isExists=os.path.exists(path)
    if not isExists:
        os.makedirs(path)

def readData(content):
    counties = []
    jdata = json.loads(content)
    for i in range(0,100):
        try:
            country = Country()
            country.name = jdata[i]['name'] 
            country.id = jdata[i]['id'] 
            provs = jdata[i]['provs'] 
            for prov in provs:
                city = City()
                city.name = prov['name']
                city.id = prov['id']
                country.cities.append(city)
                city.universities = []
                for dic in prov['univs']:
                    university = University() 
                    university.id = dic['id']
                    university.name = dic['name']
#                    print('get data: \t' + university.name)
                    university.depts = readDept(university.id)
                    city.universities.append(university)
                    print('city = ' + city.name + '\tuniversity = ' + university.name)
                writeXml(city)
            counties.append(country)
        except IndexError:
            break;
    return counties


print('开始时间:' + time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
f = open('data','r' )
content = f.read()
f.close()
counties = readData(content)
print('结束时间:' + time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))



其中data是从如下网站拿到的

http://s.xnimg.cn/allunivlist.js
posted @ 2015-09-03 21:52  wardensky  阅读(444)  评论(0编辑  收藏  举报