spider autohome (1)
Code:
#!/usr/bin/python # -*- coding: UTF-8 -*- import re import urllib import time def getHtml(url): """ This function just simply get all the data by the url you get.and then decode and code to utf-8 which you need. """ page = urllib.urlopen(url) html=page.read() uni_str = html.decode('gb2312') utf_str = uni_str.encode('utf-8') return utf_str def getInfo(html): """ This function just simply get the data from the html and filter some data which we are interest,and then return a list. """ reg = r'config = {(.+?)};' config_re = re.compile(reg) config_list = re.findall(config_re,html) return config_list def getEachCar(config_lists): """ This function will parse the data,and then return a list include the all information of each car,the each item of the car's information split by '|'. """ each_car={} for sp in config_lists: config_str='{'+sp+'}' config_str=config_str.replace("null","None") regx=r'{"specid":\d{5},"value":.+?}' cc=re.compile(regx) xx=re.findall(regx,config_str) for x in xx: x=eval(x) akey=repr(x['specid']) if each_car.has_key(akey): each_car[akey]=each_car[akey]+x["value"]+"|" else: each_car[akey]=x['value'] jobs=[] for each in each_car: ter_data="|"+each_car[each] jobs.append(ter_data) return jobs if __name__ == '__main__': # html = getHtml("http://car.autohome.com.cn/config/spec/21308.html#pvareaid=100679") html = getHtml("http://car.autohome.com.cn/config/spec/18239.html") config_lists=getInfo(html) each_car=getEachCar(config_lists) for acar in each_car: print acar
Result:
Can we drop this masquerade