python 爬取媒体文件(使用chrome代理,启动客户端,有防火墙)
#coding = utf-8 ''' 中文转经纬度 ''' import time,json import urllib.request from selenium import webdriver from bs4 import BeautifulSoup import pandas as pd import numpy as np AK ='C2hKkyF9fHbmzESq6dmSArZIzw8wEiS1' table = pd.read_csv('./data/test.csv',encoding='utf-8') outfp = open('./data/result_test.csv','w',encoding='utf-8') class LoadData: def __init__(self): print("start") self.m_driver = webdriver.Chrome('D:\Program Files (x86)\ChromeDriver\chromedriver.exe') self.loc_result = [] def get_uri(self, addr, city = ''): # try: server = 'http://api.map.baidu.com/geocoder/v2/?' params = urllib.parse.urlencode({'address':addr,'city':city,'ak':AK,'output':'json'}) self.m_driver.get(server+params) bs = BeautifulSoup(self.m_driver.page_source,'lxml') # temp = bs.prefix result = json.loads(bs.pre.get_text())['result'] location = result.get('location') if( location != None ): lng = location.get('lng') lat = location.get('lat') return lng,lat # except: # print("error addr:",addr) # return np.NAN,np.NAN def get_lng_lat(self, addr): lng,lat = self.get_uri(addr) if((lng == None) or (lat == None)): print("error") self.loc_result.append([addr,lng,lat]) def main(self): addr_list = table['ADDRESS'].tolist() [self.get_lng_lat(addr) for addr in addr_list] outfp.write(str(self.loc_result)) if __name__ == '__main__': tStart = time.clock() LD = LoadData() LD.main() tEnd = time.clock() print("%s s"%(tEnd - tStart))
附录:
chromdriver.exe与chrome版本映射及下载链接
https://blog.csdn.net/mmayanshuo/article/details/78962398