在pyspark上并行预测python模型
一、运行xgboost模型
from sklearn import datasets iris = datasets.load_iris() data = iris.data[:100] print data.shape #(100L, 4L) #一共有100个样本数据, 维度为4维 label = iris.target[:100] print label #划分训练集、测试集 from sklearn.cross_validation import train_test_split train_x, test_x, train_y, test_y = train_test_split(data, label, random_state=0) #构建xgboost模型 import xgboost as xgb dtrain=xgb.DMatrix(train_x,label=train_y) dtest=xgb.DMatrix(test_x) #xgboost模型参数 params={'booster':'gbtree', 'objective': 'binary:logistic', 'eval_metric': 'auc', 'max_depth':4, 'lambda':10, 'subsample':0.75, 'colsample_bytree':0.75, 'min_child_weight':2, 'eta': 0.025, 'seed':0, 'nthread':8, 'silent':1} watchlist = [(dtrain,'train')] # 训练模型 bst=xgb.train(params,dtrain,num_boost_round=100,evals=watchlist) #预测 ypred=bst.predict(dtest) #保存模型和加载模型 bst.save_model('/root/xgb2.model') bst2 = xgb.core.Booster(model_file='/root/xgb2.model') #数据集并行化跑 from pyspark import SparkConf, SparkContext conf = SparkConf().setMaster("local").setAppName("My App") sc = SparkContext(conf = conf) s=sc.parallelize(test_x,5) #并行预测 import numpy as np; s.map(lambda x: bst2.predict(xgb.DMatrix(np.array(x).reshape((1,-1))))).collect()