Use Spark sklearn 100x faster

skdist: https://github.com/Ibotta/sk-dist

 

import time
from sklearn import datasets, svm
from skdist.distribute.search import DistGridSearchCV
from pyspark.sql import SparkSession 
# instantiate spark session
spark = (   
    SparkSession    
    .builder    
    .getOrCreate()    
    )
sc = spark.sparkContext 
# the digits dataset
digits = datasets.load_digits()
X = digits["data"]
y = digits["target"] 
# create a classifier: a support vector classifier
classifier = svm.SVC()
param_grid = {
    "C": [0.01, 0.01, 0.1, 1.0, 10.0, 20.0, 50.0], 
    "gamma": ["scale", "auto", 0.001, 0.01, 0.1], 
    "kernel": ["rbf", "poly", "sigmoid"]
    }
scoring = "f1_weighted"
cv = 10
# hyperparameter optimization
start = time.time()
model = DistGridSearchCV(    
    classifier, param_grid,     
    sc=sc, cv=cv, scoring=scoring,
    verbose=True    
    )
model.fit(X,y)
print("Train time: {0}".format(time.time() - start))
print("Best score: {0}".format(model.best_score_))
------------------------------
Spark context found; running with spark
Fitting 10 folds for each of 105 candidates, totalling 1050 fits
Train time: 3.380601406097412
Best score: 0.981450024203508

 

posted @ 2020-06-02 15:53  similarface  阅读(197)  评论(0编辑  收藏  举报