我的代码-random forest


# coding: utf-8

# In[1]:


import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import binarize
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score,recall_score,average_precision_score,auc

import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import binarize
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score,recall_score,average_precision_score,auc


# In[137]:


from imblearn.over_sampling import SMOTE
data = pd.read_csv(r"D:\Users\sgg91044\Desktop\model_data_1.csv")


# In[138]:


data.head()


# In[50]:


data= data.drop(columns=['Stg','RNK','parametername','ooc','oos'])
p= pd.pivot_table(data, index=['eqpid','Chamber','lotid','slotid','stage','Recipie_Name','finishtime'],values='data1', columns='Param_Name', aggfunc=np.sum)


# In[54]:


p
p.to_csv(r'D:\Users\sgg91044\Desktop\more_parameter\more_parameter_pivot.csv', index=True, header=True)


# In[236]:


p.drop(columns=["waferid","defect_count"],inplace=True)


# In[237]:


data.head()


# In[184]:


data = pd.read_csv(r"D:\Users\sgg91044\Desktop/MEP_data_pivot.csv")


# In[6]:


data.iloc[:,0:17] = data.iloc[:,0:17].apply(pd.to_numeric,errors='coerce')


# In[239]:


for i in range(0,18):
med = np.median(data.iloc[:,i][data.iloc[:,i].isna() == False])
data.iloc[:,i] = data.iloc[:,i].fillna(med)


# In[139]:


data.Target = data.Target.astype("category")


# In[140]:


Y = data.Target
X = data.drop(columns='Target')


# In[195]:


ohe = OneHotEncoder()
le = LabelEncoder()


# In[246]:


X=X.drop(columns=['eqpid','lotid','Chamber','Recipie_Name'])
X
for i in range(0,18):
med = np.median(data.iloc[:,i][data.iloc[:,i].isna() == False])
data.iloc[:,i] = data.iloc[:,i].fillna(med)


# In[243]:


X["eqp_encoded"] = le.fit_transform(X.iloc[:,0])
X["chmbr_encoded"] = le.fit_transform(X.iloc[:,1])
X.drop(columns=['Step'],inplace=True)
X['recipe_encoded'] = le.fit_transform(X.iloc[:,2])


# In[135]:


X_eqp = ohe.fit_transform(X.eqp_encoded.values.reshape(-1,1)).toarray()
X_chamber = ohe.fit_transform(X.chmbr_encoded.values.reshape(-1,1)).toarray()
X_recipie = ohe.fit_transform(X.recipe_encoded.values.reshape(-1,1)).toarray()

dfOneHot = pd.DataFrame(X_eqp, columns = ["Eqp_"+str(int(i)) for i in range(X_eqp.shape[1])])
X = pd.concat([X, dfOneHot], axis=1)

dfOneHot = pd.DataFrame(X_chamber, columns = ["Chamber_"+str(int(i)) for i in range(X_chamber.shape[1])])
X = pd.concat([X, dfOneHot], axis=1)

dfOneHot = pd.DataFrame(X_recipie, columns = ["Recipie_"+str(int(i)) for i in range(X_recipie.shape[1])])
X = pd.concat([X, dfOneHot], axis=1)


# In[136]:


Trace_back = pd.concat([X[["eqpid","Chamber","Recipie_Name"]],X[["eqp_encoded","chmbr_encoded","recipe_encoded"]]],axis=1)


# In[137]:


X.drop(columns=list(Trace_back.columns),inplace=True)


# In[197]:


nz = Normalizer()
X.iloc[:,0:19]=pd.DataFrame(nz.fit_transform(X.iloc[:,0:19]),columns=X.iloc[:,0:19].columns)


# In[150]:


data.Target = data.Target.astype("category")
Y = data.Target
X = data.drop(columns='Target')


# In[124]:


sm = SMOTE(random_state=12, ratio = 1.0)
X_smote, Y_smote = sm.fit_sample(X, Y)


# In[237]:


data=pd.read_csv(r"D:\Users\sgg91044\Desktop\model_data_1.csv")


# In[238]:


data.eqpid = data.eqpid.astype("category")
data.chamber = data.chamber.astype("category")
data.wafer = data.wafer.astype("category")


# In[239]:


data.Target = data.Target.astype("category")
Y = data.Target
X = data.drop(columns='Target')


# In[240]:


X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.2, random_state=8)


# In[241]:


sm = SMOTE(random_state=12, ratio = 1.0)
x_train_smote, y_train_smote = sm.fit_sample(X_train, y_train)


# In[242]:


print(y_train.value_counts(), np.bincount(y_train))


# In[243]:


from sklearn.ensemble import RandomForestClassifier

# Make the random forest classifier
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, oob_score = True, verbose = 1, n_jobs = -1)


# In[244]:


# Train on the training data
random_forest.fit(x_train_smote,y_train_smote)


# In[245]:


# Make predictions on the test data
y_pred = random_forest.predict(X_test)


# In[246]:


print(classification_report(y_pred=y_pred,y_true=y_test))


# In[247]:


print(confusion_matrix(y_pred=y_pred,y_true=y_test))


# In[235]:


from sklearn.externals import joblib

# Save to file in the current working directory
joblib_file = "model_RF.pkl"
joblib.dump(random_forest, joblib_file)


# In[229]:


X_Nov=pd.read_csv(r'D:\Users\sgg91044\Desktop\sep_oct_data\Nov_good_imputed.csv')
Y_Nov=pd.read_csv(r'D:\Users\sgg91044\Desktop\sep_oct_data\Y_Nov.csv')


# In[230]:


def encode_eqpid(eqpid):
return int(eqpid[-2:])-1

def encode_chamber(chamber):
if chamber == 'A':
return 0
else:
return 1

def encode_wafer(wafer):
if wafer > 0:
return wafer-1
data=pd.read_csv(r"D:\Users\sgg91044\Desktop\normalizing_example.csv")
nz = Normalizer()
data.iloc[:,8:10]=pd.DataFrame(nz.fit_transform(data.iloc[:,8:10]),columns=data.iloc[:,8:10].columns)
data.iloc[:,0:3]=pd.DataFrame(nz.fit_transform(data.iloc[:,0:3]),columns=data.iloc[:,0:3].columns)


# In[231]:


X_Nov.eqpid = X_Nov.eqpid.apply(encode_eqpid)
X_Nov.chamber = X_Nov.chamber.apply(encode_chamber)
X_Nov.wafer = X_Nov.wafer.apply(encode_wafer)
X_Nov.eqpid = X_Nov.eqpid.astype("category")
X_Nov.chamber = X_Nov.chamber.astype("category")
X_Nov.wafer = X_Nov.wafer.astype("category")
X_Nov.iloc[:,11:13]=nz.transform(X_Nov.iloc[:,11:13])
X_Nov.iloc[:,3:6]=nz.transform(X_Nov.iloc[:,3:6])
#SUM_ETCM
X_Nov["SUM_ETCM"]=np.array(X_Nov.ETCM_PHA4)+np.array(X_Nov.ETCM_PHB4)+np.array(X_Nov.ETCM_PHC4)


# In[232]:


X_Nov=X_Nov.drop(columns="Target")


# In[233]:


# Make predictions on the test data
y_pred = random_forest.predict(X_Nov)


# In[234]:


print(classification_report(y_pred=y_pred,y_true=Y_Nov))


# In[129]:


print("Accuracy of Random_forest:",round(accuracy_score(y_pred=y_pred,y_true=y_test) * 100,2),"%")


# In[130]:


print("Sensitivity of Random_forest:",round(recall_score(y_pred=y_pred,y_true=y_test)*100,2),"%")


# In[18]:


from sklearn.externals import joblib

joblib.dump(random_forest, r'D:\Users\sgg91044\Desktop\deployment\model_RF.pkl')


# In[217]:

 

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]},
{'kernel':['poly'],'degree':[2,3,5]}]
clf = GridSearchCV(SVC(),param_grid=tuned_parameters,cv=3,scoring='recall',verbose=True)
clf.fit(x_train_smote,y_train_smote)


# In[218]:


[clf.best_estimator_.kernel,clf.best_estimator_.C,clf.best_estimator_.gamma]
y_pred = clf.predict(X_test)


# In[219]:


print(classification_report(y_pred=y_pred,y_true=y_test))


# In[165]:


df=pd.DataFrame(y_pred)
df.to_csv(r'D:\Users\sgg91044\Desktop\df_pred.csv', index=True, header=True)


# In[223]:


from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train,y_train)


# In[224]:


y_test_pred=classifier.predict(X_test)


# In[225]:


print(classification_report(y_pred=y_test_pred,y_true=y_test))


# In[120]:


f1_score(y_pred=y_pred,y_true=y_test)


# In[121]:


print("Accuracy of Random_forest:",round(accuracy_score(y_pred=y_pred,y_true=y_test) * 100,2),"%")


# In[122]:


print("Sensitivity of Random_forest:",round(recall_score(y_pred=y_pred,y_true=y_test)*100,2),"%")


# In[30]:


X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.3, random_state=0)


# In[31]:


sm = SMOTE(random_state=12, ratio = 1.0)
x_train_smote, y_train_smote = sm.fit_sample(X_train, y_train)


# In[32]:


print(y_train.value_counts(), np.bincount(y_train_smote))


# In[86]:


from sklearn.ensemble import RandomForestClassifier

# Make the random forest classifier
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)


# In[89]:


# Train on the training data
random_forest.fit(x_train_smote,y_train_smote)


# In[90]:


# Make predictions on the test data
y_pred = random_forest.predict(X_test)


# In[91]:


print(classification_report(y_pred=y_pred,y_true=y_test))


# In[92]:


print(confusion_matrix(y_pred=y_pred,y_true=y_test))


# In[93]:


f1_score(y_pred=y_pred,y_true=y_test)


# In[220]:


print("Accuracy of Random_forest:",round(accuracy_score(y_pred=y_pred,y_true=y_test) * 100,2),"%")


# In[221]:


print("Sensitivity of Random_forest:",round(recall_score(y_pred=y_pred,y_true=y_test)*100,2),"%")


# In[96]:


y_pred_rf = random_forest.predict_proba(X_test)
y_pred_rf


# In[99]:


# The random forest model by itself
y_pred_rf = random_forest.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)


# In[83]:


import matplotlib.pyplot as plt
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_rf, tpr_rf, label='RF')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred_rf.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
print ("AUC of Random_forest:", roc_auc["micro"])

 

posted on 2018-12-19 10:24  Aimee0207  阅读(298)  评论(0编辑  收藏  举报

导航