Class 28: SVM & Model OptimizationΒΆ
Use 1 word to say how your portfolio check 2 is going in the zoom chat
Log onto prismia & share any final questions you have about the portfolio
# %load http://drsmb.co/310
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import datasets
from sklearn import cluster
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
iris_X, iris_y = datasets.load_iris(return_X_y= True)
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris_X,iris_y)
svm_clf = svm.SVC(kernel='linear')
svm_clf.fit(iris_X_train,iris_y_train)
SVC(kernel='linear')
svm_clf.score(iris_X_test, iris_y_test)
1.0
param_grid = {'kernel':['linear','rbf'], 'C':[.5, 1, 10]}
svm_opt = GridSearchCV(svm_clf,param_grid,)
svm_opt.fit(iris_X, iris_y)
GridSearchCV(estimator=SVC(kernel='linear'),
param_grid={'C': [0.5, 1, 10], 'kernel': ['linear', 'rbf']})
type(svm_opt.best_estimator_)
sklearn.svm._classes.SVC
svm_opt.best_params_
{'C': 0.5, 'kernel': 'linear'}
svm_opt.cv_results_
{'mean_fit_time': array([0.00061193, 0.00076184, 0.00062265, 0.00072966, 0.00060325,
0.00064535]),
'std_fit_time': array([1.05270116e-04, 7.82869614e-05, 4.61122982e-05, 4.40765221e-05,
2.30151865e-05, 3.56278736e-05]),
'mean_score_time': array([0.00028019, 0.00037417, 0.00029044, 0.00034089, 0.00027933,
0.00030575]),
'std_score_time': array([5.55222356e-05, 6.42569874e-05, 2.11074649e-05, 2.79559992e-05,
3.98430475e-05, 2.54760488e-05]),
'param_C': masked_array(data=[0.5, 0.5, 1, 1, 10, 10],
mask=[False, False, False, False, False, False],
fill_value='?',
dtype=object),
'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf'],
mask=[False, False, False, False, False, False],
fill_value='?',
dtype=object),
'params': [{'C': 0.5, 'kernel': 'linear'},
{'C': 0.5, 'kernel': 'rbf'},
{'C': 1, 'kernel': 'linear'},
{'C': 1, 'kernel': 'rbf'},
{'C': 10, 'kernel': 'linear'},
{'C': 10, 'kernel': 'rbf'}],
'split0_test_score': array([0.96666667, 0.93333333, 0.96666667, 0.96666667, 1. ,
0.96666667]),
'split1_test_score': array([1. , 0.96666667, 1. , 0.96666667, 1. ,
1. ]),
'split2_test_score': array([1. , 0.96666667, 0.96666667, 0.96666667, 0.9 ,
0.96666667]),
'split3_test_score': array([0.96666667, 0.93333333, 0.96666667, 0.93333333, 0.96666667,
0.96666667]),
'split4_test_score': array([1., 1., 1., 1., 1., 1.]),
'mean_test_score': array([0.98666667, 0.96 , 0.98 , 0.96666667, 0.97333333,
0.98 ]),
'std_test_score': array([0.01632993, 0.02494438, 0.01632993, 0.02108185, 0.03887301,
0.01632993]),
'rank_test_score': array([1, 6, 2, 5, 4, 2], dtype=int32)}
import pandas as pd
pd.DataFrame(svm_opt.cv_results_)
mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_C | param_kernel | params | split0_test_score | split1_test_score | split2_test_score | split3_test_score | split4_test_score | mean_test_score | std_test_score | rank_test_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.000612 | 0.000105 | 0.000280 | 0.000056 | 0.5 | linear | {'C': 0.5, 'kernel': 'linear'} | 0.966667 | 1.000000 | 1.000000 | 0.966667 | 1.0 | 0.986667 | 0.016330 | 1 |
1 | 0.000762 | 0.000078 | 0.000374 | 0.000064 | 0.5 | rbf | {'C': 0.5, 'kernel': 'rbf'} | 0.933333 | 0.966667 | 0.966667 | 0.933333 | 1.0 | 0.960000 | 0.024944 | 6 |
2 | 0.000623 | 0.000046 | 0.000290 | 0.000021 | 1 | linear | {'C': 1, 'kernel': 'linear'} | 0.966667 | 1.000000 | 0.966667 | 0.966667 | 1.0 | 0.980000 | 0.016330 | 2 |
3 | 0.000730 | 0.000044 | 0.000341 | 0.000028 | 1 | rbf | {'C': 1, 'kernel': 'rbf'} | 0.966667 | 0.966667 | 0.966667 | 0.933333 | 1.0 | 0.966667 | 0.021082 | 5 |
4 | 0.000603 | 0.000023 | 0.000279 | 0.000040 | 10 | linear | {'C': 10, 'kernel': 'linear'} | 1.000000 | 1.000000 | 0.900000 | 0.966667 | 1.0 | 0.973333 | 0.038873 | 4 |
5 | 0.000645 | 0.000036 | 0.000306 | 0.000025 | 10 | rbf | {'C': 10, 'kernel': 'rbf'} | 0.966667 | 1.000000 | 0.966667 | 0.966667 | 1.0 | 0.980000 | 0.016330 | 2 |
svm_opt.best_estimator_.predict(iris_X_test)
array([2, 0, 0, 2, 2, 2, 0, 0, 1, 0, 2, 2, 2, 2, 1, 2, 0, 1, 2, 1, 0, 2,
0, 0, 2, 2, 0, 0, 2, 2, 1, 0, 0, 2, 0, 2, 0, 1])
from sklearn import tree
Find the optimal criterion, max_depth and min_samples_leaf for a decisoin tree on the iris data
dt = tree.DecisionTreeClassifier()
params_dt = {'criterion':['gini','entropy'],'max_depth':[2,3,4],
'min_samples_leaf':list(range(2,20,2))}
dt_opt = GridSearchCV(dt,params_dt)
dt_opt.fit(iris_X,iris_y)
GridSearchCV(estimator=DecisionTreeClassifier(),
param_grid={'criterion': ['gini', 'entropy'],
'max_depth': [2, 3, 4],
'min_samples_leaf': [2, 4, 6, 8, 10, 12, 14, 16, 18]})
dt_opt.best_params_
{'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 2}