Class 28: SVM & Model OptimizationΒΆ

  1. Use 1 word to say how your portfolio check 2 is going in the zoom chat

  2. Log onto prismia & share any final questions you have about the portfolio

# %load http://drsmb.co/310
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import datasets
from sklearn import cluster
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

iris_X, iris_y = datasets.load_iris(return_X_y= True)
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris_X,iris_y)
svm_clf = svm.SVC(kernel='linear')
svm_clf.fit(iris_X_train,iris_y_train)
SVC(kernel='linear')
svm_clf.score(iris_X_test, iris_y_test)
1.0
param_grid = {'kernel':['linear','rbf'], 'C':[.5, 1, 10]}
svm_opt = GridSearchCV(svm_clf,param_grid,)
svm_opt.fit(iris_X, iris_y)
GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [0.5, 1, 10], 'kernel': ['linear', 'rbf']})
type(svm_opt.best_estimator_)
sklearn.svm._classes.SVC
svm_opt.best_params_
{'C': 0.5, 'kernel': 'linear'}
svm_opt.cv_results_
{'mean_fit_time': array([0.00061193, 0.00076184, 0.00062265, 0.00072966, 0.00060325,
        0.00064535]),
 'std_fit_time': array([1.05270116e-04, 7.82869614e-05, 4.61122982e-05, 4.40765221e-05,
        2.30151865e-05, 3.56278736e-05]),
 'mean_score_time': array([0.00028019, 0.00037417, 0.00029044, 0.00034089, 0.00027933,
        0.00030575]),
 'std_score_time': array([5.55222356e-05, 6.42569874e-05, 2.11074649e-05, 2.79559992e-05,
        3.98430475e-05, 2.54760488e-05]),
 'param_C': masked_array(data=[0.5, 0.5, 1, 1, 10, 10],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.5, 'kernel': 'linear'},
  {'C': 0.5, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 1, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'}],
 'split0_test_score': array([0.96666667, 0.93333333, 0.96666667, 0.96666667, 1.        ,
        0.96666667]),
 'split1_test_score': array([1.        , 0.96666667, 1.        , 0.96666667, 1.        ,
        1.        ]),
 'split2_test_score': array([1.        , 0.96666667, 0.96666667, 0.96666667, 0.9       ,
        0.96666667]),
 'split3_test_score': array([0.96666667, 0.93333333, 0.96666667, 0.93333333, 0.96666667,
        0.96666667]),
 'split4_test_score': array([1., 1., 1., 1., 1., 1.]),
 'mean_test_score': array([0.98666667, 0.96      , 0.98      , 0.96666667, 0.97333333,
        0.98      ]),
 'std_test_score': array([0.01632993, 0.02494438, 0.01632993, 0.02108185, 0.03887301,
        0.01632993]),
 'rank_test_score': array([1, 6, 2, 5, 4, 2], dtype=int32)}
import pandas as pd
pd.DataFrame(svm_opt.cv_results_)
mean_fit_time std_fit_time mean_score_time std_score_time param_C param_kernel params split0_test_score split1_test_score split2_test_score split3_test_score split4_test_score mean_test_score std_test_score rank_test_score
0 0.000612 0.000105 0.000280 0.000056 0.5 linear {'C': 0.5, 'kernel': 'linear'} 0.966667 1.000000 1.000000 0.966667 1.0 0.986667 0.016330 1
1 0.000762 0.000078 0.000374 0.000064 0.5 rbf {'C': 0.5, 'kernel': 'rbf'} 0.933333 0.966667 0.966667 0.933333 1.0 0.960000 0.024944 6
2 0.000623 0.000046 0.000290 0.000021 1 linear {'C': 1, 'kernel': 'linear'} 0.966667 1.000000 0.966667 0.966667 1.0 0.980000 0.016330 2
3 0.000730 0.000044 0.000341 0.000028 1 rbf {'C': 1, 'kernel': 'rbf'} 0.966667 0.966667 0.966667 0.933333 1.0 0.966667 0.021082 5
4 0.000603 0.000023 0.000279 0.000040 10 linear {'C': 10, 'kernel': 'linear'} 1.000000 1.000000 0.900000 0.966667 1.0 0.973333 0.038873 4
5 0.000645 0.000036 0.000306 0.000025 10 rbf {'C': 10, 'kernel': 'rbf'} 0.966667 1.000000 0.966667 0.966667 1.0 0.980000 0.016330 2
svm_opt.best_estimator_.predict(iris_X_test)
array([2, 0, 0, 2, 2, 2, 0, 0, 1, 0, 2, 2, 2, 2, 1, 2, 0, 1, 2, 1, 0, 2,
       0, 0, 2, 2, 0, 0, 2, 2, 1, 0, 0, 2, 0, 2, 0, 1])
from sklearn import tree

Find the optimal criterion, max_depth and min_samples_leaf for a decisoin tree on the iris data

dt = tree.DecisionTreeClassifier()
params_dt = {'criterion':['gini','entropy'],'max_depth':[2,3,4],
             'min_samples_leaf':list(range(2,20,2))}
dt_opt = GridSearchCV(dt,params_dt)
dt_opt.fit(iris_X,iris_y)
GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4],
                         'min_samples_leaf': [2, 4, 6, 8, 10, 12, 14, 16, 18]})
dt_opt.best_params_
{'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 2}