Class 28: SVM & Model Optimization¶

Use 1 word to say how your portfolio check 2 is going in the zoom chat
Log onto prismia & share any final questions you have about the portfolio

# %load http://drsmb.co/310
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import datasets
from sklearn import cluster
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

iris_X, iris_y = datasets.load_iris(return_X_y= True)

iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris_X,iris_y)

svm_clf = svm.SVC(kernel='linear')

svm_clf.fit(iris_X_train,iris_y_train)

SVC(kernel='linear')

svm_clf.score(iris_X_test, iris_y_test)

1.0

param_grid = {'kernel':['linear','rbf'], 'C':[.5, 1, 10]}
svm_opt = GridSearchCV(svm_clf,param_grid,)

svm_opt.fit(iris_X, iris_y)

GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [0.5, 1, 10], 'kernel': ['linear', 'rbf']})

type(svm_opt.best_estimator_)

sklearn.svm._classes.SVC

svm_opt.best_params_

{'C': 0.5, 'kernel': 'linear'}

svm_opt.cv_results_

{'mean_fit_time': array([0.00061193, 0.00076184, 0.00062265, 0.00072966, 0.00060325,
        0.00064535]),
 'std_fit_time': array([1.05270116e-04, 7.82869614e-05, 4.61122982e-05, 4.40765221e-05,
        2.30151865e-05, 3.56278736e-05]),
 'mean_score_time': array([0.00028019, 0.00037417, 0.00029044, 0.00034089, 0.00027933,
        0.00030575]),
 'std_score_time': array([5.55222356e-05, 6.42569874e-05, 2.11074649e-05, 2.79559992e-05,
        3.98430475e-05, 2.54760488e-05]),
 'param_C': masked_array(data=[0.5, 0.5, 1, 1, 10, 10],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.5, 'kernel': 'linear'},
  {'C': 0.5, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 1, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'}],
 'split0_test_score': array([0.96666667, 0.93333333, 0.96666667, 0.96666667, 1.        ,
        0.96666667]),
 'split1_test_score': array([1.        , 0.96666667, 1.        , 0.96666667, 1.        ,
        1.        ]),
 'split2_test_score': array([1.        , 0.96666667, 0.96666667, 0.96666667, 0.9       ,
        0.96666667]),
 'split3_test_score': array([0.96666667, 0.93333333, 0.96666667, 0.93333333, 0.96666667,
        0.96666667]),
 'split4_test_score': array([1., 1., 1., 1., 1., 1.]),
 'mean_test_score': array([0.98666667, 0.96      , 0.98      , 0.96666667, 0.97333333,
        0.98      ]),
 'std_test_score': array([0.01632993, 0.02494438, 0.01632993, 0.02108185, 0.03887301,
        0.01632993]),
 'rank_test_score': array([1, 6, 2, 5, 4, 2], dtype=int32)}

import pandas as pd

pd.DataFrame(svm_opt.cv_results_)

	mean_fit_time	std_fit_time	mean_score_time	std_score_time	param_C	param_kernel	params	split0_test_score	split1_test_score	split2_test_score	split3_test_score	split4_test_score	mean_test_score	std_test_score	rank_test_score
0	0.000612	0.000105	0.000280	0.000056	0.5	linear	{'C': 0.5, 'kernel': 'linear'}	0.966667	1.000000	1.000000	0.966667	1.0	0.986667	0.016330	1
1	0.000762	0.000078	0.000374	0.000064	0.5	rbf	{'C': 0.5, 'kernel': 'rbf'}	0.933333	0.966667	0.966667	0.933333	1.0	0.960000	0.024944	6
2	0.000623	0.000046	0.000290	0.000021	1	linear	{'C': 1, 'kernel': 'linear'}	0.966667	1.000000	0.966667	0.966667	1.0	0.980000	0.016330	2
3	0.000730	0.000044	0.000341	0.000028	1	rbf	{'C': 1, 'kernel': 'rbf'}	0.966667	0.966667	0.966667	0.933333	1.0	0.966667	0.021082	5
4	0.000603	0.000023	0.000279	0.000040	10	linear	{'C': 10, 'kernel': 'linear'}	1.000000	1.000000	0.900000	0.966667	1.0	0.973333	0.038873	4
5	0.000645	0.000036	0.000306	0.000025	10	rbf	{'C': 10, 'kernel': 'rbf'}	0.966667	1.000000	0.966667	0.966667	1.0	0.980000	0.016330	2

svm_opt.best_estimator_.predict(iris_X_test)

array([2, 0, 0, 2, 2, 2, 0, 0, 1, 0, 2, 2, 2, 2, 1, 2, 0, 1, 2, 1, 0, 2,
       0, 0, 2, 2, 0, 0, 2, 2, 1, 0, 0, 2, 0, 2, 0, 1])

from sklearn import tree

Find the optimal criterion, max_depth and min_samples_leaf for a decisoin tree on the iris data

dt = tree.DecisionTreeClassifier()
params_dt = {'criterion':['gini','entropy'],'max_depth':[2,3,4],
             'min_samples_leaf':list(range(2,20,2))}
dt_opt = GridSearchCV(dt,params_dt)
dt_opt.fit(iris_X,iris_y)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4],
                         'min_samples_leaf': [2, 4, 6, 8, 10, 12, 14, 16, 18]})

dt_opt.best_params_

{'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 2}

Programming for Data Science at URI Fall 2020

Class 28: SVM & Model Optimization¶