Class : More Representations of TextΒΆ

[[1],[1],[1],[1],[1]]

# %load http://drsmb.co/310
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import euclidean_distances
from sklearn import datasets
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
import numpy as np
ng_X,ng_y = datasets.fetch_20newsgroups(categories =['comp.graphics','sci.crypt'],
                                       return_X_y = True)
count_vec = text.CountVectorizer()
ng_X_vec = count_vec.fit_transform(ng_X)
ng_X_vec.toarray()[:3]
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]])
ng_X_vec.shape
(1179, 24257)
np.max(ng_X_vec)
549
tfidf = text.TfidfTransformer()
ng_X_tfidf = tfidf.fit_transform(ng_X_vec)
ng_X_tfidf.toarray()[:3]
array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.05125476, 0.0576842 , 0.        , ..., 0.        , 0.        ,
        0.        ]])
ng_X_tfidf.shape
(1179, 24257)
np.max(ng_X_tfidf)
0.938424962381379

Does this representation improve our classification for this task?

counts_bigram = text.CountVectorizer(ngram_range = (2,2))
counts_bigram.fit_transform(ng_X)
<1179x149885 sparse matrix of type '<class 'numpy.int64'>'
	with 313557 stored elements in Compressed Sparse Row format>
counts_bigram = text.CountVectorizer(ngram_range = (2,2),stop_words = 'english')
counts_bigram.fit_transform(ng_X)
<1179x125898 sparse matrix of type '<class 'numpy.int64'>'
	with 192349 stored elements in Compressed Sparse Row format>