22. More text representations#
Important
We will have 2 speakers to wrap up the semester:
12/7: Nirmal Keshava
12/12: Tiffany Sithiphone
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import euclidean_distances
from sklearn import datasets
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
ng_X,ng_y = datasets.fetch_20newsgroups(categories =['comp.graphics','sci.crypt'],
return_X_y = True)
count_vec = text.CountVectorizer()
ng_vec = count_vec.fit_transform(ng_X)
ng_vec
<1179x24257 sparse matrix of type '<class 'numpy.int64'>'
with 188291 stored elements in Compressed Sparse Row format>
1179*24257
28599003
188291*3
564873
clf = MultinomialNB()
ng_vec_train, ng_vec_test, ng_y_train, ng_y_test = train_test_split(ng_vec,ng_y)
clf.fit(ng_vec_train,ng_y_train).score(ng_vec_test,ng_y_test)
0.9830508474576272
tfidf = text.TfidfVectorizer()
ng_tfidf = tfidf.fit_transform(ng_X)
ng_vec[:2].toarray()[:,21691]
array([41, 0])
ng_tfidf[:2].toarray()[:,21691]
array([0.21375056, 0. ])
from sklearn import tree
dt = tree.DecisionTreeClassifier()
ng_vec_train, ng_vec_test, ng_tfidf_train, ng_tfidf_test, ng_y_train, ng_y_test = train_test_split(
ng_vec,ng_tfidf, ng_y)
dt.fit(ng_vec_train,ng_y_train).score(ng_vec_test,ng_y_test)
0.9525423728813559
dt.fit(ng_tfidf_train,ng_y_train).score(ng_tfidf_test,ng_y_test)
0.9457627118644067
ng_X[1][:200]
'From: lulagos@cipres.cec.uchile.cl (admirador)\nSubject: OAK VGA 1Mb. Please, I needd VESA TSR!!! 8^)\nOriginator: lulagos@cipres\nNntp-Posting-Host: cipres.cec.uchile.cl\nOrganization: Centro de Computac'
ng_y[:5]
array([0, 0, 1, 0, 0])
ng_vec[1].toarray()[:50]
array([[0, 0, 0, ..., 0, 0, 0]])
counts = text.CountVectorizer(
Cell In[20], line 1
counts = text.CountVectorizer(
^
SyntaxError: unexpected EOF while parsing