import pandas as pd
!pip install nlpia
from nlpia.data.loaders import get_data
pd.options.display.width = 120
sms = get_data('sms-spam')

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
sms

index = ['sms{}{}'.format(i, '!'*j) for (i,j) in zip(range(len(sms)), sms.spam)]
sms.index = index
sms

tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=sms.text).toarray()

len(tfidf.vocabulary_)

/usr/local/lib/python3.9/dist-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(

9232

print(tfidf_docs)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

tfidf_docs = pd.DataFrame(tfidf_docs)
tfidf_docs = tfidf_docs - tfidf_docs.mean()
tfidf_docs.shape

(4837, 9232)

tfidf_docs

sms.spam.sum()

638

pca = PCA(n_components=16)
pca = pca.fit(tfidf_docs)
pca_topic_vectors = pca.transform(tfidf_docs)
columns = ['topic{}'.format(i) for i in range(pca.n_components)]
pca_topic_vectors = pd.DataFrame(pca_topic_vectors, columns=columns,\
index=index)
pca_topic_vectors.round(3)

tfidf.vocabulary_

column_nums, terms = zip(*sorted(zip(tfidf.vocabulary_.values(),\
tfidf.vocabulary_.keys())))

weights = pd.DataFrame(pca.components_, columns=terms,index=['topic{}'.format(i) for i in range(16)])

pd.options.display.max_columns = 8
weights.head(4).round(3)

pd.options.display.max_columns = 10
deals = weights['! ;) :) half off free crazy deal only $ 80 %'.split()].round(3) * 100
deals.head(5)

deals.T.sum()
#Topics 4, 8, and 9 appear to all contain positive “deal” topic sentiment. And topics 0, 3,
#and 5 appear to be “anti-deal” topics, messages about stuff that’s the opposite of “deals”:
#negative deals.

topic0    -11.9
topic1      7.5
topic2     12.8
topic3    -15.5
topic4     38.3
topic5    -33.9
topic6      4.8
topic7     -5.0
topic8     40.6
topic9     32.0
topic10   -29.1
topic11    48.3
topic12     3.5
topic13    47.5
topic14    32.0
topic15    -4.2
dtype: float64

svd = TruncatedSVD(n_components=16, n_iter=100)
svd_topic_vectors = svd.fit_transform(tfidf_docs.values)
svd_topic_vectors = pd.DataFrame(svd_topic_vectors, columns=columns,\
index=index)

svd_topic_vectors.round(3).head(6)

svd_topic_vectors = (svd_topic_vectors.T / np.linalg.norm(\
svd_topic_vectors, axis=1)).T

svd_topic_vectors.iloc[:20].dot(svd_topic_vectors.iloc[:20].T).round(1)

	spam	text
sms0	0	Go until jurong point, crazy.. Available only ...
sms1	0	Ok lar... Joking wif u oni...
sms2!	1	Free entry in 2 a wkly comp to win FA Cup fina...
sms3	0	U dun say so early hor... U c already then say...
sms4	0	Nah I don't think he goes to usf, he lives aro...
...	...	...
sms4832!	1	This is the 2nd time we have tried 2 contact u...
sms4833	0	Will ü b going to esplanade fr home?
sms4834	0	Pity, * was in mood for that. So...any other s...
sms4835	0	The guy did some bitching but I acted like i'd...
sms4836	0	Rofl. Its true to its name

	spam	text
sms0	0	Go until jurong point, crazy.. Available only ...
sms1	0	Ok lar... Joking wif u oni...
sms2!	1	Free entry in 2 a wkly comp to win FA Cup fina...
sms3	0	U dun say so early hor... U c already then say...
sms4	0	Nah I don't think he goes to usf, he lives aro...
...	...	...
sms4832!	1	This is the 2nd time we have tried 2 contact u...
sms4833	0	Will ü b going to esplanade fr home?
sms4834	0	Pity, * was in mood for that. So...any other s...
sms4835	0	The guy did some bitching but I acted like i'd...
sms4836	0	Rofl. Its true to its name

	0	1	2	3	4	5	6	7	8	9	...	9222	9223	9224	9225	9226	9227	9228	9229	9230	9231
0	-0.025643	-0.00584	-0.000228	-0.000053	-0.000156	-0.000943	-0.000463	-0.006695	-0.004035	-0.002745	...	-0.000264	-0.000426	-7.667659e-07	-0.001598	-0.000148	-0.000099	-0.00066	-0.000055	-0.000055	-0.000055
1	-0.025643	-0.00584	-0.000228	-0.000053	-0.000156	-0.000943	-0.000463	-0.006695	-0.004035	-0.002745	...	-0.000264	-0.000426	-7.667659e-07	-0.001598	-0.000148	-0.000099	-0.00066	-0.000055	-0.000055	-0.000055
2	-0.025643	-0.00584	-0.000228	-0.000053	-0.000156	-0.000943	-0.000463	0.096125	0.127340	0.124007	...	-0.000264	-0.000426	-7.667659e-07	-0.001598	-0.000148	-0.000099	-0.00066	-0.000055	-0.000055	-0.000055
3	-0.025643	-0.00584	-0.000228	-0.000053	-0.000156	-0.000943	-0.000463	-0.006695	-0.004035	-0.002745	...	-0.000264	-0.000426	-7.667659e-07	-0.001598	-0.000148	-0.000099	-0.00066	-0.000055	-0.000055	-0.000055
4	-0.025643	-0.00584	-0.000228	-0.000053	-0.000156	-0.000943	-0.000463	-0.006695	-0.004035	-0.002745	...	-0.000264	-0.000426	-7.667659e-07	-0.001598	-0.000148	-0.000099	-0.00066	-0.000055	-0.000055	-0.000055
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4832	0.063691	-0.00584	-0.000228	-0.000053	-0.000156	-0.000943	-0.000463	-0.006695	-0.004035	-0.002745	...	-0.000264	-0.000426	-7.667659e-07	-0.001598	-0.000148	-0.000099	-0.00066	-0.000055	-0.000055	-0.000055
4833	-0.025643	-0.00584	-0.000228	-0.000053	-0.000156	-0.000943	-0.000463	-0.006695	-0.004035	-0.002745	...	-0.000264	-0.000426	-7.667659e-07	-0.001598	-0.000148	-0.000099	-0.00066	-0.000055	-0.000055	-0.000055
4834	-0.025643	-0.00584	-0.000228	-0.000053	-0.000156	-0.000943	-0.000463	-0.006695	-0.004035	-0.002745	...	-0.000264	-0.000426	-7.667659e-07	-0.001598	-0.000148	-0.000099	-0.00066	-0.000055	-0.000055	-0.000055
4835	-0.025643	-0.00584	-0.000228	-0.000053	-0.000156	-0.000943	-0.000463	-0.006695	-0.004035	-0.002745	...	-0.000264	-0.000426	-7.667659e-07	-0.001598	-0.000148	-0.000099	-0.00066	-0.000055	-0.000055	-0.000055
4836	-0.025643	-0.00584	-0.000228	-0.000053	-0.000156	-0.000943	-0.000463	-0.006695	-0.004035	-0.002745	...	-0.000264	-0.000426	-7.667659e-07	-0.001598	-0.000148	-0.000099	-0.00066	-0.000055	-0.000055	-0.000055

	topic0	topic1	topic2	topic3	topic4	topic5	topic6	topic7	topic8	topic9	topic10	topic11	topic12	topic13	topic14	topic15
sms0	0.201	0.003	0.037	0.011	-0.019	-0.053	0.039	-0.066	0.013	-0.082	0.005	-0.009	-0.019	-0.019	-0.006	0.032
sms1	0.404	-0.094	-0.078	0.051	0.100	0.047	0.023	0.065	0.023	-0.023	-0.002	0.038	-0.045	-0.016	0.046	-0.044
sms2!	-0.030	-0.048	0.090	-0.067	0.091	-0.043	-0.000	-0.002	-0.057	0.048	0.122	0.022	-0.035	0.012	-0.032	0.048
sms3	0.329	-0.033	-0.035	-0.016	0.052	0.056	-0.166	-0.074	0.062	-0.105	0.021	0.031	-0.080	-0.028	0.018	-0.070
sms4	0.002	0.031	0.038	0.034	-0.075	-0.093	-0.044	0.061	-0.044	0.028	0.028	-0.014	-0.020	0.053	-0.074	-0.016
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
sms4832!	-0.126	-0.160	0.091	-0.074	0.194	0.103	-0.161	0.021	0.095	-0.113	-0.050	0.042	-0.134	-0.090	0.239	0.054
sms4833	0.070	0.078	0.003	-0.063	-0.031	0.046	-0.018	-0.079	-0.090	0.162	0.070	0.029	0.097	-0.018	-0.017	0.205
sms4834	0.077	0.043	-0.019	0.060	0.016	-0.009	-0.024	-0.022	-0.062	-0.071	-0.078	-0.097	0.005	0.050	0.024	-0.024
sms4835	-0.029	0.007	0.001	-0.015	-0.066	-0.101	-0.028	0.033	-0.111	0.035	0.018	0.033	-0.015	-0.027	0.031	-0.055
sms4836	-0.038	-0.078	0.016	-0.064	-0.007	0.007	0.038	-0.007	-0.054	-0.004	0.049	-0.003	-0.045	0.000	-0.053	-0.006

	!	"	#	#150	...	…	┾	〨ud	鈥
topic0	-0.071	0.008	-0.001	-0.000	...	-0.002	0.001	0.001	0.001
topic1	0.063	0.008	0.000	-0.000	...	0.003	0.001	0.001	0.001
topic2	0.071	0.027	0.000	0.001	...	0.002	-0.001	-0.001	-0.001
topic3	-0.059	-0.032	-0.001	-0.000	...	0.001	0.000	0.000	0.000

	!	;)	:)	half	off	...	deal	only	$	80	%
topic0	-7.1	0.1	-0.5	-0.0	-0.4	...	-0.1	-2.2	0.3	-0.0	-0.0
topic1	6.3	0.0	7.4	0.1	0.4	...	-0.1	-3.8	-0.1	-0.0	-0.2
topic2	7.1	0.2	-0.1	0.1	0.3	...	-0.1	0.7	0.0	0.0	0.1
topic3	-5.9	-0.3	-7.1	0.2	0.3	...	0.1	-2.3	0.1	-0.1	-0.3
topic4	38.1	-0.1	-12.5	-0.1	-0.2	...	-0.2	3.0	0.3	0.1	-0.1