import
json
import
numpy as np
import
pandas as pd
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.decomposition
import
PCA
from
sklearn.cluster
import
KMeans
import
matplotlib.pyplot as plt
df
=
pd.read_json(
'sarcasm.json'
)
sentence
=
df.headline
vectorizer
=
TfidfVectorizer(stop_words
=
'english'
)
vectorized_documents
=
vectorizer.fit_transform(sentence)
pca
=
PCA(n_components
=
2
)
reduced_data
=
pca.fit_transform(vectorized_documents.toarray())
num_clusters
=
2
kmeans
=
KMeans(n_clusters
=
num_clusters, n_init
=
5
,
max_iter
=
500
, random_state
=
42
)
kmeans.fit(vectorized_documents)
results
=
pd.DataFrame()
results[
'document'
]
=
sentence
results[
'cluster'
]
=
kmeans.labels_
print
(results.sample(
5
))
colors
=
[
'red'
,
'green'
]
cluster
=
[
'Not Sarcastic'
,
'Sarcastic'
]
for
i
in
range
(num_clusters):
plt.scatter(reduced_data[kmeans.labels_
=
=
i,
0
],
reduced_data[kmeans.labels_
=
=
i,
1
],
s
=
10
, color
=
colors[i],
label
=
f
' {cluster[i]}'
)
plt.legend()
plt.show()