-
Notifications
You must be signed in to change notification settings - Fork 0
/
scattertext_model.py
61 lines (43 loc) · 1.72 KB
/
scattertext_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 10 01:19:14 2022
@author: Micha
"""
import json
import pandas as pd
# from nltk.corpus import stopwords
import gensim
import gensim.corpora as corpora
import spacy
import scattertext as st
####Stopwords alternative trial
# from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
df = pd.read_csv("Second Party System.csv")
columns = df.columns
df = df.rename(columns={"clean text":"clean_text"})
texts = df["clean_text"]
party = df["Party"]
party.unique()
nlp = st.whitespace_nlp_with_sentences
print("Document Count")
print(df.groupby("Party")["clean_text"].count())
print("Word Count")
"""
We build a scattertext model by grouping them by Party, and then feeing the cleaned text into the model
"""
# df["parsed"] = df.clean_text.apply(nlp)
for party in df['Party'].unique():
df['category'] = df['Party'].apply(lambda x: party if x == party else 'Other Parties')
corpus = st.CorpusFromPandas(df, category_col= "category", text_col="clean_text").build()
html = st.produce_scattertext_explorer(corpus,
category=party,
category_name=party,
not_category_name='Other Parties',
width_in_pixels=1000,
minimum_term_frequency=8,
term_ranker=st.OncePerDocFrequencyRanker,
# metadata=df['parsed']
)
file_name = party+".html"
with open(file_name, "wb") as fn:
fn.write(html.encode("utf-8"))