top_wordcloud.py

# -*- coding: utf-8 -*-
"""WordCloud.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Ynd3GOj1I6Vdot79UvTzrvFausmlxGIZ
"""

import matplotlib.pyplot as plt
#from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import (WordCloud, get_single_color_func)
#from PIL import Image
#import seaborn as sns
#import random
import streamlit as st
import pandas as pd
import numpy as np

import nltk
import re                                   # library for regular expression operations

#import string                               # for string operations
# module for stop words that come with NLTK
from nltk.corpus import stopwords, wordnet
#from nltk.stem import PorterStemmer         # module for stemming
from nltk.stem import WordNetLemmatizer     # module for stemming
#from nltk.tokenize import TweetTokenizer    # module for tokenizing strings
#from nltk.tokenize import sent_tokenize
#from nltk.tokenize import word_tokenize

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#from textblob import TextBlob
#from textblob import Word

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')


# Word Cloud
#from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


# Load data
tweets = pd.read_csv('tweets_EDA_clean.csv', encoding='utf-8', index_col=0)

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
#      Data Pre-Processing           #
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

def app():


    def clean_text(tweet):
      # function to clean tweets
        """Function to clean tweet text from noise, lowercasing, remove stop words

           Input: Raw text

           Output: Clean text.
        """

        temp = tweet.lower()
        temp = re.sub(r'\\n', " ", temp)
        temp = re.sub(r'&\S+', "  ", temp)
        temp = re.sub("@[a-z0-9_]+", " ", temp)
        temp = re.sub("#[a-z0-9_]+", "  ", temp)
        temp = re.sub(r'http\S+', "  ", temp)
        temp = re.sub(r'covid19|covid-19|coronavirus|virus', "covid", temp)
        temp = re.sub(r'vaccine|vaccination', "vaccine", temp)
        temp = re.sub(r'covid\s+vaccine', "vaccine", temp)
        temp = re.sub('[()!?]', '  ', temp)
        temp = re.sub('\[.*?\]', '  ', temp)
        temp = re.sub("[^a-z0-9]", "  ", temp)

        # Remove stop words from the twitter texts
        stop_words = stopwords.words('english')
        temp = temp.split()
        temp = [w for w in temp if not w in stop_words]
        temp = " ".join(word for word in temp)
        return temp


    # Clean data
    tweets['pre_cleaned_text'] = tweets['text'].apply(clean_text)


    # Lemmatize with POS Tag
    def get_wordnet_pos(word):

        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)

    # Reduce word to its root word with lemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    #tweets['cleaned_text'] = tweets['pre_cleaned_text'].apply(lambda x: " ".join(
    #    wordnet_lemmatizer.lemmatize(word, pos='v') for word in x.split()))

    tweets['cleaned_text'] = tweets['pre_cleaned_text'].apply(lambda x: " ".join(
        wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in x.split()))


    # Initialize VADER
    sentimentAnalyser = SentimentIntensityAnalyzer()


    def calculate_sentiment(text):
        # Run VADER on the text
        scores = sentimentAnalyser.polarity_scores(text)
        # Extract the compound score
        compound_score = scores['compound']
        # Return compound score
        return compound_score


    # function that will categorize the 'sentiment_score' column by Postive, Negative, or Neutral
    def getCategory(score):
        if score > 0.05:
            return 'Positive'
        elif score < -0.05:
            return 'Negative'
        else:
            return 'Neutral'


    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
    #                                    #
    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
    # Sentiment Analysis after cleaning
    # Apply the function to every row in the "text" column and output the results into a new column "sentiment_score"
    tweets['sentiment_score'] = tweets['cleaned_text'].apply(calculate_sentiment)
    tweets['analysis'] = tweets['sentiment_score'].apply(getCategory)

    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
    #                                       #
    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

    # Feature extraction


    def get_word_counts(documents):
        """
        Funtion to get embeddings of the documents 

        """
        # Creating the CountVectorizer instance
        vectorizer = CountVectorizer()

        # Getting the tokenization and occurrence counting
        words_trans = vectorizer.fit_transform(documents)

        words = vectorizer.get_feature_names()

        return words_trans, words


    # call the function
    X, word = get_word_counts(tweets['cleaned_text'])

    # Getting the bag of words as DataFrame
    words_df = pd.DataFrame(
        list(zip(word, np.ravel(X.sum(axis=0)))), columns=["Word", "Word_Count_All_Sentiments"]
    )

    # sort values by column "Word_Count_All_Sentiments"
    words_df = words_df.sort_values(
        by=["Word_Count_All_Sentiments"], ascending=False)

    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
    #                #
    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

    # Overall sentiment
    total_tuple = []
    for word in words_df.Word:  # loop through each word in the column

        temp2 = []
        # loop through each text in the column
        for ind, text in enumerate(tweets['cleaned_text']):
            if word in text:
                # Look for the sentiment_score of matched word and append to temp2 list ##### [tweets.loc[ind, 'sentiment_score'] for ind, text in enumerate(tweets['cleaned_text']) if word in text]
                temp2.append(tweets.loc[ind, 'sentiment_score'])

        # get average of each word and append the word and the mean to total_tuple list
        average_num = sum(temp2)/(len(temp2) + 0.0001)
        total_tuple.append((word, average_num))


    # Filter positive and negative sentiments into new lists as input for the color_to_words
    res_pos = [sub[0] for sub in total_tuple if sub[1] > 0.05]
    res_neg = [sub[0] for sub in total_tuple if sub[1] < -0.05]
    #res_neu = [sub[0] for sub in total_tuple if (sub[1] <= 0.05) & (sub[1] >= -0.05)]


    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
    #        Word Cloud                  #
    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

    class GroupedColorFunc(object):
        """Create a color function object which assigns DIFFERENT SHADES of
           specified colors to certain words based on the color to words mapping.

           Uses wordcloud.get_single_color_func

           Parameters
           ----------
           color_to_words : dict(str -> list(str))
             A dictionary that maps a color to the list of words.

           default_color : str
             Color that will be assigned to a word that's not a member
             of any value from color_to_words.
        """

        def __init__(self, color_to_words, default_color):
            self.color_func_to_words = [
                (get_single_color_func(color), set(words))
                for (color, words) in color_to_words.items()]

            self.default_color_func = get_single_color_func(default_color)

        def get_color_func(self, word):
            """Returns a single_color_func associated with the word"""
            try:
                color_func = next(
                    color_func for (color_func, words) in self.color_func_to_words
                    if word in words)
            except StopIteration:
                color_func = self.default_color_func

            return color_func

        def __call__(self, word, **kwargs):
            return self.get_color_func(word)(word, **kwargs)


    def wordcloud(text, display_words):

        # Generate a word cloud that assigns colors to words based on a predefined mapping from colors to words

        wc = WordCloud(max_words=display_words, random_state=42,
                       max_font_size=119, width=800, height=500).generate(text)

        color_to_words = {

            # words below will be colored with a green single color function
            '#00ff00': res_pos,
            # will be colored with a red single color function
            'red': res_neg
        }

        # Words that are not in any of the color_to_words values
        # will be colored with a grey single color function
        default_color = 'grey'

        # Create a color function with single tone
        # grouped_color_func = SimpleGroupedColorFunc(color_to_words, default_color)

        # Create a color function with multiple tones
        grouped_color_func = GroupedColorFunc(color_to_words, default_color)

        # Apply our color function
        wc.recolor(color_func=grouped_color_func)

        # Plot
        plt.figure(figsize=(25, 25))
        plt.imshow(wc, interpolation="bilinear")
        plt.axis("off")
        # plt.show()
        st.set_option('deprecation.showPyplotGlobalUse', False)
        st.pyplot()


    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
    # Slider, wordcloud
    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
   # with open('style.css') as f:
  #      st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)

    #st.title("*WordCloud for Covid-19 Sentiment on Twitter*")

    title1 = '<p style="color:Blue; font-size: 40px;">Covid-19 Sentiment on Twitter</p>'
    st.markdown(title1, unsafe_allow_html=True)


    title2 = '<p style="color:Blue; font-size: 32px;">Wordcloud</p>'
    st.markdown(title2, unsafe_allow_html=True)
    #st.subheader("Wordcloud")

    #st.markdown("The wordcloud vizualization displays the top words used in the tweets, the sizes of the words represent their fequencies, the bigger the word, the more frequent the word and the smaller the word, the less frequent the word. The colors represent `sentiment scores` associated with the word which ranges from -1 to 1, from most negative to most positive. Sentiment scores greater than `0.05` indicate **Positive Sentiment** with the color *green*, and scores less than `-0.05` indicate **Negative Sentiment** with color *red*, while **Neutral Sentiment** takes color *grey*. ")

    st.markdown("The wordcloud vizualization displays the top words used in the tweets, the sizes of the words represent their fequencies, the bigger the word, the more frequent the word and the smaller the word, the less frequent the word. The colors represent `sentiment scores` associated with the word which ranges from -1 to 1, from most negative to most positive. ")


    text1 = '<p>Sentiment scores greater than <code>0.05</code> indicate <em id = "positiveSent">Positive Sentiment</em> in <em id = "positiveSent">green</em> color, and scores less than <code>-0.05</code> indicate <em id = "negativeSent">Negative Sentiment</em> in <em id = "negativeSent">red</em> color, while <em id = "neutralSent">Neutral Sentiment</em> takes color <em id = "neutralSent">grey</em>. </p>'
    st.markdown(text1, unsafe_allow_html=True)

    st.markdown("The user can choose the number of words to display on the wordcloud by moving the slide to a desired number. The result will show the top number of words used in the tweets and their polarity in colors defined above. ")


    # --------------------------------------------------------
    all_sentiments = ' '.join([category for category in tweets['cleaned_text']])

    num_words = st.slider(label='Number of Words to Display',
                          min_value=5, max_value=150, value=75, step=5)
    st.write("###### Slider number:", num_words)

    wordcloud(all_sentiments, num_words)
    # --------------------------------------------------------