-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.py
83 lines (62 loc) · 2.99 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import os
from os import getcwd
import matplotlib.pyplot as plt
from requests.api import get
from wordcloud import WordCloud
from data_pipeline import fetch_data, pre_process_text, get_word_frequency, process_tweet_data, get_processed_tweets
from language_model import generate_model
from utils import build_frequency, get_most_freqeuent_words, write_to_file
try:
os.mkdir(f"{getcwd()}/results/")
except OSError:
pass
all_page_content_file = f"{getcwd()}/data/AllContent.txt"
root_page_content_file = f"{getcwd()}/data/RootContent.txt"
raw_root_page_content_file = f"{getcwd()}/data/RootContentRaw.txt"
lemma_content_file = f"{getcwd()}/data/lemmas.txt"
token_content_file = f"{getcwd()}/data/tokens.txt"
test_tweets_file = f"{getcwd()}/data/ProcessedTestTweets.json"
train_tweets_file = f"{getcwd()}/data/ProcessedTrainTweets.json"
# uncomment to refetch data from wikipedia
# fetch_data(root_page_content_file, all_page_content_file, raw_root_page_content_file)
# uncomment to reprocess data from wikipedia
# pre_process_text(all_page_content_file, lemma_content_file, token_content_file)
######################################################
## Question 5.1 ###
######################################################
word_frequency, top_words = get_word_frequency(lemma_content_file)
# print(top_words)
# wc = WordCloud()
# wc.generate_from_frequencies(word_frequency)
# wc.to_file(f"{getcwd()}/results/wiki-covid-cloud.png")
# plt.figure()
# plt.imshow(wc, interpolation='bilinear')
# plt.axis("off")
# plt.show()
######################################################
## Question 5.2 ###
######################################################
# uncomment to reprocess data from scraped tweets
# process_tweet_data(test_tweets_file, train_tweets_file)
# test_tweets, train_tweets = get_processed_tweets(test_tweets_file, train_tweets_file)
# test_tweet_tokens = test_tweets["tokens"]
# test_tweet_types = test_tweets["types"]
# train_tweet_tokens = train_tweets["tokens"]
# train_tweet_types = train_tweets["types"]
# def get_oov_rate(corpus, vocab):
# counter = 0
# for word in corpus:
# if (word not in vocab):
# counter += 1
# oov_rate = counter/len(corpus)
# return oov_rate
# oov_tweet_types_norm = get_oov_rate(test_tweet_types, word_frequency)
# print("out of vocabulary Word types in tweets normalized by the number of word types tweet = ", oov_tweet_types_norm)
# oov_tweet_tokens_norm = get_oov_rate(test_tweet_tokens, word_frequency)
# print("out of vocabulary Word tokens in tweets normalized by the number of word tokens tweet = ", oov_tweet_tokens_norm)
# oov_tweet_tokens_norm = get_oov_rate(test_tweet_tokens, train_tweet_types)
# print("OOV-rate of your tweet test set = ", oov_tweet_tokens_norm)
######################################################
## Question 5.3 ###
######################################################
generate_model(raw_root_page_content_file)