-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathexplore.py
88 lines (73 loc) · 3.13 KB
/
explore.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from wordcloud import WordCloud
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import nltk
def word_string(df):
# create variables to hold words that appear in each type of repo as a single string
js_words = ' '.join(df[df.language=='JavaScript'].clean)
py_words = ' '.join(df[df.language=='Python'].clean)
all_words = ' '.join(df.clean)
js_words = re.sub(r'\s.\s', '', js_words)
py_words = re.sub(r'\s.\s', '', py_words)
all_words = re.sub(r'\s.\s', '', all_words)
return js_words, py_words, all_words
def word_freq(js_words, py_words, all_words):
# how frequently each word appears
js_freq = pd.Series(js_words.split()).value_counts()
py_freq = pd.Series(py_words.split()).value_counts()
all_freq = pd.Series(all_words.split()).value_counts()
return js_freq, py_freq, all_freq
def word_counts(js_freq, py_freq):
# word frequency dataframe
word_counts = (pd.concat([js_freq, py_freq], axis=1, sort=True)
.set_axis(['js', 'py'], axis=1, inplace=False)
.fillna(0)
.apply(lambda s: s.astype(int))
)
# all counts of the word
word_counts['all'] = word_counts['js'] + word_counts['py']
# proportion of each string that is javascript
word_counts['prop_js'] = word_counts['js']/word_counts['all']
return word_counts
def word_cloud(js_words, py_words):
js_cloud = WordCloud(background_color='lightgray',
height=800, width=800).generate(js_words)
py_cloud = WordCloud(background_color='lightgray',
height=800, width=800).generate(py_words)
plt.figure(figsize=(10,10))
axs = [plt.axes([.25, 1, .5, .5]), plt.axes([.8, 1, .5, .5])]
# imshow => display data as an image
axs[0].imshow(js_cloud)
axs[1].imshow(py_cloud)
axs[0].set_title('JavaScript')
axs[1].set_title('Python')
for ax in axs: ax.axis('off')
def ngrams(js_words, py_words, n):
js_ngrams = pd.Series(list(nltk.ngrams(js_words.split(), n))).value_counts()
py_ngrams = pd.Series(list(nltk.ngrams(py_words.split(), n))).value_counts()
return js_ngrams, py_ngrams
def plot_ngrams(js_bigrams, py_bigrams, js_trigrams, py_trigrams):
plt.subplot(221)
js_bigrams.head(10).plot.barh(color='red', width=.9, figsize=(10, 10), alpha=.8)
plt.title('10 most frequently occurring JavaScript bigrams')
plt.ylabel('Bigram')
plt.xlabel('Frequency')
plt.subplot(222)
py_bigrams.head(10).plot.barh(color='green', width=.9, figsize=(10, 10), alpha=.3)
plt.title('10 most frequently occurring Python bigrams')
plt.ylabel('Bigram')
plt.xlabel('Frequency')
plt.subplot(223)
js_trigrams.head(10).plot.barh(color='orange', width=.9, figsize=(10, 10), alpha=.3)
plt.title('10 most frequently occurring JavaScript trigrams')
plt.ylabel('Trigram')
plt.xlabel('Frequency')
plt.subplot(224)
py_trigrams.head(10).plot.barh(color='navy', width=.9, figsize=(10, 10), alpha=.3)
plt.title('10 most frequently occurring Python trigrams')
plt.ylabel('Trigram')
plt.xlabel('Frequency')
plt.tight_layout()
plt.show()