-
Notifications
You must be signed in to change notification settings - Fork 0
/
WordFrequency.py
84 lines (73 loc) · 2.06 KB
/
WordFrequency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 27 23:42:40 2018
@author: Dario
"""
import numpy as np
import time
#import os
#import codecs
import operator
#"/Users/Dario/Pictures/PondIce.wav
pathData = '/Users/Dario/Desktop/ETH/Freiwillig/NLU/Project/Data'
# load data
#
##def SentencePreProcessing():
#filename = os.path.join(pathData, 'sentences.train')
#fp = codecs.open(filename, 'r', 'utf-8')
#
#
#file = open(filename, 'rt')
#text = file.read()
#file.close()
def WordFrequencyDist_veryOld(words):
t_start = time.time()
dist = []
dist.append([words[0], 1])
for w in words[1:]:
foundflag = False
for d in dist:
if d[0] == w:
d[1]=d[1]+1
foundflag = True
break
if not(foundflag):
dist.append([w, 1])
print('time: '+str(time.time()-t_start))
#sort and stuff
dist_ar = np.asarray(dist)
arg_sort = np.flip(np.argsort(dist_ar[:,1]),axis=0)
print('time: '+str(time.time()-t_start))
return dist_ar[arg_sort]
def WordFrequencyDist_old(words):
""" takes list of words as input
returns the frequncy of words
this newer version is much faster than
the first one"""
t_start = time.time()
dist = {words[0]: 1}
for w in words[1:]:
if w in dist:
dist[w] += 1
else:
dist[w] = 1
print('time: '+str(time.time()-t_start))
sorted_dist = np.flip(sorted(dist.items(), key=operator.itemgetter(1)),axis=0)
print('time: '+str(time.time()-t_start))
return sorted_dist
def WordFrequencyDist_D(words):
""" takes list of words as input
returns the frequncy of words
this newer version is much faster than
the first one"""
t_start = time.time()
dist = {words[0]: 1}
for w in words[1:]:
if w in dist:
dist[w] += 1
else:
dist[w] = 1
print('time: '+str(time.time()-t_start))
sorted_dist = sorted(dist.items(), key=lambda x: x[1], reverse=True)
print('time: '+str(time.time()-t_start))
return sorted_dist