-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract-RussianVerbsClassification.py
140 lines (120 loc) · 5.94 KB
/
extract-RussianVerbsClassification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import argparse
import csv
import sys
from russianwords.clustering import RussianWordsPairsClusters as rwc
from russianwords.clustering import Relation
colSep = ';'
transSepBig = ';'
transSepSmall = ','
def genCsv(levels, order, yellowCol, yellowWhen):
with open("RussianVerbsClassification.csv", 'r', newline='') as csvfile:
reader = csv.DictReader(csvfile, delimiter=';')
allVerbsOrder = []
allVerbs = {}
nVerbsExported = 0
# Build an initial array of verbs
for row in reader:
infinitive = row['Инфинитив']
filterVal = row['Скрытый1-unwantedToExport']
if filterVal in ["x", "X"]:
sys.stderr.write("ignoring verb unwantedToExport: " + infinitive + "\n")
continue
verbRank = row['Ранг ГЛ']
lvl = row['Уровень']
pair = row['Пара аспектов']
usage = row['Подробности'].split('-')[0]
transFR = row['По-французски'].split(transSepBig)[0]
#if len(transFR) in [19, 20]:
# transFR = transFR.replace(", ", ",")
transEN = row['По-английски'].split(transSepBig)[0]
yellow = None
if yellowCol != None:
yellow = row[yellowCol]
# name of verb is the verb pair if available, else a single verb
if lvl in levels:
if verbRank != "10000":
nVerbsExported = nVerbsExported + 1
verb = infinitive
# Fetch pair if available
if pair != "":
verb = pair
if verb not in allVerbsOrder:
allVerbsOrder.append(verb)
else:
sys.stderr.write("WARN-2 ranked verbs in a pair: " + verb + " already met\n")
if yellow != None:
allVerbs[verb] = {
"finalName": verb,
"usage": usage,
"transFr": transFR,
"transEn": transEN,
"yellow": yellow
}
else:
allVerbs[verb] = {
"finalName": verb,
"usage": usage,
"transFr": transFR,
"transEn": transEN
}
newVerbsKeys = []
if order == 'none':
newVerbsKeys = allVerbs.keys()
if order == 'abc':
allVerbsKeys = allVerbs.keys()
newVerbsKeys = sorted(allVerbsKeys)
elif order == "abc_n_proximity":
allVerbsKeys = allVerbs.keys()
newVerbsKeys = sorted(allVerbsKeys)
verbsClusters = rwc(newVerbsKeys)
newVerbsKeys = rwc.flatten(verbsClusters.getWordsAndClusters([Relation.STEM, Relation.TRANS], True))
elif order == "freq": # by frequency
newVerbsKeys = allVerbsOrder
elif order == "freq_n_proximity":
verbsClusters = rwc(allVerbsOrder)
newVerbsKeys = rwc.flatten(verbsClusters.getWordsAndClusters([Relation.STEM, Relation.TRANS], True))
# Build lines of verbs
# TODO Verb pairs have their char '/' turned to a Latex function
for verb in newVerbsKeys:
currentVerb = allVerbs[verb]
finalName = currentVerb["finalName"]
# Fix pairs perf/imperf
# - by keeping only the prefix
# - by replacing char '/' by a latex function
if '/' in finalName:
pairA = finalName.split('/')
imperf = pairA[0]
perf = pairA[1]
if perf.endswith(imperf):
lenPrefix = len(perf) - len(imperf)
finalName = imperf + '/' + perf[0:lenPrefix] + '-'
currentVerb["finalName"] = finalName.replace("/", "\slash ")
line = currentVerb["finalName"] + colSep + currentVerb["usage"] + colSep + currentVerb["transFr"] + colSep + currentVerb["transEn"]
if yellowCol != None:
line = line + colSep
yellowVal = 1 if currentVerb["yellow"] == yellowWhen else 0
line += str(yellowVal)
print(line)
sys.stderr.write(str(nVerbsExported) + " verbs exported\n")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Fiters and Extractions on RussianVerbsClassification.csv')
parser.add_argument('-l', '--cefr-levels', dest='levels', required=True, nargs='+', help='')
parser.add_argument('-y', '--yellow', dest='yellow', nargs=2, default=None, help='2 strings: the first is the name of the field to use in a new yellow column, the second is the value of the field used to set the yellow field to 1 (else it equals 0)')
parser.add_argument('-o', '--order', dest='order', nargs='?', choices=['none', 'abc', 'abc_n_proximity', 'freq', 'freq_n_proximity'], default='freq', help='order to classify data, either by frequency order or by alphabetical order')
args = parser.parse_args()
levels = args.levels
order = args.order
yellowCol = None
yellowWhen = None
if args.yellow != None and len(args.yellow) == 2:
yellowCol = args.yellow[0]
yellowWhen = args.yellow[1]
columnsName = ""
columnsNameA = ["verb", "usage", "transFr", "transEn"]
if yellowCol != None:
columnsNameA.append("yellow")
for col in columnsNameA:
columnsName = columnsName + col + colSep
columnsName = columnsName[:-len(colSep)]
print(columnsName)
genCsv(levels, order, yellowCol, yellowWhen)