-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimpute-from-monomorphemic.py
64 lines (49 loc) · 2.36 KB
/
impute-from-monomorphemic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# -*- coding: utf-8 -*-
# billdthompson@berkeley.edu
import csv
import click
import logging
import warnings
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sklearn.linear_model import Ridge as RR
from sklearn.model_selection import train_test_split as tts
D = 300 # skipgram vector dimension
VECTORDIMENSIONS = ['d_{0}'.format(d) for d in range(D)]
@click.command()
@click.option('--norms', '-n', default='combined-experimental-norms.csv')
@click.option('--vecfile', '-v', default='wiki.en.vec')
@click.option('--expand', '-e', multiple = True)
@click.option('--verbose', default=False, is_flag = True)
def run(norms, vecfile, expand, verbose):
# admin
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) if verbose else logging.basicConfig(format='%(levelname)s: %(message)s', level=None)
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")
# norms
click.secho("Reading norms from: {}".format(norms), fg = 'green')
data = pd.read_csv(norms).drop_duplicates(subset = 'word')
data['word'] = data.word.str.lower()
# vectors
click.secho("Reading vectors from: {}".format(vecfile), fg = 'yellow')
vecs = pd.read_csv(vecfile, sep = ' ', quoting = csv.QUOTE_NONE, skiprows = 1, header = None, names = ['word'] + VECTORDIMENSIONS + ['ignore']).drop_duplicates(subset = ['word']).drop(columns = 'ignore').merge(data, on = 'word', how = 'inner')
logging.info("Computing predictions for: {}".format(expand))
for norm in list(expand):
# identify training material
trainingdata = ((vecs[norm].notnull()) & (vecs.nmorph == '1'))
# learn the regression
mu = vecs[trainingdata][norm].mean()
X = vecs[trainingdata][VECTORDIMENSIONS].values
y = vecs[trainingdata][norm].values - mu
rr = RR()
rr.fit(X, y)
# predict new values
vecs[norm + '_imputed_monomorph'] = rr.predict(vecs[VECTORDIMENSIONS].values) + mu
newfn = norms.replace('.csv', '') + '-with-{}-monomorph-predictions.csv'.format('-'.join(expand))
vecs = vecs.drop(columns = VECTORDIMENSIONS)
logging.info("Saved results to {}".format(newfn))
logging.info("Correlation between predicted and observed norms:\n{}".format(vecs.corr()[[col for col in vecs.columns if col in expand or '_imputed' in col]]))
vecs.to_csv(newfn, index = False)
click.secho("===============================", fg = "yellow")
if __name__ == '__main__':
run()