-
Notifications
You must be signed in to change notification settings - Fork 0
/
consolidator.py
executable file
·78 lines (57 loc) · 2.81 KB
/
consolidator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# Python Librairies
import os
import pandas as pd
import re
from functools import reduce
import sys
import argparse
import time
import numpy as np
# Custom package
import package as pkg
# Variables
parser = argparse.ArgumentParser(description='Variables')
parser.add_argument('--case')
# tokens, sentences, ngrams2, ngrams3, ngrams4, ngrams5
args = parser.parse_args()
# Main
proust = {}
filenames = tuple(os.listdir(f"./outputs/{args.case}"))
books = (
"du_cote_de_chez_swann",
"a_l_ombre_des_jeunes_filles_en_fleurs",
"le_cote_de_guermantes",
"sodome_et_gomorrhe",
"la_prisonniere",
"albertine_disparue",
"le_temps_retrouve"
)
for i, book in enumerate(books):
parts = [file for file in filenames if book in file]
proust[book] = reduce(lambda acc, df: acc.add(df, fill_value = 0),
[pd.read_csv(f"./outputs/{args.case}/" + part, delimiter="===", engine='python', names=["expression", "count"]).set_index("expression") for part in parts]
)
proust_recherche = reduce(lambda acc, df: acc.add(df, fill_value = 0), [proust[book] for book in books]).astype(int)
# --case sentences
if args.case == "sentences":
sentences_comparison = pd.concat([proust[book] for book in books], axis=1, sort=False, ignore_index=True).fillna(0).astype(int)
sentences_comparison.to_csv(f"./data/proust-volumes_sentences.csv", sep=",", index_label="expression")
# --case ngrams
elif "ngrams" in args.case:
dataframe_values = pd.read_csv(f"./utilitaries/configuration_ngrams.csv", index_col="case")
value = dataframe_values.loc[args.case, "value"]
valuable_ngrams = pkg.lowerize(proust_recherche).query(f"count >= {value}")
proust_words_lowerize = pkg.words(valuable_ngrams)
proust_gems = proust_words_lowerize[proust_words_lowerize.index.map(lambda x: pkg.gem(x))]
proust_gems = proust_gems.sort_values("count", ascending = False)
proust_gems.to_csv(f"./data/proust-recherche_{args.case}.csv", sep=",")
# --case tokens
else:
distrib_func = pd.read_csv(f"./utilitaries/configuration_tokens.csv", index_col="file")
distrib_func = distrib_func[distrib_func.index.map(lambda x: True if args.case in x else False)]
for file in distrib_func.index:
dataframe_source = pd.read_csv(f"./utilitaries/{file}", sep="=", names=["expression"])
distribution_recherche = pkg.against(getattr(pkg, distrib_func.loc[file, "function"])(proust_recherche), dataframe_source).astype(int)
distribution_recherche.to_csv(f"./data/proust-recherche_{file}", sep=",", index_label="expression")
distribution = pd.concat([pkg.against(getattr(pkg, distrib_func.loc[file, "function"])(proust[book]), dataframe_source) for book in books], axis=1, sort=False, ignore_index=True).astype(int)
distribution.to_csv(f"./data/proust-volumes_{file}", sep=",", index_label="expression")