-
Notifications
You must be signed in to change notification settings - Fork 1
/
3. add_links.py
36 lines (28 loc) · 1.62 KB
/
3. add_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from tqdm import tqdm
import pandas as pd
words = pd.read_csv("russian3/russian3 - words.csv", usecols=["id", "bare", "disabled", "type"])
words["bare"] = words["bare"].apply(lambda x: x.strip())
words = words[~pd.isna(words["type"])]
words = words[words["disabled"] == 0]
words = words.drop(columns=["disabled", "type"])
dtype = {"id": "int", "bare": "string"}
words = words.astype(dtype)
words.info()
word_dict = dict(zip(words["id"].values.tolist(), words["bare"].values.tolist()))
words_forms_csv = pd.read_csv("russian3/russian3 - words_forms.csv", usecols=["word_id", "_form_bare"])
words_forms_csv = words_forms_csv[~words_forms_csv["_form_bare"].isna()]
words_forms_csv["_form_bare"] = words_forms_csv["_form_bare"].apply(lambda x: x.strip())
words_forms_csv["_form_bare"] = words_forms_csv["_form_bare"].apply(lambda x: x.strip("()"))
words_forms_csv = words_forms_csv[words_forms_csv["_form_bare"] != "-"]
words_forms_csv = words_forms_csv[words_forms_csv["_form_bare"] != "—"]
# 因为words中剔除了部分type为NaN和disabled的,这里只链接form原型在words中的部分
words_forms_csv = words_forms_csv[words_forms_csv["word_id"].isin(words["id"].values)]
dtype = {"word_id": "int", "_form_bare": "string"}
words_forms_csv = words_forms_csv.astype(dtype)
words_forms_csv.info(show_counts=True)
words_forms_iter = zip(words_forms_csv["word_id"].values.tolist(), words_forms_csv["_form_bare"].values.tolist())
del words
del words_forms_csv
with open("output/Mdx_html.txt", "a", encoding="utf-8") as f:
for word_id, form_bare in tqdm(words_forms_iter):
f.write("%s\n@@@LINK=%s\n</>\n" % (form_bare, word_dict[word_id]))