-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlexibank_wichmannmixezoquean.py
85 lines (71 loc) · 2.91 KB
/
lexibank_wichmannmixezoquean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from pathlib import Path
import attr
from clldutils.misc import slug
from pylexibank import Dataset as BaseDataset
from pylexibank import FormSpec
from pylexibank import Language as BaseLanguage
@attr.s
class CustomLanguage(BaseLanguage):
Name = attr.ib(default=None)
Abbreviation = attr.ib(default=None)
Note = attr.ib(default=None)
Source = attr.ib(default=None)
class Dataset(BaseDataset):
dir = Path(__file__).parent
id = "wichmannmixezoquean"
writer_options = dict(keep_languages=False, keep_parameters=False)
language_class = CustomLanguage
form_spec = FormSpec(brackets={"(": ")", "[": "]"}, separators=",~", missing_data=("?", "-"))
def cmd_makecldf(self, args):
args.writer.add_sources()
languages = args.writer.add_languages(lookup_factory=lambda l: l["Abbreviation"])
concepts = args.writer.add_concepts(
id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name"
)
# add multiple forms
concepts.update(
{
# note the mishmash of different dashes etc handled here.
"hair - 1": "36_hair",
"hair - 2": "36_hair",
"see - 1": "72_see",
"see - 2": "72_see",
"stand - 1": "79_stand",
"stand - 2": "79_stand",
"stand -2": "79_stand",
"walk/go - 1": "92_walkgo",
"walk/go - 2": "92_walkgo",
"worm - 1": "109_worm",
"worm – 2": "109_worm",
"worm - 2": "109_worm",
}
)
sources = {l["Abbreviation"]: l["Source"] for l in self.languages}
data = zip(
self.raw_dir.read_csv("Wordlist.txt", delimiter="\t"),
self.raw_dir.read_csv("Cognates.txt", delimiter="\t"),
)
cogidx = 1
header = None
for i, (row1, row2) in enumerate(data):
if i == 0:
header = row1[1:]
else:
concept_id = concepts[row1[0].strip()]
for lang_abbrev, word, cog in zip(header, row1[1:], row2[1:]):
if word.strip():
if cog.strip().lower() != "na":
cogid = concept_id + "-" + cog
else:
cogid = str(cogidx)
cogidx += 1
for row in args.writer.add_forms_from_value(
Language_ID=languages[lang_abbrev],
Parameter_ID=concept_id,
Value=word,
Source=sources[lang_abbrev],
Cognacy=cogid,
):
args.writer.add_cognate(
lexeme=row, Cognateset_ID=cogid, Source="Cysouw2006a"
)