-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexibank_liusinitic.py
148 lines (129 loc) · 5.43 KB
/
lexibank_liusinitic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from collections import defaultdict
from pathlib import Path
from clldutils.misc import slug
import attr
import lingpy
import pylexibank
def check_entry(wordlist, index, errors=defaultdict(list)):
prosody = lingpy.basictypes.lists(wordlist[index, "structure"])
morphemes = lingpy.basictypes.strings(wordlist[index, "morphemes"])
tokens = lingpy.basictypes.lists(wordlist[index, "tokens"])
cogids = lingpy.basictypes.ints(wordlist[index, "cogids"])
if len(prosody.n) != len(tokens.n):
errors[index] += ["prostring"]
print(prosody, tokens)
if len(morphemes) != len(tokens.n):
errors[index] += ["morphemes"]
if len(cogids) != len(tokens.n):
errors[index] += ["cogids-{0}".format(str(wordlist[index, "cogids"]))]
for i, (p, t) in enumerate(zip(prosody.n, tokens.n)):
if len(p) != len(t):
errors[index] += [
"prostring-{1}-{2}-{0}".format(
i, wordlist[index, "doculect"], wordlist[index, "concept"]
)
]
return errors
@attr.s
class CustomLexeme(pylexibank.Lexeme):
Prosody = attr.ib(default="")
Morpheme_Glosses = attr.ib(default=None)
Partial_Cognacy = attr.ib(default=None)
Chinese_Characters = attr.ib(default=None)
@attr.s
class CustomConcept(pylexibank.Concept):
Chinese_Gloss = attr.ib(default=None)
@attr.s
class CustomCognate(pylexibank.Cognate):
Segment_Slice = attr.ib(default=None)
@attr.s
class CustomLanguage(pylexibank.Language):
Latitude = attr.ib(default=None)
Longitude = attr.ib(default=None)
ChineseName = attr.ib(default=None)
SubGroup = attr.ib(default="Sinitic")
Family = attr.ib(default="Sino-Tibetan")
Source_ID = attr.ib(default=None)
DialectGroup = attr.ib(default=None)
Pinyin = attr.ib(default=None)
AltName = attr.ib(default=None)
class Dataset(pylexibank.Dataset):
id = "liusinitic"
dir = Path(__file__).parent
concept_class = CustomConcept
language_class = CustomLanguage
lexeme_class = CustomLexeme
cognate_class = CustomCognate
cross_concept_cognates = True
writer_options = dict(keep_languages=False, keep_parameters=False)
def cmd_download(self, args):
print("updating ...")
self.raw_dir.download(
"https://lingulist.de/edictor/triples/get_data.py?file=liusinitic&remote_dbase=liusinitic.sqlite3&columns=DOCULECT|SUBGROUP|CONCEPT|VALUE|IPA|TOKENS|COGIDS|MORPHEMES|STRUCTURE|NOTE|CHARACTERS|CHARACTER_IS",
"liusinitic.tsv",
)
def cmd_makecldf(self, args):
# add source
args.writer.add_sources()
# read in data
ds = self.raw_dir / "liusinitic.tsv"
wl = lingpy.Wordlist(str(ds))
# add languages
languages = args.writer.add_languages(lookup_factory="Name")
# add concepts
concepts = {}
for concept in self.conceptlists[0].concepts.values():
idx = concept.id.split("-")[-1]+"_"+slug(concept.english)
args.writer.add_concept(
ID=idx,
Name=concept.english,
Chinese_Gloss=concept.attributes["chinese"],
Concepticon_ID=concept.concepticon_id,
Concepticon_Gloss=concept.concepticon_gloss,
)
concepts[concept.english] = idx
# add the concepts which appear in the word list but do not appear in the concepticon list.
args.writer.add_concept(
ID="202_heartcompound",
Name="heart [compound]",
Chinese_Gloss="心臟",
Concepticon_ID="1223",
Concepticon_Gloss="HEART"
)
concepts["heart [compound]"] = "202_heartcompound"
concepts["river_2"] = "50_river"
concepts["river"] = "49_river"
# add forms
errors = defaultdict(list)
for idx in pylexibank.progressbar(wl, desc="cldfify the data"):
# check for mismatch in prosody
check_entry(wl, idx, errors)
lexeme = args.writer.add_form_with_segments(
Language_ID=languages[wl[idx, "doculect"]],
Parameter_ID=concepts[wl[idx, "concept"]],
Value=wl[idx, "value"],
Form=wl[idx, "value"],
Segments=[y for y in [x.split("/")[0] for x in wl[idx, "tokens"]] if y != "Ø"],
Prosody=wl[idx, "structure"],
Source=["Liu2007"],
Comment=wl[idx, "note"],
Morpheme_Glosses=" ".join(wl[idx, "morphemes"]),
Partial_Cognacy=" ".join([str(c) for c in wl[idx, "cogids"]]),
Chinese_Characters=wl[idx, "characters"]
)
for gloss_index, cogid in enumerate(wl[idx, "cogids"]):
args.writer.add_cognate(
lexeme=lexeme, Cognateset_ID=cogid, Segment_Slice=gloss_index + 1
)
if errors:
with open(self.dir.joinpath("errors.md"), "w") as f:
f.write("# ERRORS found\n")
for idx, problems in sorted(errors.items()):
for error in problems:
args.log.warning("{0} {1}".format(idx, error))
f.write("* {0} {1}\n".format(error, idx))
args.writer.cldf["LanguageTable"].tableSchema.columns = [
col
for col in args.writer.cldf["LanguageTable"].tableSchema.columns
if col.name != "ISO639P3code"
]