Skip to content

Commit

Permalink
Refactorings, release prep, added tests
Browse files Browse the repository at this point in the history
  • Loading branch information
chrzyki committed Nov 11, 2019
1 parent 5eef340 commit df3ee86
Show file tree
Hide file tree
Showing 7 changed files with 121 additions and 99 deletions.
27 changes: 27 additions & 0 deletions FORMS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
## Specification of form manipulation


Specification of the value-to-form processing in Lexibank datasets:

The value-to-form processing is divided into two steps, implemented as methods:
- `FormSpec.split`: Splits a string into individual form chunks.
- `FormSpec.clean`: Normalizes a form chunk.

These methods use the attributes of a `FormSpec` instance to configure their behaviour.

- `brackets`: `{'[': ']', '(': ')'}`
Pairs of strings that should be recognized as brackets, specified as `dict` mapping opening string to closing string
- `separators`: `(';', '/', ',')`
Iterable of single character tokens that should be recognized as word separator
- `missing_data`: `('*', '---', '')`
Iterable of strings that are used to mark missing data
- `strip_inside_brackets`: `True`
Flag signaling whether to strip content in brackets (**and** strip leading and trailing whitespace)
- `replacements`: `[]`
List of pairs (`source`, `target`) used to replace occurrences of `source` in formswith `target` (before stripping content in brackets)
- `first_form_only`: `False`
Flag signaling whether at most one form should be returned from `split` - effectively ignoring any spelling variants, etc.
- `normalize_whitespace`: `True`
Flag signaling whether to normalize whitespace - stripping leading and trailing whitespace and collapsing multi-character whitespace to single spaces
- `normalize_unicode`: `None`
UNICODE normalization form to use for input of `split` (`None`, 'NFD' or 'NFC')
12 changes: 6 additions & 6 deletions cldf/cldf-metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,31 +18,31 @@
"rdf:type": "prov:Entity",
"dc:title": "Repository",
"rdf:about": "https://github.com/lexibank/marrisonnaga",
"dc:created": "v1.0.1-11-gaa8ffaa"
"dc:created": "v1.0.1-9-g5eef340"
},
{
"rdf:type": "prov:Entity",
"dc:title": "Glottolog",
"rdf:about": "https://github.com/lingulist/glottolog-data",
"rdf:about": "https://github.com/glottolog/glottolog",
"dc:created": "v4.0"
},
{
"rdf:type": "prov:Entity",
"dc:title": "Concepticon",
"rdf:about": "https://github.com/concepticon/concepticon-data/",
"dc:created": "v2.1.0-109-g408e9f8"
"rdf:about": "https://github.com/concepticon/concepticon-data",
"dc:created": "v2.2.0"
},
{
"rdf:type": "prov:Entity",
"dc:title": "CLTS",
"rdf:about": "https://github.com/cldf-clts/clts/",
"rdf:about": "https://github.com/cldf-clts/clts",
"dc:created": "v1.4"
}
],
"prov:wasGeneratedBy": [
{
"dc:title": "python",
"dc:description": "3.7.4"
"dc:description": "3.7.3"
},
{
"dc:title": "python-packages",
Expand Down
80 changes: 40 additions & 40 deletions cldf/languages.csv
Original file line number Diff line number Diff line change
@@ -1,41 +1,41 @@
ID,Name,Glottocode,Glottolog_Name,ISO639P3code,Macroarea,Latitude,Longitude,Family,STEDT_Name,SubGroup,Coverage,Area
Chang,Chang,chan1313,,nbc,,26.2667,94.0833,Sino-Tibetan,Chang,Konyak,936,India
Chokri,Chokri,chok1243,,nri,,25.6833,94.2667,Sino-Tibetan,Chokri,Angami,519,India
ChungliAo,Ao Chungli,aona1235,,njo,,26.3167,94.5167,Sino-Tibetan,Ao (Chungli),Ao,984,India
Dimasa,Dimasa,dima1251,,dis,,25.42,93.18,Sino-Tibetan,Dimasa,Boro,917,Myanmar
Jingpho,Jingpho,jing1260,,kac,,25.461826,97.329866,Sino-Tibetan,Jingpho,Kachin,1029,India
Kezhama,Kezhama,khez1235,,nkh,,25.5167,94.2,Sino-Tibetan,Khezha,Angami,149,India
Khoirao,Khoirao,than1255,,nki,,25.2167,94.0333,Sino-Tibetan,Khoirao,Zemeic,406,India
KhonomaAngami,Angami Khonoma,khon1248,,njm,,25.65,94.0333,Sino-Tibetan,Angami (Khonoma),Angami,842,India
KohimaAngami,Angami Kohima,anga1288,,njm,,25.55,94.1333,Sino-Tibetan,Angami (Kohima),Angami,971,India
Konyak,Konyak,kony1246,,nbe,,26.55,95.05,Sino-Tibetan,Konyak,Konyak,979,India
Liangmai,Liangmai,lian1251,,njn,,25.3667,93.6333,Sino-Tibetan,Liangmei,Zemeic,724,India
Lotha,Lotha,loth1237,,njh,,26.1,94.2667,Sino-Tibetan,Lotha Naga,Lotha,1068,India
Lushai,Lushai,lush1249,,lus,,22.60535,92.629457,Sino-Tibetan,Lushai [Mizo],Kuki Chin-Central,1105,India
Manipuri,Manipuri,mani1292,,mni,,24.44,93.34,Sino-Tibetan,Meithei,Other Tibeto-Burman,970,India
Mao,Mao,maon1238,,nbi,,25.4667,94.1167,Sino-Tibetan,Mao,Angami,712,India
Maram,Maram,mara1379,,nma,,25.4333,94.15,Sino-Tibetan,Maram,Zemeic,352,India
Maring,Maring,mari1416,,nng,,24.05,94.0333,Sino-Tibetan,Maring,Maringic,418,India
Meluri,Meluri,poch1243,,npo,,25.0667,94.6333,Sino-Tibetan,Meluri,Pochuri,316,India
Mikir,Mikir,karb1241,,mjw,,25.735084,93.050494,Sino-Tibetan,Mikir [Karbi],Other Tibeto-Burman,1341,India
MongsenAo,Ao Mongsen,mong1332,,njo,,26.4167,94.4,Sino-Tibetan,Ao (Mongsen: Longchang),Ao,917,India
MoshangTangsa,Tangsa (Moshang),mosa1240,,nst,,,,Sino-Tibetan,Tangsa (Moshang),Yacham-Tengsa,313,India
Mzieme,Mzieme,mzie1235,,nme,,25.5167,93.75,Sino-Tibetan,Mzieme,Zemeic,581,India
Nocte,Nocte,noct1238,,njb,,27.1167,95.4833,Sino-Tibetan,Nocte,Nocte,395,India
Nruanghmei,Nruanghmei,rong1266,,nbu,,25.0,93.05,Sino-Tibetan,Rongmei / Nruanghmei,Zemeic,811,India
Ntenyi,Ntenyi,nort2725,,nnl,,25.9833,94.0333,Sino-Tibetan,Ntenyi,Rengma,636,India
Phom,Phom,phom1236,,nph,,26.6167,94.05,Sino-Tibetan,Phom,Konyak,679,India
Puiron,Puiron,rong1266,,nbu,,25.1,93.79,Sino-Tibetan,Puiron,Zemeic,385,India
Rengma,Rengma,sout2732,,nre,,25.0667,94.6333,Sino-Tibetan,Rengma,Rengma,803,India
Sangtam,Sangtam,sang1321,,nsa,,25.0667,94.8667,Sino-Tibetan,Sangtam,Sangtam,853,India
Sema,Sema,sumi1235,,nsm,,25.85,94.2667,Sino-Tibetan,Sema [Sumi],Angami,920,India
Tangkhul,Tangkhul,sino1246,,nmf,,25.1167,94.3667,Sino-Tibetan,Tangkhul,Tangkhulic,945,India
Tengsa,Tengsa,teng1273,,njo,,26.95,95.0667,Sino-Tibetan,Tengsa,Yacham-Tengsa,5,India
Wancho,Wancho,wanc1238,,nnp,,26.9667,95.8167,Sino-Tibetan,Wancho,Konyak,464,India
WrittenBurmese,Burmese (Written),oldb1235,,,,21.624974,97.126742,Sino-Tibetan,Burmese (Written),Burmese,985,Myanmar
WrittenTibetan,Tibetan (Written),clas1254,,xct,,30.027852,91.158704,Sino-Tibetan,Tibetan (Written),Tibetan,1134,China
Yacham,Yacham,yach1235,,njo,,26.6167,94.7833,Sino-Tibetan,Yacham,Yacham-Tengsa,5,India
YachamTengsa,Yacham-Tengsa,yach1234,,njo,,,,Sino-Tibetan,Yacham-Tengsa,Yacham-Tengsa,270,India
Yimchungru,Yimchungrü,yimc1241,,yim,,25.7167,94.9167,Sino-Tibetan,Yimchungrü,Yimchingric,536,India
YogliTangsa,Tangsa (Yogli),yogl1238,,nst,,,,Sino-Tibetan,Tangsa (Yogli),Yacham-Tengsa,225,India
Zeme,Zeme,zeme1240,,nzm,,25.1833,93.2,Sino-Tibetan,Zeme,Zemeic,834,India
Chang,Chang,chan1313,Chang Naga,nbc,Eurasia,26.2667,94.0833,Sino-Tibetan,Chang,Konyak,936,India
Chokri,Chokri,chok1243,Chokri Naga,nri,Eurasia,25.6833,94.2667,Sino-Tibetan,Chokri,Angami,519,India
ChungliAo,Ao Chungli,aona1235,Ao Naga,njo,Eurasia,26.3167,94.5167,Sino-Tibetan,Ao (Chungli),Ao,984,India
Dimasa,Dimasa,dima1251,Dimasa,dis,Eurasia,25.42,93.18,Sino-Tibetan,Dimasa,Boro,917,Myanmar
Jingpho,Jingpho,jing1260,Jingpho,kac,,25.461826,97.329866,Sino-Tibetan,Jingpho,Kachin,1029,India
Kezhama,Kezhama,khez1235,Khezha Naga,nkh,Eurasia,25.5167,94.2,Sino-Tibetan,Khezha,Angami,149,India
Khoirao,Khoirao,than1255,Thangal Naga,nki,Eurasia,25.2167,94.0333,Sino-Tibetan,Khoirao,Zemeic,406,India
KhonomaAngami,Angami Khonoma,khon1248,Khonoma,njm,Eurasia,25.65,94.0333,Sino-Tibetan,Angami (Khonoma),Angami,842,India
KohimaAngami,Angami Kohima,anga1288,Angami Naga,njm,Eurasia,25.55,94.1333,Sino-Tibetan,Angami (Kohima),Angami,971,India
Konyak,Konyak,kony1246,Konyak,nbe,,26.55,95.05,Sino-Tibetan,Konyak,Konyak,979,India
Liangmai,Liangmai,lian1251,Liangmai Naga,njn,Eurasia,25.3667,93.6333,Sino-Tibetan,Liangmei,Zemeic,724,India
Lotha,Lotha,loth1237,Lotha Naga,njh,Eurasia,26.1,94.2667,Sino-Tibetan,Lotha Naga,Lotha,1068,India
Lushai,Lushai,lush1249,Mizo,lus,Eurasia,22.60535,92.629457,Sino-Tibetan,Lushai [Mizo],Kuki Chin-Central,1105,India
Manipuri,Manipuri,mani1292,Manipuri,mni,Eurasia,24.44,93.34,Sino-Tibetan,Meithei,Other Tibeto-Burman,970,India
Mao,Mao,maon1238,Mao Naga,nbi,Eurasia,25.4667,94.1167,Sino-Tibetan,Mao,Angami,712,India
Maram,Maram,mara1379,Maram Naga,nma,Eurasia,25.4333,94.15,Sino-Tibetan,Maram,Zemeic,352,India
Maring,Maring,mari1416,Maring Naga,nng,Eurasia,24.05,94.0333,Sino-Tibetan,Maring,Maringic,418,India
Meluri,Meluri,poch1243,Pochuri Naga,npo,Eurasia,25.0667,94.6333,Sino-Tibetan,Meluri,Pochuri,316,India
Mikir,Mikir,karb1241,Hills Karbi,mjw,Eurasia,25.735084,93.050494,Sino-Tibetan,Mikir [Karbi],Other Tibeto-Burman,1341,India
MongsenAo,Ao Mongsen,mong1332,Mongsen,njo,Eurasia,26.4167,94.4,Sino-Tibetan,Ao (Mongsen: Longchang),Ao,917,India
MoshangTangsa,Tangsa (Moshang),mosa1240,Mosang,nst,Eurasia,,,Sino-Tibetan,Tangsa (Moshang),Yacham-Tengsa,313,India
Mzieme,Mzieme,mzie1235,Mzieme Naga,nme,Eurasia,25.5167,93.75,Sino-Tibetan,Mzieme,Zemeic,581,India
Nocte,Nocte,noct1238,Nocte Naga,njb,Eurasia,27.1167,95.4833,Sino-Tibetan,Nocte,Nocte,395,India
Nruanghmei,Nruanghmei,rong1266,Rongmei Naga,nbu,Eurasia,25.0,93.05,Sino-Tibetan,Rongmei / Nruanghmei,Zemeic,811,India
Ntenyi,Ntenyi,nort2725,Northern Rengma Naga,nnl,Eurasia,25.9833,94.0333,Sino-Tibetan,Ntenyi,Rengma,636,India
Phom,Phom,phom1236,Phom Naga,nph,Eurasia,26.6167,94.05,Sino-Tibetan,Phom,Konyak,679,India
Puiron,Puiron,rong1266,Rongmei Naga,nbu,Eurasia,25.1,93.79,Sino-Tibetan,Puiron,Zemeic,385,India
Rengma,Rengma,sout2732,Southern Rengma Naga,nre,Eurasia,25.0667,94.6333,Sino-Tibetan,Rengma,Rengma,803,India
Sangtam,Sangtam,sang1321,Sangtam Naga,nsa,Eurasia,25.0667,94.8667,Sino-Tibetan,Sangtam,Sangtam,853,India
Sema,Sema,sumi1235,Sumi Naga,nsm,Eurasia,25.85,94.2667,Sino-Tibetan,Sema [Sumi],Angami,920,India
Tangkhul,Tangkhul,sino1246,Tangkhulic,nmf,,25.1167,94.3667,Sino-Tibetan,Tangkhul,Tangkhulic,945,India
Tengsa,Tengsa,teng1273,Tengsa,njo,,26.95,95.0667,Sino-Tibetan,Tengsa,Yacham-Tengsa,5,India
Wancho,Wancho,wanc1238,Wancho Naga,nnp,Eurasia,26.9667,95.8167,Sino-Tibetan,Wancho,Konyak,464,India
WrittenBurmese,Burmese (Written),oldb1235,Old Burmese,,Eurasia,21.624974,97.126742,Sino-Tibetan,Burmese (Written),Burmese,985,Myanmar
WrittenTibetan,Tibetan (Written),clas1254,Classical Tibetan,xct,Eurasia,30.027852,91.158704,Sino-Tibetan,Tibetan (Written),Tibetan,1134,China
Yacham,Yacham,yach1235,Yacham,njo,,26.6167,94.7833,Sino-Tibetan,Yacham,Yacham-Tengsa,5,India
YachamTengsa,Yacham-Tengsa,yach1234,Yacham-Tengsa,njo,,,,Sino-Tibetan,Yacham-Tengsa,Yacham-Tengsa,270,India
Yimchungru,Yimchungrü,yimc1241,Yimchungru,yim,Eurasia,25.7167,94.9167,Sino-Tibetan,Yimchungrü,Yimchingric,536,India
YogliTangsa,Tangsa (Yogli),yogl1238,Yogli,nst,Eurasia,,,Sino-Tibetan,Tangsa (Yogli),Yacham-Tengsa,225,India
Zeme,Zeme,zeme1240,Zeme Naga,nzm,Eurasia,25.1833,93.2,Sino-Tibetan,Zeme,Zemeic,834,India
35 changes: 15 additions & 20 deletions cldf/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,47 +2,42 @@ appdirs==1.4.3
atomicwrites==1.3.0
certifi==2019.9.11
chardet==3.0.4
-e git+https://github.com/cldf/cldfbench/@f373855e3b9cde029578e77c26136f0df26a82fa#egg=cldfbench
-e git+https://github.com/cldf/cldfcatalog@f26c9ac38c1ab111a55682ae83f4ac1cb03333c3#egg=cldfcatalog
clldutils==3.2.0
cldfbench==1.0.0
cldfcatalog==1.3.0
clldutils==3.3.0
colorlog==4.0.2
configparser==4.0.2
-e git+https://github.com/cldf/csvw.git@bcd398856fdfe6408567cc02c7ff8b67ba1c8e38#egg=csvw
csvw==1.6.0
decorator==4.4.1
idna==2.8
isodate==0.6.0
jdcal==1.4.1
-e git+https://github.com/lexibank/allenbai.git@2115179090d47dd9b18ab8cf41a4b4d83e82955c#egg=lexibank_allenbai
-e git+https://github.com/lexibank/marrisonnaga.git@aa8ffaae2fde0e1f2e9e4c4aca981f75aa649a1e#egg=lexibank_marrisonnaga
-e git+https://github.com/lexibank/naganorgyalrongic@3f7336c46c182f73b11a6ed5ab2a60dae31e0342#egg=lexibank_naganorgyalrongic
-e git+https://github.com/lexibank/sohartmannchin.git@2df6c14f491c6f83490aea8d83a569b970a3dc76#egg=lexibank_sohartmannchin
-e git+https://github.com/lingpy/lingpy.git@2a1671c1b65886e1d33eccd74818b29bc4ce73dd#egg=lingpy
lingpy==2.6.5
Markdown==3.1.1
networkx==2.2
networkx==2.1
newick==1.0.0
numpy==1.17.3
numpy==1.17.4
openpyxl==3.0.0
packaging==19.2
pluggy==0.13.0
purl==1.5
py==1.8.0
pybtex==0.22.2
pycldf==1.8.2
-e git+https://github.com/cldf-clts/pyclts@4842f1fd9613de6ef20a917dbc3bd723e8d0ffbb#egg=pyclts
-e git+https://github.com/concepticon/pyconcepticon.git@615a048b11cc6e5f8a3fe92619ad7790b23154db#egg=pyconcepticon
pyclts==2.0.0
pyconcepticon==2.5.1
pycountry==19.8.18
-e git+https://github.com/clld/pyglottolog.git@0f24f24a46d1f510c975337e4c0d8c23b357c8bd#egg=pyglottolog
-e git+https://github.com/lexibank/pylexibank.git@9d3001127a62fc063845f02a5afafd43a5be7234#egg=pylexibank
pyglottolog==2.2.1
pylexibank==2.1.0
pytest==5.2.2
regex==2019.8.19
regex==2019.11.1
requests==2.22.0
rfc3986==1.3.2
segments==2.0.2
six==1.12.0
segments==2.1.2
six==1.13.0
SQLAlchemy==1.3.10
tabulate==0.8.5
termcolor==1.1.0
tqdm==4.36.1
tqdm==4.38.0
uritemplate==3.0.0
urllib3==1.25.6
wcwidth==0.1.7
Expand Down
24 changes: 11 additions & 13 deletions lexibank_marrisonnaga.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
from pylexibank.dataset import Dataset as BaseDataset
from pylexibank import Language
from pylexibank import FormSpec
from pylexibank.util import pb
from pylexibank.util import progressbar
from clldutils.misc import slug
import lingpy
import attr


@attr.s
class CustomLanguage(Language):
STEDT_Name = attr.ib(default=None)
Expand All @@ -17,31 +18,28 @@ class CustomLanguage(Language):
Latitude = attr.ib(default=None)
Area = attr.ib(default=None)


class Dataset(BaseDataset):
dir = Path(__file__).parent
id = "marrisonnaga"
language_class = CustomLanguage
form_spec = FormSpec(
missing_data=("*", "---", ""),
brackets={"[": "]", "(": ")"}
)
form_spec = FormSpec(missing_data=("*", "---", ""), brackets={"[": "]", "(": ")"})

def cmd_makecldf(self, args):
"""
Convert the raw data to a CLDF dataset.
"""
wl = lingpy.Wordlist(self.raw_dir.joinpath("GEM-CNL.csv").as_posix())
concept_lookup = args.writer.add_concepts(
id_factory=lambda x: x.id.split('-')[-1]+'_'+slug(x.english),
lookup_factory="Name"
)
language_lookup = args.writer.add_languages(
lookup_factory="STEDT_Name")
id_factory=lambda x: x.id.split("-")[-1] + "_" + slug(x.english), lookup_factory="Name"
)
language_lookup = args.writer.add_languages(lookup_factory="STEDT_Name")
args.writer.add_sources()
# check for missing items
missing = defaultdict(int)
for idx, language, concept, value, pos in pb(
wl.iter_rows("doculect", "concept", "reflex", "gfn")):
for idx, language, concept, value, pos in progressbar(
wl.iter_rows("doculect", "concept", "reflex", "gfn")
):
if concept not in concept_lookup:
if pos == "n":
if concept + " (noun)" in concept_lookup:
Expand All @@ -60,5 +58,5 @@ def cmd_makecldf(self, args):
Language_ID=language_lookup[language],
Parameter_ID=concept_lookup[concept],
Value=value,
Source=["Marrison1967"]
Source=["Marrison1967"],
)
29 changes: 9 additions & 20 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,19 @@
import json


with open('metadata.json') as fp:
with open("metadata.json") as fp:
metadata = json.load(fp)


setup(
name='lexibank_marrisonnaga',
description=metadata['title'],
license=metadata.get('license', ''),
url=metadata.get('url', ''),
py_modules=['lexibank_marrisonnaga'],
name="lexibank_marrisonnaga",
description=metadata["title"],
license=metadata.get("license", ""),
url=metadata.get("url", ""),
py_modules=["lexibank_marrisonnaga"],
include_package_data=True,
zip_safe=False,
entry_points={
'lexibank.dataset': [
'marrisonnaga=lexibank_marrisonnaga:Dataset',
]
},
install_requires=[
'pylexibank>=2.0.0',
'segments>=2.0.2'
],
extras_require={
'test': [
'pytest-cldf',
],
},
entry_points={"lexibank.dataset": ["marrisonnaga=lexibank_marrisonnaga:Dataset"]},
install_requires=["pylexibank>=2.1", "segments>=2.0.2"],
extras_require={"test": ["pytest-cldf"]},
)
13 changes: 13 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,15 @@
def test_valid(cldf_dataset, cldf_logger):
assert cldf_dataset.validate(log=cldf_logger)


def test_forms(cldf_dataset):
assert len(list(cldf_dataset["FormTable"])) == 19200
assert any(f["Form"] == "bu◦thu" for f in cldf_dataset["FormTable"])


def test_parameters(cldf_dataset):
assert len(list(cldf_dataset["ParameterTable"])) == 626


def test_languages(cldf_dataset):
assert len(list(cldf_dataset["LanguageTable"])) == 40

0 comments on commit df3ee86

Please sign in to comment.