From 340f41f462b60af10f9dbbdab12654ca36044373 Mon Sep 17 00:00:00 2001
From: Nick Budak <nbudak@princeton.edu>
Date: Thu, 16 Jan 2020 15:48:19 -0500
Subject: [PATCH] move data directory into package

---
 MANIFEST.in                                    | 1 -
 README.md                                      | 2 +-
 {data => dphon/data}/bs_dict.json              | 0
 {data => dphon/data}/dummy_dict.json           | 0
 {data => dphon/data}/dummy_initgroup_dict.json | 0
 {data => dphon/data}/schuessler_dict.json      | 0
 dphon/lib.py                                   | 8 ++++++--
 setup.py                                       | 3 ++-
 8 files changed, 9 insertions(+), 5 deletions(-)
 delete mode 100644 MANIFEST.in
 rename {data => dphon/data}/bs_dict.json (100%)
 rename {data => dphon/data}/dummy_dict.json (100%)
 rename {data => dphon/data}/dummy_initgroup_dict.json (100%)
 rename {data => dphon/data}/schuessler_dict.json (100%)

diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index 003bee6..0000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1 +0,0 @@
-recursive-include data *
\ No newline at end of file
diff --git a/README.md b/README.md
index c64c21c..6704d96 100644
--- a/README.md
+++ b/README.md
@@ -60,7 +60,7 @@ $ dphon --version
 
 ## methodology
 
-matching sequences are determined by a dictionary file that represents a particular reconstruction of old chinese phonology (you can see some examples in the `data/` folder). these data structures map an input character to an arbitrary sound token ("dummy") that can be matched against other such tokens.
+matching sequences are determined by a dictionary file that represents a particular reconstruction of old chinese phonology (you can see some examples in the `dphon/data/` folder). these data structures map an input character to an arbitrary sound token ("dummy") that can be matched against other such tokens.
 
 the core process of DIRECT is to accept plaintext input, tokenize it according to a particular phonological reconstruction, and search for matches amongst the tokenized text. these matches thus represent resonance: sequences that could have rhymed when they were originally read aloud, despite dissimilarity in their written forms.
 
diff --git a/data/bs_dict.json b/dphon/data/bs_dict.json
similarity index 100%
rename from data/bs_dict.json
rename to dphon/data/bs_dict.json
diff --git a/data/dummy_dict.json b/dphon/data/dummy_dict.json
similarity index 100%
rename from data/dummy_dict.json
rename to dphon/data/dummy_dict.json
diff --git a/data/dummy_initgroup_dict.json b/dphon/data/dummy_initgroup_dict.json
similarity index 100%
rename from data/dummy_initgroup_dict.json
rename to dphon/data/dummy_initgroup_dict.json
diff --git a/data/schuessler_dict.json b/dphon/data/schuessler_dict.json
similarity index 100%
rename from data/schuessler_dict.json
rename to dphon/data/schuessler_dict.json
diff --git a/dphon/lib.py b/dphon/lib.py
index d69ce03..b6d2730 100644
--- a/dphon/lib.py
+++ b/dphon/lib.py
@@ -1,12 +1,16 @@
 import json
 from collections import defaultdict
-from typing import List, Dict, Tuple
 from os.path import basename, splitext
+from typing import Dict, List, Tuple
+
+import pkg_resources
 
 '''Non-alphabetic symbols used in place of a character.'''
 CHAR_MARKERS = ['□']
 
-with open('data/dummy_dict.json', encoding='utf-8') as file:
+'''Dictionary based on Schuessler's reconstruction of Old Chinese.'''
+schuessler_path = pkg_resources.resource_filename(__package__, 'data/dummy_dict.json')
+with open(schuessler_path, encoding='utf-8') as file:
     DUMMY_DICT = json.loads(file.read())
 
 def phonetic_tokens(string: str) -> str:
diff --git a/setup.py b/setup.py
index a727600..38b052b 100644
--- a/setup.py
+++ b/setup.py
@@ -39,7 +39,8 @@ def run(self):
     long_description=long_description,
     long_description_content_type='text/markdown',
     url='https://github.com/direct-phonology/direct',
-    include_package_data=True,  # include extra data files, e.g. dictionaries
+    include_package_data=True,
+    package_data={'dphon': ['data/*.json']},
     author='John O\'Leary, Nick Budak, Gian Rominger',
     author_email='jo10@princeton.edu, nbudak@princeton.edu, gianr@princeton.edu',
     license='MIT',