Merge pull request #45 from direct-phonology/develop

1.1.0 release
direct-phonology · Jan 16, 2020 · b665f78 · b665f78
2 parents 15a59c4 + 340f41f
commit b665f78
Show file tree

Hide file tree

Showing 28 changed files with 376 additions and 253 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 venv
+env
 .vscode
 .pytest_cache
 .python-version

diff --git a/MANIFEST.in b/MANIFEST.in
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@ _Digital Intertextual Resonances in Early Chinese Texts_
 
 ![CI Status](https://github.com/direct-phonology/direct/workflows/test/badge.svg)
 ![Dependency Status](https://pyup.io/repos/github/direct-phonology/direct/shield.svg?t=1568910750251)
-![PyPi Version](https://img.shields.io/pypi/v/dphon.svg?style=flat)
+[![PyPi Version](https://img.shields.io/pypi/v/dphon.svg?style=flat)](https://pypi.org/project/dphon/)
 ![Python Versions](https://img.shields.io/pypi/pyversions/dphon.svg?style=flat)
 
 ## installation
@@ -39,16 +39,28 @@ the output will be a list of character sequences in text_a that have rhyming cou
 解其分 (b: 56)         # and two from b on lines 4 and 56
 ```
 
-note that the sequences ignore non-word characters, including punctuation and numbers. this means that rhymes could span across lines, which will be reflected in the output.
+note that the matches ignore non-word characters, including punctuation and numbers.
+this means that matches could span multiple lines, which will be reflected in the output (line breaks will be represented by the ⏎ character).
 
 you can view the full list of command options with:
 ```sh
 $ dphon --help
 ```
 
+by default, all matches are shown, including those where the text is identical. to limit to instances
+where actual graphic variation has occurred, you can use the `--variants-only` flag:
+```sh
+$ dphon text_a.txt text_b.txt --variants-only
+```
+
+this tool is under active development, and results may vary. to find the version you are running:
+```sh
+$ dphon --version
+```
+
 ## methodology
 
-matching sequences are determined by a dictionary file that represents a particular reconstruction of old chinese phonology (you can see some examples in the `data/` folder). these data structures map an input character to an arbitrary sound token ("dummy") that can be matched against other such tokens.
+matching sequences are determined by a dictionary file that represents a particular reconstruction of old chinese phonology (you can see some examples in the `dphon/data/` folder). these data structures map an input character to an arbitrary sound token ("dummy") that can be matched against other such tokens.
 
 the core process of DIRECT is to accept plaintext input, tokenize it according to a particular phonological reconstruction, and search for matches amongst the tokenized text. these matches thus represent resonance: sequences that could have rhymed when they were originally read aloud, despite dissimilarity in their written forms.
 

diff --git a/dphon/__init__.py b/dphon/__init__.py
@@ -1 +1 @@
-__version__ = '1.0.1'
+__version__ = '1.1.0'
diff --git a/dphon/cli.py b/dphon/cli.py
@@ -2,21 +2,23 @@
 dphon
  
 Usage:
-  dphon <text1> <text2> [--output=<file>]
-  dphon -h | --help
-  dphon -v | --version
+    dphon <text1> <text2> [--n=<n>] [--output=<file>] [--variants-only]
+    dphon -h | --help
+    dphon --version
  
 Options:
-  -h --help         Show this screen.
-  -v --version      Show program version.
+    -h --help           Show this screen.
+    --version           Show program version.
+    --variants-only     Limit to matches with graphic variation.
+    --n=<n>             Limit to matches with length >= n [default: 3].
  
 Examples:
-    dphon 老子甲.txt 老子乙.txt
-    dphon 老子丙.txt 老子乙.txt --output=out.txt
+    dphon 老子丙.txt 老子乙.txt --output=matches.txt
+    dphon 周南.txt 鹿鳴之什.txt --variants-only
  
 Help:
-  For more information on using this tool, please visit the Github repository:
-  https://github.com/direct-phonology/direct
+    For more information on using this tool, please visit the Github repository:
+    https://github.com/direct-phonology/direct
 """
 
 from sys import stderr, stdin, stdout
@@ -29,19 +31,28 @@
 def run():
     """CLI entrypoint."""
     arguments = docopt(__doc__, version=__version__)
+    # read in the two files
     with open(arguments['<text1>'], encoding='utf-8') as file:
         text1 = file.read()
     with open(arguments['<text2>'], encoding='utf-8') as file:
         text2 = file.read()
+    # store their texts and filenames
     c = Comparator(a=text1,
                    b=text2,
                    a_name=arguments['<text1>'],
                    b_name=arguments['<text2>'])
-    matches = c.get_matches()
+    # get and reduce initial matches
+    matches = c.get_matches(min_length=int(arguments['--n']))
+    # if requested, remove matches without graphic variation
+    if arguments['--variants-only']:
+        matches = c.matches_with_graphic_variation(matches)
+    # group matches and format for output
     groups = Comparator.group_matches(matches)
     output = c.resolve_groups(groups)
+    # write to a file if requested
     if arguments['--output']:
         with open(arguments['--output'], mode='w', encoding='utf8') as file:
             file.write(output)
+    # otherwise write to stdout
     else:
         stdout.buffer.write(output.encode('utf-8'))
diff --git a/data/bs_dict.json → dphon/data/bs_dict.json b/data/bs_dict.json → dphon/data/bs_dict.json
diff --git a/data/dummy_dict.json → dphon/data/dummy_dict.json b/data/dummy_dict.json → dphon/data/dummy_dict.json
diff --git a/data/dummy_initgroup_dict.json → dphon/data/dummy_initgroup_dict.json b/data/dummy_initgroup_dict.json → dphon/data/dummy_initgroup_dict.json
diff --git a/data/schuessler_dict.json → dphon/data/schuessler_dict.json b/data/schuessler_dict.json → dphon/data/schuessler_dict.json
diff --git a/dphon/lib.py b/dphon/lib.py
@@ -1,12 +1,26 @@
 import json
 from collections import defaultdict
-from typing import List, Dict, Tuple
 from os.path import basename, splitext
+from typing import Dict, List, Tuple
 
+import pkg_resources
 
-with open('data/dummy_dict.json', encoding='utf-8') as file:
+'''Non-alphabetic symbols used in place of a character.'''
+CHAR_MARKERS = ['□']
+
+'''Dictionary based on Schuessler's reconstruction of Old Chinese.'''
+schuessler_path = pkg_resources.resource_filename(__package__, 'data/dummy_dict.json')
+with open(schuessler_path, encoding='utf-8') as file:
     DUMMY_DICT = json.loads(file.read())
 
+def phonetic_tokens(string: str) -> str:
+    """Returns iterator of phonetic tokens for input string. Characters not in
+    the dictionary are left unchanged."""
+    return (DUMMY_DICT[char][2] if char in DUMMY_DICT else char for char in string)
+
+def has_char_markers(string: str) -> bool:
+    """Returns True if input string contains any character in CHAR_MARKERS."""
+    return any([c in string for c in CHAR_MARKERS])
 
 class Match(object):
     a_start: int
@@ -25,7 +39,22 @@ def __str__(self) -> str:
         return 'A (%d - %d) :: B (%d - %d)' % (self.a_start, self.a_end,
                                                self.b_start, self.b_end)
 
-    def resolve(self, a: str, b: str):
+    def has_graphic_variation(self, a:str, b:str) -> bool:
+        """Whether a match contains an actual graphic variant of a character,
+        ignoring punctuation and other differences."""
+        # strip punctuation initially
+        a_seq = ''.join([c for c in a[self.a_start:self.a_end + 1] if c.isalpha()])
+        b_seq = ''.join([c for c in b[self.b_start:self.b_end + 1] if c.isalpha()])
+
+        # if we find a character in b that we have an entry for but it's in a
+        # different form, that's graphic variation
+        for (i, char) in enumerate(a_seq):
+            if char in DUMMY_DICT and char != b_seq[i]:
+                return True
+
+        return False
+
+    def resolve(self, a: str, b: str) -> str:
         """Get the actual text of a match by mapping its locations to texts."""
         return '%s :: %s\t%s' % (
             a[self.a_start:self.a_end + 1],
@@ -71,8 +100,6 @@ def create_numbered_text(text: str):
             output += '%s\t%s' % (i + 1, line)
 
         return output
-        # save the lines into a new string
-        # write the string to a file
 
     @staticmethod
     def get_text_ngrams(text: str, n: int = 3) -> List[Dict]:
@@ -82,7 +109,7 @@ def get_text_ngrams(text: str, n: int = 3) -> List[Dict]:
             raise ValueError('Value for `n` must be 1 or greater.')
         ngrams = []
         for pos, char in enumerate(text):
-            if char.isalpha():
+            if char.isalpha() or char in CHAR_MARKERS:
                 # create a new ngram
                 ngram = {'text': '', 'start': None, 'end': None}
                 # add either the original character or a token if we have one
@@ -104,8 +131,10 @@ def get_text_ngrams(text: str, n: int = 3) -> List[Dict]:
                     except IndexError:
                         continue
                 ngrams.append(ngram)
-        # return all but the last n - 1 ngrams, as they are redundant
-        return ngrams[:len(ngrams) - n + 1]
+        # last n - 1 ngrams are redundant
+        ngrams = ngrams[:len(ngrams) - n + 1]
+        # step through the final list to discard any ngrams with CHAR_MARKERS
+        return [n for n in ngrams if not has_char_markers(n['text'])]
 
     def get_initial_matches(self, n: int = 3) -> List[Match]:
         """Gets a set of initial, overlapping matches between two texts that can
@@ -132,21 +161,40 @@ def reduce_matches(matches: List[Match]) -> List[Match]:
         contiguous matches."""
         for i, match in enumerate(matches):
             # lookahead
-            for candidate in matches[i:]:
+            for candidate in matches[i+1:]:
                 # ignore matches that are fully congruent
                 if candidate.a_start == match.a_start and candidate.a_end == match.a_end:
                     continue
-                # next we should find matches that are overlapping, if any
-                if candidate.a_start < match.a_end and candidate.a_start > match.a_start and candidate.b_start < match.b_end and candidate.b_start > match.b_start:
-                    # move the candidate inside the current match
-                    match.a_end = candidate.a_end
-                    match.b_end = candidate.b_end
-                    matches.remove(candidate)
-                    continue
+                # next we should find matches that are overlapping in A, if any
+                if candidate.a_start < match.a_end and candidate.a_start > match.a_start:
+                    # ignore matches pointing to somewhere else in B
+                    if candidate.b_start >= match.b_end or candidate.b_start <= match.b_start:
+                        continue
+                    # ignore matches in B that are completely inside ours
+                    if candidate.b_start > match.b_start and candidate.b_end < match.b_end:
+                        continue
+                    # if we overlap in both A and B, merge into our match
+                    if candidate.b_start < match.b_end and candidate.b_start > match.b_start:
+                        match.a_end = candidate.a_end
+                        match.b_end = candidate.b_end
+                        matches.remove(candidate)
+                        continue
                 # if we didn't find any overlapping, we're done
                 break
+        # some matches may still be completely subsumed by others
+        for i, match in enumerate(matches):
+            # lookahead to see if any matches fit inside this one
+            for candidate in matches[i+1:]:
+                # if so, remove
+                if candidate.a_start >= match.a_start and candidate.a_end <= match.a_end and candidate.b_start >= match.b_start and candidate.b_end <= match.b_end:
+                    matches.remove(candidate)
+        # return final list
         return matches
 
+    def matches_with_graphic_variation(self, matches: List[Match]) -> List[Match]:
+        """Filter the set of matches to only those with graphic variation."""
+        return [match for match in matches if match.has_graphic_variation(self.a, self.b)]
+
     @staticmethod
     def group_matches(matches: List[Match]) -> Dict[range, List[range]]:
         """Groups a list of matches by position in a text, so that a single
@@ -163,11 +211,15 @@ def resolve_groups(self, matches: Dict[range, List[range]]) -> str:
         """Print grouped matches by mapping their locations to texts."""
         output = ''
         for a, bs in matches.items():
-            output += '%s (%s: %d)\n' % (
-                self.a[a.start:a.stop+1], self.a_name, self.a_linemap[a.start])
+            text = self.a[a.start:a.stop+1]
+            display = '%s (%s: %d)\n' % (text.replace('\n',' ⏎ '),
+                                         self.a_name, self.a_linemap[a.start])
+            output += display
             for b in bs:
-                output += '%s (%s: %d)\n' % (
-                    self.b[b.start:b.stop+1], self.b_name, self.b_linemap[b.start])
+                text = self.b[b.start:b.stop+1]
+                display = '%s (%s: %d)\n' % (text.replace('\n', ' ⏎ '),
+                                             self.b_name, self.b_linemap[b.start])
+                output += display
             output += '\n'
         return output