Merge pull request #487 from moj-analytical-services/doc11

More docs
moj-analytical-services · May 26, 2022 · 77b2396 · 77b2396
2 parents 0d765ff + 582bc99
commit 77b2396
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 15 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "splink"
-version = "3.0.0.dev12"
-description = "Implementation of Fellegi-Sunter's canonical model of record linkage in Apache Spark, including EM algorithm to estimate parameters"
+version = "3.0.0.dev13"
+description = "Fast probabilistic data linkage at scale"
 authors = ["Robin Linacre <robinlinacre@hotmail.com>", "Sam Lindsay", "Theodore Manassis"]
 license = "MIT"
 homepage = "https://github.com/moj-analytical-services/splink"

diff --git a/splink/comparison.py b/splink/comparison.py
@@ -10,14 +10,20 @@
 
 
 class Comparison:
-    """Each comparison defines how one type of data in the input record is compared to
-    assess its similarity.  For example, one comparison may represent the comparison
-    of a person's date or birth.  Another may represent the comparison of a person's
-    name, or location.
+    """Each comparison defines how data from a subset of input columns is
+    compared to assess its similarity.
+
+    For example, one comparison may represent how similarity is assessed for a
+    person's date of birth.  Others may represent the comparison of a person's name or
+    location.
+
+    The method used to assess similarity will depend on the type of data -
+    for instance, the method used to assess similarity of a company's turnover would
+    be different to the method used to assess the similarity of a person's first name.
 
     A linking model thus usually contains several Comparisons.
 
-    Each Comparison contains two or more ComparisonLevels which assess gradations of
+    Each Comparison contains two or more ComparisonLevels which define the gradations of
     similarity between the input columns.  For example, for the date of birth Comparison
     there may be a ComparisonLevel for an exact match, another for a one-character
     difference, and another for all other comparisons.

diff --git a/splink/comparison_level.py b/splink/comparison_level.py
@@ -85,14 +85,32 @@ def _default_u_values(num_levels):
 
 
 class ComparisonLevel:
-    """Defines a way in which the similarity between one or more columns of a pairwise
-    comparisons is assessed.
-
-    For example, the Comparison for first_name comparison may have an exact match level,
-    a levenstein similarity level and an 'all other comparisons' level
-
-    A Comparison will have several ComparisonLevels. A Splink model will have several
-    Comparisons (e.g. name, dob, postcode, etc.)"""
+    """ComparisonLevels define the gradations of similarity within a Comparison.
+
+    For example, a Comparison of the first_name and surname columns may define three
+    ComparisonLevels:
+        An exact match on first name and surname
+        First name and surname have a JaroWinkler score of above 0.95
+        All other comparisons
+
+    The method used to assess similarity will depend on the type of data -
+    for instance, the method used to assess similarity of a company's turnover would
+    be different to the method used to assess the similarity of a person's first name.
+
+    To summarise:
+
+    Data Linking Model
+    ├─-- Comparison: Name
+    │    ├─-- ComparisonLevel: Exact match on first_name and surname
+    │    ├─-- ComparisonLevel: first_name and surname have JaroWinkler > 0.95
+    │    ├─-- ComparisonLevel: All other
+    ├─-- Comparison: Date of birth
+    │    ├─-- ComparisonLevel: Exact match
+    │    ├─-- ComparisonLevel: One character difference
+    │    ├─-- ComparisonLevel: All other
+    ├─-- etc.
+
+    """
 
     def __init__(
         self,