Fixes excessive line wrapping. (#529)

* Fixes excessive line wrapping. * Adds changelog update. * Reruns with current `black`. * Adds lxml_html_clean dependency. For context, see: psf/requests-html#558 (comment) psf/requests-html#569 (comment) * [nan] Removes specialized selector. It seems that Min Nan has been totally removed from English Wiktionary. * updates changelog * Adds new dependency to pyproject.toml too.
CUNY-CL · Apr 9, 2024 · 3538190 · 3538190
1 parent 39666bc
commit 3538190
Show file tree

Hide file tree

Showing 8 changed files with 16 additions and 103 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,11 +12,13 @@ Unreleased
 
 ### Under `data/`
 
+-   Fixes excessive line wrapping. (\#529)
 -   Big scrape for 2024. (\#514)
 
 ### Under `src/` and elsewhere
 
 -   Upgrades `black` for Dependabot. (\#530)
+-   Removes Min Nan (`nan`) custom selector. (\#529)
 
 #### Added
 

diff --git a/data/requirements.txt b/data/requirements.txt
@@ -1,7 +1,8 @@
 ipapy>=0.0.9.0
+lxml_html_clean
+prettytable>=2.0.0
+pynini>=2.1.0
 regex>=2019.12.9
 requests
 requests-html
 wikipron>=1.0.0
-pynini>=2.1.0
-prettytable>=2.0.0
diff --git a/data/scrape/lib/languages_update.py b/data/scrape/lib/languages_update.py
@@ -37,23 +37,11 @@ def _detect_best_script_name(
 
     Example: "ژۇرنال" -> ("Arabic", 1.0).
     """
-    script_counts: DefaultDict[
-        str,
-        float,
-    ] = collections.defaultdict(float)
+    script_counts: DefaultDict[str, float] = collections.defaultdict(float)
     for char in word:
         script_counts[unicodedataplus.script(char)] += 1.0
-    script_probs = [
-        (
-            s,
-            script_counts[s] / len(word),
-        )
-        for s in script_counts
-    ]
-    script_probs.sort(
-        key=operator.itemgetter(1),
-        reverse=True,
-    )
+    script_probs = [(s, script_counts[s] / len(word)) for s in script_counts]
+    script_probs.sort(key=operator.itemgetter(1), reverse=True)
     if strict and len(script_probs) != 1:
         return None
     else:
@@ -79,26 +67,14 @@ def _get_alias(
 
 
 def _remove_mismatch_ids(
-    script_dict: Dict[
-        str,
-        Dict[
-            str,
-            str,
-        ],
-    ]
+    script_dict: Dict[str, Dict[str, str]]
 ) -> Dict[str, Dict[str, str]]:
     """Removes [key:value] pairs when the key does not
     match the ISO 15924 code alias for script.
     """
     remove = []
-    for (
-        key,
-        value,
-    ) in script_dict["script"].items():
-        value = value.replace(
-            " ",
-            "_",
-        )
+    for key, value in script_dict["script"].items():
+        value = value.replace(" ", "_")
         if _get_alias(value) != key:
             remove.append(key)
     for i in remove:
@@ -107,11 +83,7 @@ def _remove_mismatch_ids(
 
 
 def main():
-    with open(
-        LANGUAGES_PATH,
-        "r",
-        encoding="utf-8",
-    ) as source:
+    with open(LANGUAGES_PATH, "r", encoding="utf-8") as source:
         languages = json.load(source)
     for filename in os.listdir(TSV_DIRECTORY):
         if filename.endswith(".tsv"):
@@ -126,10 +98,7 @@ def main():
             ) as source:
                 for line in source:
                     if line is not None:
-                        word = line.split(
-                            "\t",
-                            1,
-                        )[0]
+                        word = line.split("\t", 1)[0]
                         script = _detect_best_script_name(word)
                         if script is not None:
                             if "script" not in lang:

diff --git a/pyproject.toml b/pyproject.toml
@@ -20,6 +20,7 @@ keywords = [
     "Wiktionary",
 ]
 dependencies = [
+    'lxml_html_clean',
     'python-iso639 >= 2022.11.27',
     'requests',
     'requests-html',

diff --git a/requirements.txt b/requirements.txt
@@ -1,9 +1,10 @@
 black==24.3.0
 build==0.9.0
 flake8==7.0.0
-python-iso639==2022.11.27
+lxml_html_clean==0.1.1
 mypy==1.1.1
 pytest==7.2.0
+python-iso639==2022.11.27
 requests-html==0.10.0
 requests==2.31.0
 segments==2.2.1

diff --git a/src/wikipron/extract/__init__.py b/src/wikipron/extract/__init__.py
@@ -4,7 +4,6 @@
 from wikipron.extract.khb import extract_word_pron_lu
 from wikipron.extract.khm import extract_word_pron_khmer
 from wikipron.extract.lat import extract_word_pron_latin
-from wikipron.extract.nan import extract_word_pron_nan
 from wikipron.extract.shn import extract_word_pron_shan
 from wikipron.extract.tha import extract_word_pron_thai
 from wikipron.extract.vie import extract_word_pron_vie
@@ -20,7 +19,6 @@
     "Khmer": extract_word_pron_khmer,
     "Latin": extract_word_pron_latin,
     "Lü": extract_word_pron_lu,
-    "Min Nan": extract_word_pron_nan,
     "Shan": extract_word_pron_shan,
     "Tai Dam": extract_word_pron_blt,
     "Thai": extract_word_pron_thai,

diff --git a/src/wikipron/extract/nan.py b/src/wikipron/extract/nan.py
diff --git a/tests/test_wikipron/test_scrape.py b/tests/test_wikipron/test_scrape.py
@@ -52,7 +52,6 @@
         },
     ),
     SmokeTestLanguage("yue", "Cantonese", {"skip_spaces_pron": False}),
-    SmokeTestLanguage("nan", "Min Nan", {"skip_spaces_pron": False}),
     SmokeTestLanguage("blt", "Tai Dam", {"narrow": True}),
 ]
-Original file line number
+Diff line change
@@ Expand Up / @@ -52,7 +52,6 @@ @@
             },
         ),
         SmokeTestLanguage("yue", "Cantonese", {"skip_spaces_pron": False}),
-        SmokeTestLanguage("nan", "Min Nan", {"skip_spaces_pron": False}),
         SmokeTestLanguage("blt", "Tai Dam", {"narrow": True}),
     ]
@@ Expand Down @@