Merge pull request #9 from essteer/tests

Unit tests
essteer · Jun 15, 2024 · 89fd13d · 89fd13d
2 parents e9a8cf0 + d129d29
commit 89fd13d
Show file tree

Hide file tree

Showing 14 changed files with 424 additions and 49 deletions.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -0,0 +1,35 @@
+name: test
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  test:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        python-version: ['3.10', 3.11, 3.12]
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+
+      - name: Run tests
+        run: |
+          python -m unittest discover
diff --git a/README.md b/README.md
@@ -1,7 +1,8 @@
 <h1 align="center">Xiwen 析文</h1>
 
 <p align="center">
-  <a href="https://github.com/essteer/xiwen"><img src="https://img.shields.io/badge/Python-3.9_|_3.10_|_3.11_|_3.12-3776AB.svg?style=flat&logo=Python&logoColor=white"></a>
+  <a href="https://github.com/essteer/xiwen/actions/workflows/test.yaml"><img src="https://github.com/essteer/xiwen/actions/workflows/test.yaml/badge.svg"></a>
+  <a href="https://github.com/essteer/xiwen"><img src="https://img.shields.io/badge/Python-3.10_|_3.11_|_3.12-3776AB.svg?style=flat&logo=Python&logoColor=white"></a>
   <a href="https://github.com/astral-sh/ruff"><img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json"></a>
   <a href="https://snyk.io/test/github/essteer/xiwen"><img src="https://snyk.io/test/github/essteer/xiwen/badge.svg?name=Snyk&style=flat&logo=Snyk"></a>
 </p>
@@ -69,7 +70,7 @@ The table below lists the number of simplified hanzi per grade, and the number o
 
 [![](https://img.shields.io/badge/GitHub-xiwen-181717.svg?flat&logo=GitHub&logoColor=white)](https://github.com/essteer/xiwen)
 
-Clone the `xiwen` repo from GitHub for the full source code. The repo includes the CSV and text files used to generate the character lists and a test suite.
+Clone the `xiwen` repo from GitHub for the full code, files used to generate the character lists and a test suite.
 
 ```console
 $ git clone git@github.com:essteer/xiwen

diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,6 @@ dependencies = [
     "masquer>=1.1.1",
     "polars==0.20.31",
     "requests>=2.32.3",
-    "tqdm==4.66.2",
 ]
 requires-python = ">=3.9"
 license = { file = "LICENSE" }
@@ -21,7 +20,6 @@ classifiers = [
     "Intended Audience :: Developers",
     "License :: OSI Approved :: MIT License",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
@@ -31,6 +29,7 @@ classifiers = [
 dev = [
     "pre-commit==3.7.0",
     "ruff>=0.4.5",
+    "tqdm==4.66.2",
 ]
 
 [project.urls]

diff --git a/requirements.txt b/requirements.txt
@@ -16,7 +16,5 @@ requests==2.32.3
     # via xiwen (pyproject.toml)
 soupsieve==2.5
     # via beautifulsoup4
-tqdm==4.66.3
-    # via xiwen (pyproject.toml)
 urllib3==2.2.1
     # via requests
diff --git a/src/resources/main.py b/src/resources/main.py
@@ -1,4 +1,4 @@
-import pandas as pd
+import polars as pl
 from tqdm import tqdm
 from xiwen.utils.config import (
     ASSETS_DIR,
@@ -24,7 +24,7 @@
 ##########################################################################
 
 # Read hsk30-chars-ext.csv
-df = pd.read_csv(HSK_PATH)
+df = pl.read_csv(HSK_PATH)
 # Extract character columns and HSK grades
 df = df[["Hanzi", "Traditional", "Level"]]
 # Rename columns
@@ -33,8 +33,8 @@
 )
 # Get pinyin based on traditional characters
 trad_hanzi = df["Traditional"].tolist()
-# pinyin_df = pd.DataFrame("Pinyin": get_pinyin(trad_hanzi, pinyin_map))
-df["Pinyin"] = pd.DataFrame({"Pinyin": get_pinyin(trad_hanzi, pinyin_map)})
+# pinyin_df = pl.DataFrame("Pinyin": get_pinyin(trad_hanzi, pinyin_map))
+df["Pinyin"] = pl.DataFrame({"Pinyin": get_pinyin(trad_hanzi, pinyin_map)})
 
 ##########################################################################
 # Add unicode for simplified and traditional hanzi
@@ -67,7 +67,7 @@
 
 # DataFrame of Jun Da character frequencies
 cols = ["Simplified", "JD Rank", "JD Frequency", "JD Percentile"]
-junda_df = pd.DataFrame(junda_freqs, columns=cols)
+junda_df = pl.DataFrame(junda_freqs, columns=cols)
 
 ##########################################################################
 # Map frequencies to HSK set

diff --git a/src/xiwen/utils/analysis.py b/src/xiwen/utils/analysis.py
@@ -1,4 +1,5 @@
 import polars as pl
+import sys
 from .config import HSK_GRADES, STATS_COLUMNS
 from .counters import cumulative_counts, get_counts, granular_counts
 
@@ -20,6 +21,8 @@ def identify_variant(hsk_simp: list, hsk_trad: list) -> str:
     str
         text character variant
     """
+    # Use epsilon to mitigate float rounding errors
+    epsilon = sys.float_info.epsilon
     # Threshold beyond which to decide that text belongs to one variant
     threshold = 0.90
     simp_set = set(hsk_simp) - set(hsk_trad)
@@ -29,9 +32,11 @@ def identify_variant(hsk_simp: list, hsk_trad: list) -> str:
         return "Unknown"
 
     ratio = len(simp_set) / (len(simp_set) + len(trad_set))
-    if ratio >= threshold:
+    if ratio >= threshold - epsilon:
         return "Simplified"
-    return "Traditional"
+    elif ratio <= 1 - threshold + epsilon:
+        return "Traditional"
+    return "Unknown"
 
 
 def compute_stats(
@@ -143,7 +148,6 @@ def analyse(
         "Unknown": trad_list,
     }
     # Get counts of each hanzi
-    # hanzi_df = get_counts(variants[variant], variant, HSK_HANZI)
     hanzi_df = get_counts(variants[variant], variant)
     # Get counts of hanzi by grade
     grade_counts = granular_counts(hanzi_df, hanzi_list, variant)

diff --git a/src/xiwen/utils/counters.py b/src/xiwen/utils/counters.py
@@ -1,6 +1,6 @@
 import polars as pl
 from .config import HSK_GRADES
-from .hanzi import get_hanzi_processor_instance
+from .hanzi import get_HSKHanzi_instance
 
 
 def unit_counts(hanzi: list) -> dict:
@@ -100,15 +100,18 @@ def get_counts(hanzi_subset: list, variant: str) -> pl.DataFrame:
     merged_df : pl.DataFrame
         DataFrame of HSK_HANZI with counts applied
     """
+    # Get DataFrame of full HSK character liss
+    hsk_hanzi = get_HSKHanzi_instance().HSK_HANZI
     # Count occurrences of each character in hanzi_subset
     counts = unit_counts(hanzi_subset)
+
+    # Merge on variant column
+    if variant == "Unknown":
+        variant = "Traditional"
     # Create DataFrame from counts dictionary
     counts_df = pl.DataFrame(
         list(counts.items()), schema={variant: pl.String, "Count": pl.Int32}
     )
-    # Get DataFrame of full HSK character liss
-    hsk_hanzi = get_hanzi_processor_instance().HSK_HANZI
-    # Merge on variant column
     merged_df = hsk_hanzi.join(counts_df, on=variant, coalesce=True, how="left")
     # Fill null values and convert counts to integers
     merged_df = merged_df.fill_null(0).with_columns(pl.col("Count").cast(pl.Int32))

diff --git a/src/xiwen/utils/hanzi.py b/src/xiwen/utils/hanzi.py
@@ -3,7 +3,7 @@
 from .config import ASSETS_DIR, HSK30_HANZI_SCHEMA
 
 
-class HanziProcessor:
+class HSKHanzi:
     """
     Loads and retains HSK character lists
     Singleton pattern -> only one instance exists
@@ -25,7 +25,7 @@ class HanziProcessor:
 
     def __new__(cls):
         if cls._instance is None:
-            cls._instance = super(HanziProcessor, cls).__new__(cls)
+            cls._instance = super(HSKHanzi, cls).__new__(cls)
             cls._instance._initialize()
         return cls._instance
 
@@ -38,8 +38,8 @@ def _initialize(self):
         self.HSK_TRAD = self.HSK_HANZI.select("Traditional").to_series().to_list()
 
 
-def get_hanzi_processor_instance():
+def get_HSKHanzi_instance():
     """
-    Gets and returns the HanziProcessor class
+    Gets and returns the HSKHanzi class
     """
-    return HanziProcessor()
+    return HSKHanzi()
diff --git a/src/xiwen/utils/transform.py b/src/xiwen/utils/transform.py
@@ -1,4 +1,4 @@
-from .hanzi import get_hanzi_processor_instance
+from .hanzi import get_HSKHanzi_instance
 
 
 def partition_hanzi(hanzi_list: list) -> tuple[list]:
@@ -23,8 +23,8 @@ def partition_hanzi(hanzi_list: list) -> tuple[list]:
     outliers : list
         characters not in above lists
     """
-    hsk_simp = get_hanzi_processor_instance().HSK_SIMP
-    hsk_trad = get_hanzi_processor_instance().HSK_TRAD
+    hsk_simp = get_HSKHanzi_instance().HSK_SIMP
+    hsk_trad = get_HSKHanzi_instance().HSK_TRAD
 
     simp = [zi for zi in hanzi_list if zi in hsk_simp]
     trad = [zi for zi in hanzi_list if zi in hsk_trad]

diff --git a/tests/test_analysis.py b/tests/test_analysis.py
@@ -1,5 +1,5 @@
 import os
-import pandas as pd
+import polars as pl
 import unittest
 from src.xiwen.utils.analysis import identify_variant
 from src.xiwen.utils.config import ENCODING
@@ -9,9 +9,9 @@
 
 TEST_ASSETS = os.path.abspath(os.path.join("tests", "assets"))
 # Combine script directory with relative path to the file
-filepath = os.path.join("src", "xiwen", "assets", "hsk30_hanzi.csv")
+filepath = os.path.join("src", "xiwen", "assets", "hsk30_hanzi.parquet")
 # Load HSK Hanzi database (unigrams only)
-HSK_HANZI = pd.read_csv(filepath)
+HSK_HANZI = pl.read_parquet(filepath)
 HSK_SIMP = list(HSK_HANZI["Simplified"])
 HSK_TRAD = list(HSK_HANZI["Traditional"])
 
@@ -354,3 +354,7 @@ def test_known_figures(self):
             simp, trad, outliers = partition_hanzi(hanzi)
             # Check identified character variant
             self.assertEqual(identify_variant(simp, trad), TEST_CASES[test_case][0])
+
+
+if __name__ == "__main__":
+    unittest.main()