Skip to content

Commit

Permalink
Merge pull request #9 from essteer/tests
Browse files Browse the repository at this point in the history
Unit tests
  • Loading branch information
essteer authored Jun 15, 2024
2 parents e9a8cf0 + d129d29 commit 89fd13d
Show file tree
Hide file tree
Showing 14 changed files with 424 additions and 49 deletions.
35 changes: 35 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: test

on:
push:
branches:
- main
pull_request:
branches:
- main

jobs:
test:
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: ['3.10', 3.11, 3.12]

runs-on: ${{ matrix.os }}

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
pip install -r requirements.txt
- name: Run tests
run: |
python -m unittest discover
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
<h1 align="center">Xiwen 析文</h1>

<p align="center">
<a href="https://github.com/essteer/xiwen"><img src="https://img.shields.io/badge/Python-3.9_|_3.10_|_3.11_|_3.12-3776AB.svg?style=flat&logo=Python&logoColor=white"></a>
<a href="https://github.com/essteer/xiwen/actions/workflows/test.yaml"><img src="https://github.com/essteer/xiwen/actions/workflows/test.yaml/badge.svg"></a>
<a href="https://github.com/essteer/xiwen"><img src="https://img.shields.io/badge/Python-3.10_|_3.11_|_3.12-3776AB.svg?style=flat&logo=Python&logoColor=white"></a>
<a href="https://github.com/astral-sh/ruff"><img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json"></a>
<a href="https://snyk.io/test/github/essteer/xiwen"><img src="https://snyk.io/test/github/essteer/xiwen/badge.svg?name=Snyk&style=flat&logo=Snyk"></a>
</p>
Expand Down Expand Up @@ -69,7 +70,7 @@ The table below lists the number of simplified hanzi per grade, and the number o

[![](https://img.shields.io/badge/GitHub-xiwen-181717.svg?flat&logo=GitHub&logoColor=white)](https://github.com/essteer/xiwen)

Clone the `xiwen` repo from GitHub for the full source code. The repo includes the CSV and text files used to generate the character lists and a test suite.
Clone the `xiwen` repo from GitHub for the full code, files used to generate the character lists and a test suite.

```console
$ git clone git@github.com:essteer/xiwen
Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ dependencies = [
"masquer>=1.1.1",
"polars==0.20.31",
"requests>=2.32.3",
"tqdm==4.66.2",
]
requires-python = ">=3.9"
license = { file = "LICENSE" }
Expand All @@ -21,7 +20,6 @@ classifiers = [
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
Expand All @@ -31,6 +29,7 @@ classifiers = [
dev = [
"pre-commit==3.7.0",
"ruff>=0.4.5",
"tqdm==4.66.2",
]

[project.urls]
Expand Down
2 changes: 0 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,5 @@ requests==2.32.3
# via xiwen (pyproject.toml)
soupsieve==2.5
# via beautifulsoup4
tqdm==4.66.3
# via xiwen (pyproject.toml)
urllib3==2.2.1
# via requests
10 changes: 5 additions & 5 deletions src/resources/main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import pandas as pd
import polars as pl
from tqdm import tqdm
from xiwen.utils.config import (
ASSETS_DIR,
Expand All @@ -24,7 +24,7 @@
##########################################################################

# Read hsk30-chars-ext.csv
df = pd.read_csv(HSK_PATH)
df = pl.read_csv(HSK_PATH)
# Extract character columns and HSK grades
df = df[["Hanzi", "Traditional", "Level"]]
# Rename columns
Expand All @@ -33,8 +33,8 @@
)
# Get pinyin based on traditional characters
trad_hanzi = df["Traditional"].tolist()
# pinyin_df = pd.DataFrame("Pinyin": get_pinyin(trad_hanzi, pinyin_map))
df["Pinyin"] = pd.DataFrame({"Pinyin": get_pinyin(trad_hanzi, pinyin_map)})
# pinyin_df = pl.DataFrame("Pinyin": get_pinyin(trad_hanzi, pinyin_map))
df["Pinyin"] = pl.DataFrame({"Pinyin": get_pinyin(trad_hanzi, pinyin_map)})

##########################################################################
# Add unicode for simplified and traditional hanzi
Expand Down Expand Up @@ -67,7 +67,7 @@

# DataFrame of Jun Da character frequencies
cols = ["Simplified", "JD Rank", "JD Frequency", "JD Percentile"]
junda_df = pd.DataFrame(junda_freqs, columns=cols)
junda_df = pl.DataFrame(junda_freqs, columns=cols)

##########################################################################
# Map frequencies to HSK set
Expand Down
10 changes: 7 additions & 3 deletions src/xiwen/utils/analysis.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import polars as pl
import sys
from .config import HSK_GRADES, STATS_COLUMNS
from .counters import cumulative_counts, get_counts, granular_counts

Expand All @@ -20,6 +21,8 @@ def identify_variant(hsk_simp: list, hsk_trad: list) -> str:
str
text character variant
"""
# Use epsilon to mitigate float rounding errors
epsilon = sys.float_info.epsilon
# Threshold beyond which to decide that text belongs to one variant
threshold = 0.90
simp_set = set(hsk_simp) - set(hsk_trad)
Expand All @@ -29,9 +32,11 @@ def identify_variant(hsk_simp: list, hsk_trad: list) -> str:
return "Unknown"

ratio = len(simp_set) / (len(simp_set) + len(trad_set))
if ratio >= threshold:
if ratio >= threshold - epsilon:
return "Simplified"
return "Traditional"
elif ratio <= 1 - threshold + epsilon:
return "Traditional"
return "Unknown"


def compute_stats(
Expand Down Expand Up @@ -143,7 +148,6 @@ def analyse(
"Unknown": trad_list,
}
# Get counts of each hanzi
# hanzi_df = get_counts(variants[variant], variant, HSK_HANZI)
hanzi_df = get_counts(variants[variant], variant)
# Get counts of hanzi by grade
grade_counts = granular_counts(hanzi_df, hanzi_list, variant)
Expand Down
11 changes: 7 additions & 4 deletions src/xiwen/utils/counters.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import polars as pl
from .config import HSK_GRADES
from .hanzi import get_hanzi_processor_instance
from .hanzi import get_HSKHanzi_instance


def unit_counts(hanzi: list) -> dict:
Expand Down Expand Up @@ -100,15 +100,18 @@ def get_counts(hanzi_subset: list, variant: str) -> pl.DataFrame:
merged_df : pl.DataFrame
DataFrame of HSK_HANZI with counts applied
"""
# Get DataFrame of full HSK character liss
hsk_hanzi = get_HSKHanzi_instance().HSK_HANZI
# Count occurrences of each character in hanzi_subset
counts = unit_counts(hanzi_subset)

# Merge on variant column
if variant == "Unknown":
variant = "Traditional"
# Create DataFrame from counts dictionary
counts_df = pl.DataFrame(
list(counts.items()), schema={variant: pl.String, "Count": pl.Int32}
)
# Get DataFrame of full HSK character liss
hsk_hanzi = get_hanzi_processor_instance().HSK_HANZI
# Merge on variant column
merged_df = hsk_hanzi.join(counts_df, on=variant, coalesce=True, how="left")
# Fill null values and convert counts to integers
merged_df = merged_df.fill_null(0).with_columns(pl.col("Count").cast(pl.Int32))
Expand Down
10 changes: 5 additions & 5 deletions src/xiwen/utils/hanzi.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from .config import ASSETS_DIR, HSK30_HANZI_SCHEMA


class HanziProcessor:
class HSKHanzi:
"""
Loads and retains HSK character lists
Singleton pattern -> only one instance exists
Expand All @@ -25,7 +25,7 @@ class HanziProcessor:

def __new__(cls):
if cls._instance is None:
cls._instance = super(HanziProcessor, cls).__new__(cls)
cls._instance = super(HSKHanzi, cls).__new__(cls)
cls._instance._initialize()
return cls._instance

Expand All @@ -38,8 +38,8 @@ def _initialize(self):
self.HSK_TRAD = self.HSK_HANZI.select("Traditional").to_series().to_list()


def get_hanzi_processor_instance():
def get_HSKHanzi_instance():
"""
Gets and returns the HanziProcessor class
Gets and returns the HSKHanzi class
"""
return HanziProcessor()
return HSKHanzi()
6 changes: 3 additions & 3 deletions src/xiwen/utils/transform.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .hanzi import get_hanzi_processor_instance
from .hanzi import get_HSKHanzi_instance


def partition_hanzi(hanzi_list: list) -> tuple[list]:
Expand All @@ -23,8 +23,8 @@ def partition_hanzi(hanzi_list: list) -> tuple[list]:
outliers : list
characters not in above lists
"""
hsk_simp = get_hanzi_processor_instance().HSK_SIMP
hsk_trad = get_hanzi_processor_instance().HSK_TRAD
hsk_simp = get_HSKHanzi_instance().HSK_SIMP
hsk_trad = get_HSKHanzi_instance().HSK_TRAD

simp = [zi for zi in hanzi_list if zi in hsk_simp]
trad = [zi for zi in hanzi_list if zi in hsk_trad]
Expand Down
10 changes: 7 additions & 3 deletions tests/test_analysis.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
import pandas as pd
import polars as pl
import unittest
from src.xiwen.utils.analysis import identify_variant
from src.xiwen.utils.config import ENCODING
Expand All @@ -9,9 +9,9 @@

TEST_ASSETS = os.path.abspath(os.path.join("tests", "assets"))
# Combine script directory with relative path to the file
filepath = os.path.join("src", "xiwen", "assets", "hsk30_hanzi.csv")
filepath = os.path.join("src", "xiwen", "assets", "hsk30_hanzi.parquet")
# Load HSK Hanzi database (unigrams only)
HSK_HANZI = pd.read_csv(filepath)
HSK_HANZI = pl.read_parquet(filepath)
HSK_SIMP = list(HSK_HANZI["Simplified"])
HSK_TRAD = list(HSK_HANZI["Traditional"])

Expand Down Expand Up @@ -354,3 +354,7 @@ def test_known_figures(self):
simp, trad, outliers = partition_hanzi(hanzi)
# Check identified character variant
self.assertEqual(identify_variant(simp, trad), TEST_CASES[test_case][0])


if __name__ == "__main__":
unittest.main()
Loading

0 comments on commit 89fd13d

Please sign in to comment.