Skip to content

Commit

Permalink
Fixes excessive line wrapping. (#529)
Browse files Browse the repository at this point in the history
* Fixes excessive line wrapping.

* Adds changelog update.

* Reruns with current `black`.

* Adds lxml_html_clean dependency.

For context, see:

psf/requests-html#558 (comment)
psf/requests-html#569 (comment)

* [nan] Removes specialized selector.

It seems that Min Nan has been totally removed from English Wiktionary.

* updates changelog

* Adds new dependency to pyproject.toml too.
  • Loading branch information
kylebgorman authored Apr 9, 2024
1 parent 39666bc commit 3538190
Show file tree
Hide file tree
Showing 8 changed files with 16 additions and 103 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@ Unreleased

### Under `data/`

- Fixes excessive line wrapping. (\#529)
- Big scrape for 2024. (\#514)

### Under `src/` and elsewhere

- Upgrades `black` for Dependabot. (\#530)
- Removes Min Nan (`nan`) custom selector. (\#529)

#### Added

Expand Down
5 changes: 3 additions & 2 deletions data/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
ipapy>=0.0.9.0
lxml_html_clean
prettytable>=2.0.0
pynini>=2.1.0
regex>=2019.12.9
requests
requests-html
wikipron>=1.0.0
pynini>=2.1.0
prettytable>=2.0.0
47 changes: 8 additions & 39 deletions data/scrape/lib/languages_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,23 +37,11 @@ def _detect_best_script_name(
Example: "ژۇرنال" -> ("Arabic", 1.0).
"""
script_counts: DefaultDict[
str,
float,
] = collections.defaultdict(float)
script_counts: DefaultDict[str, float] = collections.defaultdict(float)
for char in word:
script_counts[unicodedataplus.script(char)] += 1.0
script_probs = [
(
s,
script_counts[s] / len(word),
)
for s in script_counts
]
script_probs.sort(
key=operator.itemgetter(1),
reverse=True,
)
script_probs = [(s, script_counts[s] / len(word)) for s in script_counts]
script_probs.sort(key=operator.itemgetter(1), reverse=True)
if strict and len(script_probs) != 1:
return None
else:
Expand All @@ -79,26 +67,14 @@ def _get_alias(


def _remove_mismatch_ids(
script_dict: Dict[
str,
Dict[
str,
str,
],
]
script_dict: Dict[str, Dict[str, str]]
) -> Dict[str, Dict[str, str]]:
"""Removes [key:value] pairs when the key does not
match the ISO 15924 code alias for script.
"""
remove = []
for (
key,
value,
) in script_dict["script"].items():
value = value.replace(
" ",
"_",
)
for key, value in script_dict["script"].items():
value = value.replace(" ", "_")
if _get_alias(value) != key:
remove.append(key)
for i in remove:
Expand All @@ -107,11 +83,7 @@ def _remove_mismatch_ids(


def main():
with open(
LANGUAGES_PATH,
"r",
encoding="utf-8",
) as source:
with open(LANGUAGES_PATH, "r", encoding="utf-8") as source:
languages = json.load(source)
for filename in os.listdir(TSV_DIRECTORY):
if filename.endswith(".tsv"):
Expand All @@ -126,10 +98,7 @@ def main():
) as source:
for line in source:
if line is not None:
word = line.split(
"\t",
1,
)[0]
word = line.split("\t", 1)[0]
script = _detect_best_script_name(word)
if script is not None:
if "script" not in lang:
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ keywords = [
"Wiktionary",
]
dependencies = [
'lxml_html_clean',
'python-iso639 >= 2022.11.27',
'requests',
'requests-html',
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
black==24.3.0
build==0.9.0
flake8==7.0.0
python-iso639==2022.11.27
lxml_html_clean==0.1.1
mypy==1.1.1
pytest==7.2.0
python-iso639==2022.11.27
requests-html==0.10.0
requests==2.31.0
segments==2.2.1
Expand Down
2 changes: 0 additions & 2 deletions src/wikipron/extract/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from wikipron.extract.khb import extract_word_pron_lu
from wikipron.extract.khm import extract_word_pron_khmer
from wikipron.extract.lat import extract_word_pron_latin
from wikipron.extract.nan import extract_word_pron_nan
from wikipron.extract.shn import extract_word_pron_shan
from wikipron.extract.tha import extract_word_pron_thai
from wikipron.extract.vie import extract_word_pron_vie
Expand All @@ -20,7 +19,6 @@
"Khmer": extract_word_pron_khmer,
"Latin": extract_word_pron_latin,
"Lü": extract_word_pron_lu,
"Min Nan": extract_word_pron_nan,
"Shan": extract_word_pron_shan,
"Tai Dam": extract_word_pron_blt,
"Thai": extract_word_pron_thai,
Expand Down
58 changes: 0 additions & 58 deletions src/wikipron/extract/nan.py

This file was deleted.

1 change: 0 additions & 1 deletion tests/test_wikipron/test_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
},
),
SmokeTestLanguage("yue", "Cantonese", {"skip_spaces_pron": False}),
SmokeTestLanguage("nan", "Min Nan", {"skip_spaces_pron": False}),
SmokeTestLanguage("blt", "Tai Dam", {"narrow": True}),
]

Expand Down

0 comments on commit 3538190

Please sign in to comment.