Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Oct 2, 2023
1 parent 6c7964f commit ea52f5a
Show file tree
Hide file tree
Showing 26 changed files with 31 additions and 44 deletions.
2 changes: 1 addition & 1 deletion ac_dc/anonymization.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def apply_regex_anonymization(
tag_type=tag_type,
)
if anonymize_condition:
for (ent, start, end, tag) in ner:
for ent, start, end, tag in ner:
# we need to actually walk through and replace by start, end span.
sentence = sentence.replace(ent, f" <{tag}> ")
return sentence, ner
5 changes: 1 addition & 4 deletions ac_dc/deduplicate/self_deduplicate.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2022-01-08 22:39:29
# @Author : Chenghao Mou (mouchenghao@gmail.com)
# @Description: Self-deduplication with `datasets`
Expand Down Expand Up @@ -27,8 +26,7 @@


def main(conf: str) -> None:

with open(conf, "r") as f:
with open(conf) as f:
conf = yaml.safe_load(f.read())

if conf["load_from_disk"]["path"]:
Expand Down Expand Up @@ -201,5 +199,4 @@ def main(conf: str) -> None:


if __name__ == "__main__":

typer.run(main)
2 changes: 0 additions & 2 deletions ac_dc/visualization/get_data_for_visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ def __init__(
path_kenlm_model,
path_save_stats,
):

self.ds = dataset
self.num_iter = num_iter

Expand Down Expand Up @@ -166,7 +165,6 @@ def compute_stats(self):


if __name__ == "__main__":

lang_dataset_id = "en"

dataset_name = "oscar" # "TurkuNLP/register_oscar"
Expand Down
2 changes: 0 additions & 2 deletions ac_dc/visualization/visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,6 @@ def filtering_of_words(self):
)

if display_discarded_words_by_filter:

if "len_word" in columns:
cond_filter = np.invert(conds_words["len_word"])
Visualization_for_lang.display_dataset(
Expand Down Expand Up @@ -698,7 +697,6 @@ def is_doc_discarded(key, score):
return score < key[1]

if personal_doc:

st.markdown("Statistics of the document:")

for key in self.keys:
Expand Down
1 change: 0 additions & 1 deletion bertin/evaluation/run_glue.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
1 change: 0 additions & 1 deletion bertin/evaluation/run_ner.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2020 The HuggingFace Team All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
2 changes: 1 addition & 1 deletion bertin/mc4/mc4.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ def _generate_examples(self, filepaths):
for filepath in filepaths:
logger.info("generating examples from = %s", filepath)
if filepath.endswith("jsonl"):
with open(filepath, "r", encoding="utf-8") as f:
with open(filepath, encoding="utf-8") as f:
for line in f:
if line:
example = json.loads(line)
Expand Down
1 change: 0 additions & 1 deletion bertin/run_mlm_flax.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Team All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
3 changes: 1 addition & 2 deletions bertin/run_mlm_flax_stream.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Team All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -446,7 +445,7 @@ def restore_checkpoint(save_dir, state):
args = joblib.load(os.path.join(save_dir, "training_args.joblib"))
data_collator = joblib.load(os.path.join(save_dir, "data_collator.joblib"))

with open(os.path.join(save_dir, "training_state.json"), "r") as f:
with open(os.path.join(save_dir, "training_state.json")) as f:
training_state = json.load(f)
step = training_state["step"]

Expand Down
2 changes: 1 addition & 1 deletion bertin/utils/dataset_perplexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def get_perplexity(doc):


with open("mc4-es-train-50M-stats.csv", "w") as csv:
with open("mc4-es-train-50M-steps.jsonl", "r") as data:
with open("mc4-es-train-50M-steps.jsonl") as data:
for line in tqdm(data):
text = json.loads(line)["text"]
csv.write(f"{len(text.split())},{get_perplexity(text)}\n")
4 changes: 2 additions & 2 deletions cc_pseudo_crawl/python_scripts/download_warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,9 @@ def get_warcs(batch):
existing_compressed_warcs,
)

batch["compressed_warc"], batch["download_exception"] = [
batch["compressed_warc"], batch["download_exception"] = (
list(l) for l in zip(*warcs_or_exceptions)
]
)
return batch


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,6 @@ def main(args: PreprocessingConfig) -> None: # Setup logging
]

def process_file(file_name: str):

logger.info(config.HF_DATASETS_CACHE)
processing_name = (
"-".join(args.metadata_to_include)
Expand Down
2 changes: 1 addition & 1 deletion cc_pseudo_crawl/python_scripts/load_all_seed_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def main():

seed_ids = []
for seed_path in args.seed_paths:
with open(seed_path, "r") as fi:
with open(seed_path) as fi:
data = csv.reader(fi)
# First line is all the headers that we remove.
seed_ids += [row[0] for row_id, row in enumerate(data) if row_id > 0]
Expand Down
1 change: 0 additions & 1 deletion kenlm_training/cc_net/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ def get_executor(
task_parallelism: int = -1,
options: dict = {},
) -> Executor:

execution_mode = execution.split(",")[0]
options.update(
{kv.split("=", 1)[0]: kv.split("=", 1)[1] for kv in execution.split(",")[1:]}
Expand Down
7 changes: 3 additions & 4 deletions kenlm_training/cc_net/jsonql.py
Original file line number Diff line number Diff line change
Expand Up @@ -880,8 +880,7 @@ def describe(source, columns=None, weights=None, **kwargs):
continue
if "." in k or k == ALL_DOCUMENTS:
continue
for line in display_stats(stats, k, weights=weights, **kwargs):
yield line
yield from display_stats(stats, k, weights=weights, **kwargs)


def shard(lines):
Expand Down Expand Up @@ -961,7 +960,7 @@ def open_read(filename: ReadableFileLike) -> Iterable[str]:
if filename.suffix == ".gz":
file: TextIO = gzip.open(filename, "rt") # type: ignore
else:
file = open(filename, "rt")
file = open(filename)

return _close_when_exhausted(file)

Expand Down Expand Up @@ -1015,7 +1014,7 @@ def open_write(
if filename.suffix == ".gz":
return BlockedGzipWriter(Path(filename), mode, block_size="64M")

return open(filename, "wt")
return open(filename, "w")


def parse_size(size):
Expand Down
2 changes: 1 addition & 1 deletion kenlm_training/tests/test_jsonql.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def do(self, x):
def acc(values):
print("acc: started")
res = 0
for (x, _) in values:
for x, _ in values:
res += int(x)
print("acc: done")
yield f"acc: result={res}"
Expand Down
4 changes: 2 additions & 2 deletions pii-manager/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@

def requirements(filename="requirements.txt"):
"""Read the requirements file"""
with io.open(filename, "r") as f:
with open(filename) as f:
return [line.strip() for line in f if line and line[0] != "#"]


def long_description():
"""
Take the README and remove markdown hyperlinks
"""
with open("README.md", "rt", encoding="utf-8") as f:
with open("README.md", encoding="utf-8") as f:
desc = f.read()
desc = re.sub(r"^\[ ([^\]]+) \]: \s+ \S.*\n", r"", desc, flags=re.X | re.M)
return re.sub(r"\[ ([^\]]+) \]", r"\1", desc, flags=re.X)
Expand Down
9 changes: 3 additions & 6 deletions pii-manager/src/pii_manager/api/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,22 +31,19 @@ def fetch_all_tasks(
"""
taskdict = get_taskdict(debug=debug)
# Language-independent
for task in taskdict[LANG_ANY].values():
yield task
yield from taskdict[LANG_ANY].values()

langdict = taskdict.get(lang, {})
# Country-independent
for task in langdict.get(COUNTRY_ANY, {}).values():
yield task
yield from langdict.get(COUNTRY_ANY, {}).values()
# Country-specific
if country:
if country[0] in (COUNTRY_ANY, "all"):
country = country_list(lang)
for c in country:
if c == COUNTRY_ANY: # already included above
continue
for task in langdict.get(c, {}).values():
yield task
yield from langdict.get(c, {}).values()


def fetch_task(
Expand Down
2 changes: 1 addition & 1 deletion pii-manager/test/unit/api/test_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def datafile(name: str) -> str:


def readfile(name: str) -> str:
with open(name, "rt", encoding="utf-8") as f:
with open(name, encoding="utf-8") as f:
return f.read().strip()


Expand Down
2 changes: 1 addition & 1 deletion pii-manager/test/unit/api/test_file_taskfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def datafile(name: str) -> str:


def readfile(name: str) -> str:
with open(name, "rt", encoding="utf-8") as f:
with open(name, encoding="utf-8") as f:
return f.read().strip()


Expand Down
5 changes: 4 additions & 1 deletion pii-manager/test/unit/api/test_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ def test20_info():
info = obj.task_info()

exp = {
(PiiEnum.CREDIT_CARD, None,): [
(
PiiEnum.CREDIT_CARD,
None,
): [
(
"credit card",
"Credit card numbers for most international credit cards (detect & validate)",
Expand Down
4 changes: 2 additions & 2 deletions pii-manager/test/unit/api/test_manager_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def test110_call():
obj = PiiManager("en", None, PiiEnum.EMAIL_ADDRESS)
obj.add_tasks([DUMMY_REGEX])

for (doc, exp) in TEST_REGEX:
for doc, exp in TEST_REGEX:
got = obj(doc)
assert got == exp

Expand Down Expand Up @@ -86,6 +86,6 @@ def test200_call():
obj = PiiManager("en")
obj.add_tasks([DUMMY_CLASS])

for (doc, exp) in TEST_CLASS:
for doc, exp in TEST_CLASS:
got = obj(doc)
assert got == exp
4 changes: 2 additions & 2 deletions pii-manager/test/unit/api/test_manager_ctx.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def test10_context_regex():
"""
obj = PiiManager("en", mode="extract")
obj.add_tasks([DUMMY_REGEX])
for (text, exp) in TEST:
for text, exp in TEST:
got = obj(text)
assert list(got) == exp

Expand All @@ -64,6 +64,6 @@ def test20_context_class():
"""
obj = PiiManager("en", mode="extract")
obj.add_tasks([DUMMY_CLASS])
for (text, exp) in TEST:
for text, exp in TEST:
got = obj(text)
assert list(got) == exp
4 changes: 2 additions & 2 deletions pii-manager/test/unit/helper/test_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def test10_context_true():
"""
Check valid contexts
"""
for (text, context) in TEST_TRUE:
for text, context in TEST_TRUE:
spec = mod.context_spec(context)
assert mod.context_check(text, spec, 20) is True

Expand All @@ -83,7 +83,7 @@ def test20_context_false():
"""
Check invalid contexts
"""
for (text, context) in TEST_FALSE:
for text, context in TEST_FALSE:
spec = mod.context_spec(context)
assert mod.context_check(text, spec, 20) is False

Expand Down
2 changes: 1 addition & 1 deletion pii-manager/test/unit/helper/test_norm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ def test10_normalizer():
"""
Create base object
"""
for (text, exp) in TEST:
for text, exp in TEST:
assert mod.normalize(text, "en", whitespace=True, lowercase=True) == exp
1 change: 1 addition & 0 deletions tokenizer/python_script/dedup_lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

META_COLUMNS = ["meta"]


# filter text to remove certain lines (e.g. menu items, copyright notice)
def filter_lines(article, skip_set, used_lines):
# TODO discuss the strip
Expand Down

0 comments on commit ea52f5a

Please sign in to comment.