forked from josarago/kaggle--feedback-prize-ell
-
Notifications
You must be signed in to change notification settings - Fork 0
/
english_utils.py
85 lines (60 loc) · 2.36 KB
/
english_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import re
import numpy as np
import pandas as pd
PUNCTUATION_CHARACTERS = "'!(),-./:;?" + '"'
def count_punctuation_characters(text,
characters=PUNCTUATION_CHARACTERS):
"""
count punctuation characters
"""
return np.sum([text.count(char) for char in characters])
def count_missing_trailing_whitespaces(text):
"""
find missing trailing spaces after `.,!?` except if followed by a newline or return carriage or a quotation mark
"""
return len(re.findall(r"[.,!?](?!\s)[^\"\n\r\']", text))
def count_extra_leading_whitespaces(text):
"""
find extra whitespace before `.,!?:;`
"""
return len(re.findall(r"\s[.,!?\:;]", text))
def count_missing_leading_whitespaces(text):
return len(re.findall(r"(?<!\s)[\(]", text))
def count_missing_paired_characters(text,
paired_chars=("()", '""', "''")):
"""
this only count open vs closed and does not check whether the order is correct
"""
return np.sum([np.abs(text.count(chars[0]) - text.count(chars[1])) for chars in paired_chars])
def squeeze_pattern(text,
pattern="\n"):
while pattern * 2 in text:
text = text.replace(pattern * 2, pattern)
return text
def clean_special_characters(text):
cleaned_text = squeeze_pattern(text, pattern="\n")
return cleaned_text.replace("\r", "").replace("\n", " ")
def number_of_unigrams(series: pd.Series) -> np.array:
return series.apply(clean_special_characters).str.split(" ").apply(len).values.reshape(-1, 1)
def number_of_line_breaks(series: pd.Series) -> np.array:
return series.str.count("\n").values.reshape(-1, 1)
def _get_punctuation_error_fraction(text):
"""
A definitely non exhaustive and imperfect list of punctuation errors.
Can be higher than one as there can be more than one error per character
"""
error = (
count_missing_trailing_whitespaces(text),
count_extra_leading_whitespaces(text),
count_missing_leading_whitespaces(text),
count_missing_paired_characters(text)
)
total_punctuation_characters = count_punctuation_characters(text)
if total_punctuation_characters > 0:
return np.sum(error) / count_punctuation_characters(text)
else:
return 0
def get_punctuation_error_fraction(series: pd.Series) -> np.array:
X_count = series.apply(count_punctuation_characters).values.reshape(-1, 1)
X_fraction = series.apply(_get_punctuation_error_fraction).values.reshape(-1, 1)
return np.concatenate((X_count, X_fraction), axis=1)