Skip to content
This repository has been archived by the owner on Aug 9, 2024. It is now read-only.

Commit

Permalink
Merge branch 'master' of https://github.com/wiki-ai/revscoring
Browse files Browse the repository at this point in the history
  • Loading branch information
halfak committed Nov 1, 2015
2 parents 3593e4c + 627bc70 commit ec49f3f
Show file tree
Hide file tree
Showing 22 changed files with 1,162 additions and 394 deletions.
5 changes: 4 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ addons:
- libopenblas-dev
- python3-dev
- enchant
- aspell-de
- aspell-nl
- aspell-id
- myspell-en-au
- myspell-en-gb
- myspell-en-us
Expand All @@ -19,7 +22,7 @@ addons:
- myspell-fr
- myspell-es
- myspell-he
- aspell-id
- myspell-it
- hunspell-vi
- myspell-pt
before_install:
Expand Down
3 changes: 3 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,12 @@ some NLTK data. The following command will get the necessary corpus.
You'll also need to install [enchant](https://enchant.org) compatible
dictionaries of the languages you'd like to use. We recommend the following:

* ``languages.dutch``: myspell-nl
* ``languages.english``: myspell-en-us myspell-en-gb myspell-en-au
* ``languages.french``: myspell-fr
* ``languages.german``: myspell-de-at myspell-de-ch myspell-de-ch
* ``languages.indonesian``: aspell-id
* ``languages.italian``: myspell-it
* ``languages.hebrew``: myspell-he
* ``languages.portuguese``: myspell-pt
* ``languages.persian``: myspell-fa
Expand Down
6 changes: 5 additions & 1 deletion revscoring/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,19 @@
collection of language feature sets that work like other features except
that they are language-specific. Language-specific feature sets are
available for the following languages:
:data:`~revscoring.languages.dutch`,
:data:`~revscoring.languages.english`,
:data:`~revscoring.languages.french`,
:data:`~revscoring.languages.german`,
:data:`~revscoring.languages.hebrew`,
:data:`~revscoring.languages.indonesian`,
:data:`~revscoring.languages.italian`,
:data:`~revscoring.languages.persian`,
:data:`~revscoring.languages.portuguese`,
:data:`~revscoring.languages.spanish`,
:data:`~revscoring.languages.turkish`, and
:data:`~revscoring.languages.vietnamese`. See :mod:`revscoring.languages`
:data:`~revscoring.languages.vietnamese`.
See :mod:`revscoring.languages`
Example:
Expand Down
9 changes: 6 additions & 3 deletions revscoring/languages/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,29 @@
languages
+++++++++
.. automodule:: revscoring.languages.dutch
.. automodule:: revscoring.languages.english
.. automodule:: revscoring.languages.french
.. automodule:: revscoring.languages.german
.. automodule:: revscoring.languages.hebrew
.. automodule:: revscoring.languages.indonesian
.. automodule:: revscoring.languages.italian
.. automodule:: revscoring.languages.persian
.. automodule:: revscoring.languages.portuguese
.. automodule:: revscoring.languages.spanish
:members:
.. automodule:: revscoring.languages.turkish
:members:
.. automodule:: revscoring.languages.vietnamese
:members:
Base classes
++++++++++++
Expand Down
177 changes: 177 additions & 0 deletions revscoring/languages/dutch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
import sys

from .space_delimited import SpaceDelimited

try:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("dutch")
except ValueError:
raise ImportError("Could not load stemmer for {0}. ".format(__name__))

try:
from nltk.corpus import stopwords as nltk_stopwords
stopwords = set(nltk_stopwords.words('dutch'))
except LookupError:
raise ImportError("Could not load stopwords for {0}. ".format(__name__) +
"You may need to install the nltk 'stopwords' " +
"corpora. See http://www.nltk.org/data.html")

try:
import enchant
dictionary = enchant.Dict("nl")
except enchant.errors.DictNotFoundError:
raise ImportError("No enchant-compatible dictionary found for 'nl'. " +
"Consider installing 'myspell-nl'.")


badwords = [
r"aars",
r"an(aal|us)\w*",
r"balhaar",
r"drol(len)?",
r"fack(en|ing|s)?", "facking",
r"flikkers?",
r"focking",
r"ge(ile?|lul)",
r"geneukt",
r"hoer(en?)?",
r"homos?",
r"kaka?",
r"kak(hoofd|ken)",
r"k[ae]nker",
r"klootzak(ken)?",
r"klote",
r"kont(gat|je)?",
r"pedo",
r"penis(sen)?",
r"peop",
r"piemels?",
r"pijpen",
r"pik",
r"pimel",
r"pipi",
r"poep(chinees?|en|hoofd)?",
r"poep(ie|je|sex|te?)s?",
r"porno?",
r"neuke?",
r"neuken(de)?",
r"neukt(en?)?",
r"stron(d|t)",
r"suck(s|t)?",
r"zuigt",
r"sukkels?",
r"ter(ing|ten)", "tetten",
r"tieten",
r"vagina",
r"verekte",
r"verkracht",
r"dikzak",
r"dildo",
r"mon?g(olen|ool)?", "mooiboy",
r"negers?",
r"shit",
r"sperma",
r"kut(jes?)?",
r"stelletje",
r"losers?",
r"lul(len)?",
r"reet",
r"scheet", "scheten", r"schijt",
r"diaree",
r"slet",
r"lekkerding",
r"likken"
]

informals = [
r"aap(jes)?",
r"banaan",
r"bent",
r"boe(it)?",
r"doei"
r"dombo",
r"domme",
r"eigelijk",
r"godverdomme",
r"groetjes",
r"gwn",
r"hoi",
r"hal+o+",
r"heb",
r"hee+[jyl]", r"heee+l",
r"houd?",
r"(hoi+)+",
r"hoor",
r"izan",
r"jij",
r"jou",
r"jullie",
r"kaas",
r"klopt",
r"kots",
r"kusjes",
r"le?kke?re?",
r"maarja",
r"mama",
r"nou",
r"oma",
r"ofzo",
r"oke",
r"sexy?",
r"snap",
r"stink(en|t)",
r"stoer",
r"swag",
r"swek",
r"vies", "vieze",
r"vind",
r"vuile",
r"xxx",
r"yeah",
r"zielig",
r"zooi",
r"yolo",
r"zeg"
]

sys.modules[__name__] = SpaceDelimited(
__name__,
doc="""
dutch
=======
revision
--------
.. autoattribute:: revision.words
.. autoattribute:: revision.content_words
.. autoattribute:: revision.badwords
.. autoattribute:: revision.misspellings
.. autoattribute:: revision.informals
.. autoattribute:: revision.infonoise
parent_revision
---------------
.. autoattribute:: parent_revision.words
.. autoattribute:: parent_revision.content_words
.. autoattribute:: parent_revision.badwords
.. autoattribute:: parent_revision.misspellings
.. autoattribute:: parent_revision.informals
.. autoattribute:: parent_revision.infonoise
diff
----
.. autoattribute:: diff.words_added
.. autoattribute:: diff.words_removed
.. autoattribute:: diff.badwords_added
.. autoattribute:: diff.badwords_removed
.. autoattribute:: diff.misspellings_added
.. autoattribute:: diff.misspellings_removed
.. autoattribute:: diff.informals_added
.. autoattribute:: diff.informals_removed
""",
badwords=badwords,
dictionary=dictionary,
informals=informals,
stemmer=stemmer,
stopwords=stopwords
)
12 changes: 10 additions & 2 deletions revscoring/languages/english.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,25 +123,33 @@
r"don'?t", r"dum+b*(y|ies|er|est)?(ass)?",
r"d+u+d+e+\w*",
r"good[-_]?bye",
r"h+[aiou]+(h+[aeiou]*)*", r"h+[e]+(h+[aeiou]*)+",
r"(mw?[au]+)?h+[aiou]+(h+[aeiou]*)*", r"h+[e]+(h+[aeiou]*)+",
r"hel+o+", r"h(aa+|e+)y+",
r"h+m+",
r"i", r"i+d+i+o+t+",
r"(la)+",
r"loser",
r"(l+[uo]+l+)([uo]+l+)*",
r"l+m+a+o+",
r"l[ou]+ve?",
r"m+e+o+w+",
r"munch\w*",
r"mom+(y|a)?",
r"moron",
r"nerds?",
r"noo+b(y|ie|s)?\w*",
r"no+pe",
r"o+k+(a+y+)?",
r"\w*o+m+g+\w*",
r"poo+p\w*",
r"\w*retard\w*", r"tard",
r"r+o+f+l+(mao)?",
r"s+e+x+y+",
r"so+rry",
r"shove",
r"smelly",
r"soo+",
r"stinky",
r"stink(s|y)?",
r"\w*s+t+[uo]+p+i+d+\w*",
r"suck(s|ing|er)?", r"sux",
r"shouldn'?t",
Expand Down
Loading

0 comments on commit ec49f3f

Please sign in to comment.