-
Notifications
You must be signed in to change notification settings - Fork 0
/
FilterCorpus.java
83 lines (74 loc) · 2.64 KB
/
FilterCorpus.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.HashSet;
public class FilterCorpus
{
public static void main(String[] args)
{
HashSet<String> spamList = new HashSet<String>();
String badWords = "src/main/resources/bad-words.txt";
try (BufferedReader in = new BufferedReader(new FileReader(badWords));)
{
String line;
while ((line = in.readLine()) != null)
{
spamList.add(line.toLowerCase());
spamList.add(line.toLowerCase() + "s");
spamList.add(line.toLowerCase() + "es");
spamList.add(line.toLowerCase() + "ed");
spamList.add(line.toLowerCase() + "ing");
} // while
} catch (Exception e)
{
e.printStackTrace();
System.out.println("Spam List not loaded");
System.exit(0);
} // catch
String langs = "src/main/resources/lang_codes.txt";
try (BufferedReader in = new BufferedReader(new FileReader(langs));)
{
String line;
while ((line = in.readLine()) != null)
{
spamList.add(line.toLowerCase());
} // while
} catch (Exception e)
{
e.printStackTrace();
System.out.println("Langs List not loaded");
System.exit(0);
} // catch
spamList.addAll(Arrays.asList("hi", "stub", "not", "no", "puto", "easy",
"simplify", "hkl", "png", "svg", "lol", "langen", "bjj", "hi", "hii",
"hiii", "hiiii", "ugly", "bob", "ha", "haha", "hahaha", "hahahaha",
"fukin", "friggin", "heyyyyy", "heyyyy", "heyyy", "heyy", "hey",
"dongs", "mierda", "sucks", "bum", "blahblah", "randi", "bastard",
"smelly", "benders", "hello", "kiran", "bloody", "lmao", "rofl",
"hahahahahah", "yep", "shitty", "faisal", "ashton", "smells",
"butthole", "poopy", "hahahahah", "james", "hiya", "hola", "pizza",
"cheese", "yay", "wassup", "sucky", "shity", "penus", "noob", "nnn",
"jk", "jim", "ji", "iiii", "idk", "hihi", "hahahahaha", "gggg",
"fucken", "bla"));
try (
PrintWriter out = new PrintWriter(
"src/main/resources/simpleDeleteCorpus_Filtered.tsv");
BufferedReader in = new BufferedReader(
new FileReader("src/main/resources/simpleDeleteCorpus.tsv")))
{
String line;
while ((line = in.readLine()) != null)
{
String[] split = line.split("\t");
if (!spamList.contains(split[0].toLowerCase()) && split[0].length() > 1)
out.println(line);
} // while
} catch (Exception e)
{
e.printStackTrace();
System.out.println("could not parse corpus");
System.exit(0);
}
}
}