-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClean.R
49 lines (31 loc) · 1.23 KB
/
Clean.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
library(jsonlite)
library(readr)
library(sjmisc)
library(stringr)
library(stringi)
filenames <- c("296-0.txt","57.txt","pg19033.txt","pg23661.txt")
#dir(pattern="*.txt")
titles <- c("The Cash Boy", "Aladdin and the Magic Lamp", "Alice's Adventures in wonderland", "The Book of DRAGONS")
ids <- c("296","57","19033","23661")
newName <- c("A1.txt","A2.txt","A3.txt","A4.txt")
helper <- function(x){
roman <- as.roman(x);
num <- as.numeric(roman)
return(num)
}
for(i in 1:4){
Rstring <- read_file(filenames[i])
#split by new line breaks
split2 <- unlist(str_split(Rstring,"\\r\\r\\n\\r\\r\\n"))
#convert unicode tag to quotes
split2 <- unlist(str_replace_all(split2, "\uFFFD", '"'))
#convert back to spaces
split2 <- unlist(str_replace_all(split2,"\\r\\r\\n", " "))
#convert roman to integers
#split2 <- unlist(str_replace_all(split2, "(?<=CHAPTER\\s)[:upper:]+", helper))
split2 <- unlist(str_replace_all(split2, "\\[.+\\]", ""))
split2 <- unlist(str_replace_all(split2, "\\*+", ""))
split2 <- unlist(str_replace_all(split2, "(?<=\\w)\\_", ""))
split2 <- unlist(str_replace_all(split2, "\\_(?=\\w)", ""))
writeLines(split2, newName[i])
}