-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdprep_functions.R
153 lines (123 loc) · 3.9 KB
/
dprep_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
library(magrittr)
library(stringi)
library(stringr)
library(rJava)
library(tabulizer)
library(tidyverse)
library(pdftools)
library(jsonlite)
library(rlang)
### REGEXes and constants
gri_code <- regex("(4[0-1][0-9])(?:-[0-9])?", ignore_case = FALSE)
reg_pn <- regex("[[:alpha:]]")
num_regex <-regex("\\d{1,3}")
#range_case <- regex("\\d{1,3}-\\d{1,3}")
saving_dir <- "reports"
getwd()
# setwd("../")
###
#list_of_pdf_reports <-function(){
# Loading the list of available reports to process (results of gri_df_prep.R)
gri_list_df <- read.csv("reports/GriTempUrlList3.csv", stringsAsFactors=FALSE)
# gri_list_df %>% glimpse()
# Adding the fullpath for retrieving files
gri_list_df<-gri_list_df %>% mutate(fullpath=str_c(saving_dir, gri_list_df$ffolder, gri_list_df$filename, sep = "/") )
# Filtering only pdfs with available GRI index page number
gri_list_df %<>% filter((dld_status=="FE")|(dld_status=="OK"), !str_detect(pdf_contents_page,reg_pn), !pdf_contents_page=="")
gri_list_df %<>% filter((dld_status=="OK"))
# Making the page number into numeric format
gri_list_df %<>% mutate(pdf_contents_page=as.numeric(pdf_contents_page))
# Eliminating na page numbers
gri_list_df %<>% filter(!is.na(pdf_contents_page))
# gri_list_df$pdf_contents_page %>% is.na() %>% sum()
gri_list_df %>% glimpse()
# return(gri_list_df)
#}
#### functions to get the info from pages of reports
extract_one_cpage <- function(i,j){
contents_page <- extract_tables(
file = gri_list_df$fullpath[i],
pages = gri_list_df$pdf_contents_page[i]+j,
method = "decide",
output = "data.frame")
return(contents_page)
}
extract_gri_indexes <- function (n){ # get the list of companies with pages that possibly contain gri indexes
gic <-c()
#for(i in 1:nrow(gri_list_df)){
for(i in 1:n){
cp <-c()
compname <- gri_list_df$fullpath[i]
print(compname)
for(j in 0:4){
pagenum <- gri_list_df$pdf_contents_page[i]+j
out <- tryCatch(
expr = {
pagecont <- extract_one_cpage(i,j)
cp <- c(cp, list(pagecont))
names(cp)[j+1] <- pagenum
print(paste("page #", pagenum, "OK "))
},
error = function(e){
cp <- c(cp, list("Fail to read due to error",e))
print(paste("page #", pagenum, "FAIL ", e))
return(NULL)
}
)
}
gic <-c(gic, list(cp))
names(gic)[i] <- compname
print(cat(compname, "processed "))
gc()
}
return(gic)
}
numextract <- function(string){
# extracting all the numbers from a string
return(unlist(regmatches(string,gregexpr(num_regex,string))))
}
get_gri_index <- function(data_tp){ # picking dataframes on a page
tl<-c()
for (n in 1:length(data_tp)) {
temp_index_table <-data_tp[[n]]
m_result <- combine_codenpage(filter_index(temp_index_table))
tl <- c(tl,m_result)
}
return(tl)
}
filter_index <- function(table_tp){
my_if_statement <- !is.null(dim(table_tp)) # nrow(table_tp)>0 && ncol(table_tp)>0
if(my_if_statement){
table_tp[str_detect(table_tp[,1],gri_code),] # we assume gri codes are in the first column
}else{
return("Filtering index failed")
}
}
choose_column <- function(table_tp){
num_regex <-regex("\\d{1,3}")
for(i in 2:ncol(table_tp)){
x<-sum(str_detect(table_tp[,i],num_regex),TRUE)
z<-nrow(table_tp)
if((!is.na(x))&&(x/z>0.5)){
return(i)
}
}
}
combine_codenpage <- function(table_tp){
doc_gri_index <-c()
if(!is.null(dim(table_tp))){
k<-choose_column(table_tp)
}else{
return("No tables to process")
}
for(j in 1:nrow(table_tp)){
my_if_statement <- ((!toString(table_tp[j,1])=="") && (!is.na(table_tp[j,1])))
if (my_if_statement){
doc_gri_index <- c(doc_gri_index, list(numextract(table_tp[j,k])))
names(doc_gri_index)[j] <- table_tp[j,1]
}else{
doc_gri_index <- "No GRI code found"
}
}
return(doc_gri_index)
}