-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlisted_danish_comp_analysis.Rmd
144 lines (100 loc) · 2.96 KB
/
listed_danish_comp_analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
---
title: "R Notebook"
output: html_notebook
---
```{r}
library(tidyverse)
library(magrittr)
library(stringi)
library(stringr)
library(readtext)
library(dplyr)
library(curl)
library(pdftools)
library(cld2)
library(tabulizer)
library(googleLanguageR)
```
```{r}
lowerize <- function(mystring){
rem_punct <- regex("[[:punct:]]")
str_replace_all(
str_squish(
str_sub(
str_to_lower(
str_remove_all(mystring, rem_punct), locale = "en"),
start=1, end=35)
),
" ", "_")
}
```
```{r}
dl_comps <- read_csv("listed_danish_companies.csv")
names(dl_comps) <- lowerize(names(dl_comps))
names(dl_comps)
```
```{r}
dl_comps %<>% mutate(company_name=iconv(company_name, from="UTF-8", to="ASCII//TRANSLIT"))
dl_comps %<>% mutate(folder=str_sub(lowerize(str_remove_all(company_name, "A/S")), start=1, end=15))
dl_comps %<>% mutate(pubyear=list(c("2021","2020","2019")))
dl_comps %<>% unnest(pubyear)
dl_comps %<>% mutate(full_path=str_c("listed_danish_companies",folder,str_c(folder,"_",pubyear,".pdf"), sep = "/"))
dl_comps %<>% data.frame()
```
```{r}
dirvec <-unique(dl_comps$folder)
create_folders(dirvec) <- function (){
setwd("listed_danish_companies")
for (i in 1:length(dirvec)) {
dir.create(dirvec[i], recursive = F)
}
}
# create_folders(divec)
```
```{r}
setwd("listed_danish_companies")
write_csv(dl_comps, "listed_danish_companies_fullpath.csv")
```
### Text extraction
```{r}
page_vector <- c()
text_vector <- c()
dl_comps %<>% mutate(text="")
dl_comps %<>% mutate(pages="")
for (i in 1:nrow(dl_comps)) {
page_vector <- c (1:pdf_info(dl_comps$full_path[i])$pages)
text_vector <- list(extract_text(dl_comps$full_path[i], page_vector))
dl_comps$pages[i] <- list(page_vector)
dl_comps$text[i] <- text_vector
}
```
```{r}
dl_comps_unnested <- dl_comps %>% unnest(c(pages,text))
dl_comps_unnested %<>% mutate(lang=detect_language(text))
dl_comps_unnested %>% group_by(lang) %>% count
#dl_comps_unnested_filtered <- dl_comps_unnested %>% filter(lang=="da")
```
```{r}
# dl_comps_unnested_filtered %<>% slice(-206) # produces an error
#translated <- translate(dataset=dl_comps_unnested_filtered,
# content.field="text",
# source.lang="da",
# target.lang="en",
# google.api.key="mykey")
gl_auth("g_auth.json")
for (i in 1:nrow(dl_comps_unnested)) {
if(!dl_comps_unnested$lang[i]=="en"|is.na(dl_comps_unnested$lang[i])){
translated <- gl_translate(dl_comps_unnested$text[i], target="en")
dl_comps_unnested$lang[i]<-"en"
dl_comps_unnested$text[i] <- translated$translatedText
}
}
dl_comps_unnested$text#[206]=="en"
#dl_comps_unnested %<>% mutate(text=case_when(lang=="da"~gl_translate(text, target="en")$translatedText))
vignette("translation", package = "googleLanguageR")
gl_translate()
# %<>% mutate(lang=detect_language(text))
```
```{r}
write.csv(dl_comps_unnested, "listed_danish_companies_text.csv")
```