cord-19.Rmd

---
title: "COVID-19 Cleaning/Exploration"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

Screencast: https://www.youtube.com/watch?v=-5HYdBq_PTM

```{r}
library(tidyverse)
library(tidytext)
library(jsonlite)
library(janitor)

# This is where I'm storing mine
infolder <- "~/Downloads/2020-03-13"

metadata <- read_csv(paste0(infolder, "/all_sources_metadata_2020-03-13.csv")) %>%
  clean_names() %>%
  rename(paper_id = sha,
         source = source_x)
```

```{r}
# Read in all the JSON objects as well
# dir() with recursive = TRUE allows us to get a full vector of filenames
json_objects <- dir(infolder,
                    pattern = "*.json",
                    full.names = TRUE,
                    recursive = TRUE) %>%
  map(read_json)
```

We then use the `hoist()` function from tidyr to turn the nested data into a rectangle.

```{r articles_hoisted}
articles_hoisted <- tibble(json = json_objects) %>%
  hoist(json,
        paper_id = "paper_id",
        section = c("body_text", function(.) map_chr(., "section")),
        text = c("body_text", function(.) map_chr(., "text")),
        citations = c("body_text", function(.) map(., "cite_spans")),
        bib_entries = "bib_entries") %>%
  select(-json)
```

```{r}
paragraphs <- articles_hoisted %>%
  select(-bib_entries) %>%
  unnest(cols = c(text, section, citations)) %>%
  group_by(paper_id) %>%
  mutate(paragraph = row_number()) %>%
  ungroup() %>%
  select(paper_id, paragraph, everything())

# Could use unnest_wider, but hoist seems to be faster
paragraph_citations <- paragraphs %>%
  select(paper_id, paragraph, citations) %>%
  unnest(citations) %>%
  hoist(citations, start = "start", end = "end", text = "text", ref_id = "ref_id")
```

```{r}
articles_full <- articles_hoisted %>%
  select(paper_id)
  inner_join(metadata, by = c(paper_id = "sha")) %>%
  mutate(abstract = coalesce(abstract, abstract_json)) %>%
  select(-json, -has_full_text, -abstract_json) %>%
  filter(!is.na(title), !is.na(abstract))
```

Pulling out the details from the article references

```{r}
article_references <- articles_hoisted %>%
  select(paper_id, bib_entries) %>%
  unnest(bib_entries) %>%
  hoist(bib_entries,
        ref_id = "ref_id",
        title = "title",
        venue = "venue",
        volume = "volume",
        issn = "issn",
        pages = "pages",
        year = "year",
        doi = list("other_ids", "DOI", 1)) %>%
  select(-bib_entries)
```


### Exploratory Data Analysis

```{r}
title_words <- article_data %>%
  unnest_tokens(word, title) %>%
  count(word, sort = TRUE) %>%
  anti_join(stop_words, by = "word")

title_words %>%
  head(20) %>%
  mutate(word = fct_reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  coord_flip() +
  labs(title = "Words that appear in many titles")
```

```{r}
abstract_words <- article_data %>%
  unnest_tokens(word, abstract) %>%
  count(word, sort = TRUE) %>%
  anti_join(stop_words, by = "word")

abstract_words %>%
  head(20) %>%
  mutate(word = fct_reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  coord_flip() +
  labs(title = "Words that appear in many titles")
```

```{r}
library(spacyr)
spacy_initialize("en_core_sci_sm", python_executable = "/opt/miniconda3/bin/python")
```

Tidytext can take a custom tokenization function

```{r}
tokenize_scispacy_entities <- function(text) {
  spacy_extract_entity(text) %>%
    group_by(doc_id) %>%
    nest() %>%
    pull(data) %>%
    map("text") %>%
    map(str_to_lower)
}

tokenize_scispacy_entities(c("Myeloid derived suppressor cells (MDSC) are immature 
myeloid cells with immunosuppressive activity.", "They accumulate in tumor-bearing mice and humans 
with different types of cancer, including hepatocellular 
carcinoma (HCC)."))

abstract_entities <- article_data %>%
  select(paper_id, abstract) %>%
  sample_n(2000) %>%
  unnest_tokens(entity, abstract, token = tokenize_scispacy_entities)
```

```{r}
abstract_entities %>%
  count(entity, sort = TRUE) %>%
  head(30) %>%
  mutate(entity = fct_reorder(entity, n)) %>%
  ggplot(aes(entity, n)) +
  geom_col() +
  coord_flip()
```

```{r}
library(widyr)

entity_correlations <- abstract_entities %>%
  add_count(entity) %>%
  filter(n >= 100) %>%
  pairwise_cor(entity, paper_id, sort = TRUE) %>%
  head(400)

library(ggraph)

set.seed(2020)

entity_correlations %>%
  igraph::graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation)) +
  geom_node_point() +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void() +
  theme(legend.position = "none") +
  labs(title = "Entities that often appear together in abstracts",
       subtitle = "Based on the scispacy Named Entity Recognition model")
```

### References

```{r}
num_articles <- n_distinct(article_references$paper_id)

article_references %>%
  filter(!str_detect(title, "Submit your next|This article|Springer Nature remains|Publisher's Note")) %>%
  count(title = str_trunc(title, 100), sort = TRUE) %>%
  mutate(percent = n / num_articles) %>%
  head(20) %>%
  mutate(title = fct_reorder(title, percent)) %>%
  ggplot(aes(title, percent)) +
  geom_col() +
  scale_y_continuous(labels = scales::percent_format()) +
  coord_flip() +
  labs(title = "What are the most referenced articles in the COVID-19 dataset?",
       subtitle = glue::glue("Based on the { scales::comma(num_articles) } open for commercial use that have references"))
```

```{r}
referenced_articles <- article_references %>%
  filter(!is.na(year)) %>%
  distinct(title, year)

year_totals <- referenced_articles %>%
  count(year = 2 * (year %/% 2), name = "total")

referenced_article_words <- referenced_articles %>%
  unnest_tokens(word, title)

by_word_year <- referenced_article_words %>%
  count(year = 2 * (year %/% 2), word) %>%
  filter(year >= 1900, year <= 2020) %>%
  inner_join(year_totals, by = "year") %>%
  mutate(percent = n / total)

by_word_year %>%
  filter(word %in% c("bat", "bats")) %>%
  ggplot(aes(year, percent)) +
  geom_col() +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "How much do referenced papers refer to bats in the title?")
```

```{r}
article_references %>%
  count(venue, sort = TRUE)
```