The-Good-the-Bad-and-the-Ugly.Rmd

---
title: "The Good, the Bad and the Ugly"
author: "Jack Carter"
date: "18/04/2022"
output: github_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(tidyverse)
library(ggplot2)
library(stringr)
library(ggthemes)
library(knitr)
library(here)

# gets the z score for each term. 
get_z_score <- function(data) {
  mean <- mean(data)
  sigma <- sd(data)
  z_scores <- list()
  for(i in 1:length(data)) {
    z_scores[i] <- (data[i] - mean) / sigma
  }
  return(unlist(z_scores))
}

# gets the z score for a list of terms. 
get_z_scores <- function(data) {
  groups <- data %>%
    group_split(term)
  z_scores <- list()
  for(i in 1:length(groups)) {
    z_scores[[i]] <- list(get_z_score(groups[[i]]$hits))
  }
  return(unlist(z_scores))
}

national_belonging <- c("patriotism",
                        "national identity")
partisanship <- c("partisanship",
                  "political divide",
                  "political differences")
discrimination <- c("anti-semitism",
                    "sexism", 
                    "racism", 
                    "islamophobia",
                    "transphobia")

# read in data. 
data <- read_csv(here("data.csv"))

# sort data alphabetically by term. 
index <- order(data$term)
sorted_df <- data[index,]

# change term to upper case. 
sorted_df$term <- str_to_title(sorted_df$term)

# adds a z-score variable. 
sorted_df$z_score <- get_z_scores(sorted_df)

# my personal plot theme for data visualizations. 
my_theme <- theme_economist_white(gray_bg = FALSE) +
  theme(plot.title = element_text(hjust = 0.5,
                                  vjust = 10,
                                  size = 10,
                                  color = "#474747"),
        plot.margin = unit(c(1.5, 1, 1.5, 1), "cm"),
        axis.text = element_text(size = 9,
                                 color = "gray30"),
        axis.text.x=element_text(vjust = -2.5),
        axis.title.x = element_text(size = 9,
                                    color = "gray30",
                                    vjust = -10),
        axis.title.y = element_text(size = 9,
                                    color = "gray30",
                                    vjust = 10),
        legend.direction = "vertical", 
        legend.position = "right",
        legend.title = element_blank(),
        legend.text = element_text(size = 11,
                                   color = "gray20"),
        legend.margin=margin(1, -15, 1, 0),
        legend.spacing.x = unit(0.25, "cm"),
        legend.key.size = unit(1, "cm"), 
        legend.key.height = unit(0.75, "cm"),
        strip.text = element_text(hjust = 0.5,
                                  vjust = 1,
                                  size = 10,
                                  color = "#474747"),
        panel.spacing = unit(2, "lines"))

# creates a plot with smoothed loess regression lines. 
make_plot <- function(category, title) {
  plot <- sorted_df %>%
    filter(term %in% str_to_title(category)) %>%
    ggplot(aes(x=year, 
               y=z_score, 
               col=term)) +
    geom_smooth(se=F, 
                span = 0.5, 
                size = 0.5) +
    geom_vline(xintercept = 2016,
               size=0.25,
               col="#696969") +
    geom_text(aes(x=2016, 
                  label="2016 win -", 
                  y=max(z_score)+0.1,
                  hjust=1.05),
              size=2.75,
              col="#696969") +
    geom_vline(xintercept = 2020,
               size=0.25,
               col="#696969") +
    geom_text(aes(x=2020, 
                  label="2020 loss -",
                  y=max(z_score)+0.1,
                  hjust=1.05),
              size=2.75,
              col="#696969") +
    ggtitle(title) +
    ylab("Z-score") +
    xlab("") + 
    my_theme
  return(plot)
}

```

## **Summary**
This project uses the number of articles containing certain terms to
show how the conversation in The New York Times (NYT) changed following
the 2016 election of Donald Trump. Just like the characters in Clint
Eastwood’s famous spaghetti western, Trump's rise highlighted good 
(a heightened national belonging for many (albeit mostly white) 
Americans), bad (political polarization), and ugly (social
discrimination) characteristics of US society.

&nbsp;

## 1) The Good

```{r, echo = FALSE, message = FALSE, warning = FALSE, dpi=600}

# national_belonging  
make_plot(national_belonging, "National Belonging")

```

## 2) The Bad

```{r, echo = FALSE, message = FALSE, warning = FALSE, dpi=600}

# partisanship   
make_plot(partisanship, "Political Polarization")

```

## 3) The Ugly

```{r, echo = FALSE, message = FALSE, warning = FALSE, dpi=600}

# discrimination 
make_plot(discrimination, "Social Discrimination")

```

&nbsp;

## **Disclaimer**

The data above show only relative changes in the number of articles for each term between 2011 and 2022, not how many times a term appeared overall or the context in which it was used. This means any conclusions we make about good, bad and ugly changes in Trump's America are only assumptions, not necessarily facts.  

&nbsp;

## **Method**

### **1) Choose Terms:**

The terms were selected on the basis of trial and error in an attempt to find underlying trends in the data during Trump's presidency. The table below details the number of articles for each term between 2011 and 2022.

**Terms (articles in 000s)**

```{r, echo = FALSE, message = FALSE, warning = FALSE, dpi=600}

# creates a summary table of the total hits for each term. 
summary_table <- data %>%
  mutate(term=str_to_title(term),
         hits=round((hits/10^3), 2)) %>%
  group_by(term) %>%
  summarise(total=sum(hits)) %>%
  spread(term, total)
kable(summary_table)

```

&nbsp;

### **2) Data Collection:**

The data were collected using an API call from the New York Times. A repeat try loop is used to ensure the full data are collected even if the connection drops out on a particular call. 

—EXAMPLE CODE SNIPET—

```{r, echo = TRUE, message = FALSE, warning = FALSE, dpi=600}

# find out how many results are returned for a given year. 
get_data <- function(start_dates, end_dates, terms) {
  url <- paste0("http://api.nytimes.com/svc/search/v2/articlesearch.json?q=%22",
                terms,
                "%22&begin_date=",
                start_dates,
                "&end_date=",
                end_dates,
                "&facet_filter=true&api-key=",
                nyt_key, 
                sep="")
  # query. 
  results_counter <- 1L
  results <- list()
  search <- repeat{try({query <- fromJSON(url, flatten = TRUE)})
    # error handling. 
    if(exists("query")) {
      results <- query
      rm(query)
      break 
    } else {
      if(results_counter <= 45L) {
        message("Re-trying query: attempt ", results_counter, " of 45.")
        results_counter <- results_counter +1L
        Sys.sleep(1)
      } else {
        message("Retry limit reached: initial query unsuccessful.")
        break
      }
    }
  }
  return(results)
}

```

&nbsp;

### **3) Z-score Transformation:**

The number of articles is converted to each term's z-score. This allows us to view the term's relative distribution over time. It is calculated as 1) the number of articles less the term's mean, 2) divided by the term's standard deviation.

—EXAMPLE CODE SNIPET—

```{r, echo = TRUE, message = FALSE, warning = FALSE, dpi=600}

# gets the z-score for each term. 
get_z_score <- function(data) {
  mean <- mean(data)
  sigma <- sd(data)
  z_scores <- list()
  for(i in 1:length(data)) {
    z_scores[i] <- (data[i] - mean) / sigma
  }
  return(unlist(z_scores))
}

# gets the z-score for a list of terms. 
get_z_scores <- function(data) {
  groups <- data %>%
    group_split(term)
  z_scores <- list()
  for(i in 1:length(groups)) {
    z_scores[[i]] <- list(get_z_score(groups[[i]]$hits))
  }
  return(unlist(z_scores))
}

```

&nbsp;

### **4) Loess Transformation:**

The data for each term is plotted with the use of a loess regression line (geom_smooth in the code below). This transforms the data into a smooth curve for a better visualization of overall trends. 

—EXAMPLE CODE SNIPET—

```{r, echo = TRUE, message = FALSE, warning = FALSE, dpi=600}

# creates a plot with smoothed loess regression lines. 
make_plot <- function(category, title) {
  #plot <- sorted_df %>%
    #filter(term %in% str_to_title(category)) %>%
    #ggplot(aes(x=year, 
               #y=z_score, 
               #col=term)) +
    geom_smooth(se=F, 
                span = 0.5, 
                size = 0.5)
    #ggtitle(title) +
    #ylab("Articles (z-scores)") +
    #xlab("") + 
    #my_theme
  return(plot)
}

```

&nbsp;

## **Sources**

- Boyer (2019) https://www.esquire.com/news-politics/a26454551/donald-trump-interview-new-york-times-media-objectivity/

- New York Times (2021) https://developer.nytimes.com/apis

- Rutenberg (2016) https://www.nytimes.com/2016/08/08/business/balance-fairness-and-a-proudly-provocative-presidential-candidate.html

- Statology (2021) https://www.statology.org/interpret-z-scores/

&nbsp;
&nbsp;
&nbsp;