CAA_analysis.Rmd

---
title: "CAA_Analysis"
author: "SA"
date: "6/23/2020"
output: html_document
---

```{r}
library(tidyverse)
library(data.table)
library(quanteda)
library(stringr)
library(lubridate)

```

Read the data and filter unrequired columns 
```{r}
caa_dataset <- fread("/Users/shreyaagarwal/Google Drive/Dissertation/Data/unique_rows_hi_eng.csv")

caa_dataset <- caa_dataset[,-c(1,2)]

colnames(caa_dataset)
#filter down to relevant columns
caa_text <- caa_dataset %>% select("created_at","screen_name","text","hashtags", "source", "description", "lang")
caa_text$created_at <- lubridate::as_date(caa_text$created_at)

#hashtag table
hashtags <- as.data.table(table(caa_dataset$hashtags))
```

Word lists for columns - hashtags, description, screen_name
- assign hashtags to Pro CAA, Anti CAA and Neutral stance
```{r}
#Str_detect
anti_caa <- c("WeReject_CAA_NPR_NRC", "IndiaAgainstCAA_NRC", "notocaa", "kagaj_nahi_dikhayenge", "Anti_CAA","शाहीनबाग_जीतेगा_मोदी_हारेगा","UNRejectsCAA","IndiaAgainstCAA","AmitShahIstifaDo","RejectCAA","IndiaAgainstCAA_NRC_NPR","ArrestKapilMishra","SCSTOBC_Against_Hate", "CrimesAgainstHumanity","HinduTerror")

pro_caa <- c("CAA_NRC_support","SupportCAA" ,"supportCAA", "IndiaSupportsCAA_NRC", "Pro_CAA", "शाहीन_बाग_हारेगा_देश_जितेंगा","IAmIndianISupportCAA","IStandWithKapilMishra","SaveDelhiHindus","हिंदुओ_को_मारना_बंद_करो",
"ISupportKapilMishra","हर_दिल_में_योगी","PatrioticIndiansSupportCAA","AntiHinduRI", "Anti CAA agitators killed", "Tahir_hussain_terrorist", "ISupportKapilMishra")

neutral_hashtags <- c("DelhiRiots2020", "CAA", "ShaheenBagh", "CAA_NRC", "DelhiViolance", "Delhigenocide", "Jafrabad", "CitizenshipAmendmentBill", "ShaheenBaghProtests", "DelhiCAAClashes", "CAAProtests", "CAAProtest", "CitizenshipAmendmentAct", "DelhiBurning", "CitizenshipAmendmentBill2019", "CitizenshipAct", "caa", "shaheenbaghs","DelhiPolice", "Kashmir", "AmulyaLeona", "Karnataka", "Aligarh")

n_hash <- ("DelhiRiots2020 | CAA | ShaheenBagh |CAA_NRC|DelhiViolance|Delhigenocide|Jafrabad|CitizenshipAmendmentBill|ShaheenBaghProtests|DelhiCAAClashes|CAAProtests|CAAProtest| CitizenshipAmendmentAct|DelhiBurning|CitizenshipAmendmentBill2019|CitizenshipAct|caa|shaheenbaghs")

news_words <- ("Breaking news | latest news | independent news | Hindi News | digital news | news agency | leading news | online news | Kadak news | News Services | news follow | AJ+ is news |daily news | presenting news |news agency | Regional news | Asianetnews | international news |news channel| source on news| global news | news ticker | commentary news | Deccan Chronicle | Democracy Express | instant news | The Economic Times| Indian news | FinancialExpress | word in news | news website | news")
```


Creating a training set - Tweet classification - Anti_CAA, Pro_CAA
```{r}
library(tidyverse)
#add pro and anti CAA column
caa_text <- caa_text %>% mutate("CAA_stance" = case_when(
  str_detect(hashtags, paste(anti_caa, collapse = "|")) ~ "Anti_CAA",
  str_detect(hashtags, paste(pro_caa, collapse = "|")) ~ "Pro_CAA"
   ))


#Number of Pro_CAA and Anti_CAA labelled tweets - 115,176 - 21% tweets are labelled.
table(caa_text$CAA_stance)

```

*Filter news organization from main dataset*

1. Filter out irrelevant tweets from news tweets 
2. Remove duplicated tweets from news deck
3. Total count of distinct tweets from news organisations - 6,286

```{r}

news_tweets <- caa_text %>% filter(str_detect(description, news_words)==TRUE) %>% distinct(text, .keep_all = TRUE)

#remove indvidual accounts from the news accounts  (Approximate cleaning)
news_tweets_dis <- news_tweets %>% filter(!str_detect(screen_name, ind_screen_name)==TRUE)

news_tweets_dis <- news_tweets_dis %>% filter(!str_detect(screen_name, ind_screen)==TRUE)

#Add the news/Not_news column to tweet columns
news_tweets_dis <- news_tweets_dis %>% mutate(news_notnews = "News")

#Add stance to the CAAstance column
news_tweets_dis <- news_tweets_dis %>% mutate("CAA_stance" = case_when(
  str_detect(hashtags, paste(anti_caa, collapse = "|")) ~ "Anti_CAA",
  str_detect(hashtags, paste(pro_caa, collapse = "|")) ~ "Pro_CAA",
  str_detect(screen_name, paste(neutral_screennames, collapse = "|")) ~ "Neutral"
   ))

```


```{r}

#Dataset containing "Distinct" "no-news org" tweets (Hopefully!)
citizen_tweets <- caa_text %>% filter(!text %in% news_tweets_dis$text) %>% distinct(text, .keep_all = TRUE)

#Remove useless tweets from the dataset
citizen_tweets <- citizen_tweets %>% filter(!str_detect(screen_name, "_cricketkeeda")==TRUE)

#Add extra news/not_news column to the dataset
citizen_tweets <- citizen_tweets %>% mutate(news_notnews = "Not News")

#22% tweets are labelled.
table(citizen_tweets$CAA_stance) 

table(citizen_tweets$lang) #More Hindi tweets than English tweets -> En:Hi : 44:56 ratio

```

*Dataset with -- Lang, Stance, News/NotNews columns* - 25% of the dataset is labelled - Neutral, Anti, Pro
```{r}

#Join the two datasets - news and citizen tweets 
caa <- rbind(citizen_tweets, news_tweets_dis) 

```

Step 1: Clean the text
```{r}
#Read the combined text file
caa <- fread("/Users/shreyaagarwal/Google Drive/Dissertation/Data/combined_news_citizen_tweets.csv", header = TRUE)

#caa <- caa[,-c(1,2)]

#Read the hindi stopwords file
hi_stop <- readLines("/Users/shreyaagarwal/Google Drive/Dissertation/Data/final_stopwords.txt", encoding = "UTF-8")

#Read hinglish stopwords file
hiEng_stop <- readLines("/Users/shreyaagarwal/Google Drive/Dissertation/Data/hinglish_stopwords.txt")

#devtools::install_github("quanteda/stopwords")


#Step 1: Scrub the text - hashtags, links, punctuation marks, stopwords, help words

caa$text <- as.character(caa$text)

caa$text <- gsub("<.*?>", "", caa$text) #get rid of stuff within brackets
caa$text <- gsub("http[[:alnum:][:punct:]]*", "", caa$text) #get rid of links and punctuation
caa$text <- str_replace_all(caa$text,"#[a-z,A-Z]*","") #get rid of hashtags
caa$text <- str_replace_all(caa$text,"@[a-z,A-Z]*","") #get rid of references of other screen names
caa$text = gsub("[[:punct:]]", "", caa$text) #get rid of punctuation
caa$text = gsub("[[:digit:]]", "", caa$text) #get rid of digits
caa$text <- str_replace_all(caa$text," "," ") #get rid of unncessary space
caa$text <- trimws(caa$text, "l")
```


```{r}

library(tidyverse)
library(data.table)

caa <- fread("/Users/shreyaagarwal/Google Drive/Dissertation/Data/combined_news_citizen_filtered.csv")


caa_english <- caa %>% filter(lang == "en")
caa_hi <- caa %>% filter(lang == "hi")

caa_hitext <- caa_hi[1:20000,] 
caa_hi_rest <- caa_hi[20001:129010,]

#was created because excel could not read the massive dataset. So, it was split into 20:80.
caa_halftext <- fread("/Users/shreyaagarwal/Downloads/hi_tweets_subtext - hi_tweets_subtext.csv")

#and then rbind-ed
caa_hindi <- rbind(caa_hi_rest,caa_halftext)

(caa_hindi$CAA_stance)

write_excel_csv(caa_english, "/Users/shreyaagarwal/Google Drive/Dissertation/Data/english_tweets.csv")

write.csv(caa_english, "/Users/shreyaagarwal/Google Drive/Dissertation/Data/english_tweets.csv", fileEncoding = "UTF-8")

write_excel_csv(caa_hindi, "/Users/shreyaagarwal/Google Drive/Dissertation/Data/hindi_tweets.csv")

write.csv(caa_hindi, "/Users/shreyaagarwal/Google Drive/Dissertation/Data/hi_tweets.csv")

s <- fread("/Users/shreyaagarwal/Google Drive/Dissertation/Data/hi_tweets.csv", header = TRUE)


```

```{r}


Hinews_tweets <- caa_hindi %>% filter(str_detect(description, news_words)==TRUE) %>% distinct(text, .keep_all = TRUE)

hicitizen_tweets <- caa_hindi %>% filter(!text %in% Hinews_tweets$text) %>% distinct(text, .keep_all = TRUE)

hindi_tweets <- rbind(Hinews_tweets, hicitizen_tweets)

library(openxlsx)
write.xlsx(hindi_tweets, "/Users/shreyaagarwal/Google Drive/Dissertation/Data/hi_tweets1.xlsx")
```

#Read english tweets and remove hashtags from it.
```{r}
eng_tweets <- read.csv("/Users/shreyaagarwal/Google Drive/Dissertation/Data/english_tweets123 copy.csv", header = TRUE)

eng_tweets$text <- str_replace_all(eng_tweets$text, "AmitShahIstifaDo", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "SaveDelhiHindus", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "IndiaAgainstCAANRC", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "UNRejectsCAA", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "CAA UNRejectsCAA", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "DelhiRiots2020 AmitShahIstifaDo", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "UNRejectsCAA UNRejectsCAA", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "Tahirhussainterrorist", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "UNRejectsCAA", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "IndiaAgainstCAA", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "ArrestKapilMishra", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "SCSTOBCAgainstHate", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "हिंदुओकोमारनाबंदकरो", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "ArrestTahirHussain", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "DelhiRiots2020", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "DelhiBurning", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "DelhiBurning ArrestTerroristKapilMishra", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "AntiHinduRiot", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "AntiHinduRiots", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "UN", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "mosque DelhiBurning ArrestTerroristKapilMishra", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "ArrestTerroristKapilMishra", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "WhyBJPBurningDelhi", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "GenocideInDelhi", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "ArrestSwaraBhaskar", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "AmitShahResign", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "DelhiAgainstJehadiViolence", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "DelhiRiotTruth", "")
eng_tweets$text <- str_replace_all(eng_tweets$text, "stopkillinghindus", "")

write.csv(eng_tweets, "/Users/shreyaagarwal/Google Drive/Dissertation/Data/english_tweets123 copy.csv", fileEncoding = "UTF-8")
```

#Read Hindi tweets and remove hashtags from it.
```{r}
Hindi_tweets <- read.xlsx("/Users/shreyaagarwal/Google Drive/Dissertation/Data/hi_tweets1.xlsx")
```

```{r}
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "AmitShahIstifaDo", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "SaveDelhiHindus", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "IndiaAgainstCAANRC", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "UNRejectsCAA", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "CAA UNRejectsCAA", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "DelhiRiots2020 AmitShahIstifaDo", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "UNRejectsCAA UNRejectsCAA", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "Tahirhussainterrorist", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "UNRejectsCAA", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "IndiaAgainstCAA", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "ArrestKapilMishra", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "SCSTOBCAgainstHate", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "हिंदुओकोमारनाबंदकरो", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "ArrestTahirHussain", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "DelhiRiots2020", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "DelhiBurning", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "DelhiBurning ArrestTerroristKapilMishra", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "AntiHinduRiot", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "AntiHinduRiots", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "UN", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "mosque DelhiBurning ArrestTerroristKapilMishra", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "ArrestTerroristKapilMishra", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "WhyBJPBurningDelhi", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "GenocideInDelhi", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "ArrestSwaraBhaskar", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "AmitShahResign", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "DelhiAgainstJehadiViolence", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "DelhiRiotTruth", "")
hindi_tweets$text <- str_replace_all(hindi_tweets$text, "stopkillinghindus", "")

```

```{r}
hindi_tweets$text <- gsub("<.*?>", "", hindi_tweets$text) #get rid of stuff within brackets
hindi_tweets$text <- gsub("http[[:alnum:][:punct:]]*", "", hindi_tweets$text) #get rid of links and punctuation
hindi_tweets$text <- str_replace_all(hindi_tweets$text,"#[a-z,A-Z]*","") #get rid of hashtags
hindi_tweets$text <- str_replace_all(hindi_tweets$text,"@[a-z,A-Z]*","") #get rid of references of other screen names
hindi_tweets$text = gsub("[[:punct:]]", "", hindi_tweets$text) #get rid of punctuation
hindi_tweets$text = gsub("[[:digit:]]", "", hindi_tweets$text) #get rid of digits
hindi_tweets$text <- str_replace_all(hindi_tweets$text," "," ") #get rid of unncessary space
hindi_tweets$text <- trimws(hindi_tweets$text, "l")

```

```{r}
write.csv(hindi_tweets,"/Users/shreyaagarwal/Google Drive/Dissertation/Data/hi_tweets1.csv")
```