-
Notifications
You must be signed in to change notification settings - Fork 0
/
Collate all tweets.Rmd
106 lines (56 loc) · 2.14 KB
/
Collate all tweets.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
---
title: "Step 2 - Collate all tweets and divide it into Hindi and English"
author: "SA"
date: "2/17/2020"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
library(tidyverse)
library(readxl)
setwd("/Users/shreyaagarwal/Documents/Code/Machine Learning/Dissertation/Capstone")
```
```{r}
library(data.table)
library(readr)
#read all csv files from folder
files <- list.files(pattern = ".csv")
#Combine all files into one file 'data'
temp <- lapply(files, fread, sep=",")
data <- rbindlist(temp, fill = TRUE)
write_excel_csv(data, "/Users/shreyaagarwal/Google Drive/Dissertation/Data/allcombineddata.csv")
```
```{r}
complete_dataset <- fread("/Users/shreyaagarwal/Google Drive/Dissertation/Data/allcombineddata.csv")
eng <- complete_dataset %>% filter(lang == "en")
nrow(hi)/nrow(complete_dataset) - sum(nrow(eng), nrow(hi) )
```
Check for duplication in complete dataset (hindi/english tweets only) - 540,973
```{r}
#Unique rows
data_unique <- unique(data)
data_unique_lang <- data_unique %>% filter(lang == "hi" | lang == "en" )
data_unique_lang$created_at <- lubridate::as_date(data_unique_lang$created_at)
#table(data_unique_lang$created_at)
#Check language breakup
#table(data_unique$lang)
write_excel_csv(data_unique_lang, "/Users/shreyaagarwal/Google Drive/Dissertation/Data/unique_rows_hi_eng.csv")
```
```{r}
d_dataset <- fread("/Users/shreyaagarwal/Google Drive/Dissertation/Data/unique_rows_hi_eng.csv")
hi_d <- d_dataset %>% filter(lang == "hi")
nrow(hi_d)/623345 - sum(nrow(eng_d), nrow(hi_d) )
```
Check for duplication in text columns - tweets - 231,910
```{r}
#Distinct text rows - including all languages
data.text<- data_unique %>% distinct(text, .keep_all = TRUE)
table(data.text$lang)
#distinct text rows - including only Hindi and English tweets - 231,910 tweets
data.text <- data.text %>% distinct(text, .keep_all = TRUE) %>% filter(lang == "hi" | lang == "en" )
data.text$created_at <- lubridate::as_date(data.text$created_at)
table(data.text$created_at)
write.csv(data.text, "/Users/shreyaagarwal/Google Drive/Dissertation/Data/unique_text_rows_hi_eng.csv")
```