-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCreole Linguistics Analysis(Final Report).Rmd
113 lines (69 loc) · 2.66 KB
/
Creole Linguistics Analysis(Final Report).Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
---
title: "Creole Linguistics Analysis"
author: "Tariq Williams"
date: "April 2023"
output:
github_document:
toc: true
toc_depth: 2
---
```{r setup, include = FALSE}
# This setup chunk sets global defaults and includes the tidyverse packages
# The option include = FALSE prevents warnings and messages from printing to your report.
# Change the next option to echo = FALSE to hide code chunks by default
knitr::opts_chunk$set(echo = FALSE)
library(tidyverse)
library(tm)
library(readxl)
```
#Read in data
This is all the code used to read in the datasets
```{r}
english_words <- read.table("english_words_patois_edit.txt", header = FALSE, col.names = "words")
lyrics_final <- read_excel("lyrics_Final.xlsx")
```
#Data Cleaning
````{r}
# remove words ending with "embed" from the Lyrics column
lyrics_final$Lyrics <- gsub("\\b\\w+embed\\b", "", lyrics_final$Lyrics, ignore.case = TRUE)
````
#Data Analysis
This is all the code used to analyze the data
#Word Count Analysis
``` {r}
# read in the English Words
english_words <- readLines("english_words_patois_edit.txt")
# convert all the words in the English dictionary to lowercase
english_words <- tolower(english_words)
# create variables to store the counts
english_word_count <- numeric(nrow(lyrics_final))
patois_word_count <- numeric(nrow(lyrics_final))
# loop through each row in the table
for (i in 1:nrow(lyrics_final)) {
# extract the lyrics from the row
lyrics <- lyrics_final$Lyrics[i]
# convert all the words in the lyrics to lowercase
lyrics <- tolower(lyrics)
# split the lyrics into individual words
words <- unlist(strsplit(lyrics, "\\W+"))
# count the number of English and non-English words
english_word_count[i] <- sum(words %in% english_words)
patois_word_count[i] <- length(words) - english_word_count[i]
}
# create a data frame with the results
word_counts <- data.frame( lyrics_final$Artist, lyrics_final$Song, lyrics_final$Year, lyrics_final$Genre , english_word_count,patois_word_count)
colnames(word_counts) <- c( "Artist", "Song" , "Year" , "Genre" , "English Words", "Patois Words")
```
# Adding Variables and Filters
```{r}
# Calculate Patois and English Percentages
word_counts$Patois_Percentage <- round(100 * word_counts$`Patois Words` / (word_counts$`English Words` + word_counts$`Patois Words`),2)
word_counts$English_Percentage <- round(100 * word_counts$`English Words`/ (word_counts$`English Words` + word_counts$`Patois Words`),2)
# Filters
# Filter for songs with genre "Dancehall"
dancehall_songs <- word_counts[word_counts$Genre == "Dancehall",]
```
# Data Visualiation
This is all the code that is used to visualize the data.
```{r}
```