-
Notifications
You must be signed in to change notification settings - Fork 1
/
2-DataManipulation.R
115 lines (90 loc) · 4.12 KB
/
2-DataManipulation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# #######################################################################
# File-Name: 2-DataManipulation.R
# Version: R 3.4.3
# Date: Sep 07, 2018
# Author: Sumitra Badrinathan <sumitra@sas.upenn.edu>
# Purpose: Reading in and manipulating data on Amnesty's
# assessment of state terror in 1994
# Machine: macOS 10.14
# #######################################################################
set.seed(1221)
rm(list=ls()) # remove objects from R workspace
# set working directory
setwd("~/Dropbox/PSCI338") #macs
#setwd("C:/Users/name/Dropbox/PSCI338") #windows
# Download data file pts1994.csv from this repository and save it in the Data/Raw folder
# Read in the data : use read.csv for csv files or read.table for txt files
# reads in .csv with data on assessment of state terror in 1994
democracy94 <- read.csv("Data/Raw/pts1994.csv")
#democracy94 <- read.table("Data/Raw/pts1994.csv", header=TRUE, na="NA", sep=",")
# see type of data
class(democracy94) #this is a dataframe, like a spreadsheet
# see a snippet of the full data
head(democracy94) # first few rows
# data contains 6 variables and 179 rows, each row a country. Variables are country name,
# COW country code, world bank country code,
# and amnesty & state departement's assessment of terror in 1994 on a scale of 1 to 5 (higher=more terror)
head(democracy94,15) # specify first 15 rows
names(democracy94) # see names of variables in datsaet
# see the data for one country
democracy94[1,] # with a dataframe, have to specify rows AND columns to be displayed
democracy94[1:10] # will this work?
# see the data for one variable, say Amnesty score (there are two ways to do this)
democracy94[,5] # notice missing data!
democracy94$Amnesty.1994
# summary of a variable
summary(democracy94$Amnesty.1994)
# tabulate the amnesty variable, ignoring missings
table(democracy94$Amnesty.1994)
# tabulate amnesty var, including missings
table(democracy94$Amnesty.1994, useNA = "ifany")
# create a table w proportions
prop.table(table(democracy94$Amnesty.1994))
# basic numerical stats
max(democracy94$Amnesty.1994) #notice NA! how to remove?
mean(democracy94$Amnesty.1994, na.rm = T)
var()
sd()
# subsetting!
head(democracy94)
#subset to only certain variables
reduced <- subset(democracy94, select=c(Country, Amnesty.1994))
head(reduced)
dim(reduced) # see dimensions of subsetted data
# subset to certain rows: display cases where amnesty score is <=2
lowterror <- subset(democracy94, democracy94$Amnesty.1994 <= 2)
head(lowterror,10)
table(lowterror$Amnesty.1994)
dim(lowterror)
# logical operators in R: evaluated to TRUE or FALSE
is.data.frame(democracy94)
is.matrix(democracy94)
# subset with missings
lowterror2 <- subset(democracy94, democracy94$Amnesty.1994 <= 2 | is.na(democracy94$Amnesty.1994))
dim(lowterror2)
# display cases where the two sources differ by more than one unit
disagree <- subset(democracy94, abs(Amnesty.1994 - StateDept.1994) >= 2)
dim(disagree)
disagree
# create a new variable to display which countries have an amnesty score of 5 AND state dep score of 5
democracy94$highterror <- as.numeric(democracy94$Amnesty.1994==5 & democracy94$StateDept.1994==5)
head(democracy94,10) #we created a new variable!
table(democracy94$highterror)
# which are those countries?
democracy94$Country[democracy94$highterror==1]
# exercise in spurious correlations
# what is the correlation b/w the divorce rate in maine & per capita consumption of margerine?
# 1. go to http://tylervigen.com/old-version.html
# 2. create a variable "divorce" which is a vector of divorce rates in maine from 2000-09
# 3. create a variable "margerine" which is a vector of margerine consumption from 00-09
# 4. find the correlation b/w "divorce" and "margerine"
# 5. come up with a causal theory to relate these two :)
# read stata 13 and 14 files in R
install.packages("readstata13")
library(readstata13)
data <- read.dta13("myStataFile.dta")
# read spss .sas and .por files in R
install.packages("memisc")
library(memisc)
data <- as.data.set(spss.system.file("filename.sav"))
data <- as.data.set(spss.portable.file("filename.por"))