-
Notifications
You must be signed in to change notification settings - Fork 0
/
import_cities_to_countries.R
147 lines (115 loc) · 8.34 KB
/
import_cities_to_countries.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
library(jsonlite)
library(dplyr)
rm(list = ls())
source("formulas.r")
# Import data -----------------------------------------------------------------
# original json file "https://github.com/David-Haim/CountriesToCitiesJSON/blob/master/countriesToCities.json"
json_countries_to_cities <- "Data\\datasets\\countriesToCities.json"
countries_to_cities <- fromJSON(json_countries_to_cities, flatten=TRUE)
# Conversion of countries_to_cities into a data.frame -------------------------
# The simple 'lapply' loses the name of the list which contains the country name
convert_countries_to_cities = function(list_countries_to_cities,
country_names) {
r <- cbind(as.character(unlist(list_countries_to_cities)), country_names)
r <- as.data.frame(r, stringsAsFactors = FALSE)
names(r) <- c("country", "city")
return (r)
}
df_countries_to_cities <- mapply(convert_countries_to_cities, names(countries_to_cities), countries_to_cities, SIMPLIFY = FALSE, USE.NAMES = TRUE)
df_countries_to_cities <- bind_rows(df_countries_to_cities, NULL)
df_countries_to_cities <- unique(df_countries_to_cities)
# get data frame of cities from tennis data -----------------------------------
allGamesWithoutRating <- getAllGamesWithoutRating()
city <- trimws(unique(allGamesWithoutRating$Location))
city <- as.data.frame(city, stringsAsFactors = FALSE)
city <- filter(city, city != "")
unique_cities <- count(distinct(city, city))
# join data city with countries_to_cities -------------------------------------
city <- left_join (city, df_countries_to_cities, by = c('city' = 'city'))
# Some city names occur in multiple countries
# Add Reference city - country list for cities in different countries
CityCountryReference <- data.frame("Melbourne","Australia", stringsAsFactors = FALSE)
names(CityCountryReference) <- c("city","country")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("San Jose","United States")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("London","United Kingdom")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Copenhagen","Denmark")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Santiago","Chile")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Scottsdale","United States")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Miami","United States")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Barcelona","Spain")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Rome","Italy")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Hamburg","Germany")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Paris","France")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Halle","Germany")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Nottingham","United Kingdom")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Newport","United States")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Amsterdam","Netherlands")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Stuttgart","Germany")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Los Angeles","United States")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("San Marino","San Marino")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Toronto","Canada")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Washington","United States")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Palermo","Italy")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Brighton","United Kingdom")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Stockholm","Sweden")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Lisbon","Portugal")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Houston","United States")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Sopot","Poland")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Montreal","Canada")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Madrid","Spain")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Valencia","Spain")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Metz","France")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Las Vegas","United States")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Warsaw","Poland")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Brisbane","Australia")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Belgrade","Serbia")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Eastbourne","United Kingdom")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Moscow","Russia")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Lyon","France")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Basel","Switzerland")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Oeiras","Portugal")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Geneva","Switzerland")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Antwerp","Belgium")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Munich","Germany")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Vienna","Austria")
CityCountryReference[nrow(CityCountryReference) + 1,] = c("Sydney","Australia")
# Missing Reference city - country list for cities in different countries
MissingCityCountryReference <- data.frame("Mallorca","Spain", stringsAsFactors = FALSE)
names(MissingCityCountryReference) <- c("city","country")
MissingCityCountryReference[nrow(MissingCityCountryReference) + 1,] = c("St. Polten","Austria")
MissingCityCountryReference[nrow(MissingCityCountryReference) + 1,] = c("Queens Club","United Kingdom")
MissingCityCountryReference[nrow(MissingCityCountryReference) + 1,] = c("Kitzbuhel","Austria")
MissingCityCountryReference[nrow(MissingCityCountryReference) + 1,] = c("St. Petersburg","Russia")
MissingCityCountryReference[nrow(MissingCityCountryReference) + 1,] = c("Vina del Mar","Chile")
MissingCityCountryReference[nrow(MissingCityCountryReference) + 1,] = c("'s-Hertogenbosch","Netherlands")
MissingCityCountryReference[nrow(MissingCityCountryReference) + 1,] = c("s-Hertogenbosch","Netherlands")
MissingCityCountryReference[nrow(MissingCityCountryReference) + 1,] = c("Dubai","United Arab Emirates")
MissingCityCountryReference[nrow(MissingCityCountryReference) + 1,] = c("Estoril","Portugal")
MissingCityCountryReference[nrow(MissingCityCountryReference) + 1,] = c("Costa Do Sauipe","Brasil")
MissingCityCountryReference[nrow(MissingCityCountryReference) + 1,] = c("Ho Chi Min City","China")
MissingCityCountryReference[nrow(MissingCityCountryReference) + 1,] = c("Portschach","Austria")
MissingCityCountryReference[nrow(MissingCityCountryReference) + 1,] = c("Johannesburg","South Africa")
MissingCityCountryReference[nrow(MissingCityCountryReference) + 1,] = c("Shenzhen","China")
MissingCityCountryReference[nrow(MissingCityCountryReference) + 1,] = c("Marrakech","Morocco")
MissingCityCountryReference[nrow(MissingCityCountryReference) + 1,] = c("Los Cabos","Mexico")
# first remove reference rows with multiple countries
city <- filter(city, !city %in% CityCountryReference[[1]])
# add correct reference rows for cities with multiple countries
city <- rbind(city, CityCountryReference)
# remove known cities/places with missing countries (country = NA)
city <- filter(city, !city %in% MissingCityCountryReference[[1]])
# add missing countries for city/places
city <- rbind(city, MissingCityCountryReference)
# Validation check if there are still missing rows ----------------------------
allMatched <- as.integer(sum(is.na(city$country)))
if (allMatched > 0) {
"Not all countries are found for all cities, NOT GOOD"
} else if (nrow(city) == unique_cities) {
"All cities are matched with countries, GOOD"
} else {
"NOT All cities are matched with countries, NOT GOOD"
}
# Export result city country --------------------------------------------------
write.csv(file = "Data/datasets/citycountry.csv", city, row.names=FALSE)
# clean up variables ----------------------------------------------------------
rm(json_countries_to_cities, CityCountryReference, df_countries_to_cities, allMatched, countries_to_cities, MissingCityCountryReference, convert_countries_to_cities, unique_cities)