-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path03-data_import.html
366 lines (323 loc) · 33.1 KB
/
03-data_import.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
<head>
<meta charset="utf-8" />
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<title>02-importing and exporting data</title>
<meta name="description" content="02-importing and exporting data" />
<meta name="generator" content="bookdown 0.13 and GitBook 2.6.7" />
<meta property="og:title" content="02-importing and exporting data" />
<meta property="og:type" content="book" />
<meta name="twitter:card" content="summary" />
<meta name="twitter:title" content="02-importing and exporting data" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="apple-mobile-web-app-capable" content="yes" />
<meta name="apple-mobile-web-app-status-bar-style" content="black" />
<script src="libs/jquery-2.2.3/jquery.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-table.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />
<link href="libs/pagedtable-1.1/css/pagedtable.css" rel="stylesheet" />
<script src="libs/pagedtable-1.1/js/pagedtable.js"></script>
<style type="text/css">
a.sourceLine { display: inline-block; line-height: 1.25; }
a.sourceLine { pointer-events: none; color: inherit; text-decoration: inherit; }
a.sourceLine:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode { white-space: pre; position: relative; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
code.sourceCode { white-space: pre-wrap; }
a.sourceLine { text-indent: -1em; padding-left: 1em; }
}
pre.numberSource a.sourceLine
{ position: relative; left: -4em; }
pre.numberSource a.sourceLine::before
{ content: attr(data-line-number);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; pointer-events: all; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
a.sourceLine::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
</style>
</head>
<body>
<div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">
<div class="book-summary">
<nav role="navigation">
<ul class="summary">
<li><strong><a href="./">MRDA 2019</a></strong></li>
<li class="divider"></li>
<li class="chapter" data-level="0.1" data-path=""><a href="#data-import-and-export"><i class="fa fa-check"></i><b>0.1</b> Data import and export</a><ul>
<li class="chapter" data-level="0.1.1" data-path=""><a href="#getting-data-for-this-course"><i class="fa fa-check"></i><b>0.1.1</b> Getting data for this course</a></li>
<li class="chapter" data-level="0.1.2" data-path=""><a href="#import-data-created-by-other-software-packages"><i class="fa fa-check"></i><b>0.1.2</b> Import data created by other software packages</a></li>
<li class="chapter" data-level="0.1.3" data-path=""><a href="#export-data"><i class="fa fa-check"></i><b>0.1.3</b> Export data</a></li>
<li class="chapter" data-level="0.1.4" data-path=""><a href="#import-data-from-the-web"><i class="fa fa-check"></i><b>0.1.4</b> Import data from the Web</a></li>
</ul></li>
</ul>
</nav>
</div>
<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i><a href="./">02-importing and exporting data</a>
</h1>
</div>
<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">
<section class="normal" id="section-">
<div id="header">
<h1 class="title">02-importing and exporting data</h1>
</div>
<div id="data-import-and-export" class="section level2">
<h2><span class="header-section-number">0.1</span> Data import and export</h2>
<p>Before you can start your analysis in R, you first need to import the data you wish to perform the analysis on. You will often be faced with different types of data formats (usually produced by some other statistical software like SPSS or Excel or a text editor). Fortunately, R is fairly flexible with respect to the sources from which data may be imported and you can import the most common data formats into R with the help of a few packages. R can, among others, handle data from the following sources:</p>
<p><img src="https://github.com/IMSMWU/Teaching/raw/master/MRDA2017/Graphics/data_import.JPG" /></p>
<p>In the previous chapter, we saw how we may use the keyboard to input data in R. In the following sections, we will learn how to import data from text files and other statistical software packages.</p>
<div id="getting-data-for-this-course" class="section level3">
<h3><span class="header-section-number">0.1.1</span> Getting data for this course</h3>
<p>Most of the data sets we will be working with in this course will be stored in text files (i.e., .dat, .txt, .csv). There are two ways for you to obtain access to the data sets:</p>
<p><a href="./Code/03-data_import%20(2).R">You can download the corresponding R-Code here</a></p>
<div id="directly-import-datasets-from-github-recommended" class="section level4">
<h4><span class="header-section-number">0.1.1.1</span> Directly import datasets from GitHub (recommended)</h4>
<p>All data sets we will be working with are stored in a repository on GitHub (similar to other cloud storage services such as Dropbox). If you know the location, where the files are stored, you may conveniently load the data directly from GitHub into R using the <code>read.table()</code> function. The <code>header=TRUE</code> argument indicates that the first line of data represents the header, i.e., it contains the names of the columns. The <code>sep="\t"</code>-argument specifies the delimiter (the character used to separate the columns), which is a TAB in this case.</p>
</div>
<div id="download-and-import-datasets-from-learnwu" class="section level4">
<h4><span class="header-section-number">0.1.1.2</span> Download and import datasets from “Learn@WU”</h4>
<p>It is also possible to download the data from the respective folder on the “Learn@WU” platform, placing it in the working directory and importing it from there. However, this requires an additional step to download the file manually first. If you chose this option, please <strong>remember to put the data file in the working directory first</strong>. If the import is not working, check your working directory setting using <code>getwd()</code>. Once you placed the file in the working directory, you can import it using the same command as above. Note that the file must be given as a character string (i.e., in quotation marks) and has to end with the file extension (e.g., .csv, .tsv, etc.).</p>
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb1-1" data-line-number="1">music_data <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"music.data.extension"</span>, <span class="dt">header =</span> <span class="ot">TRUE</span>)</a></code></pre></div>
</div>
</div>
<div id="import-data-created-by-other-software-packages" class="section level3">
<h3><span class="header-section-number">0.1.2</span> Import data created by other software packages</h3>
<p>Sometimes, you may need to import data files created by other software packages, such as Excel or SPSS. In this section we will use the <code>readxl</code> and <code>haven</code> packages to do this. To import a certain file you should first make sure that the file is stored in your current working directory. You can list all file names in your working directory using the <code>list.files()</code> function. If the file is not there, either copy it to your current working directory, or set your working directory to the folder where the file is located using <code>setwd("/path/to/file")</code>. This tells R the folder you are working in. Remember that you have to use <code>/</code> instead of <code>\</code> to specify the path (if you use Windows paths copied from the explorer they will not work). When your file is in your working directory you can simply enter the filename into the respective import command. The import commands offer various options. For more details enter <code>?read_excel</code>, <code>?read_spss</code> after loading the packages.</p>
<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb2-1" data-line-number="1"><span class="kw">list.files</span>() <span class="co">#lists all files in the current working directory</span></a>
<a class="sourceLine" id="cb2-2" data-line-number="2"><span class="co"># setwd('/path/to/file') #may be used to change the</span></a>
<a class="sourceLine" id="cb2-3" data-line-number="3"><span class="co"># working directory to the folder that contains the</span></a>
<a class="sourceLine" id="cb2-4" data-line-number="4"><span class="co"># desired file</span></a>
<a class="sourceLine" id="cb2-5" data-line-number="5"></a>
<a class="sourceLine" id="cb2-6" data-line-number="6"><span class="co"># import excel files</span></a>
<a class="sourceLine" id="cb2-7" data-line-number="7"><span class="kw">library</span>(readxl) <span class="co">#load package to import Excel files</span></a>
<a class="sourceLine" id="cb2-8" data-line-number="8"><span class="kw">excel_sheets</span>(<span class="st">"music_data.xlsx"</span>)</a>
<a class="sourceLine" id="cb2-9" data-line-number="9">music_data_excel <-<span class="st"> </span><span class="kw">read_excel</span>(<span class="st">"music_data.xlsx"</span>, <span class="dt">sheet =</span> <span class="st">"mrda_2016_survey"</span>) <span class="co"># 'sheet=x'' specifies which sheet to import</span></a>
<a class="sourceLine" id="cb2-10" data-line-number="10"><span class="kw">head</span>(music_data_excel)</a>
<a class="sourceLine" id="cb2-11" data-line-number="11"></a>
<a class="sourceLine" id="cb2-12" data-line-number="12"><span class="kw">library</span>(haven) <span class="co">#load package to import SPSS files</span></a>
<a class="sourceLine" id="cb2-13" data-line-number="13"><span class="co"># import SPSS files</span></a>
<a class="sourceLine" id="cb2-14" data-line-number="14">music_data_spss <-<span class="st"> </span><span class="kw">read_sav</span>(<span class="st">"music_data.sav"</span>)</a>
<a class="sourceLine" id="cb2-15" data-line-number="15"><span class="kw">head</span>(music_data_spss)</a></code></pre></div>
<p>The import of other file formats works in a very similar way (e.g., Stata, SAS). Please refer to the respective help-files (e.g., <code>?read_dta</code>, <code>?read_sas</code> …) if you wish to import data created by other software packages.</p>
</div>
<div id="export-data" class="section level3">
<h3><span class="header-section-number">0.1.3</span> Export data</h3>
<p>Exporting to different formats is also easy, as you can just replace “read” with “write” in many of the previously discussed functions (e.g. <code>write.table(object, "file_name")</code>). This will save the data file to the working directory. To check what the current working directory is you can use <code>getwd()</code>. By default, the <code>write.table(object, "file_name")</code>function includes the row number as the first variable. By specifying <code>row.names = FALSE</code>, you may exclude this variable since it doesn’t contain any useful information.</p>
<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb3-1" data-line-number="1"><span class="kw">write.table</span>(music_data, <span class="st">"musicData.dat"</span>, <span class="dt">row.names =</span> <span class="ot">FALSE</span>, </a>
<a class="sourceLine" id="cb3-2" data-line-number="2"> <span class="dt">sep =</span> <span class="st">"</span><span class="ch">\t</span><span class="st">"</span>) <span class="co">#writes to a tab-delimited text file</span></a>
<a class="sourceLine" id="cb3-3" data-line-number="3"><span class="kw">write.table</span>(music_data, <span class="st">"musicData.csv"</span>, <span class="dt">row.names =</span> <span class="ot">FALSE</span>, </a>
<a class="sourceLine" id="cb3-4" data-line-number="4"> <span class="dt">sep =</span> <span class="st">","</span>) <span class="co">#writes to a comma-separated value file </span></a>
<a class="sourceLine" id="cb3-5" data-line-number="5"><span class="kw">write_sav</span>(music_data, <span class="st">"my_file.sav"</span>)</a></code></pre></div>
</div>
<div id="import-data-from-the-web" class="section level3">
<h3><span class="header-section-number">0.1.4</span> Import data from the Web</h3>
<div id="scraping-data-from-websites" class="section level4">
<h4><span class="header-section-number">0.1.4.1</span> Scraping data from websites</h4>
<p>Sometimes you may come across interesting data on websites that you would like to analyse. Reading data from websites is possible in R, e.g., using the <code>rvest</code> package. Let’s assume you would like to read a table that lists the population of different countries from <a href="https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population" target="_blank">this Wikipedia page</a>. It helps to first inspect the structure of the website (e.g., using tools like <a href="http://selectorgadget.com/" target="_blank">SelectorGadget</a>), so you know which elements you would like to extract. In this case it is fairly obvious that the data are stored in a table for which the associated html-tag is <code><table></code>. So let’s read the entire website using <code>read_html(url)</code> and filter all tables using <code>read_html(html_nodes(...,"table"))</code>.</p>
<div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb4-1" data-line-number="1"><span class="kw">library</span>(rvest)</a>
<a class="sourceLine" id="cb4-2" data-line-number="2">url <-<span class="st"> "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"</span></a>
<a class="sourceLine" id="cb4-3" data-line-number="3">population <-<span class="st"> </span><span class="kw">read_html</span>(url)</a>
<a class="sourceLine" id="cb4-4" data-line-number="4">population <-<span class="st"> </span><span class="kw">html_nodes</span>(population, <span class="st">"table.wikitable"</span>)</a>
<a class="sourceLine" id="cb4-5" data-line-number="5"><span class="kw">print</span>(population)</a></code></pre></div>
<pre><code>## {xml_nodeset (1)}
## [1] <table class="wikitable sortable" style="text-align:right"><tbody>\n ...</code></pre>
<p>The output shows that there are two tables on the website and the first one appears to contain the relevant information. So let’s read the first table using the <code>html_table()</code> function. Note that <code>population</code> is of class “list”. A list is a vector that has other R objects (e.g., other vectors, data frames, matrices, etc.) as its elements. If we want to access the data of one of the elements, we have to use two square brackets on each side instead of just one (e.g., <code>population[[1]]</code> gets us the first table from the list of tables on the website; the argument <code>fill = TRUE</code> ensures that empty cells are replaced with missing values when reading the table).</p>
<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb6-1" data-line-number="1">population <-<span class="st"> </span>population[[<span class="dv">1</span>]] <span class="op">%>%</span><span class="st"> </span><span class="kw">html_table</span>(<span class="dt">fill =</span> <span class="ot">TRUE</span>)</a>
<a class="sourceLine" id="cb6-2" data-line-number="2"><span class="kw">head</span>(population) <span class="co">#checks if we scraped the desired data</span></a></code></pre></div>
<div data-pagedtable="false">
<script data-pagedtable-source type="application/json">
{"columns":[{"label":["Rank"],"name":[1],"type":["chr"],"align":["left"]},{"label":["Country(or dependent territory)"],"name":[2],"type":["chr"],"align":["left"]},{"label":["Population"],"name":[3],"type":["chr"],"align":["left"]},{"label":["Date"],"name":[4],"type":["chr"],"align":["left"]},{"label":["% of worldpopulation"],"name":[5],"type":["chr"],"align":["left"]},{"label":["Source"],"name":[6],"type":["chr"],"align":["left"]}],"data":[{"1":"1","2":"China[Note 2]","3":"1,399,070,000","4":"September 11, 2019","5":"18.1%","6":"Official population clock"},{"1":"2","2":"India[Note 3]","3":"1,352,150,000","4":"September 11, 2019","5":"17.5%","6":"Official population clock"},{"1":"3","2":"United States[Note 4]","3":"329,878,000","4":"September 11, 2019","5":"4.27%","6":"Official population clock"},{"1":"4","2":"Indonesia","3":"268,074,600","4":"July 1, 2019","5":"3.47%","6":"Official annual projection"},{"1":"5","2":"Brazil","3":"210,424,000","4":"September 11, 2019","5":"2.72%","6":"Official population clock"},{"1":"6","2":"Pakistan","3":"205,975,000","4":"September 11, 2019","5":"2.66%","6":"Official population clock"}],"options":{"columns":{"min":{},"max":[10]},"rows":{"min":[10],"max":[10]},"pages":{}}}
</script>
</div>
<p>You can see that population is read as a character variable because of the commas.</p>
<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb7-1" data-line-number="1"><span class="kw">class</span>(population<span class="op">$</span>Population)</a></code></pre></div>
<pre><code>## [1] "character"</code></pre>
<p>If we wanted to use this variable for some kind of analysis, we would first need to convert it to numeric format using the <code>as.numeric()</code> function. However, before we can do this, we can use the <code>str_replace_all()</code> function from the stringr package, which replaces all matches of a string. In our case, we would like to replace the commas (<code>","</code>) with nothing (<code>""</code>).</p>
<div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb9-1" data-line-number="1"><span class="kw">library</span>(stringr)</a>
<a class="sourceLine" id="cb9-2" data-line-number="2">population<span class="op">$</span>Population <-<span class="st"> </span><span class="kw">as.numeric</span>(<span class="kw">str_replace_all</span>(population<span class="op">$</span>Population, </a>
<a class="sourceLine" id="cb9-3" data-line-number="3"> <span class="dt">pattern =</span> <span class="st">","</span>, <span class="dt">replacement =</span> <span class="st">""</span>)) <span class="co">#convert to numeric</span></a>
<a class="sourceLine" id="cb9-4" data-line-number="4"><span class="kw">head</span>(population) <span class="co">#checks if we scraped the desired data</span></a></code></pre></div>
<div data-pagedtable="false">
<script data-pagedtable-source type="application/json">
{"columns":[{"label":["Rank"],"name":[1],"type":["chr"],"align":["left"]},{"label":["Country(or dependent territory)"],"name":[2],"type":["chr"],"align":["left"]},{"label":["Population"],"name":[3],"type":["dbl"],"align":["right"]},{"label":["Date"],"name":[4],"type":["chr"],"align":["left"]},{"label":["% of worldpopulation"],"name":[5],"type":["chr"],"align":["left"]},{"label":["Source"],"name":[6],"type":["chr"],"align":["left"]}],"data":[{"1":"1","2":"China[Note 2]","3":"1399070000","4":"September 11, 2019","5":"18.1%","6":"Official population clock"},{"1":"2","2":"India[Note 3]","3":"1352150000","4":"September 11, 2019","5":"17.5%","6":"Official population clock"},{"1":"3","2":"United States[Note 4]","3":"329878000","4":"September 11, 2019","5":"4.27%","6":"Official population clock"},{"1":"4","2":"Indonesia","3":"268074600","4":"July 1, 2019","5":"3.47%","6":"Official annual projection"},{"1":"5","2":"Brazil","3":"210424000","4":"September 11, 2019","5":"2.72%","6":"Official population clock"},{"1":"6","2":"Pakistan","3":"205975000","4":"September 11, 2019","5":"2.66%","6":"Official population clock"}],"options":{"columns":{"min":{},"max":[10]},"rows":{"min":[10],"max":[10]},"pages":{}}}
</script>
</div>
<p>Now the variable is of type “numeric” and could be used for analysis.</p>
<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb10-1" data-line-number="1"><span class="kw">class</span>(population<span class="op">$</span>Population)</a></code></pre></div>
<pre><code>## [1] "numeric"</code></pre>
</div>
<div id="scraping-data-from-apis" class="section level4">
<h4><span class="header-section-number">0.1.4.2</span> Scraping data from APIs</h4>
<div id="scraping-data-from-apis-directly" class="section level5">
<h5><span class="header-section-number">0.1.4.2.1</span> Scraping data from APIs directly</h5>
<p>Reading data from websites can be tricky since you need to analyze the page structure first. Many web-services (e.g., Facebook, Twitter, YouTube) actually have application programming interfaces (API’s), which you can use to obtain data in a pre-structured format. JSON (JavaScript Object Notation) is a popular lightweight data-interchange format in which data can be obtained. The process of obtaining data is visualized in the following graphic:</p>
<div class="figure">
<img src="https://github.com/IMSMWU/Teaching/raw/master/MRDA2017/API.JPG" alt="Obtaining data from APIs" />
<p class="caption">Obtaining data from APIs</p>
</div>
<p>The process of obtaining data from APIs consists of the following steps:</p>
<ul>
<li>Identify an API that has enough data to be relevant and reliable (e.g., <a href="http://www.programmableweb.com:" target="_blank">www.programmableweb.com</a> has >12,000 open web APIs in 63 categories).</li>
<li>Request information by calling (or, more technically speaking, creating a request to) the API (e.g., R, python, php or JavaScript).</li>
<li>Receive response messages, which is usually in JavaScript Object Notation (JSON) or Extensible Markup Language (XML) format.</li>
<li>Write a parser to pull out the elements you want and put them into a of simpler format</li>
<li>Store, process or analyze data according the marketing research question.</li>
</ul>
<p>Let’s assume that you would like to obtain population data again. The World Bank has an API that allows you to easily obtain this kind of data. The details are usually provided in the API reference, e.g., <a href="https://datahelpdesk.worldbank.org/knowledgebase/articles/889392-api-documentation" target="_blank">here</a>. You simply “call” the API for the desired information and get a structured JSON file with the desired key-value pairs in return. For example, the population for Austria from 1960 to 2016 can be obtained using <a href="http://api.worldbank.org/countries/AT/indicators/SP.POP.TOTL/?date=1960:2016&format=json&per_page=100" target="_blank">this call</a>. The file can be easily read into R using the <code>fromJSON()</code>-function from the <code>jsonlite</code>-package. Again, the result is a list and the second element <code>ctrydata[[2]]</code> contains the desired data, from which we select the “value” and “data” columns using the square brackets as usual <code>[,c("value","date")]</code></p>
<div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb12-1" data-line-number="1"><span class="kw">library</span>(jsonlite)</a>
<a class="sourceLine" id="cb12-2" data-line-number="2">url <-<span class="st"> "http://api.worldbank.org/countries/AT/indicators/SP.POP.TOTL/?date=1960:2016&format=json&per_page=100"</span> <span class="co">#specifies url</span></a>
<a class="sourceLine" id="cb12-3" data-line-number="3">ctrydata <-<span class="st"> </span><span class="kw">fromJSON</span>(url) <span class="co">#parses the data </span></a>
<a class="sourceLine" id="cb12-4" data-line-number="4"><span class="kw">str</span>(ctrydata)</a></code></pre></div>
<pre><code>## List of 2
## $ :List of 4
## ..$ page : int 1
## ..$ pages : int 1
## ..$ per_page: chr "100"
## ..$ total : int 57
## $ :'data.frame': 57 obs. of 5 variables:
## ..$ indicator:'data.frame': 57 obs. of 2 variables:
## .. ..$ id : chr [1:57] "SP.POP.TOTL" "SP.POP.TOTL" "SP.POP.TOTL" "SP.POP.TOTL" ...
## .. ..$ value: chr [1:57] "Population, total" "Population, total" "Population, total" "Population, total" ...
## ..$ country :'data.frame': 57 obs. of 2 variables:
## .. ..$ id : chr [1:57] "AT" "AT" "AT" "AT" ...
## .. ..$ value: chr [1:57] "Austria" "Austria" "Austria" "Austria" ...
## ..$ value : chr [1:57] "8736668" "8642699" "8546356" "8479823" ...
## ..$ decimal : chr [1:57] "0" "0" "0" "0" ...
## ..$ date : chr [1:57] "2016" "2015" "2014" "2013" ...</code></pre>
<div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb14-1" data-line-number="1"><span class="kw">head</span>(ctrydata[[<span class="dv">2</span>]][, <span class="kw">c</span>(<span class="st">"value"</span>, <span class="st">"date"</span>)]) <span class="co">#checks if we scraped the desired data</span></a></code></pre></div>
<div data-pagedtable="false">
<script data-pagedtable-source type="application/json">
{"columns":[{"label":["value"],"name":[1],"type":["chr"],"align":["left"]},{"label":["date"],"name":[2],"type":["chr"],"align":["left"]}],"data":[{"1":"8736668","2":"2016"},{"1":"8642699","2":"2015"},{"1":"8546356","2":"2014"},{"1":"8479823","2":"2013"},{"1":"8429991","2":"2012"},{"1":"8391643","2":"2011"}],"options":{"columns":{"min":{},"max":[10]},"rows":{"min":[10],"max":[10]},"pages":{}}}
</script>
</div>
</div>
<div id="scraping-data-from-apis-via-r-packages" class="section level5">
<h5><span class="header-section-number">0.1.4.2.2</span> Scraping data from APIs via R packages</h5>
<p>An even more convenient way to obtain data from web APIs is to use existing R packages that someone else has already created. There are R packages available for various web-services. For example, the <code>gtrendsR</code> package can be used to conveniently obtain data from the <a href="https://trends.google.at/trends/" target="_blank">Google Trends</a> page. The <code>gtrends()</code> function is easy to use and returns a list of elements (e.g., “interest over time”, “interest by city”, “related topics”), which can be inspected using the <code>ls()</code> function. The following example can be used to obtain data for the search term “data science” in the US between September 1 and October 6:</p>
<div class="sourceCode" id="cb15"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb15-1" data-line-number="1"><span class="kw">library</span>(gtrendsR)</a>
<a class="sourceLine" id="cb15-2" data-line-number="2">index =<span class="st"> </span><span class="dv">1</span></a>
<a class="sourceLine" id="cb15-3" data-line-number="3">success =<span class="st"> </span><span class="ot">FALSE</span></a>
<a class="sourceLine" id="cb15-4" data-line-number="4"><span class="cf">while</span> (<span class="op">!</span>(success <span class="op">|</span><span class="st"> </span>index <span class="op">==</span><span class="st"> </span><span class="dv">10</span>)) {</a>
<a class="sourceLine" id="cb15-5" data-line-number="5"> google_trends <-<span class="st"> </span><span class="kw">try</span>(<span class="kw">gtrends</span>(<span class="st">"data science"</span>, <span class="dt">geo =</span> <span class="kw">c</span>(<span class="st">"US"</span>), </a>
<a class="sourceLine" id="cb15-6" data-line-number="6"> <span class="dt">gprop =</span> <span class="kw">c</span>(<span class="st">"web"</span>), <span class="dt">time =</span> <span class="st">"2017-09-01 2017-10-06"</span>), </a>
<a class="sourceLine" id="cb15-7" data-line-number="7"> <span class="dt">silent =</span> <span class="ot">TRUE</span>)</a>
<a class="sourceLine" id="cb15-8" data-line-number="8"> </a>
<a class="sourceLine" id="cb15-9" data-line-number="9"> <span class="cf">if</span> (<span class="op">!</span><span class="kw">is</span>(google_trends, <span class="st">"try-error"</span>)) {</a>
<a class="sourceLine" id="cb15-10" data-line-number="10"> <span class="kw">ls</span>(google_trends)</a>
<a class="sourceLine" id="cb15-11" data-line-number="11"> <span class="kw">head</span>(google_trends<span class="op">$</span>interest_over_time)</a>
<a class="sourceLine" id="cb15-12" data-line-number="12"> success =<span class="st"> </span><span class="ot">TRUE</span></a>
<a class="sourceLine" id="cb15-13" data-line-number="13"> } <span class="cf">else</span> {</a>
<a class="sourceLine" id="cb15-14" data-line-number="14"> index =<span class="st"> </span>index <span class="op">+</span><span class="st"> </span><span class="dv">1</span></a>
<a class="sourceLine" id="cb15-15" data-line-number="15"> <span class="kw">Sys.sleep</span>(<span class="kw">runif</span>(<span class="dv">1</span>, <span class="dv">0</span>, <span class="dv">3</span>))</a>
<a class="sourceLine" id="cb15-16" data-line-number="16"> }</a>
<a class="sourceLine" id="cb15-17" data-line-number="17">}</a>
<a class="sourceLine" id="cb15-18" data-line-number="18"><span class="cf">if</span> (success <span class="op">==</span><span class="st"> </span><span class="ot">FALSE</span>) {</a>
<a class="sourceLine" id="cb15-19" data-line-number="19"> <span class="kw">warning</span>(<span class="st">"Google Trends has exited unsuccessfully"</span>)</a>
<a class="sourceLine" id="cb15-20" data-line-number="20">}</a></code></pre></div>
</div>
</div>
</div>
</div>
</section>
</div>
</div>
</div>
</div>
</div>
<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
<script src="libs/gitbook-2.6.7/js/lunr.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
<script>
gitbook.require(["gitbook"], function(gitbook) {
gitbook.start({
"sharing": {
"github": false,
"facebook": true,
"twitter": true,
"google": false,
"linkedin": false,
"weibo": false,
"instapaper": false,
"vk": false,
"all": ["facebook", "google", "twitter", "linkedin", "weibo", "instapaper"]
},
"fontsettings": {
"theme": "white",
"family": "sans",
"size": 2
},
"edit": {
"link": null,
"text": null
},
"history": {
"link": null,
"text": null
},
"download": null,
"toc": {
"collapse": "section"
},
"search": false
});
});
</script>
</body>
</html>