-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path05-visualization.html
593 lines (550 loc) · 86 KB
/
05-visualization.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
<head>
<meta charset="utf-8" />
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<title>05-visualization</title>
<meta name="description" content="05-visualization" />
<meta name="generator" content="bookdown 0.13 and GitBook 2.6.7" />
<meta property="og:title" content="05-visualization" />
<meta property="og:type" content="book" />
<meta name="twitter:card" content="summary" />
<meta name="twitter:title" content="05-visualization" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="apple-mobile-web-app-capable" content="yes" />
<meta name="apple-mobile-web-app-status-bar-style" content="black" />
<script src="libs/jquery-2.2.3/jquery.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-table.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />
<link href="libs/pagedtable-1.1/css/pagedtable.css" rel="stylesheet" />
<script src="libs/pagedtable-1.1/js/pagedtable.js"></script>
<style type="text/css">
a.sourceLine { display: inline-block; line-height: 1.25; }
a.sourceLine { pointer-events: none; color: inherit; text-decoration: inherit; }
a.sourceLine:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode { white-space: pre; position: relative; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
code.sourceCode { white-space: pre-wrap; }
a.sourceLine { text-indent: -1em; padding-left: 1em; }
}
pre.numberSource a.sourceLine
{ position: relative; left: -4em; }
pre.numberSource a.sourceLine::before
{ content: attr(data-line-number);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; pointer-events: all; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
a.sourceLine::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
</style>
</head>
<body>
<div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">
<div class="book-summary">
<nav role="navigation">
<ul class="summary">
<li><strong><a href="./">MRDA 2019</a></strong></li>
<li class="divider"></li>
<li class="chapter" data-level="0.1" data-path=""><a href="#data-visualization"><i class="fa fa-check"></i><b>0.1</b> Data visualization</a><ul>
<li class="chapter" data-level="0.1.1" data-path=""><a href="#categorical-variables"><i class="fa fa-check"></i><b>0.1.1</b> Categorical variables</a></li>
<li class="chapter" data-level="0.1.2" data-path=""><a href="#continuous-variables"><i class="fa fa-check"></i><b>0.1.2</b> Continuous variables</a></li>
<li class="chapter" data-level="0.1.3" data-path=""><a href="#saving-plots"><i class="fa fa-check"></i><b>0.1.3</b> Saving plots</a></li>
<li class="chapter" data-level="0.1.4" data-path=""><a href="#additional-options"><i class="fa fa-check"></i><b>0.1.4</b> Additional options</a></li>
</ul></li>
</ul>
</nav>
</div>
<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i><a href="./">05-visualization</a>
</h1>
</div>
<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">
<section class="normal" id="section-">
<div id="header">
<h1 class="title">05-visualization</h1>
</div>
<div id="data-visualization" class="section level2">
<h2><span class="header-section-number">0.1</span> Data visualization</h2>
<p>This section discusses how to produce appropriate graphics to describe our data visually. While R includes tools to build plots, we will be using the ggplot2 package by Hadley Wickham. It has the advantage of being fairly straightforward to learn but being very flexible when it comes to building more complex plots. For a more in depth discussion you can refer to chapter 4 of the book “Discovering Statistics Using R” by Andy Field et al. or read the following chapter from the book <a href="http://r4ds.had.co.nz/data-visualisation.html" target="_blank">“R for Data science”</a> by Hadley Wickham as well as <a href="https://r-graphics.org/" target="_blank">“R Graphics Cookbook”</a> by Winston Chang.</p>
<p><a href="./Code/05-visualization%20(1).R">You can download the corresponding R-Code here</a></p>
<p>ggplot2 is built around the idea of constructing plots by stacking layers on top of one another. Every plot starts with the <code>ggplot(data)</code> function, after which layers can be added with the “+” symbol. The following figures show the layered structure of creating plots with ggplot.</p>
<p style="text-align:center;">
<img src="https://github.com/IMSMWU/Teaching/raw/master/MRDA2017/Graphics/ggplot2.JPG" alt="DSUR cover" height="250" />
<img src="https://github.com/IMSMWU/Teaching/raw/master/MRDA2017/Graphics/ggplot1.JPG" alt="DSUR cover" height="250" />
</p>
<div id="categorical-variables" class="section level3">
<h3><span class="header-section-number">0.1.1</span> Categorical variables</h3>
<div id="bar-plot" class="section level4">
<h4><span class="header-section-number">0.1.1.1</span> Bar plot</h4>
<p>To give you an example of how the graphics are composed, let’s go back to the frequency table from the previous chapter, where we created this table:</p>
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb1-1" data-line-number="1"><span class="kw">readRDS</span>(<span class="st">"music_data.rds"</span>)</a></code></pre></div>
<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb2-1" data-line-number="1">s.genre <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"pop"</span>, <span class="st">"hip hop"</span>, <span class="st">"rock"</span>, <span class="st">"rap"</span>, <span class="st">"indie"</span>)</a>
<a class="sourceLine" id="cb2-2" data-line-number="2">music_data <-<span class="st"> </span><span class="kw">subset</span>(music_data, top.genre <span class="op">%in%</span><span class="st"> </span>s.genre)</a>
<a class="sourceLine" id="cb2-3" data-line-number="3"></a>
<a class="sourceLine" id="cb2-4" data-line-number="4">music_data<span class="op">$</span>genre_cat <-<span class="st"> </span><span class="kw">as.factor</span>(music_data<span class="op">$</span>top.genre)</a>
<a class="sourceLine" id="cb2-5" data-line-number="5">music_data<span class="op">$</span>explicit_cat <-<span class="st"> </span><span class="kw">factor</span>(music_data<span class="op">$</span>explicit, </a>
<a class="sourceLine" id="cb2-6" data-line-number="6"> <span class="dt">levels =</span> <span class="kw">c</span>(<span class="dv">0</span><span class="op">:</span><span class="dv">1</span>), <span class="dt">labels =</span> <span class="kw">c</span>(<span class="st">"not explicit"</span>, <span class="st">"explicit"</span>))</a>
<a class="sourceLine" id="cb2-7" data-line-number="7"></a>
<a class="sourceLine" id="cb2-8" data-line-number="8"><span class="kw">head</span>(music_data)</a></code></pre></div>
<div data-pagedtable="false">
<script data-pagedtable-source type="application/json">
{"columns":[{"label":["id"],"name":[1],"type":["chr"],"align":["left"]},{"label":["trackName"],"name":[2],"type":["chr"],"align":["left"]},{"label":["isrc"],"name":[3],"type":["chr"],"align":["left"]},{"label":["explicit"],"name":[4],"type":["dbl"],"align":["right"]},{"label":["trackPopularity"],"name":[5],"type":["dbl"],"align":["right"]},{"label":["primary_artistName"],"name":[6],"type":["chr"],"align":["left"]},{"label":["primary_artistID"],"name":[7],"type":["chr"],"align":["left"]},{"label":["artistName"],"name":[8],"type":["chr"],"align":["left"]},{"label":["artistIDs"],"name":[9],"type":["chr"],"align":["left"]},{"label":["albumName"],"name":[10],"type":["chr"],"align":["left"]},{"label":["albumID"],"name":[11],"type":["chr"],"align":["left"]},{"label":["available_markets"],"name":[12],"type":["chr"],"align":["left"]},{"label":["n_available_markets"],"name":[13],"type":["dbl"],"align":["right"]},{"label":["releaseDate"],"name":[14],"type":["date"],"align":["right"]},{"label":["releaseDate_precision"],"name":[15],"type":["chr"],"align":["left"]},{"label":["danceability"],"name":[16],"type":["dbl"],"align":["right"]},{"label":["energy"],"name":[17],"type":["dbl"],"align":["right"]},{"label":["key"],"name":[18],"type":["dbl"],"align":["right"]},{"label":["loudness"],"name":[19],"type":["dbl"],"align":["right"]},{"label":["mode"],"name":[20],"type":["dbl"],"align":["right"]},{"label":["speechiness"],"name":[21],"type":["dbl"],"align":["right"]},{"label":["acousticness"],"name":[22],"type":["dbl"],"align":["right"]},{"label":["instrumentalness"],"name":[23],"type":["dbl"],"align":["right"]},{"label":["liveness"],"name":[24],"type":["dbl"],"align":["right"]},{"label":["valence"],"name":[25],"type":["dbl"],"align":["right"]},{"label":["tempo"],"name":[26],"type":["dbl"],"align":["right"]},{"label":["duration_ms"],"name":[27],"type":["dbl"],"align":["right"]},{"label":["time_signature"],"name":[28],"type":["dbl"],"align":["right"]},{"label":["uri"],"name":[29],"type":["chr"],"align":["left"]},{"label":["analysis_url"],"name":[30],"type":["chr"],"align":["left"]},{"label":["followers"],"name":[31],"type":["dbl"],"align":["right"]},{"label":["type"],"name":[32],"type":["chr"],"align":["left"]},{"label":["popularity"],"name":[33],"type":["dbl"],"align":["right"]},{"label":["primary_genre"],"name":[34],"type":["chr"],"align":["left"]},{"label":["genres"],"name":[35],"type":["chr"],"align":["left"]},{"label":["n_genres"],"name":[36],"type":["dbl"],"align":["right"]},{"label":["total_releases"],"name":[37],"type":["dbl"],"align":["right"]},{"label":["n_album"],"name":[38],"type":["dbl"],"align":["right"]},{"label":["n_single"],"name":[39],"type":["dbl"],"align":["right"]},{"label":["n_appears_on"],"name":[40],"type":["dbl"],"align":["right"]},{"label":["n_compilation"],"name":[41],"type":["dbl"],"align":["right"]},{"label":["mstreams"],"name":[42],"type":["dbl"],"align":["right"]},{"label":["region"],"name":[43],"type":["chr"],"align":["left"]},{"label":["adv_spending"],"name":[44],"type":["dbl"],"align":["right"]},{"label":["top.genre"],"name":[45],"type":["chr"],"align":["left"]},{"label":["genre_cat"],"name":[46],"type":["fctr"],"align":["left"]},{"label":["explicit_cat"],"name":[47],"type":["fctr"],"align":["left"]}],"data":[{"1":"5aAx2yezTd8zXrkmtKl66Z","2":"Starboy","3":"USUG11600976","4":"1","5":"22","6":"The Weeknd","7":"1Xyo4u8uXC1ZmMpatF05PJ","8":"The Weeknd feat. Daft Punk","9":"1Xyo4u8uXC1ZmMpatF05PJ,4tZwfgrHOc3mvqYlEYSvVi","10":"Starboy","11":"09fggMHib4YkOtwQNXEBII","12":"NA","13":"0","14":"2016-11-25","15":"day","16":"0.681","17":"0.594","18":"7","19":"-7.028","20":"1","21":"0.2820","22":"0.1650","23":"0.00000349","24":"0.134","25":"0.535","26":"186.054","27":"230453","28":"4","29":"spotify:track:5aAx2yezTd8zXrkmtKl66Z","30":"https://api.spotify.com/v1/audio-analysis/5aAx2yezTd8zXrkmtKl66Z","31":"15222808","32":"artist","33":"91","34":"canadian contemporary r&b","35":"canadian contemporary r&b, canadian pop, pop, rap","36":"4","37":"124","38":"13","39":"20","40":"91","41":"0","42":"357336.0","43":"us","44":"4538745582809720084335652681802192354751872961846816542356290991840842800362708869311675467455586767281244340224","45":"pop","46":"pop","47":"explicit"},{"1":"7BKLCZ1jbUBVqRi2FVlTVw","2":"Closer","3":"USQX91601347","4":"0","5":"86","6":"The Chainsmokers","7":"69GGBxA162lTqCwzJG5jLp","8":"The Chainsmokers feat. Halsey","9":"69GGBxA162lTqCwzJG5jLp,26VFTg2z8YR0cCuwLzESi2","10":"Closer","11":"0rSLgV8p5FzfnqlEk4GzxE","12":"AD, AE, AR, AT, AU, BE, BG, BH, BO, BR, CA, CH, CL, CO, CR, CY, CZ, DE, DK, DO, DZ, EC, EE, EG, ES, FI, FR, GB, GR, GT, HK, HN, HU, ID, IE, IL, IN, IS, IT, JO, JP, KW, LB, LI, LT, LU, LV, MA, MC, MT, MX, MY, NI, NL, NO, NZ, OM, PA, PE, PH, PL, PS, PT, PY, QA, RO, SA, SE, SG, SK, SV, TH, TN, TR, TW, US, UY, VN, ZA","13":"79","14":"2016-07-29","15":"day","16":"0.748","17":"0.524","18":"8","19":"-5.599","20":"1","21":"0.0338","22":"0.4140","23":"0.00000000","24":"0.111","25":"0.661","26":"95.010","27":"244960","28":"4","29":"spotify:track:7BKLCZ1jbUBVqRi2FVlTVw","30":"https://api.spotify.com/v1/audio-analysis/7BKLCZ1jbUBVqRi2FVlTVw","31":"11565104","32":"artist","33":"90","34":"dance pop","35":"dance pop, electropop, pop, tropical house","36":"4","37":"96","38":"2","39":"59","40":"35","41":"0","42":"323081.4","43":"us","44":"87296125002616694828879835742568772595201488756664067891719260959953504690764846096364552367775131489311653888","45":"pop","46":"pop","47":"not explicit"},{"1":"4pdPtRcBmOSQDlJ3Fk945m","2":"Let Me Love You","3":"QMZSY1600015","4":"0","5":"27","6":"DJ Snake","7":"540vIaP2JwjQb9dm3aArA4","8":"DJ Snake feat. Justin Bieber","9":"540vIaP2JwjQb9dm3aArA4,1uNFoZAHBGtllmzznpCI3s","10":"Encore","11":"55bbXORm6ZrVq52zfZnxBf","12":"NA","13":"0","14":"2016-08-05","15":"day","16":"0.476","17":"0.718","18":"8","19":"-5.309","20":"1","21":"0.0576","22":"0.0784","23":"0.00000000","24":"0.122","25":"0.142","26":"199.864","27":"205947","28":"4","29":"spotify:track:4pdPtRcBmOSQDlJ3Fk945m","30":"https://api.spotify.com/v1/audio-analysis/4pdPtRcBmOSQDlJ3Fk945m","31":"4345960","32":"artist","33":"86","34":"dance pop","35":"dance pop, edm, electronic trap, pop, tropical house","36":"5","37":"85","38":"2","39":"38","40":"45","41":"0","42":"264546.4","43":"us","44":"4052673829394860012092441543505898027168278466357604061531255984953722983872577825913849720641449881585582080","45":"pop","46":"pop","47":"not explicit"},{"1":"5knuzwU65gJK7IF5yJsuaW","2":"Rockabye (feat. Sean Paul & Anne-Marie)","3":"GBAHS1600363","4":"0","5":"81","6":"Clean Bandit","7":"6MDME20pz9RveH9rEXvrOM","8":"Clean Bandit feat. Sean Paul feat. Anne-Marie","9":"6MDME20pz9RveH9rEXvrOM,3Isy6kedDrgPYoTS1dazA9,1zNqDE7qDGCsyzJwohVaoX","10":"Rockabye (feat. Sean Paul & Anne-Marie)","11":"3meZFplbMmji648oWUNEfQ","12":"AD, AE, AR, AT, AU, BE, BG, BH, BO, BR, CA, CH, CL, CO, CR, CY, CZ, DE, DK, DO, DZ, EC, EE, EG, ES, FI, FR, GB, GR, GT, HK, HN, HU, ID, IE, IL, IS, IT, JO, JP, KW, LB, LI, LT, LU, LV, MA, MC, MT, MX, MY, NI, NL, NO, NZ, OM, PA, PE, PH, PL, PS, PT, PY, QA, RO, SA, SE, SG, SK, SV, TH, TN, TR, TW, US, UY, VN, ZA","13":"78","14":"2016-10-21","15":"day","16":"0.720","17":"0.763","18":"9","19":"-4.068","20":"0","21":"0.0523","22":"0.4060","23":"0.00000000","24":"0.180","25":"0.742","26":"101.965","27":"251088","28":"4","29":"spotify:track:5knuzwU65gJK7IF5yJsuaW","30":"https://api.spotify.com/v1/audio-analysis/5knuzwU65gJK7IF5yJsuaW","31":"2504254","32":"artist","33":"84","34":"dance pop","35":"dance pop, edm, pop, tropical house, uk funky","36":"5","37":"97","38":"2","39":"73","40":"22","41":"0","42":"299399.5","43":"us","44":"63076779883569398413334987328546550614768141860947245086927001088783215597781844192705652165114862194747506688","45":"pop","46":"pop","47":"not explicit"},{"1":"1xznGGDReH1oQq0xzbwXa3","2":"One Dance","3":"USCM51600028","4":"0","5":"25","6":"Drake","7":"3TVXtAsR1Inumwj472S9r4","8":"Drake feat. WizKid feat. Kyla","9":"3TVXtAsR1Inumwj472S9r4,3tVQdUvClmAT7URs9V3rsp,77DAFfvm3O9zT5dIoG0eIO","10":"Views","11":"3hARKC8cinq3mZLLAEaBh9","12":"NA","13":"0","14":"2016-05-06","15":"day","16":"0.791","17":"0.619","18":"1","19":"-5.886","20":"1","21":"0.0532","22":"0.0078","23":"0.00420000","24":"0.351","25":"0.371","26":"103.989","27":"173987","28":"4","29":"spotify:track:1xznGGDReH1oQq0xzbwXa3","30":"https://api.spotify.com/v1/audio-analysis/1xznGGDReH1oQq0xzbwXa3","31":"35200192","32":"artist","33":"98","34":"canadian hip hop","35":"canadian hip hop, canadian pop, hip hop, pop rap, rap","36":"5","37":"403","38":"27","39":"37","40":"339","41":"0","42":"265877.0","43":"us","44":"4668617894331640150713528913482931563888378169191399287770456451298974376421383853327796314422734196333084672","45":"pop","46":"pop","47":"not explicit"},{"1":"343YBumqHu19cGoGARUTsd","2":"Fake Love","3":"USCM51700084","4":"1","5":"76","6":"Drake","7":"3TVXtAsR1Inumwj472S9r4","8":"Drake","9":"3TVXtAsR1Inumwj472S9r4","10":"More Life","11":"1lXY618HWkwYKJWBRYR4MK","12":"AD, AE, AR, AT, AU, BE, BG, BH, BO, BR, CA, CH, CL, CO, CR, CY, CZ, DE, DK, DO, DZ, EC, EE, EG, ES, FI, FR, GB, GR, GT, HK, HN, HU, ID, IE, IL, IN, IS, IT, JO, JP, KW, LB, LI, LT, LU, LV, MA, MC, MT, MX, MY, NI, NL, NO, NZ, OM, PA, PE, PH, PL, PS, PT, PY, QA, RO, SA, SE, SG, SK, SV, TH, TN, TR, TW, US, UY, VN, ZA","13":"79","14":"2017-03-18","15":"day","16":"0.927","17":"0.488","18":"9","19":"-9.433","20":"0","21":"0.4200","22":"0.1080","23":"0.00000000","24":"0.196","25":"0.605","26":"133.987","27":"210937","28":"4","29":"spotify:track:343YBumqHu19cGoGARUTsd","30":"https://api.spotify.com/v1/audio-analysis/343YBumqHu19cGoGARUTsd","31":"35200192","32":"artist","33":"98","34":"canadian hip hop","35":"canadian hip hop, canadian pop, hip hop, pop rap, rap","36":"5","37":"403","38":"27","39":"37","40":"339","41":"0","42":"425740.1","43":"us","44":"34419285169424099063603169706763167298974920345850885001633591652364254906722960776146634654951625345242828374016","45":"pop","46":"pop","47":"explicit"}],"options":{"columns":{"min":{},"max":[10]},"rows":{"min":[10],"max":[10]},"pages":{}}}
</script>
</div>
<p>How can we plot this kind of data? Since we have a categorical variable, we will use a bar plot. However, to be able to use the table for your plot, you first need to assign it to an object as a data frame using the <code>as.data.frame()</code>-function.</p>
<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb3-1" data-line-number="1">table_plot_rel <-<span class="st"> </span><span class="kw">as.data.frame</span>(<span class="kw">prop.table</span>(<span class="kw">table</span>(music_data[, </a>
<a class="sourceLine" id="cb3-2" data-line-number="2"> <span class="kw">c</span>(<span class="st">"genre_cat"</span>)]))) <span class="co">#relative frequencies #relative frequencies</span></a>
<a class="sourceLine" id="cb3-3" data-line-number="3"><span class="kw">head</span>(table_plot_rel)</a></code></pre></div>
<div data-pagedtable="false">
<script data-pagedtable-source type="application/json">
{"columns":[{"label":["Var1"],"name":[1],"type":["fctr"],"align":["left"]},{"label":["Freq"],"name":[2],"type":["dbl"],"align":["right"]}],"data":[{"1":"hip hop","2":"0.23643263"},{"1":"indie","2":"0.01260083"},{"1":"pop","2":"0.67891747"},{"1":"rap","2":"0.02816095"},{"1":"rock","2":"0.04388812"}],"options":{"columns":{"min":{},"max":[10]},"rows":{"min":[10],"max":[10]},"pages":{}}}
</script>
</div>
<p>Since <code>Var1</code> is not a very descriptive name, let’s rename the variable to something more meaningful</p>
<div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb4-1" data-line-number="1"><span class="kw">library</span>(plyr)</a>
<a class="sourceLine" id="cb4-2" data-line-number="2">table_plot_rel <-<span class="st"> </span>plyr<span class="op">::</span><span class="kw">rename</span>(table_plot_rel, <span class="kw">c</span>(<span class="dt">Var1 =</span> <span class="st">"Genre"</span>))</a>
<a class="sourceLine" id="cb4-3" data-line-number="3"><span class="kw">head</span>(table_plot_rel)</a></code></pre></div>
<div data-pagedtable="false">
<script data-pagedtable-source type="application/json">
{"columns":[{"label":["Genre"],"name":[1],"type":["fctr"],"align":["left"]},{"label":["Freq"],"name":[2],"type":["dbl"],"align":["right"]}],"data":[{"1":"hip hop","2":"0.23643263"},{"1":"indie","2":"0.01260083"},{"1":"pop","2":"0.67891747"},{"1":"rap","2":"0.02816095"},{"1":"rock","2":"0.04388812"}],"options":{"columns":{"min":{},"max":[10]},"rows":{"min":[10],"max":[10]},"pages":{}}}
</script>
</div>
<p>Once we have our data set we can begin constructing the plot. As mentioned previously, we start with the <code>ggplot()</code> function, with the argument specifying the data set to be used. Within the function, we further specify the scales to be used using the aesthetics argument, specifying which variable should be plotted on which axis. In our example, we would like to plot the categories on the x-axis (horizontal axis) and the relative frequencies on the y-axis (vertical axis).</p>
<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb5-1" data-line-number="1"><span class="kw">library</span>(ggplot2)</a>
<a class="sourceLine" id="cb5-2" data-line-number="2">bar_chart <-<span class="st"> </span><span class="kw">ggplot</span>(table_plot_rel, <span class="kw">aes</span>(<span class="dt">x =</span> Genre, </a>
<a class="sourceLine" id="cb5-3" data-line-number="3"> <span class="dt">y =</span> Freq))</a>
<a class="sourceLine" id="cb5-4" data-line-number="4">bar_chart</a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-8"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-8-1.png" alt="Bar chart (step 1)" width="672" />
<p class="caption">
Figure 1: Bar chart (step 1)
</p>
</div>
<p>You can see that the coordinate system is empty. This is because so far, we have told R merely which variables we would like to plot but we haven’t specified which geometric figures (points, bars, lines, etc.) we would like to use. This is done using the <code>geom_xxx</code> function. ggplot includes many different geoms, for a wide range of plots (e.g., geom_line, geom_histogram, geom_boxplot, etc.). A good overview of the various geom functions can be found <a href="https://www.rstudio.com/wp-content/uploads/2015/03/ggplot2-cheatsheet.pdf" target="_blank">here</a>. In our case, we would like to use a bar chart for which <code>geom_col</code> is appropriate.</p>
<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb6-1" data-line-number="1">bar_chart <span class="op">+</span><span class="st"> </span><span class="kw">geom_col</span>()</a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-9"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-9-1.png" alt="Bar chart (step 2)" width="672" />
<p class="caption">
Figure 2: Bar chart (step 2)
</p>
</div>
<p>Note that the same could be achieved using <code>geom_bar</code>. However, by default <code>geom_bar</code> counts the number of observations within each category of a variable. This is not required in our case because we have already used the <code>prop.table()</code> function to compute the relative frequencies. The argument <code>stat = "identity"</code> prevents <code>geom_bar</code> from performing counting operations and uses it “as it is”.</p>
<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb7-1" data-line-number="1">bar_chart <span class="op">+</span><span class="st"> </span><span class="kw">geom_bar</span>(<span class="dt">stat =</span> <span class="st">"identity"</span>)</a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-10"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-10-1.png" alt="Bar chart (alternative specification)" width="672" />
<p class="caption">
Figure 3: Bar chart (alternative specification)
</p>
</div>
<p>Now we have specified the data, the scales and the shape. Specifying this information is essential for plotting data using ggplot. Everything that follows now just serves the purpose of making the plot look nicer by modifying the appearance of the plot. How about some more meaningful axis labels? We can specify the axis labels using the <code>ylab()</code> and <code>xlab()</code> functions:</p>
<div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb8-1" data-line-number="1">bar_chart <span class="op">+</span><span class="st"> </span><span class="kw">geom_col</span>() <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Relative frequency"</span>) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb8-2" data-line-number="2"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Genre"</span>)</a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-11"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-11-1.png" alt="Bar chart (step 3)" width="672" />
<p class="caption">
Figure 4: Bar chart (step 3)
</p>
</div>
<p>How about adding some value labels to the bars? This can be done using <code>geom_text()</code>. Note that the <code>sprintf()</code> function is not mandatory and is only added to format the numeric labels here. The function takes two arguments: the first specifies the format wrapped in two <code>%</code> sings. Thus, <code>%.0f</code> means to format the value as a fixed point value with no digits after the decimal point, and <code>%%</code> is a literal that prints a “%” sign. The second argument is simply the numeric value to be used. In this case, the relative frequencies multiplied by 100 to obtain the percentage values. Using the <code>vjust =</code> argument, we can adjust the vertical alignment of the label. In this case, we would like to display the label slightly above the bars.</p>
<div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb9-1" data-line-number="1">bar_chart <span class="op">+</span><span class="st"> </span><span class="kw">geom_col</span>() <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Relative frequency"</span>) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb9-2" data-line-number="2"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Genre"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">geom_text</span>(<span class="kw">aes</span>(<span class="dt">label =</span> <span class="kw">sprintf</span>(<span class="st">"%.0f%%"</span>, </a>
<a class="sourceLine" id="cb9-3" data-line-number="3"> Freq<span class="op">/</span><span class="kw">sum</span>(Freq) <span class="op">*</span><span class="st"> </span><span class="dv">100</span>)), <span class="dt">vjust =</span> <span class="fl">-0.2</span>)</a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-12"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-12-1.png" alt="Bar chart (step 4)" width="672" />
<p class="caption">
Figure 5: Bar chart (step 4)
</p>
</div>
<p>We could go ahead and specify the appearance of every single element of the plot now. However, there are also pre-specified themes that include various formatting steps in one singe function. For example <code>theme_bw()</code> would make the plot appear like this:</p>
<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb10-1" data-line-number="1">bar_chart <span class="op">+</span><span class="st"> </span><span class="kw">geom_col</span>() <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Relative frequency"</span>) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb10-2" data-line-number="2"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Genre"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">geom_text</span>(<span class="kw">aes</span>(<span class="dt">label =</span> <span class="kw">sprintf</span>(<span class="st">"%.0f%%"</span>, </a>
<a class="sourceLine" id="cb10-3" data-line-number="3"> Freq<span class="op">/</span><span class="kw">sum</span>(Freq) <span class="op">*</span><span class="st"> </span><span class="dv">100</span>)), <span class="dt">vjust =</span> <span class="fl">-0.2</span>) <span class="op">+</span><span class="st"> </span><span class="kw">theme_bw</span>()</a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-13"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-13-1.png" alt="Bar chart (step 5)" width="672" />
<p class="caption">
Figure 6: Bar chart (step 5)
</p>
</div>
<p>and <code>theme_minimal()</code> looks like this:</p>
<div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb11-1" data-line-number="1">bar_chart <span class="op">+</span><span class="st"> </span><span class="kw">geom_col</span>() <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Relative frequency"</span>) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb11-2" data-line-number="2"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Genre"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">geom_text</span>(<span class="kw">aes</span>(<span class="dt">label =</span> <span class="kw">sprintf</span>(<span class="st">"%.0f%%"</span>, </a>
<a class="sourceLine" id="cb11-3" data-line-number="3"> Freq<span class="op">/</span><span class="kw">sum</span>(Freq) <span class="op">*</span><span class="st"> </span><span class="dv">100</span>)), <span class="dt">vjust =</span> <span class="fl">-0.2</span>) <span class="op">+</span><span class="st"> </span><span class="kw">theme_minimal</span>()</a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-14"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-14-1.png" alt="Bar chart (options 1)" width="672" />
<p class="caption">
Figure 7: Bar chart (options 1)
</p>
</div>
<p>These were examples of built-in formations of <code>ggolot()</code>, where the default is <code>theme_classic()</code>. For even more options, check out the <code>ggthemes</code> package, which includes formats for specific publications. You can check out the different themes <a href="https://cran.r-project.org/web/packages/ggthemes/vignettes/ggthemes.html" target="_blank">here</a>. For example <code>theme_economist()</code> uses the formatting of the journal “The Economist”:</p>
<div class="sourceCode" id="cb12"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb12-1" data-line-number="1"><span class="kw">library</span>(ggthemes)</a>
<a class="sourceLine" id="cb12-2" data-line-number="2">bar_chart <span class="op">+</span><span class="st"> </span><span class="kw">geom_col</span>() <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Relative frequency"</span>) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb12-3" data-line-number="3"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Genre"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">theme_economist</span>()</a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-15"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-15-1.png" alt="Bar chart (options 2)" width="672" />
<p class="caption">
Figure 8: Bar chart (options 2)
</p>
</div>
<p><strong>Summary</strong></p>
<p>To create a plot with ggplot we give it the appropriate data (in the <code>ggplot()</code> function), tell it which shape to use (via a function of the geom family), assign variables to the correct axis (by using the the <code>aes()</code> function) and define the appearance of the plot.</p>
<p>Now we would like to investigate whether the distribution differs between explicit and non-explicit songs. For this purpose we first construct the conditional relative frequency table from the previous chapter again. Recall that the latter gives us the relative frequency within a group (in our case explicit and non-explicit), as compared to the relative frequency within the entire sample.</p>
<div class="sourceCode" id="cb13"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb13-1" data-line-number="1">table_plot_cond_rel <-<span class="st"> </span><span class="kw">as.data.frame</span>(<span class="kw">prop.table</span>(<span class="kw">table</span>(music_data[, </a>
<a class="sourceLine" id="cb13-2" data-line-number="2"> <span class="kw">c</span>(<span class="st">"genre_cat"</span>, <span class="st">"explicit_cat"</span>)]), <span class="dv">2</span>)) <span class="co">#conditional relative frequencies</span></a></code></pre></div>
<p>We can now take these tables to construct plots grouped by explicitness. To achieve this we simply need to add the <code>facet_wrap()</code> function, which replicates a plot multiple times, split by a specified grouping factor. Note that the grouping factor has to be supplied in R’s formula notation, hence it is preceded by a “~” symbol.</p>
<div class="sourceCode" id="cb14"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb14-1" data-line-number="1"><span class="kw">ggplot</span>(table_plot_cond_rel, <span class="kw">aes</span>(<span class="dt">x =</span> genre_cat, <span class="dt">y =</span> Freq)) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb14-2" data-line-number="2"><span class="st"> </span><span class="kw">geom_col</span>() <span class="op">+</span><span class="st"> </span><span class="kw">facet_wrap</span>(<span class="op">~</span>explicit_cat) <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Conditional relative frequency"</span>) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb14-3" data-line-number="3"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Genre"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">theme_bw</span>()</a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-17"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-17-1.png" alt="Grouped bar chart (conditional relative frequencies)" width="672" />
<p class="caption">
Figure 9: Grouped bar chart (conditional relative frequencies)
</p>
</div>
<p>To plot the relative frequencies for each response category by group in a slightly different way, we can also use the <code>fill</code> argument, which tells ggplot to fill the bars by a specified variable (in our case “explicit”). The <code>position = "dodge"</code> argument causes the bars to be displayed next to each other (as opposed to stacked on top of one another).</p>
<div class="sourceCode" id="cb15"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb15-1" data-line-number="1"><span class="kw">ggplot</span>(table_plot_cond_rel, <span class="kw">aes</span>(<span class="dt">x =</span> genre_cat, <span class="dt">y =</span> Freq, <span class="dt">fill =</span> explicit_cat)) <span class="op">+</span><span class="st"> </span><span class="co">#use "fill" argument for different colors</span></a>
<a class="sourceLine" id="cb15-2" data-line-number="2"><span class="st"> </span><span class="kw">geom_col</span>(<span class="dt">position =</span> <span class="st">"dodge"</span>) <span class="op">+</span><span class="st"> </span><span class="co">#use "dodge" to display bars next to each other (instead of stacked on top)</span></a>
<a class="sourceLine" id="cb15-3" data-line-number="3"><span class="st"> </span><span class="kw">geom_text</span>(<span class="kw">aes</span>(<span class="dt">label =</span> <span class="kw">sprintf</span>(<span class="st">"%.0f%%"</span>, Freq<span class="op">/</span><span class="kw">sum</span>(Freq) <span class="op">*</span><span class="st"> </span><span class="dv">100</span>)),<span class="dt">position=</span><span class="kw">position_dodge</span>(<span class="dt">width=</span><span class="fl">0.9</span>), <span class="dt">vjust=</span><span class="op">-</span><span class="fl">0.25</span>) <span class="op">+</span></a>
<a class="sourceLine" id="cb15-4" data-line-number="4"><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Conditional relative frequency"</span>) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb15-5" data-line-number="5"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Genre"</span>) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb15-6" data-line-number="6"><span class="st"> </span><span class="kw">theme_bw</span>() </a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-18"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-18-1.png" alt="Grouped bar chart (conditional relative frequencies) (2)" width="672" />
<p class="caption">
Figure 10: Grouped bar chart (conditional relative frequencies) (2)
</p>
</div>
</div>
<div id="covariation-plots" class="section level4">
<h4><span class="header-section-number">0.1.1.2</span> Covariation plots</h4>
<p>To visualize the covariation between categorical variables, you’ll need to count the number of observations for each combination stored in the frequency table. Say, we wanted to investigate the association between genre and popularity. First, we need to make sure that the respective variables are coded as factors.</p>
<div class="sourceCode" id="cb16"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb16-1" data-line-number="1">music_data<span class="op">$</span>genre_cat <-<span class="st"> </span><span class="kw">as.factor</span>(music_data<span class="op">$</span>top.genre)</a>
<a class="sourceLine" id="cb16-2" data-line-number="2"></a>
<a class="sourceLine" id="cb16-3" data-line-number="3">music_data<span class="op">$</span>popularity_factor <-<span class="st"> </span><span class="kw">cut</span>(music_data<span class="op">$</span>trackPopularity, </a>
<a class="sourceLine" id="cb16-4" data-line-number="4"> <span class="dt">breaks =</span> <span class="kw">c</span>(<span class="op">-</span><span class="ot">Inf</span>, <span class="dv">40</span>, <span class="dv">60</span>, <span class="ot">Inf</span>), <span class="dt">labels =</span> <span class="kw">c</span>(<span class="st">"low"</span>, </a>
<a class="sourceLine" id="cb16-5" data-line-number="5"> <span class="st">"middle"</span>, <span class="st">"high"</span>))</a></code></pre></div>
<p>There are multiple ways to visualize such a relationship with ggplot. One option would be to use a variation of the scatterplot which counts how many points overlap at any given point and increases the dot size accordingly. This can be achieved with <code>geom_count()</code>. From the bar charts above, we know that the categories in <code>genre</code> differ in size. To account for that we set the parameters <code>size</code> and <code>group</code> in <code>geom_count</code>, which gives us the conditional relative frequencies. This is equivalent to the conditional relative frequency table from above, only now the <code>stat(prop)</code> argument assures that we get relative frequencies and with the <code>group</code> argument we tell R to compute the relative frequencies by genre.</p>
<div class="sourceCode" id="cb17"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb17-1" data-line-number="1"><span class="kw">ggplot</span>(<span class="dt">data =</span> music_data) <span class="op">+</span><span class="st"> </span><span class="kw">geom_count</span>(<span class="kw">aes</span>(<span class="dt">x =</span> genre_cat, </a>
<a class="sourceLine" id="cb17-2" data-line-number="2"> <span class="dt">y =</span> popularity_factor, <span class="dt">size =</span> <span class="kw">stat</span>(prop), <span class="dt">group =</span> genre_cat)) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb17-3" data-line-number="3"><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Popularity"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Genre"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">labs</span>(<span class="dt">size =</span> <span class="st">"Proportion"</span>) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb17-4" data-line-number="4"><span class="st"> </span><span class="kw">theme_bw</span>()</a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-20"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-20-1.png" alt="Covariation between categorical data (1)" width="672" />
<p class="caption">
Figure 11: Covariation between categorical data (1)
</p>
</div>
<p>Another option would be to use a tile plot that changes the color of the tile based on the frequency of the combination of factors. To achieve this we first have to create a dataframe that contains the relative frequencies of all combinations of factors. Then we can take this dataframe and supply it to <code>geom_tile()</code>, while specifying that the fill of each tile should be dependent on the observed frequency of the factor combination, which is done by specifying the fill in the <code>aes()</code> function.</p>
<div class="sourceCode" id="cb18"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb18-1" data-line-number="1">table_plot_rel <-<span class="st"> </span><span class="kw">prop.table</span>(<span class="kw">table</span>(music_data[, <span class="kw">c</span>(<span class="st">"genre_cat"</span>, </a>
<a class="sourceLine" id="cb18-2" data-line-number="2"> <span class="st">"popularity_factor"</span>)]), <span class="dv">1</span>)</a>
<a class="sourceLine" id="cb18-3" data-line-number="3">table_plot_rel <-<span class="st"> </span><span class="kw">as.data.frame</span>(table_plot_rel)</a>
<a class="sourceLine" id="cb18-4" data-line-number="4"></a>
<a class="sourceLine" id="cb18-5" data-line-number="5"><span class="kw">ggplot</span>(table_plot_rel, <span class="kw">aes</span>(<span class="dt">x =</span> genre_cat, <span class="dt">y =</span> popularity_factor)) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb18-6" data-line-number="6"><span class="st"> </span><span class="kw">geom_tile</span>(<span class="kw">aes</span>(<span class="dt">fill =</span> Freq)) <span class="op">+</span><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Popularity"</span>) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb18-7" data-line-number="7"><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Genre"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">theme_bw</span>()</a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-21"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-21-1.png" alt="Covariation between categorical data (2)" width="672" />
<p class="caption">
Figure 12: Covariation between categorical data (2)
</p>
</div>
</div>
</div>
<div id="continuous-variables" class="section level3">
<h3><span class="header-section-number">0.1.2</span> Continuous variables</h3>
<div id="histogram" class="section level4">
<h4><span class="header-section-number">0.1.2.1</span> Histogram</h4>
<p>Histograms can be plotted for continuous data using the <code>geom_histogram()</code> function. Note that the <code>aes()</code> function only needs one argument here, since a histogram is a plot of the distribution of only one variable. As an example, let’s consider our data set containing the advertising expenditures and product sales of a company selling products in two different stores:</p>
<div class="sourceCode" id="cb19"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb19-1" data-line-number="1"><span class="kw">head</span>(music_data)</a></code></pre></div>
<div data-pagedtable="false">
<script data-pagedtable-source type="application/json">
{"columns":[{"label":["id"],"name":[1],"type":["chr"],"align":["left"]},{"label":["trackName"],"name":[2],"type":["chr"],"align":["left"]},{"label":["isrc"],"name":[3],"type":["chr"],"align":["left"]},{"label":["explicit"],"name":[4],"type":["dbl"],"align":["right"]},{"label":["trackPopularity"],"name":[5],"type":["dbl"],"align":["right"]},{"label":["primary_artistName"],"name":[6],"type":["chr"],"align":["left"]},{"label":["primary_artistID"],"name":[7],"type":["chr"],"align":["left"]},{"label":["artistName"],"name":[8],"type":["chr"],"align":["left"]},{"label":["artistIDs"],"name":[9],"type":["chr"],"align":["left"]},{"label":["albumName"],"name":[10],"type":["chr"],"align":["left"]},{"label":["albumID"],"name":[11],"type":["chr"],"align":["left"]},{"label":["available_markets"],"name":[12],"type":["chr"],"align":["left"]},{"label":["n_available_markets"],"name":[13],"type":["dbl"],"align":["right"]},{"label":["releaseDate"],"name":[14],"type":["date"],"align":["right"]},{"label":["releaseDate_precision"],"name":[15],"type":["chr"],"align":["left"]},{"label":["danceability"],"name":[16],"type":["dbl"],"align":["right"]},{"label":["energy"],"name":[17],"type":["dbl"],"align":["right"]},{"label":["key"],"name":[18],"type":["dbl"],"align":["right"]},{"label":["loudness"],"name":[19],"type":["dbl"],"align":["right"]},{"label":["mode"],"name":[20],"type":["dbl"],"align":["right"]},{"label":["speechiness"],"name":[21],"type":["dbl"],"align":["right"]},{"label":["acousticness"],"name":[22],"type":["dbl"],"align":["right"]},{"label":["instrumentalness"],"name":[23],"type":["dbl"],"align":["right"]},{"label":["liveness"],"name":[24],"type":["dbl"],"align":["right"]},{"label":["valence"],"name":[25],"type":["dbl"],"align":["right"]},{"label":["tempo"],"name":[26],"type":["dbl"],"align":["right"]},{"label":["duration_ms"],"name":[27],"type":["dbl"],"align":["right"]},{"label":["time_signature"],"name":[28],"type":["dbl"],"align":["right"]},{"label":["uri"],"name":[29],"type":["chr"],"align":["left"]},{"label":["analysis_url"],"name":[30],"type":["chr"],"align":["left"]},{"label":["followers"],"name":[31],"type":["dbl"],"align":["right"]},{"label":["type"],"name":[32],"type":["chr"],"align":["left"]},{"label":["popularity"],"name":[33],"type":["dbl"],"align":["right"]},{"label":["primary_genre"],"name":[34],"type":["chr"],"align":["left"]},{"label":["genres"],"name":[35],"type":["chr"],"align":["left"]},{"label":["n_genres"],"name":[36],"type":["dbl"],"align":["right"]},{"label":["total_releases"],"name":[37],"type":["dbl"],"align":["right"]},{"label":["n_album"],"name":[38],"type":["dbl"],"align":["right"]},{"label":["n_single"],"name":[39],"type":["dbl"],"align":["right"]},{"label":["n_appears_on"],"name":[40],"type":["dbl"],"align":["right"]},{"label":["n_compilation"],"name":[41],"type":["dbl"],"align":["right"]},{"label":["mstreams"],"name":[42],"type":["dbl"],"align":["right"]},{"label":["region"],"name":[43],"type":["chr"],"align":["left"]},{"label":["adv_spending"],"name":[44],"type":["dbl"],"align":["right"]},{"label":["top.genre"],"name":[45],"type":["chr"],"align":["left"]},{"label":["genre_cat"],"name":[46],"type":["fctr"],"align":["left"]},{"label":["explicit_cat"],"name":[47],"type":["fctr"],"align":["left"]},{"label":["popularity_factor"],"name":[48],"type":["fctr"],"align":["left"]}],"data":[{"1":"5aAx2yezTd8zXrkmtKl66Z","2":"Starboy","3":"USUG11600976","4":"1","5":"22","6":"The Weeknd","7":"1Xyo4u8uXC1ZmMpatF05PJ","8":"The Weeknd feat. Daft Punk","9":"1Xyo4u8uXC1ZmMpatF05PJ,4tZwfgrHOc3mvqYlEYSvVi","10":"Starboy","11":"09fggMHib4YkOtwQNXEBII","12":"NA","13":"0","14":"2016-11-25","15":"day","16":"0.681","17":"0.594","18":"7","19":"-7.028","20":"1","21":"0.2820","22":"0.1650","23":"0.00000349","24":"0.134","25":"0.535","26":"186.054","27":"230453","28":"4","29":"spotify:track:5aAx2yezTd8zXrkmtKl66Z","30":"https://api.spotify.com/v1/audio-analysis/5aAx2yezTd8zXrkmtKl66Z","31":"15222808","32":"artist","33":"91","34":"canadian contemporary r&b","35":"canadian contemporary r&b, canadian pop, pop, rap","36":"4","37":"124","38":"13","39":"20","40":"91","41":"0","42":"357336.0","43":"us","44":"4538745582809720084335652681802192354751872961846816542356290991840842800362708869311675467455586767281244340224","45":"pop","46":"pop","47":"explicit","48":"low"},{"1":"7BKLCZ1jbUBVqRi2FVlTVw","2":"Closer","3":"USQX91601347","4":"0","5":"86","6":"The Chainsmokers","7":"69GGBxA162lTqCwzJG5jLp","8":"The Chainsmokers feat. Halsey","9":"69GGBxA162lTqCwzJG5jLp,26VFTg2z8YR0cCuwLzESi2","10":"Closer","11":"0rSLgV8p5FzfnqlEk4GzxE","12":"AD, AE, AR, AT, AU, BE, BG, BH, BO, BR, CA, CH, CL, CO, CR, CY, CZ, DE, DK, DO, DZ, EC, EE, EG, ES, FI, FR, GB, GR, GT, HK, HN, HU, ID, IE, IL, IN, IS, IT, JO, JP, KW, LB, LI, LT, LU, LV, MA, MC, MT, MX, MY, NI, NL, NO, NZ, OM, PA, PE, PH, PL, PS, PT, PY, QA, RO, SA, SE, SG, SK, SV, TH, TN, TR, TW, US, UY, VN, ZA","13":"79","14":"2016-07-29","15":"day","16":"0.748","17":"0.524","18":"8","19":"-5.599","20":"1","21":"0.0338","22":"0.4140","23":"0.00000000","24":"0.111","25":"0.661","26":"95.010","27":"244960","28":"4","29":"spotify:track:7BKLCZ1jbUBVqRi2FVlTVw","30":"https://api.spotify.com/v1/audio-analysis/7BKLCZ1jbUBVqRi2FVlTVw","31":"11565104","32":"artist","33":"90","34":"dance pop","35":"dance pop, electropop, pop, tropical house","36":"4","37":"96","38":"2","39":"59","40":"35","41":"0","42":"323081.4","43":"us","44":"87296125002616694828879835742568772595201488756664067891719260959953504690764846096364552367775131489311653888","45":"pop","46":"pop","47":"not explicit","48":"high"},{"1":"4pdPtRcBmOSQDlJ3Fk945m","2":"Let Me Love You","3":"QMZSY1600015","4":"0","5":"27","6":"DJ Snake","7":"540vIaP2JwjQb9dm3aArA4","8":"DJ Snake feat. Justin Bieber","9":"540vIaP2JwjQb9dm3aArA4,1uNFoZAHBGtllmzznpCI3s","10":"Encore","11":"55bbXORm6ZrVq52zfZnxBf","12":"NA","13":"0","14":"2016-08-05","15":"day","16":"0.476","17":"0.718","18":"8","19":"-5.309","20":"1","21":"0.0576","22":"0.0784","23":"0.00000000","24":"0.122","25":"0.142","26":"199.864","27":"205947","28":"4","29":"spotify:track:4pdPtRcBmOSQDlJ3Fk945m","30":"https://api.spotify.com/v1/audio-analysis/4pdPtRcBmOSQDlJ3Fk945m","31":"4345960","32":"artist","33":"86","34":"dance pop","35":"dance pop, edm, electronic trap, pop, tropical house","36":"5","37":"85","38":"2","39":"38","40":"45","41":"0","42":"264546.4","43":"us","44":"4052673829394860012092441543505898027168278466357604061531255984953722983872577825913849720641449881585582080","45":"pop","46":"pop","47":"not explicit","48":"low"},{"1":"5knuzwU65gJK7IF5yJsuaW","2":"Rockabye (feat. Sean Paul & Anne-Marie)","3":"GBAHS1600363","4":"0","5":"81","6":"Clean Bandit","7":"6MDME20pz9RveH9rEXvrOM","8":"Clean Bandit feat. Sean Paul feat. Anne-Marie","9":"6MDME20pz9RveH9rEXvrOM,3Isy6kedDrgPYoTS1dazA9,1zNqDE7qDGCsyzJwohVaoX","10":"Rockabye (feat. Sean Paul & Anne-Marie)","11":"3meZFplbMmji648oWUNEfQ","12":"AD, AE, AR, AT, AU, BE, BG, BH, BO, BR, CA, CH, CL, CO, CR, CY, CZ, DE, DK, DO, DZ, EC, EE, EG, ES, FI, FR, GB, GR, GT, HK, HN, HU, ID, IE, IL, IS, IT, JO, JP, KW, LB, LI, LT, LU, LV, MA, MC, MT, MX, MY, NI, NL, NO, NZ, OM, PA, PE, PH, PL, PS, PT, PY, QA, RO, SA, SE, SG, SK, SV, TH, TN, TR, TW, US, UY, VN, ZA","13":"78","14":"2016-10-21","15":"day","16":"0.720","17":"0.763","18":"9","19":"-4.068","20":"0","21":"0.0523","22":"0.4060","23":"0.00000000","24":"0.180","25":"0.742","26":"101.965","27":"251088","28":"4","29":"spotify:track:5knuzwU65gJK7IF5yJsuaW","30":"https://api.spotify.com/v1/audio-analysis/5knuzwU65gJK7IF5yJsuaW","31":"2504254","32":"artist","33":"84","34":"dance pop","35":"dance pop, edm, pop, tropical house, uk funky","36":"5","37":"97","38":"2","39":"73","40":"22","41":"0","42":"299399.5","43":"us","44":"63076779883569398413334987328546550614768141860947245086927001088783215597781844192705652165114862194747506688","45":"pop","46":"pop","47":"not explicit","48":"high"},{"1":"1xznGGDReH1oQq0xzbwXa3","2":"One Dance","3":"USCM51600028","4":"0","5":"25","6":"Drake","7":"3TVXtAsR1Inumwj472S9r4","8":"Drake feat. WizKid feat. Kyla","9":"3TVXtAsR1Inumwj472S9r4,3tVQdUvClmAT7URs9V3rsp,77DAFfvm3O9zT5dIoG0eIO","10":"Views","11":"3hARKC8cinq3mZLLAEaBh9","12":"NA","13":"0","14":"2016-05-06","15":"day","16":"0.791","17":"0.619","18":"1","19":"-5.886","20":"1","21":"0.0532","22":"0.0078","23":"0.00420000","24":"0.351","25":"0.371","26":"103.989","27":"173987","28":"4","29":"spotify:track:1xznGGDReH1oQq0xzbwXa3","30":"https://api.spotify.com/v1/audio-analysis/1xznGGDReH1oQq0xzbwXa3","31":"35200192","32":"artist","33":"98","34":"canadian hip hop","35":"canadian hip hop, canadian pop, hip hop, pop rap, rap","36":"5","37":"403","38":"27","39":"37","40":"339","41":"0","42":"265877.0","43":"us","44":"4668617894331640150713528913482931563888378169191399287770456451298974376421383853327796314422734196333084672","45":"pop","46":"pop","47":"not explicit","48":"low"},{"1":"343YBumqHu19cGoGARUTsd","2":"Fake Love","3":"USCM51700084","4":"1","5":"76","6":"Drake","7":"3TVXtAsR1Inumwj472S9r4","8":"Drake","9":"3TVXtAsR1Inumwj472S9r4","10":"More Life","11":"1lXY618HWkwYKJWBRYR4MK","12":"AD, AE, AR, AT, AU, BE, BG, BH, BO, BR, CA, CH, CL, CO, CR, CY, CZ, DE, DK, DO, DZ, EC, EE, EG, ES, FI, FR, GB, GR, GT, HK, HN, HU, ID, IE, IL, IN, IS, IT, JO, JP, KW, LB, LI, LT, LU, LV, MA, MC, MT, MX, MY, NI, NL, NO, NZ, OM, PA, PE, PH, PL, PS, PT, PY, QA, RO, SA, SE, SG, SK, SV, TH, TN, TR, TW, US, UY, VN, ZA","13":"79","14":"2017-03-18","15":"day","16":"0.927","17":"0.488","18":"9","19":"-9.433","20":"0","21":"0.4200","22":"0.1080","23":"0.00000000","24":"0.196","25":"0.605","26":"133.987","27":"210937","28":"4","29":"spotify:track:343YBumqHu19cGoGARUTsd","30":"https://api.spotify.com/v1/audio-analysis/343YBumqHu19cGoGARUTsd","31":"35200192","32":"artist","33":"98","34":"canadian hip hop","35":"canadian hip hop, canadian pop, hip hop, pop rap, rap","36":"5","37":"403","38":"27","39":"37","40":"339","41":"0","42":"425740.1","43":"us","44":"34419285169424099063603169706763167298974920345850885001633591652364254906722960776146634654951625345242828374016","45":"pop","46":"pop","47":"explicit","48":"high"}],"options":{"columns":{"min":{},"max":[10]},"rows":{"min":[10],"max":[10]},"pages":{}}}
</script>
</div>
<p>Now we can create the histogram using <code>geom_histogram()</code>. The argument <code>binwidth</code> specifies the range that each bar spans, <code>col = "black"</code> specifies the border to be black and <code>fill = "darkblue"</code> sets the inner color of the bars to dark blue. For brevity, we have now also started naming the x and y axis with the single function <code>labs()</code>, instead of using the two distinct functions <code>xlab()</code> and <code>ylab()</code>.</p>
<div class="sourceCode" id="cb20"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb20-1" data-line-number="1"><span class="kw">ggplot</span>(music_data, <span class="kw">aes</span>(mstreams)) <span class="op">+</span><span class="st"> </span><span class="kw">geom_histogram</span>(<span class="dt">binwidth =</span> <span class="dv">3000</span>, </a>
<a class="sourceLine" id="cb20-2" data-line-number="2"> <span class="dt">col =</span> <span class="st">"black"</span>, <span class="dt">fill =</span> <span class="st">"darkblue"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">labs</span>(<span class="dt">x =</span> <span class="st">"Number of streams"</span>, </a>
<a class="sourceLine" id="cb20-3" data-line-number="3"> <span class="dt">y =</span> <span class="st">"Frequency"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">theme_bw</span>()</a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-23"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-23-1.png" alt="Histogram" width="672" />
<p class="caption">
Figure 13: Histogram
</p>
</div>
</div>
<div id="boxplot" class="section level4">
<h4><span class="header-section-number">0.1.2.2</span> Boxplot</h4>
<p>Another common way to display the distribution of continuous variables is through boxplots. ggplot will construct a boxplot if given the geom <code>geom_boxplot()</code>. In our case we want to show the difference in distribution between the two stores in our sample, which is why the <code>aes()</code> function contains both an x and a y variable.</p>
<div class="sourceCode" id="cb21"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb21-1" data-line-number="1"><span class="kw">ggplot</span>(music_data, <span class="kw">aes</span>(<span class="dt">x =</span> explicit_cat, <span class="dt">y =</span> mstreams)) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb21-2" data-line-number="2"><span class="st"> </span><span class="kw">geom_boxplot</span>(<span class="dt">coef =</span> <span class="dv">3</span>) <span class="op">+</span><span class="st"> </span><span class="kw">labs</span>(<span class="dt">x =</span> <span class="st">"Explicit"</span>, <span class="dt">y =</span> <span class="st">"Number of streams"</span>) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb21-3" data-line-number="3"><span class="st"> </span><span class="kw">theme_bw</span>()</a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-24"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-24-1.png" alt="Boxplot by group" width="672" />
<p class="caption">
Figure 14: Boxplot by group
</p>
</div>
<p>The following graphic shows you how to interpret the boxplot:</p>
<div class="figure">
<img src="https://github.com/IMSMWU/Teaching/raw/master/MRDA2017/boxplot.JPG" alt="Information contained in a Boxplot" />
<p class="caption">Information contained in a Boxplot</p>
</div>
<p>You may also augment the boxplot with the data points using <code>geom_jitter()</code>:</p>
<div class="sourceCode" id="cb22"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb22-1" data-line-number="1"><span class="kw">ggplot</span>(music_data, <span class="kw">aes</span>(<span class="dt">x =</span> explicit_cat, <span class="dt">y =</span> mstreams)) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb22-2" data-line-number="2"><span class="st"> </span><span class="kw">geom_boxplot</span>(<span class="dt">coef =</span> <span class="dv">3</span>) <span class="op">+</span><span class="st"> </span><span class="kw">geom_jitter</span>(<span class="dt">colour =</span> <span class="st">"red"</span>, </a>
<a class="sourceLine" id="cb22-3" data-line-number="3"> <span class="dt">alpha =</span> <span class="fl">0.2</span>) <span class="op">+</span><span class="st"> </span><span class="kw">labs</span>(<span class="dt">x =</span> <span class="st">"Explicit"</span>, <span class="dt">y =</span> <span class="st">"Number of streams"</span>) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb22-4" data-line-number="4"><span class="st"> </span><span class="kw">theme_bw</span>()</a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-25"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-25-1.png" alt="Boxplot with augmented data points" width="672" />
<p class="caption">
Figure 15: Boxplot with augmented data points
</p>
</div>
<p>In case you would like to create the boxplot on the total data (i.e., not by group), just leave the <code>x =</code> argument within the <code>aes()</code> function empty:</p>
<div class="sourceCode" id="cb23"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb23-1" data-line-number="1"><span class="kw">ggplot</span>(music_data, <span class="kw">aes</span>(<span class="dt">x =</span> <span class="st">""</span>, <span class="dt">y =</span> mstreams)) <span class="op">+</span><span class="st"> </span><span class="kw">geom_boxplot</span>(<span class="dt">coef =</span> <span class="dv">3</span>) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb23-2" data-line-number="2"><span class="st"> </span><span class="kw">labs</span>(<span class="dt">x =</span> <span class="st">"Total"</span>, <span class="dt">y =</span> <span class="st">"Number of streams"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">theme_bw</span>()</a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-26"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-26-1.png" alt="Single Boxplot" width="672" />
<p class="caption">
Figure 16: Single Boxplot
</p>
</div>
</div>
<div id="plot-of-means" class="section level4">
<h4><span class="header-section-number">0.1.2.3</span> Plot of means</h4>
<p>Another quick way to get an overview of the difference between two groups is to plot their respective means with confidence intervals. Two things about this plot are new. First, there are now two geoms included in the same plot. This is one of the big advantages of ggplot’s layered approach to graphs, the fact that new elements can be drawn by simply adding a new line with a new geom function. In this case we want to add confidence bounds to our plot, which we achieve by adding a <code>geom_pointrange()</code> layer. Recall that if the interval is small, the sample must be very close to the population and when the interval is wide, the sample mean is likely very different from the population mean and therefore a bad representation of the population. Second, we are using an additional argument in <code>geom_bar()</code>, namely <code>stat =</code>, which is short for statistical transformation. Every geom uses such a transformation in the background to adapt the data to be able to create the desired plot. <code>geom_bar()</code> typically uses the <code>count</code> stat, which would create a similar plot to the one we saw at the very beginning, counting how often a certain value of a variable appears. By telling <code>geom_bar()</code> explicitly that we want to use a different stat we can override its behavior, forcing it to create a bar plot of the means.</p>
<div class="sourceCode" id="cb24"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb24-1" data-line-number="1">music_data2 <-<span class="st"> </span>music_data[music_data<span class="op">$</span>duration_ms, ]</a>
<a class="sourceLine" id="cb24-2" data-line-number="2"><span class="kw">ggplot</span>(music_data, <span class="kw">aes</span>(explicit_cat, duration_ms)) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb24-3" data-line-number="3"><span class="st"> </span><span class="kw">geom_bar</span>(<span class="dt">stat =</span> <span class="st">"summary"</span>, <span class="dt">color =</span> <span class="st">"black"</span>, <span class="dt">fill =</span> <span class="st">"white"</span>, </a>
<a class="sourceLine" id="cb24-4" data-line-number="4"> <span class="dt">width =</span> <span class="fl">0.7</span>, <span class="dt">na.rm =</span> T) <span class="op">+</span><span class="st"> </span><span class="kw">geom_pointrange</span>(<span class="dt">stat =</span> <span class="st">"summary"</span>, </a>
<a class="sourceLine" id="cb24-5" data-line-number="5"> <span class="dt">fun.ymin =</span> <span class="cf">function</span>(x) <span class="kw">mean</span>(x) <span class="op">-</span><span class="st"> </span><span class="kw">sd</span>(x), <span class="dt">fun.ymax =</span> <span class="cf">function</span>(x) <span class="kw">mean</span>(x) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb24-6" data-line-number="6"><span class="st"> </span><span class="kw">sd</span>(x), <span class="dt">fun.y =</span> mean, <span class="dt">na.rm =</span> T) <span class="op">+</span><span class="st"> </span><span class="kw">labs</span>(<span class="dt">x =</span> <span class="st">"Explicit"</span>, </a>
<a class="sourceLine" id="cb24-7" data-line-number="7"> <span class="dt">y =</span> <span class="st">"Average number of streams"</span>) <span class="op">+</span><span class="st"> </span><span class="co"># coord_cartesian(ylim = c(100000, 150000)) +</span></a>
<a class="sourceLine" id="cb24-8" data-line-number="8"><span class="kw">theme_bw</span>()</a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-27"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-27-1.png" alt="Plot of means" width="672" />
<p class="caption">
Figure 17: Plot of means
</p>
</div>
</div>
<div id="scatter-plot" class="section level4">
<h4><span class="header-section-number">0.1.2.4</span> Scatter plot</h4>
<p>The most common way to show the relationship between two continuous variables is a scatterplot. The following code creates a scatterplot with some additional components. The <code>geom_smooth()</code> function creates a smoothed line from the data provided. In this particular example we tell the function to draw the best possible straight line (i.e., minimizing the distance between the line and the points) through the data (via the argument <code>method = "lm"</code>). The “fill” and “alpha” arguments solely affect appearance, in our case the color and the opacity of the confidence interval, respectively.</p>
<div class="sourceCode" id="cb25"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb25-1" data-line-number="1"><span class="kw">ggplot</span>(music_data, <span class="kw">aes</span>(<span class="kw">log</span>(adv_spending), mstreams)) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb25-2" data-line-number="2"><span class="st"> </span><span class="kw">geom_point</span>() <span class="op">+</span><span class="st"> </span><span class="kw">geom_smooth</span>(<span class="dt">method =</span> <span class="st">"lm"</span>, <span class="dt">fill =</span> <span class="st">"blue"</span>, </a>
<a class="sourceLine" id="cb25-3" data-line-number="3"> <span class="dt">alpha =</span> <span class="fl">0.1</span>) <span class="op">+</span><span class="st"> </span><span class="kw">labs</span>(<span class="dt">x =</span> <span class="st">"Advertising expenditures (EUR)"</span>, </a>
<a class="sourceLine" id="cb25-4" data-line-number="4"> <span class="dt">y =</span> <span class="st">"Number of streams"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">theme_bw</span>()</a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-28"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-28-1.png" alt="Scatter plot" width="672" />
<p class="caption">
Figure 18: Scatter plot
</p>
</div>
<p>As you can see, there appears to be a positive relationship between advertising and sales.</p>
<div id="grouped-scatter-plot" class="section level5">
<h5><span class="header-section-number">0.1.2.4.1</span> Grouped scatter plot</h5>
<p>It could be that customers from different store respond differently to advertising. We can visually capture such differences with a grouped scatter plot. By adding the argument <code>colour = store</code> to the aesthetic specification, ggplot automatically treats the two stores as distinct groups and plots accordingly.</p>
<div class="sourceCode" id="cb26"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb26-1" data-line-number="1"><span class="kw">ggplot</span>(music_data, <span class="kw">aes</span>(<span class="kw">log</span>(adv_spending), mstreams, </a>
<a class="sourceLine" id="cb26-2" data-line-number="2"> <span class="dt">colour =</span> explicit_cat)) <span class="op">+</span><span class="st"> </span><span class="kw">geom_point</span>() <span class="op">+</span><span class="st"> </span><span class="kw">geom_smooth</span>(<span class="dt">method =</span> <span class="st">"lm"</span>, </a>
<a class="sourceLine" id="cb26-3" data-line-number="3"> <span class="dt">alpha =</span> <span class="fl">0.1</span>) <span class="op">+</span><span class="st"> </span><span class="kw">labs</span>(<span class="dt">x =</span> <span class="st">"Advertising expenditures (EUR)"</span>, </a>
<a class="sourceLine" id="cb26-4" data-line-number="4"> <span class="dt">y =</span> <span class="st">"Number of streams"</span>, <span class="dt">colour =</span> <span class="st">"Explicit"</span>) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb26-5" data-line-number="5"><span class="st"> </span><span class="kw">theme_bw</span>()</a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-29"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-29-1.png" alt="Grouped scatter plot" width="672" />
<p class="caption">
Figure 19: Grouped scatter plot
</p>
</div>
<p>It appears from the plot that explicit tracks are more responsive to advertising.</p>
</div>
<div id="combination-of-scatter-plot-and-histogram" class="section level5">
<h5><span class="header-section-number">0.1.2.4.2</span> Combination of scatter plot and histogram</h5>
<p>Using the <code>ggExtra()</code> package, you may also augment the scatterplot with a histogram:</p>
<div class="sourceCode" id="cb27"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb27-1" data-line-number="1"><span class="kw">library</span>(ggExtra)</a>
<a class="sourceLine" id="cb27-2" data-line-number="2">p <-<span class="st"> </span><span class="kw">ggplot</span>(music_data, <span class="kw">aes</span>(<span class="kw">log</span>(adv_spending), mstreams)) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb27-3" data-line-number="3"><span class="st"> </span><span class="kw">geom_point</span>() <span class="op">+</span><span class="st"> </span><span class="kw">labs</span>(<span class="dt">x =</span> <span class="st">"Advertising expenditures (EUR)"</span>, </a>
<a class="sourceLine" id="cb27-4" data-line-number="4"> <span class="dt">y =</span> <span class="st">"Number of strams"</span>, <span class="dt">colour =</span> <span class="st">"store"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">theme_bw</span>()</a>
<a class="sourceLine" id="cb27-5" data-line-number="5">ggExtra<span class="op">::</span><span class="kw">ggMarginal</span>(p, <span class="dt">type =</span> <span class="st">"histogram"</span>)</a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-30"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-30-1.png" alt="Scatter plot with histogram" width="672" />
<p class="caption">
Figure 20: Scatter plot with histogram
</p>
</div>
<p>In this case, the <code>type = "histogram"</code> argument specifies that we would like to plot a histogram. However, you could also opt for <code>type = "boxplot"</code> or <code>type = "density"</code> to use a boxplot or density plot instead.</p>
</div>
</div>
<div id="line-plot" class="section level4">
<h4><span class="header-section-number">0.1.2.5</span> Line plot</h4>
<p>Another important type of plot is the line plot used if, for example, you have a variable that changes over time and you want to plot how it develops over time. To demonstrate this we first gather the population of Austria from the world bank API (as we did previously).</p>
<div class="sourceCode" id="cb28"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb28-1" data-line-number="1"><span class="kw">library</span>(jsonlite)</a>
<a class="sourceLine" id="cb28-2" data-line-number="2"><span class="co"># specifies url</span></a>
<a class="sourceLine" id="cb28-3" data-line-number="3">url <-<span class="st"> "http://api.worldbank.org/countries/AT/indicators/SP.POP.TOTL/?date=1960:2016&format=json&per_page=100"</span></a>
<a class="sourceLine" id="cb28-4" data-line-number="4">ctrydata_at <-<span class="st"> </span><span class="kw">fromJSON</span>(url) <span class="co">#parses the data </span></a>
<a class="sourceLine" id="cb28-5" data-line-number="5"><span class="kw">head</span>(ctrydata_at[[<span class="dv">2</span>]][, <span class="kw">c</span>(<span class="st">"value"</span>, <span class="st">"date"</span>)]) <span class="co">#checks if we scraped the desired data</span></a></code></pre></div>
<div data-pagedtable="false">
<script data-pagedtable-source type="application/json">
{"columns":[{"label":["value"],"name":[1],"type":["chr"],"align":["left"]},{"label":["date"],"name":[2],"type":["chr"],"align":["left"]}],"data":[{"1":"8736668","2":"2016"},{"1":"8642699","2":"2015"},{"1":"8546356","2":"2014"},{"1":"8479823","2":"2013"},{"1":"8429991","2":"2012"},{"1":"8391643","2":"2011"}],"options":{"columns":{"min":{},"max":[10]},"rows":{"min":[10],"max":[10]},"pages":{}}}
</script>
</div>
<div class="sourceCode" id="cb29"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb29-1" data-line-number="1">ctrydata_at <-<span class="st"> </span>ctrydata_at[[<span class="dv">2</span>]][, <span class="kw">c</span>(<span class="st">"date"</span>, <span class="st">"value"</span>)]</a>
<a class="sourceLine" id="cb29-2" data-line-number="2">ctrydata_at<span class="op">$</span>value <-<span class="st"> </span><span class="kw">as.numeric</span>(ctrydata_at<span class="op">$</span>value)</a>
<a class="sourceLine" id="cb29-3" data-line-number="3">ctrydata_at<span class="op">$</span>date <-<span class="st"> </span><span class="kw">as.integer</span>(ctrydata_at<span class="op">$</span>date)</a>
<a class="sourceLine" id="cb29-4" data-line-number="4"><span class="kw">str</span>(ctrydata_at)</a></code></pre></div>
<pre><code>## 'data.frame': 57 obs. of 2 variables:
## $ date : int 2016 2015 2014 2013 2012 2011 2010 2009 2008 2007 ...
## $ value: num 8736668 8642699 8546356 8479823 8429991 ...</code></pre>
<p>As you can see doing this is very straightforward. Given the correct <code>aes()</code> and geom specification ggplot constructs the correct plot for us.</p>
<div class="sourceCode" id="cb31"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb31-1" data-line-number="1"><span class="kw">ggplot</span>(ctrydata_at, <span class="kw">aes</span>(<span class="dt">x =</span> date, <span class="dt">y =</span> value)) <span class="op">+</span><span class="st"> </span><span class="kw">geom_line</span>() <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb31-2" data-line-number="2"><span class="st"> </span><span class="kw">labs</span>(<span class="dt">x =</span> <span class="st">"Year"</span>, <span class="dt">y =</span> <span class="st">"Population of Austria"</span>) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb31-3" data-line-number="3"><span class="st"> </span><span class="kw">theme_bw</span>()</a></code></pre></div>
<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-32"></span>
<img src="05-visualization_files/figure-html/unnamed-chunk-32-1.png" alt="Line plot" width="672" />
<p class="caption">
Figure 21: Line plot
</p>
</div>
</div>
</div>
<div id="saving-plots" class="section level3">
<h3><span class="header-section-number">0.1.3</span> Saving plots</h3>
<p>To save the last displayed plot, simply use the function <code>ggsave()</code>, and it will save the plot to your working directory. Use the arguments <code>height</code>and <code>width</code> to specify the size of the file. You may also choose the file format by adjusting the ending of the file name. E.g., <code>file_name.jpg</code> will create a file in JPG-format, whereas <code>file_name.png</code> saves the file in PNG-format, etc..</p>
<div class="sourceCode" id="cb32"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb32-1" data-line-number="1"><span class="kw">ggplot</span>(table_plot_abs_reg, <span class="kw">aes</span>(<span class="dt">x =</span> Theory_Regression_cat, </a>
<a class="sourceLine" id="cb32-2" data-line-number="2"> <span class="dt">y =</span> Practice_Regression_cat)) <span class="op">+</span><span class="st"> </span><span class="kw">geom_tile</span>(<span class="kw">aes</span>(<span class="dt">fill =</span> Freq)) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb32-3" data-line-number="3"><span class="st"> </span><span class="kw">ylab</span>(<span class="st">"Practical knowledge"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">xlab</span>(<span class="st">"Theoretical knowledge"</span>) <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb32-4" data-line-number="4"><span class="st"> </span><span class="kw">theme_bw</span>()</a>
<a class="sourceLine" id="cb32-5" data-line-number="5"></a>
<a class="sourceLine" id="cb32-6" data-line-number="6"><span class="kw">ggsave</span>(<span class="st">"theory_practice_regression.jpg"</span>, <span class="dt">height =</span> <span class="dv">5</span>, </a>
<a class="sourceLine" id="cb32-7" data-line-number="7"> <span class="dt">width =</span> <span class="fl">7.5</span>)</a></code></pre></div>
</div>
<div id="additional-options" class="section level3">
<h3><span class="header-section-number">0.1.4</span> Additional options</h3>
<p>Now that we have covered the most important plots, we can look at what other type of data you may come across. One type of data that is increasingly available is the geo-location of customers and users (e.g., from app usage data). The following data set contains the app usage data of Shazam users from Germany. The data contains the latitude and longitude information where a music track was “shazamed”.</p>
<div class="sourceCode" id="cb33"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb33-1" data-line-number="1"><span class="kw">library</span>(ggmap)</a>
<a class="sourceLine" id="cb33-2" data-line-number="2"><span class="kw">library</span>(dplyr)</a>
<a class="sourceLine" id="cb33-3" data-line-number="3">geo_data <-<span class="st"> </span><span class="kw">read.table</span>(<span class="st">"https://raw.githubusercontent.com/IMSMWU/Teaching/master/MRDA2017/geo_data.dat"</span>, </a>
<a class="sourceLine" id="cb33-4" data-line-number="4"> <span class="dt">sep =</span> <span class="st">"</span><span class="ch">\t</span><span class="st">"</span>, <span class="dt">header =</span> <span class="ot">TRUE</span>)</a>
<a class="sourceLine" id="cb33-5" data-line-number="5"><span class="kw">head</span>(geo_data)</a></code></pre></div>
<div data-pagedtable="false">
<script data-pagedtable-source type="application/json">
{"columns":[{"label":["lat"],"name":[1],"type":["dbl"],"align":["right"]},{"label":["lon"],"name":[2],"type":["dbl"],"align":["right"]},{"label":["latlon"],"name":[3],"type":["fctr"],"align":["left"]}],"data":[{"1":"50.7","2":"7.7","3":"50.7_7.7"},{"1":"51.0","2":"6.3","3":"51_6.3"},{"1":"52.2","2":"10.5","3":"52.2_10.5"},{"1":"50.9","2":"11.0","3":"50.9_11"},{"1":"49.9","2":"8.2","3":"49.9_8.2"},{"1":"53.3","2":"13.4","3":"53.3_13.4"}],"options":{"columns":{"min":{},"max":[10]},"rows":{"min":[10],"max":[10]},"pages":{}}}
</script>
</div>
<p>There is a package called “ggmap”, which is an augmentation for the ggplot packages. It lets you load maps from different web services (e.g., Google maps) and maps the user location within the coordination system of ggplot. With this information, you can create interesting plots like heat maps. We won’t go into detail here but you may go through the following code on your own if you are interested. However, please note that you need to register an API with Google in order to make use of this package.</p>
<div class="sourceCode" id="cb34"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb34-1" data-line-number="1"><span class="co"># register_google(key = 'your_api_key')</span></a>
<a class="sourceLine" id="cb34-2" data-line-number="2"></a>
<a class="sourceLine" id="cb34-3" data-line-number="3"><span class="co"># Download the base map</span></a>
<a class="sourceLine" id="cb34-4" data-line-number="4">de_map_g_str <-<span class="st"> </span><span class="kw">get_map</span>(<span class="dt">location =</span> <span class="kw">c</span>(<span class="fl">10.018343</span>, <span class="fl">51.133481</span>), </a>
<a class="sourceLine" id="cb34-5" data-line-number="5"> <span class="dt">zoom =</span> <span class="dv">6</span>, <span class="dt">scale =</span> <span class="dv">2</span>) <span class="co"># results in below map (wohoo!)</span></a>
<a class="sourceLine" id="cb34-6" data-line-number="6"></a>
<a class="sourceLine" id="cb34-7" data-line-number="7"><span class="co"># Draw the heat map</span></a>
<a class="sourceLine" id="cb34-8" data-line-number="8"><span class="kw">ggmap</span>(de_map_g_str, <span class="dt">extent =</span> <span class="st">"device"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">geom_density2d</span>(<span class="dt">data =</span> geo_data, </a>
<a class="sourceLine" id="cb34-9" data-line-number="9"> <span class="kw">aes</span>(<span class="dt">x =</span> lon, <span class="dt">y =</span> lat), <span class="dt">size =</span> <span class="fl">0.3</span>) <span class="op">+</span><span class="st"> </span><span class="kw">stat_density2d</span>(<span class="dt">data =</span> geo_data, </a>
<a class="sourceLine" id="cb34-10" data-line-number="10"> <span class="kw">aes</span>(<span class="dt">x =</span> lon, <span class="dt">y =</span> lat, <span class="dt">fill =</span> ..level.., <span class="dt">alpha =</span> ..level..), </a>
<a class="sourceLine" id="cb34-11" data-line-number="11"> <span class="dt">size =</span> <span class="fl">0.01</span>, <span class="dt">bins =</span> <span class="dv">16</span>, <span class="dt">geom =</span> <span class="st">"polygon"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">scale_fill_gradient</span>(<span class="dt">low =</span> <span class="st">"green"</span>, </a>
<a class="sourceLine" id="cb34-12" data-line-number="12"> <span class="dt">high =</span> <span class="st">"red"</span>) <span class="op">+</span><span class="st"> </span><span class="kw">scale_alpha</span>(<span class="dt">range =</span> <span class="kw">c</span>(<span class="dv">0</span>, <span class="fl">0.3</span>), </a>
<a class="sourceLine" id="cb34-13" data-line-number="13"> <span class="dt">guide =</span> <span class="ot">FALSE</span>)</a></code></pre></div>
<p><img src="05-visualization_files/figure-html/ggmaps-1.png" width="672" /></p>
</div>
</div>
</section>
</div>
</div>
</div>
</div>
</div>
<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
<script src="libs/gitbook-2.6.7/js/lunr.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
<script>
gitbook.require(["gitbook"], function(gitbook) {
gitbook.start({
"sharing": {
"github": false,
"facebook": true,
"twitter": true,
"google": false,
"linkedin": false,
"weibo": false,
"instapaper": false,
"vk": false,
"all": ["facebook", "google", "twitter", "linkedin", "weibo", "instapaper"]
},
"fontsettings": {
"theme": "white",
"family": "sans",
"size": 2
},
"edit": {
"link": null,
"text": null
},
"history": {
"link": null,
"text": null
},
"download": null,
"toc": {
"collapse": "section"
},
"search": false
});
});
</script>
</body>
</html>