-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindex.html
629 lines (544 loc) · 54.8 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
<!DOCTYPE html>
<html lang="en"><head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="description" content="">
<meta name="author" content="">
<title>Jennie Le</title>
<!-- Bootstrap Core CSS -->
<link href="vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet">
<!-- Theme CSS -->
<link href="css/freelancer.css" rel="stylesheet">
<!-- Custom Fonts -->
<link href="vendor/font-awesome/css/font-awesome.min.css" rel="stylesheet" type="text/css">
<link href="https://fonts.googleapis.com/css?family=Montserrat:400,700" rel="stylesheet" type="text/css">
<link href="https://fonts.googleapis.com/css?family=Lato:400,700,400italic,700italic" rel="stylesheet" type="text/css">
<!-- HTML5 Shim and Respond.js IE8 support of HTML5 elements and media queries -->
<!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
<!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/libs/html5shiv/3.7.0/html5shiv.js"></script>
<script src="https://oss.maxcdn.com/libs/respond.js/1.4.2/respond.min.js"></script>
<![endif]-->
</head>
<body id="page-top" class="index">
<div id="skipnav"><a href="#maincontent">Skip to main content</a></div>
<!-- Navigation -->
<nav id="mainNav" class="navbar navbar-default navbar-fixed-top navbar-custom affix-top">
<div class="container">
<!-- Brand and toggle get grouped for better mobile display -->
<div class="navbar-header page-scroll">
<button type="button" class="navbar-toggle" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1">
<span class="sr-only">Toggle navigation</span> Menu <i class="fa fa-bars"></i>
</button>
<a class="navbar-brand" href="#page-top">Jennie Le</a>
</div>
<!-- Collect the nav links, forms, and other content for toggling -->
<div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1">
<ul class="nav navbar-nav navbar-right">
<li class="hidden active">
<a href="#page-top"></a>
</li>
<li class="page-scroll">
<a href="#about">ABOUT ME</a>
</li>
<li class="page-scroll">
<a href="#resume">RESUME</a>
</li>
<li class="page-scroll">
<a href="#portfolio">PORTFOLIO</a>
</li>
<li class="page-scroll">
<a href="#social">CONNECT</a>
</li>
</ul>
</div>
<!-- /.navbar-collapse -->
</div>
<!-- /.container-fluid -->
</nav>
<!-- Header -->
<header>
<div class="container" id="maincontent" tabindex="-1">
<div class="row">
<div class="col-lg-12">
<img class="img-responsive" id="profilePicture" src="img/profile.png" alt="">
<div class="intro-text">
<h1 class="name"><br>Hi, it's Jennie!</h1>
<!--hr class="star-light"-->
</div>
</div>
</div>
</div>
</header>
<!-- About Section -->
<section id="about">
<div class="container">
<div class="row">
<div class="col-lg-12 text-center">
<h2>ABOUT ME</h2>
<hr class="star-primary">
</div>
</div>
<div class="row">
<div class="col-lg-4 col-lg-offset-2">
<p>2 years of experience in economics consulting and information services industries. Recently graduated with a Master in Business Analytics from Gabelli School of Business, Fordham University. Passionate about data visualization, machine learning and big data. Experienced in Python, R, Tableau, SQL and Google Analytics</p>
</div>
<div class="col-lg-4">
<p>I am a data scientist. I build predictive models and craft stories through visualizations. I love taking on the Viz challenges and sharing stories with analytical perspective with others. Whether it's G-eazy or Mozart playing in the background, I like to channel the energy and creativity from music into my work and passion projects. <br><br>
While I'm not coding, I run, come up with new food recipes, play chess, practice piano, or read books. My favourite author is Malcolm Gladwell. On a random weekend, you might find me at the farmer's market in Union Square or reading a book in Madison Square Park with my matcha latte.</p>
</div>
</div>
</div>
</section>
<!-- Resume Section -->
<section id="resume">
<div class="container">
<div class="row">
<div class="col-lg-12 text-center">
<h2>RESUME</h2>
<hr class="star-primary">
</div>
</div>
<div class="row">
<div class="col-lg-8 col-lg-offset-2">
<div class="portfolio-item">
<a href="#resumeModal" class="portfolio-link" data-toggle="modal">
<div class="caption">
<div class="caption-content">
<i class="fa fa-search-plus fa-3x"></i>
</div>
</div>
<iframe width="100%" height="500px" src="file/Resume.pdf"></iframe>
</a>
</div>
</div>
</div>
</div>
</section>
<!-- Portfolio Grid Section -->
<section id="portfolio">
<div class="container">
<div class="row">
<div class="col-lg-12 text-center">
<h2>Tableau Visualization</h2>
<hr class="star-primary">
<p>For more visualizations, visit <a href="https://public.tableau.com/profile/tram.ngoc.le#!/"">my Tableau profile</a>.</p>
</div>
</div>
<div class="row">
<div class='tableauPlaceholder' id='viz1605473535909' style='position: relative'><noscript><a href='#'><img alt=' ' src='https://public.tableau.com/static/images/CO/COVID-19NEWCASESTRENDINTHEUS312020-8242020/COVID-19NEWCASESTREND/1_rss.png' style='border: none' /></a></noscript><object class='tableauViz' style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='COVID-19NEWCASESTRENDINTHEUS312020-8242020/COVID-19NEWCASESTREND' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https://public.tableau.com/static/images/CO/COVID-19NEWCASESTRENDINTHEUS312020-8242020/COVID-19NEWCASESTREND/1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en' /></object></div> <script type='text/javascript'> var divElement = document.getElementById('viz1605473535909'); var vizElement = divElement.getElementsByTagName('object')[0]; if ( divElement.offsetWidth > 800 ) { vizElement.style.width='1250px';vizElement.style.height='827px';} else if ( divElement.offsetWidth > 500 ) { vizElement.style.width='1250px';vizElement.style.height='827px';} else { vizElement.style.width='100%';vizElement.style.height='727px';} var scriptElement = document.createElement('script'); scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js'; vizElement.parentNode.insertBefore(scriptElement, vizElement); </script>
<br><br>
</div>
<div class="row">
<div class="col-lg-12 text-center">
<h2>My Projects</h2>
<hr class="star-primary">
<p>For more technical projects, visit <a href="https://github.com/bigforehead"">my Github</a>.</p>
<br><br>
</div>
</div>
<div class="row">
<div class="col-sm-4 portfolio-item">
<a href="#portfolioModal1" class="portfolio-link" data-toggle="modal">
<div class="caption">
<div class="caption-content">
<i class="fa fa-search-plus fa-3x"></i>
</div>
</div>
<img src="img/bot.png" class="img-responsive crop" alt="Tweeter Projects">
</a>
</div>
<div class="col-sm-4 portfolio-item">
<a href="#portfolioModal2" class="portfolio-link" data-toggle="modal">
<div class="caption">
<div class="caption-content">
<i class="fa fa-search-plus fa-3x"></i>
</div>
</div>
<img src="img/song.png" class="img-responsive crop" alt="Song Projects">
</a>
</div>
<div class="col-sm-4 portfolio-item">
<a href="#portfolioModal3" class="portfolio-link" data-toggle="modal">
<div class="caption">
<div class="caption-content">
<i class="fa fa-search-plus fa-3x"></i>
</div>
</div>
<img src="img/marchmadness.png" class="img-responsive crop" alt="MarchMadness">
</a>
</div>
</div>
</div>
</div>
</section>
<!-- Footer -->
<footer class="text-center">
<section id="social">
<div class="footer-above">
<div class="container">
<div class="row">
<div class="footer-col col-md-4">
</div>
<div class="footer-col col-md-4">
<h3>Connect</h3>
<ul class="list-inline">
<li>
<a href="https://www.linkedin.com/in/jennie-le-6543b8b6/" target="_blank" class="btn-social btn-outline"><span class="sr-only">LinkedIn</span><i class="fa fa-fw fa-linkedin"></i></a>
</li>
<li>
<a href="https://github.com/bigforehead" target="_blank" class="btn-social btn-outline"><span class="sr-only">Github</span><i class="fa fa-fw fa-github"></i></a>
</li>
<li>
<a href="https://www.instagram.com/lengoctram/" target="_blank" class="btn-social btn-outline"><span class="sr-only">Instagram</span><i class="fa fa-fw fa-instagram"></i></a>
</li>
<li>
<a href="mailto:tle35@fordham.edu" target="_blank" class="btn-social btn-outline"><span class="sr-only">Email</span><i class="fa fa-fw fa-google"></i></a>
</li>
</ul>
</div>
<div class="footer-col col-md-4">
</div>
</div>
</div>
</div>
<div class="footer-below">
<div class="container">
<div class="row">
<div class="col-lg-12">
© 2021 Jennie Le
</div>
</div>
</div>
</div>
</section>
</footer>
<!-- Scroll to Top Button (Only visible on small and extra-small screen sizes) -->
<div class="scroll-top page-scroll hidden-sm hidden-xs hidden-lg hidden-md">
<a class="btn btn-primary" href="#page-top">
<i class="fa fa-chevron-up"></i>
</a>
</div>
<!--------Tweeter Projects--->
<div class="portfolio-modal modal fade" id="portfolioModal1" tabindex="-1" role="dialog" aria-hidden="true">
<div class="modal-content">
<div class="close-modal" data-dismiss="modal">
<div class="lr">
<div class="rl">
</div>
</div>
</div>
<div class="container">
<div class="row">
<div class="col-lg-8 col-lg-offset-2">
<div class="modal-body">
<h2>Twitter Malicious Bot Detection</h2>
<hr class="star-primary">
<div class="modal-text">
<p>
There are bots, or automated accounts, everywhere. This project focuses on Twitter bots. Some of these bots can be fun as they share interesting tweets to users every day. But all too often, a high-volume social media fake accounts exist to deceive or spread disinformation. Those malicious bots indirectly hurt Twitter’s image and business revenue as they gradually deplete the users’ organic exposure to the platform. <br><br>
Excited with the possibility of machine learning and text analytics, I formed a team of four and built a classification model to spot scam bots, spam bots and fake followers. <br><br>
<i>Available on <a href="https://github.com/bigforehead/">Github</a>.</i>
<br><br>
<b>BIG PICTURE</b><br>
Coming into this project, we did some research and decided to divide the process into two phases. Before the implementation, we used Python to scrape account metadata and tweets of Twitter bot IDs. During phase one, we applied supervised machine learning to train Decision Tree, Random Forest, and Logistic Regression with behavioral bot features. In phase two, we engineered new features from tweets using Natural Language Processing and trained the models again. The result was amazing! We built a Random Forest classifier with 91.68% of accuracy (for testing data).<br>
<img src="img/bot_process.png" class="model-text" alt="Tweeter Projects">
<br><br>
<b>APPROACH</b><br>
Here is our pipeline:<br><br>
<img src="img/modelbaseline.png" class="model-text" alt="Tweeter Projects">
<br><br>
The raw dataset included 900 IDs for each kind of malicious bots from Bot Repository, total 2700 IDs. After scraping account metadata and tweets with Twitter API and Tweepy, we visualized number of tweets collected in the following graphic.
<br><br>
<img src="img/botgroupdistribution.png" class="model-text" alt="Tweeter Projects">
<br><br>
<b>PHASE ONE: SPOT A BAD BOT BY BEHAVIOR </b><br>
We used IBM SPSS to build an uncorrelated forest of trees to predict the type of malicious bots based on 13 identified fields of account's behaviors. The features included average retweet per ID, average favorite per ID, bot types, bot groups, number of followers, number of friends, number of tweets per ID, a binary feature of default profile (yes/no), a binary feature of using default profile image, if geo-location is turned on, average number of tweets posted daily per account, and percentage of tweets containing URL or hyperlink for each account.<br>
<br>
We partitioned data into 70% of the training set and 30% of the testing set. The Random Forest predicted bots 99.43% correctly on training data, and 91.05% correctly on testing data. The Random Forests Classification indicated that some features matter more than others. Retweet frequency, number of followers and daily tweet frequency were identified as top 3 important features.<br>
<br>
<b>PHASE TWO: SPOT A BAD BOT BY BEHAVIOR AND TWEET SEMANTICS</b><br>
We used Natural Language Toolkits to get keyword dictionary and keyword frequency for 216,173 tweets. We calculated every term frequency for each group of malicious bots. With the 0.05% threshold for term frequency rate, terms larger than the threshold were selected to create detection dictionaries for fake followers, scam bots, and spam bots. We used Python and Excel to calculate they keywords' TFIDF in each tweet using three dictionaries. Then we normalized TFIDF of each tweet to make sure they had the same weight and calculated the average normalized TFIDF for each account. We visualized the term frequency with WordClouds to make sure that the results were consistent with the TFIDF calculation.<br><br>
<br>
In short, fake followers have the lowest TFIDF score in all three dictionaries and a little better match in fake follower dictionary due to their small number of tweets. Scam bots have high TFIDF in all three dictionaries and especially in the spam dictionary. Spam bots do post tons of tweets, and the result indicates that there are some words in the spam dictionary that have discriminative power to distinct spam bots from other types. We added these three average TFIDF as our new features in our detection model and built a new model to see whether these three new features can contribute to our overall accuracy.<br><br>
<b> Spam bot's Tweets WordCloud </b><br><br>
<img src="img/type1.png" class="model-text" alt="Tweeter Projects">
<br><br>
<b> Scam bot's Tweets WordCloud</b><br><br>
<img src="img/type2.png" class="model-text" alt="Tweeter Projects">
<br><br>
The Random Forest Classification applied 12 inputs as predicting factors to train 1405 records again. The dataset was partitioned into 70% of the training set and 30% of the testing set. Adding three new inputs of average TFIDF scores of fake followers, scam bots, and spam bots, the model produced a new list of important indicators. The top three indicators were the average of retweet frequency, the number of followers, and average TFIDF score of the spam bot. According to the result, the TFIDF score plays a significant role in classifying types of malicious bots. This new model predicted malicious bots correctly of 91.68% on the testing set.<br><br>
<img src="img/P2indicator.png" class="model-text" alt="Tweeter Projects">
<br>
<img src="img/P2indicator2.png" class="model-text" alt="Tweeter Projects">
<br><br>
<b>MODEL EVALUATION</b><br>
The model in phase two performed slightly better than the model in phase one. Obviously, the TF-IDF features certainly contributed to the improvement of the model.<br><br>
<img src="img/tweetcompare12.png" class="model-text" alt="Tweeter Projects">
<br><br>
<b>WRAPPING UP</b><br>
You made it here !!! To snapshot the process, we combined supervised machine learning and natural languague processing to detect malicious bots on Twitter.<br><br>
Overall, fake followers are inactive accounts with the highest TFIDF score for fake accounts. Scam bots are also inactive accounts but they retweet frequently and have the highest TFIDF score. Spam bots are active accounts with the highest TFIDF score. <br><br>
Thanks for reading! <br><br>
<b> SPOT A BOT THROUGH VISUALIZATION </b><br>
If you want to understand how different bots behave on Twitter, check this Viz out! <br><br>
<div class='tableauPlaceholder' id='viz1585549357967' style='position: relative'><noscript><a href='#'><img alt=' ' src='https://public.tableau.com/static/images/Tw/TwitterMaliciousBotDetection/Dashboard1/1_rss.png' style='border: none' /></a></noscript><object class='tableauViz' style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='TwitterMaliciousBotDetection/Dashboard1' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https://public.tableau.com/static/images/Tw/TwitterMaliciousBotDetection/Dashboard1/1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /></object></div> <script type='text/javascript'> var divElement = document.getElementById('viz1585549357967'); var vizElement = divElement.getElementsByTagName('object')[0]; if ( divElement.offsetWidth > 800 ) { vizElement.style.width='1024px';vizElement.style.height='795px';} else if ( divElement.offsetWidth > 500 ) { vizElement.style.width='1024px';vizElement.style.height='795px';} else { vizElement.style.width='100%';vizElement.style.height='1477px';} var scriptElement = document.createElement('script'); scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js'; vizElement.parentNode.insertBefore(scriptElement, vizElement); </script>
<b>REFERENCES</b><br>
[1] Stefano Cresci et al. The paradigm-shift of social spambots. arXiv:1701.03017v1 [cs.SI] 11 Jan 2017. Available at: https://arxiv.org/abs/1701.03017<br>
[2] Cresci-2017 and Pronbots-2019 : https://botometer.iuni.iu.edu/bot-repository/datasets.html. Accessed: 2019-11-26.<br>
[3] Lulwah Ahmad AlKulaib. Twitter Bots Multiclass Classification Using Bot-Like Behavior Features. B.S. in Computer Science, Gulf University for Science and Technology, June 2011.<br>
[4] Efthimion, Phillip George; Payne, Scott; and Proferes, Nicholas (2018) "Supervised Machine Learning Bot Detection Techniques to Identify Social Twitter Bots," SMU Data Science Review: Vol. 1: No. 2, Article 5. Available at: https://scholar.smu.edu/datasciencereview/vol1/iss2/5.
<br><br>
<iframe width="100%" height="500px" src="file/twitterbot.pdf#toolbar=0"></iframe><br><br>
</p>
</div>
<button type="button" class="btn btn-default" data-dismiss="modal"><i class="fa fa-times"></i> Close</button>
</div>
</div>
</div>
</div>
</div>
</div>
<!--------Song Projects--->
<div class="portfolio-modal modal fade" id="portfolioModal2" tabindex="-1" role="dialog" aria-hidden="true">
<div class="modal-content">
<div class="close-modal" data-dismiss="modal">
<div class="lr">
<div class="rl">
</div>
</div>
</div>
<div class="container">
<div class="row">
<div class="col-lg-8 col-lg-offset-2">
<div class="modal-body">
<h2>A Song Recommender System on Million Song Dataset</h2>
<hr class="star-primary">
<div class="modal-text">
<p>
Recommendation Systems are very standard and popular all over the web. Companies like Netflix, Spotify, Apple, Pandora, and LinkedIn leverage the recommender systems to help users find new items quickly, creating a delightsome user experience while driving incremental revenue.
<br><br>
<i>Available on <a href="https://github.com/bigforehead/Big-Data-Million-Song-Music-Recommendation-System">Github</a>.</i><br><br>
<b>BIG PICTURE</b><br>
Discover Playlist is one of the most popular Spotify’s features, and it is generated each week based on user’s listening habits. Inspired by this playlist, I and three other classmates geeked out about recommendation algorithms and decided to create three music recommendation systems for our Big Data capstone project.<br><br>
We employed three algorithms (popularity benchmark, ALS, lyric-based and artist-based similarity filtering) to generated three different playlists: Hot Song playlist, Personalized playlist, and New User playlist. ALS algorithm was used for a personalized recommendation. The lyric-based and artist-based similarity algorithm was used to solve the cold-start problem with new users.<br><br>
The whole system was built with Python and PySpark on Google Cloud Platform and AWS.
<br><br>
<b>ABOUT THE DATA</b><br>
The Million Song Dataset, a collaborative project between The Echo Nest and LabROSA, is a freely available collection of audio features and meta data for a million contemporary popular music tracks.<br><br>
The links are available here: <br>
<a href="http://millionsongdataset.com/tasteprofile/">Taste Profile</a>, <a href="http://millionsongdataset.com/musixmatch/">The musiXmatch dataset</a>, <a href="http://millionsongdataset.com/pages/getting-dataset/#subset ">The Million Song subset</a>
<br><br>
<b>APPROACH</b><br>
<img src="img/song_baselinemodel.png" class="model-text" alt="Song Projects">
<br><br>
<b>ALGORITHM DEVELOPMENT</b><br>
<b>1. Popularity Benchmark</b><br>
Popularity is the benchmark for recommending songs because it is better than random guesses. The data were filtered by total frequency based on listening history, then top 10 heated tracks were selected. This would be the baseline saver in case there is no data for a new user (cold-start problem).
<br><br>
<b>2. Collaborative Filtering</b><br>
Collaborative filtering is probably one of the most used approaches for the recommendation system, or at least as a necessary component of some advanced recommendation structures. Surprisingly, this algorithm requires such simple data, nothing more than the users’ historical preference on a set of items. The main assumption here is that the users (listeners in our case) who have agreed in the past are very likely to agree in the future. For multiple users, it assumes if user A liked the same product as user B, A would also like to have similar taste as B for other B’s favorites.<br><br>
In terms of the ratings, the algorithm often requires some metrics to indicate users’ preference, which can either be explicitly available data such as 5 stars or thumbs-up on the product, or an implicitly derived score such as clicks, number of purchases, or other data recorded in the cookie. In this project, traditional way was implemented by utilizing the listening frequency as the rating. The assumption when lacking explicit rating is that a higher listening frequency equates to a higher rating.<br><br>
For ALS, a matrix with frequency of each song played by each user was constructed. Certainly, this matrix is sparse with tons of missing value, as a limited number of users have listened to only a limited set of songs. The high-level idea is to approximate the matrix by factorizing it as the product of two matrices: the user matrix that represents each user, while the item matrix describes properties of each track.<br><br>
<img src="img/ALS_matrix.png" class="model-text" alt="Song Projects"><br>
For a reasonable result, two matrices were created so that the error for the known user-song pairs is minimized. The error here refers to the rooted mean squared error (RMSE). In a more detailed technical level, ALS would first fill the user matrix with a random value, and then optimize the song matrix value by minimizing the error. After that, it “alters”, which means it would hold the song matrix and optimize the value of the user’s matrix. This step minimizes two loss functions alternatively and should achieve some optima. <br><br>
During the implementation, Spark ML pipeline was used with the following settings: <br>
- A big set of the parameters were picked and grid-search with cross-validation on this space was executed.<br>
- Some important parameters to tune include: maxIter (the max number of iterations), rank (the number of latent factors, which basically determined the shape of matrix), and regParam (the regularization parameter).
<br><br>
<b>3. Content-Based Filtering</b><br>
Content-based filtering generates song playlists based on the cosine similarity calculated for three lyrical features, including (1) TF-IDF, (2) Word2Vec, and (3) Topic Model with Latent Dirichlet Allocation (LDA). <br><br>
In this case, the algorithm is designed for new users to search for a specific song. The users enter their favorite song or artist, then the system will match the song’s index with cosine similarity matrix and return 10 songs with similar content. <br><br>
<img src="img/content_base_algorithm.png" class="model-text" alt="Song Projects"><br>
TFIDF for Lyrics: TF-IDF stands for “Term Frequency – Inverse Document Frequency”. In this case, we vectorized the lyrics and use TF-IDF to quantify the word in song lyrics. We computed a weight to each word which signified the importance of the word in the lyrics.Because the actual lyrics were protected by copyright and Million Song Dataset did not have permission to redistribute them, the lyrics came in the form as bag-of-words rather than the original lyrics. This created limitation to this approach. <br><br>
Word2Vec for Lyrics: Word2Vec is a two-layer neural net that processes text by “vectorizing” words and turns text into a numerical form. Word2vec groups vector of similar words in vector space as it detects similarities mathematically. Due to high volume of data, calculating the cosine similarity using high-dimensional sparse TFIDF matrix was time-consuming and costly. In this case, Word2vec was used as the second method. However, Word2Vec model was less accurate because there was no context due to lyric limitation.<br><br>
LDA for Lyrics: Topic Modeling is an unsupervised approach used for identifying topics present in a text object and to derive hidden patterns exhibited by a text corpus. Latent Dirichlet Allocation (LDA) is one of the topic models that builds a topic per document model and words per topic model. The big idea behind LDA is that each document can be described by a distribution of topics, and each topic can be described by a distribution of words.We assumed that there are similar topics among all songs and limited the number of topics to help save time and memory. LDA was used as the third method. In this case, LDA ignored syntactic information and treated documents as bags of words, so the unigram format in the dataset did not matter much.<br><br>
Artist Similarity: The dataset is offered by Million Song Dataset. We assumed that similar audience would have the same taste in artists. <br><br>
After engineering new features and calculating the cosine similarity between them, we defined two key functions for our recommendation system. One returned the top 10 song with the highest cosine similarity. The other found the top 10 songs based on artist similarity and cosine similarity.<br><br>
<b>RESULT AND EVALUATION</b><br>
<b>1. Collaborative Filtering</b><br>
Here is the benchmark of our top 10 most listened songs. <br>
<img src="img/CL_result.png" class="model-text" alt="Song Projects"><br>
The data was split into three parts: 60% as train, 20% as validation, and 20% as test. Grid search and cross-validation on validation set was implemented to determine the best parameters of the recommendation system and only recommend the results to the new users in the test set once to avoid information leakage. The process was implemented from scratch and was locked in a pipeline. This work was constructed to avoid overfitting problems.<br><br>
As for Spark 2.0, when asked to provide a rating for new users never seen before, the ALS could only yield NAN value. . Therefore, it was impossible to adapt Spark ML’s Cross Validator to check the RMSE. So the algorithm was set to drop NAN values by default with customized cross validation process before using RMSE for evaluation.<br><br>
The following configuration steps were executed: (1) set appropriate parameters for users, items and rating; (2) fit ALS and transform the table to generate the prediction column; (3) run predictions against the validation set and check the error; (4) finalize the model with the best RMSE score. <br><br>
Exploratory analysis showed that more than half of the songs in the sample were listened at least once. Thus, it could be inferred that songs played more than once can better represent users' tastes and run the ALS on two versions of the datasets.<br><br>
Result comparison of model with all data and frequency >=2 data was shown as followed: <br><br>
<img src="img/CL_result_2.png" class="model-text" alt="Song Projects"><br><br>
Except for cross-validation, another common measurement is to compare the model with fake test data where every rating is the average number of frequencies from the train data. While the model excluding the 1-time listened songs returns a slightly higher RMSE score on the test dataset, it behaves better when compared with the average frequency. And this may hint the recommendation makes more sense. The highlighted tracks can be of the highest recommendation quality, as they are the overlap between two ALS recommendation models.<br>
<br>
<b>2. Content-Based Filtering</b><br>
Let's simulate a new user searches for a song to see the recommendation results.<br><br>
<img src="img/CB_simulate.png" class="model-text" alt="Song Projects"><br><br>
The results showed: 4 similar songs between TFIDF and Word2vec methods; 2 similar songs between TFIDF and LDA methods; 2 similar songs between Word2vec and LDA methods; 1 similar song among three methods. Therefore, the final recommended song related to the Box Tops’ Soul Deep was Four Tops’ You Keep Running Away.<br><br>
<img src="img/CB_result.png" width="800" height="633" class="model-text" alt="Song Projects"><br><br>
<b>BEYOND THE LYRICS: THE CROSSROAD OF MUSIC AND DATA VISUALIZATION</b><br>
Always amazed by the power of data visualization, I wanted to bring the hidden trends in the meaning of songs to light. Additional to building music recommendation systems, I utilized Natural Language Processing, WordClouds and seaborn to investigate the popularity of lyrics from 1950s to 2000s as well as the topic in lyrics.<br><br>
<b>1. Topic Modeling Visualization</b><br>
When modeling lyrical topics with LDA, lyrics were not stemmed, and the model defined 20 topics. The original word form was kept for more comprehensive analyses. I removed stop-words, generated token dictionary, and built a corpus. Then the corpus and dictionary were filtered into the LDA model with 20 topics. <br><br>
<img src="img/LDA_lyrics.png" class="model-text" alt="Song Projects"><br>
Cluster 5,10,18, and 16 as well as cluster 20,19 and 14 were separated from other clusters in different dimensions due to the difference of languages.<br><br>
Cluster 11, 17, 12, and 7 contained lyrics of violent or religious topics. For instance, cluster 11 represented religious topic with the most relevant words of “world”, “life”, “live”, “god”, “us”, “heaven”, “god”, “Jesus”, “soul”, “angle”, and many more. However, topic in cluster 12 seemed to be related to violent topic, or rap song, since the lyrics contained very negative words, such as the “f” word, “kill”, “dead”, “hate”, “hell”, “gun”, “shit”, “bitch”, “war”, “sick”, “shot”, or “murder”.<br><br>
The first dimension (upper right side) included topics with positive vibe, such as love or party. For instance, cluster 15 seemed to have a “dance party” theme with mostly “oh”, “ooh”, “ah”, “yeah”, “shake”, “mama”, “yes”, “babi”, “ohh” and so on.<br><br>
Multiple clusters with “love” topics were blended because “love” intuitively brings out many emotions. For example, cluster 8 seemed to represent “happy love” with “love”, “want”, “need”, “feel”, “like”, “kiss”, “true”, “touch”, “give” and so on while other clusters represented sadness.<br><br>
In general, a good class of 20 topics could be defined with the LDA model. Each topic was approximately well-defined with different themes, such as religion, dance party, dark drama, happy love, and sad love.
<br><br>
<b>2. Lyrics Analysis Through The Time</b><br>
There are certain changes in lyrics from 1950s to 2000s:<br><br>
"Love" was the most used word in songs from the 1950s to 1990s. The topic of love seemed to be the greatest inspiration for music at the time. In 2000s, "know" became the most used word and followed by "love".<br><br>
The party vibe was shown more frequently in 1990s and 2000s. Those words such as "got", "na", "like", "oh", "la", "come", "let", "feel","make", and "yeah" became very popular in lyrics.<br><br>
Though songs of 1950s and 1960s sounded more conservative yet filled with romantic vibe. The popular words were "girl", "time", "night", "make", "say", "day", "want","heart", "man", and "said".<br><br>
The word "babi" was in the top 3 popular words in 1950s, however, from 1960s to 2000s, it was replaced with either "like" or "got".<br><br>
Lyrics in 2000s: <br>
<img src="img/lyrics_2000s.png" class="model-text" alt="Song Projects"><br>
Lyrics in 1990s: <br>
<img src="img/lyrics_1990s.png" class="model-text" alt="Song Projects"><br>
Lyrics in 1980s: <br>
<img src="img/lyrics_1980s.png" class="model-text" alt="Song Projects"><br>
Lyrics in 1970s: <br>
<img src="img/lyrics_1970s.png" class="model-text" alt="Song Projects"><br>
Lyrics in 1960s: <br>
<img src="img/lyrics_1960s.png" class="model-text" alt="Song Projects"><br>
Lyrics in 1950s: <br>
<img src="img/lyrics_1950s.png" class="model-text" alt="Song Projects"><br>
Comparing the most popular words in lyrics of the 60s and 2000s, the lyrics in the 60s tended to bring a more subtle vibe than the 2000s did. The 60s lyrics had “babi”, “know”, “love”, “yeah”, “oh”, and “got”, which demonstrated the priority in love and the lover. At the same time, the 2000s lyrics had “love”, “la”, “know”, “got”, “de”, and “come”, which portrayed a bustling vibe and prioritized love and party at the same time. <br>
Word Cloud for Lyrics in 1960s: <br>
<img src="img/WC_1960s.png" class="model-text" alt="Song Projects"><br>
Word Cloud for Lyrics in 2000s: <br>
<img src="img/WC_2000s.png" class="model-text" alt="Song Projects"><br>
<b>CONCLUSION</b><br>
In this project, three recommendation systems were built using a dataset with 1,019,318 unique users and 384,546 unique songs. ALS algorithm, which combines user and item knowledge, was used for collaborative filtering recommender. This recommender can be applied to old users with sufficient listening history to generate personalized recommendations. Many features of song were combined, such as artist similarity, TF-IDF, Word2vec and LDA modeling for lyrics, to build a content-based recommender. The content-based recommender is for new users with only one or a few searches and listening history. Similar songs for the current song will be recommended. Considering that Spotify has about 2 million monthly active users, our project is close to the monthly magnitude of the industry-level. Yet a lot of obstacles during the implementation process still emerged.<br><br>
There are many lessons can be learned when working with Cloud and Pyspark. First, Spark has a high dependency on memory usage. During the testing phase, our virtual instance with 30GB memory crushes from time to time. It would save some cost to choose a high memory specialized instance on google cloud. To run cross validation on the 3GB listening history data, it would be secure to choose the following configuration. It is very helpful to use the Unix command “free -m” to check available memory in time. In addition, we learned that we need to cache the datasets whenever they are likely to be used more than once: <br><br>
<img src="img/lesson_learn_1.png" class="model-text" alt="Song Projects"><br>
<img src="img/lesson_learn_2.png" class="model-text" alt="Song Projects"><br>
<img src="img/lesson_learn_3.png" class="model-text" alt="Song Projects"><br>
However, overcoming memory issues on cloud alone is not sufficient. Some default Spark settings needed changing to permit more executing memory and driver memory resources to be adapted by Spark. <br><br>
In addition, debugging with Spark is quite suffering, as the error message is not informative enough. A common mistake is that the data type is not fixed or switched during the processing. Due to the Java development environment, different methods are very strict on input data type. Casting the data types should be a regular task while working with Spark. For example, ALS function in spark only allows integer input. It would return error even when the data type is big integer and so on. In the same vein, the raw ideas from the original data frame are no longer useful that new integer id pairs have to be generated. <br><br>
An example of change the Data Type to Int:<br><br>
<img src="img/lesson_learn_4.png" class="model-text" alt="Song Projects"><br>
An example of change ID's Data Type from Bigint to Int:<br><br>
<img src="img/lesson_learn_5.png" class="model-text" alt="Song Projects"><br>
Curently, there are still obstacles towards a final hybrid recommendation system. At this stage, the main difficulty comes from the lack of valid lyrics data. In the future, combining the collaborative filtering and content-based recommendation systems can provide hybrid recommenders. According complementary advantages from various recommendation algorithms, hybrid solution, such as using overlappings, would provide users with suggestions of higher quality.For example, a hybrid system based on lyrics and listening history could make sure the users like not only the explicit topics of the tracks but also the genre from their taste perspectives.<br><br>
As a matter of fact, more dimensions of recommendation are always better. Google naturally combined plenty of recommendation strategies in its wide and deep recommendation system with neural networks and ensemble methods. Though it is difficult to bring this project to such a top level, this precious practice undoubtedly helped lay a solid foundation for potential industrial workds involving recommendation systems or distributed systems.<br><br>
Besides, there are some new ideas for further exploration: <br><br>
1. User2Vec: <br>
Word2Vec for NLP was used in this project to analyze the knowledge about songs. What if songs are considered as words and users are considered as documents? The listening history of a user is the text content of the document. Then Word2Vec and Doc2vec can be applied to find similar songs and users.<br><br>
2. Graph algorithm: <br>
Graph database is another new trend for online shopping recommendation systems. When modeling songs, artists, and users in a graph database, graph algorithms can help to find the relationship among those features and consider knowledge about songs, artists, and users at the same time. <br><br>
3. Content-based filtering using music audio:<br>
In this project and the lastest business scenarios, content-based filtering in the music industry means lyric-based or text-based. However, machine learning and and deep learning also show great results and improvements in audio processing. Analyzing the music audio directly could be a new direction for content-based filtering.
<br><br>
<b>REFERENCE</b><br>
1. A Beginner's Guide to Word2Vec and Neural Word Embeddings. (n.d.). Retrieved from https://pathmind.com/wiki/word2vec<br>
2. Content-based Filtering. (2012, January 24). Retrieved from http://recommender-systems.org/content-based-filtering/<br>
3. Karantyagi. (n.d.). karantyagi/Restaurant-Recommendations-with-Yelp. Retrieved from https://github.com/karantyagi/Restaurant-Recommendations-with-Yelp<br>
4. Li, S. (2018, June 1). Topic Modeling and Latent Dirichlet Allocation (LDA) in Python. Retrieved from https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24<br>
5. 5. MODELING. (n.d.). Retrieved from https://xindizhao19931.wixsite.com/spotify2/modeling <br>
6. Welcome! (n.d.). Retrieved from http://millionsongdataset.com/
<br><br>
<iframe width="100%" height="500px" src="file/recommendation.pdf#toolbar=0"></iframe><br><br>
</p>
</div>
<button type="button" class="btn btn-default" data-dismiss="modal"><i class="fa fa-times"></i> Close</button>
</div>
</div>
</div>
</div>
</div>
</div>
<!-------- MarchMadness Project--->
<div class="portfolio-modal modal fade" id="portfolioModal3" tabindex="-1" role="dialog" aria-hidden="true">
<div class="modal-content">
<div class="close-modal" data-dismiss="modal">
<div class="lr">
<div class="rl">
</div>
</div>
</div>
<div class="container">
<div class="row">
<div class="col-lg-8 col-lg-offset-2">
<div class="modal-body">
<h2>2020 Fordham NCAA March Madness Data Crunch Competition</h2>
<hr class="star-primary">
<div class="modal-text">
<p>
Every March, millions of basketball fans, celebrities, data scientists and even presidents tune in to watch and predict the championship of NCAA Division I Men’s Basketball Tournament. Excited by all the odds of filling out a perfect bracket, I joined the annual March Data Crunch Madness competition, hosted by Fordham University and Deloitte. Even though the NCAA cancelled the tournament because of COVID-19 in 2020, I, and other participants still decided to try our hand at modeling the tournament and kept the tradition at Fordham going.
<br><br>
<i>Available on <a href="https://github.com/bigforehead/NCAA-March-Madness-Bracket-Prediction-2020">Github</a>.</i><br><br>
<b>BIG PICTURE</b><br>
The goal of the competition is to utilize past tournament data (from 2002 to 2019) to build and test predictive models in order to forecast outcomes of the Final Four in the 2020 NCAA Division I Men’s Basketball Championship. These outcomes are computed probabilistically, and the models are evaluated by log loss. For instance, Team 1 has 70% likelihood of winning Team 2. Also, if you are not familiar with log loss, just know that the best model has the lowest log loss. <br>
<img src="img/logloss.png" class="model-text" alt="MarchMadness"><br>
<br><br>
<b>APPROACH</b><br>
Coming into this competition, I did some research on similar projects and came up with a general idea of what to do. As far as I knew, some projects went with logistic regression as their primary algorithm because of the probabilistic nature and simple implementation, yet effective. With my prior knowledge, I took a similar approach to other models while making a few interesting changes with novel features. <br>
<br>So, what is new about my approach? <br><br>
<img src="img/mm_newfeatures.png" class="model-text" alt="MarchMadness"><br><br>
Instead of analyzing the stats of each team, I engineered new features by transforming all variables into Difference and Ratio, or Quotient, between two teams in each of 63 games.<br>
<br>
Then I calculated the Winning Rate and Teamwork Score for each team and coach. The level of teamwork can be reflected by the percentage of assist score (80%) and the defensive efficiency (20%). The team can be more stable if they rely on assist and better defense to win the game.<br>
<br>
To select the most important features, I removed highly correlated variables (Pearson correlation > 0.9) and applied Embedded method with Random Forests. Some advantages of Embedded methods are higher accuracy, better generalization, and being interpretable (based on importance). Also, it works well when dealing with high-dimensional dataset.<br>
<br>
After hyperparameter tunning Random Forest Classifier with Grid Search, I trained Logistic Regression, Gradient Boosting, Support Vector Classifier, Random Forest Classifier, and Linear Discriminant Analysis (LDA) and selected the best one with lowest log loss.<br>
<br><br>
<b>MODEL SELECTION</b><br>
Random Forest Classifier is selected as the best model due to the most optimal accuracy rate and lowest log loss. That should be enough for the competition. The perfect scenario was having the 2020 data for model application. However, to resolve the problem, I trained the models with 2002 to 2017 data, tested the models with 2018 data and then I applied the model on the 2019 data to generate prediction and probability.<br>
<img src="img/mm_modelresult.png" class="model-text" alt="MarchMadness"><br>
<br><br>
<b>PREDICTION RESULT</b><br>
The accuracy of the best model Random Forest Classifier is 77.61% with a log loss of 0.51. <br>
So which teams are the final four and champion? <br>
<img src="img/finalfour.png" class="model-text" alt="MarchMadness">
<br><br>
<b>WRAPPING UP</b><br><br>
I am happy to say that completing this competition has made me a more competent data scientist, especially under the unprecedented situation. I am excited and hopeful that the 2021’s tournament will not be cancelled so I can make some fun bets with my friends using this model. Thanks for reading and if you want to learn more about my code, please visit my github.<br>
<br>
<b>MARCH MADNESS VISUALIZATION</b><br>
If you are interested, here are some interesting visualizations about March Madness! See if you want to make a bet for Virginia !
<div class='tableauPlaceholder' id='viz1611035060565' style='position: relative'><noscript><a href='#'><img alt=' ' src='https://public.tableau.com/static/images/MS/MSHZN3GXB/1_rss.png' style='border: none' /></a></noscript><object class='tableauViz' style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='path' value='shared/MSHZN3GXB' /> <param name='toolbar' value='yes' /><param name='static_image' value='https://public.tableau.com/static/images/MS/MSHZN3GXB/1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en' /><param name='filter' value='publish=yes' /></object></div> <script type='text/javascript'> var divElement = document.getElementById('viz1611035060565'); var vizElement = divElement.getElementsByTagName('object')[0]; vizElement.style.width='800px';vizElement.style.height='827px'; var scriptElement = document.createElement('script'); scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js'; vizElement.parentNode.insertBefore(scriptElement, vizElement); </script>
</p>
</div>
<button type="button" class="btn btn-default" data-dismiss="modal"><i class="fa fa-times"></i> Close</button>
</div>
</div>
</div>
</div>
</div>
</div>
<!--Resume Modal-->
<div class="portfolio-modal modal fade" id="resumeModal" tabindex="-1" role="dialog" aria-hidden="true">
<div class="modal-content">
<div class="close-modal" data-dismiss="modal">
<div class="lr">
<div class="rl">
</div>
</div>
</div>
<div class="container">
<div class="row">
<div class="col-lg-8 col-lg-offset-2">
<div class="modal-body">
<div class="row">
<iframe width="100%" height="1000px" src="file/Resume.pdf"></iframe>
</div>
<button type="button" class="btn btn-default" data-dismiss="modal"><i class="fa fa-times"></i> Close</button>
</div>
</div>
</div>
</div>
</div>
</div>
<!-- jQuery -->
<script src="vendor/jquery/jquery.min.js"></script>
<!-- Bootstrap Core JavaScript -->
<script src="vendor/bootstrap/js/bootstrap.min.js"></script>
<!-- Plugin JavaScript -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery-easing/1.3/jquery.easing.min.js"></script>
<!-- Contact Form JavaScript -->
<script src="js/jqBootstrapValidation.js"></script>
<script src="js/contact_me.js"></script>
<!-- Theme JavaScript -->
<script src="js/freelancer.js"></script>
</body></html>