-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
datawarehouse-datalake.html
785 lines (727 loc) · 59.7 KB
/
datawarehouse-datalake.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta content="width=device-width, initial-scale=1.0" name="viewport">
<title>Portfolio Details</title>
<meta content="" name="description">
<meta content="" name="keywords">
<!-- Favicons -->
<link href="assets/img/Favicon-1.png" rel="icon">
<link href="assets/img/Favicon-1.png" rel="apple-touch-icon">
<!-- Google Fonts -->
<link href="https://fonts.googleapis.com/css?family=Open+Sans:300,300i,400,400i,600,600i,700,700i|Raleway:300,300i,400,400i,500,500i,600,600i,700,700i|Poppins:300,300i,400,400i,500,500i,600,600i,700,700i" rel="stylesheet">
<!-- Vendor CSS Files -->
<link href="assets/vendor/aos/aos.css" rel="stylesheet">
<link href="assets/vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet">
<link href="assets/vendor/bootstrap-icons/bootstrap-icons.css" rel="stylesheet">
<link href="assets/vendor/boxicons/css/boxicons.min.css" rel="stylesheet">
<link href="assets/vendor/glightbox/css/glightbox.min.css" rel="stylesheet">
<link href="assets/vendor/swiper/swiper-bundle.min.css" rel="stylesheet">
<!-- Creating a python code section-->
<link rel="stylesheet" href="assets/css/prism.css">
<script src="assets/js/prism.js"></script>
<!-- Template Main CSS File -->
<link href="assets/css/style.css" rel="stylesheet">
<!-- To set the icon, visit https://fontawesome.com/account-->
<script src="https://kit.fontawesome.com/5d25c1efd3.js" crossorigin="anonymous"></script>
<!-- end of icon-->
<!-- =======================================================
* Template Name: iPortfolio
* Updated: Sep 18 2023 with Bootstrap v5.3.2
* Template URL: https://bootstrapmade.com/iportfolio-bootstrap-portfolio-websites-template/
* Author: BootstrapMade.com
* License: https://bootstrapmade.com/license/
========================================================-->
</head>
<body>
<!-- ======= Mobile nav toggle button ======= -->
<i class="bi bi-list mobile-nav-toggle d-xl-none"></i>
<!-- ======= Header ======= -->
<header id="header">
<div class="d-flex flex-column">
<div class="profile">
<img src="assets/img/myphoto.jpeg" alt="" class="img-fluid rounded-circle">
<h1 class="text-light"><a href="index.html">Arun</a></h1>
<div class="social-links mt-3 text-center">
<a href="https://www.linkedin.com/in/arunp77/" target="_blank" class="linkedin"><i class="bx bxl-linkedin"></i></a>
<a href="https://github.com/arunp77" target="_blank" class="github"><i class="bx bxl-github"></i></a>
<a href="https://twitter.com/arunp77_" target="_blank" class="twitter"><i class="bx bxl-twitter"></i></a>
<a href="https://www.instagram.com/arunp77/" target="_blank" class="instagram"><i class="bx bxl-instagram"></i></a>
<a href="https://arunp77.medium.com/" target="_blank" class="medium"><i class="bx bxl-medium"></i></a>
</div>
</div>
<nav id="navbar" class="nav-menu navbar">
<ul>
<li><a href="index.html#hero" class="nav-link scrollto active"><i class="bx bx-home"></i> <span>Home</span></a></li>
<li><a href="index.html#about" class="nav-link scrollto"><i class="bx bx-user"></i> <span>About</span></a></li>
<li><a href="index.html#resume" class="nav-link scrollto"><i class="bx bx-file-blank"></i> <span>Resume</span></a></li>
<li><a href="index.html#portfolio" class="nav-link scrollto"><i class="bx bx-book-content"></i> <span>Portfolio</span></a></li>
<li><a href="index.html#skills-and-tools" class="nav-link scrollto"><i class="bx bx-wrench"></i> <span>Skills and Tools</span></a></li>
<li><a href="index.html#language" class="nav-link scrollto"><i class="bi bi-menu-up"></i> <span>Languages</span></a></li>
<li><a href="index.html#awards" class="nav-link scrollto"><i class="bi bi-award-fill"></i> <span>Awards</span></a></li>
<li><a href="index.html#professionalcourses" class="nav-link scrollto"><i class="bx bx-book-alt"></i> <span>Professional Certification</span></a></li>
<li><a href="index.html#publications" class="nav-link scrollto"><i class="bx bx-news"></i> <span>Publications</span></a></li>
<!-- <li><a href="index.html#extra-curricular" class="nav-link scrollto"><i class="bx bx-rocket"></i> <span>Extra-Curricular Activities</span></a></li> -->
<!-- <li><a href="#contact" class="nav-link scrollto"><i class="bx bx-envelope"></i> <span>Contact</span></a></li> -->
</ul>
</nav><!-- .nav-menu -->
</div>
</header><!-- End Header -->
<main id="main">
<!-- ======= Breadcrumbs ======= -->
<section id="breadcrumbs" class="breadcrumbs">
<div class="container">
<div class="d-flex justify-content-between align-items-center">
<h2></h2>
<ol>
<li><a href="content-page.html" class="clickable-box">Content section</a></li>
<li><a href="index.html" class="clickable-box">Home</a></li>
</ol>
</div>
</div>
</section><!-- End Breadcrumbs -->
<!------ right dropdown menue ------->
<div class="right-side-list">
<div class="dropdown">
<button class="dropbtn"><strong>Shortcuts:</strong></button>
<div class="dropdown-content">
<ul>
<li><a href="cloud-compute.html"><i class="fas fa-cloud"></i> Cloud</a></li>
<li><a href="AWS-GCP.html"><i class="fas fa-cloud"></i> AWS-GCP</a></li>
<li><a href="amazon-s3.html"><i class="fas fa-cloud"></i> AWS S3</a></li>
<li><a href="ec2-confi.html"><i class="fas fa-server"></i> EC2</a></li>
<li><a href="Docker-Container.html"><i class="fab fa-docker" style="color: rgb(29, 27, 27);"></i> Docker</a></li>
<li><a href="Jupyter-nifi.html"><i class="fab fa-python" style="color: rgb(34, 32, 32);"></i> Jupyter-nifi</a></li>
<li><a href="snowflake-task-stream.html"><i class="fas fa-snowflake"></i> Snowflake</a></li>
<li><a href="data-model.html"><i class="fas fa-database"></i> Data modeling</a></li>
<li><a href="sql-basics.html"><i class="fas fa-table"></i> QL</a></li>
<li><a href="sql-basic-details.html"><i class="fas fa-database"></i> SQL</a></li>
<li><a href="Bigquerry-sql.html"><i class="fas fa-database"></i> Bigquerry</a></li>
<li><a href="scd.html"><i class="fas fa-archive"></i> SCD</a></li>
<li><a href="sql-project.html"><i class="fas fa-database"></i> SQL project</a></li>
<!-- Add more subsections as needed -->
</ul>
</div>
</div>
</div>
<!-- ======= Portfolio Details Section ======= -->
<section id="portfolio-details" class="portfolio-details">
<div class="container">
<div class="row gy-4">
<h1>Datawarehouse and data lake Architecture</h1>
<div class="col-lg-8">
<div class="portfolio-details-slider swiper">
<div class="swiper-wrapper align-items-center">
<div class="swiper-slide">
<figure style="text-align: center;">
<img src="assets/img/portfolio/datawarehouse-datalake1.png" alt="" style="max-width: 100%; max-height: 100%;">
<figcaption style="text-align: center;"><strong>Image credit:</strong><a href="https://www.qubole.com/data-lakes-vs-data-warehouses-the-co-existence-argument" target="_blank"> Qubole</a></figcaption>
</figure>
</div>
</div>
</div>
</div>
<!--------Data warehouse start here -->
<h1>Datawarehouse</h1>
<ul style="list-style-type: disc; margin-left: 30px;">
<li>A data warehouse, or enterprise data warehouse (EDW), is a centralized and integrated repository of data that aggregates data from different sources into a single, central, consistent data store to support data analysis, data mining, artificial intelligence (AI), and machine learning.</li>
<li>It serves as a structured storage system for collecting and managing large volumes of data from various sources within an organization.</li>
<li>A data warehouse system enables an organization to run powerful analytics on huge volumes (petabytes) of historical data in ways that a standard database cannot.</li>
<li>A data warehouse is specially designed for data analytics, which involves reading large amounts of data to understand relationships and trends across the data. A database is used to capture and store data, such as recording details of a transaction.</li>
<div class="box">
<strong>Note: </strong>Unlike a data warehouse, a data lake is a centralized repository for all data, including structured, semi-structured, and unstructured. A data warehouse requires that the data be organized in a tabular format, which is where the schema comes into play.
The tabular format is needed so that SQL can be used to query the data. But not all applications require data to be in tabular format. Some applications, like big data analytics, full text search, and machine learning, can access data even if it is ‘semi-structured’ or completely unstructured.
</div>
<figure>
<img src="assets/img/portfolio/ware-house-struc.png" alt="" style="max-width: 100%; max-height: 100%;">
<figcaption style="text-align: center;">For OLTP and OLAP see below</figcaption>
</figure>
</ul>
<h4>Building Blocks of Data Warehouse</h4>
<p>The central and most fundamental building block of a data warehouse is the Data Warehouse itself. It serves as the primary repository for storing and managing data from various source systems. The data warehouse is specifically designed for efficient querying and analysis, making it the core component that enables organizations to consolidate and leverage their data for reporting, business intelligence, and analytics purposes.</p>
<p>Data warehousing involves several building blocks that work together to create an effective data management and analytics environment. Here are the key building blocks of a data warehouse:</p>
<img src="assets/img/portfolio/warehouse-buildingblock.png" alt="" style="max-width: 100%; max-height: 100%;">
<ol style="margin-left: 30px;">
<li>
<strong>Source Systems:</strong>
These are the systems or data repositories where data is generated or collected. Source systems can include databases, applications, spreadsheets, logs, and more.
</li>
<li>
<strong>Data Extraction (ETL - Extract, Transform, Load):</strong>
ETL processes extract data from source systems, transform it into a suitable format, and load it into the data warehouse. This step involves data cleansing, validation, and aggregation.
</li>
<li>
<strong>Staging Area:</strong>
The staging area is an intermediate storage location where data is temporarily held before being loaded into the data warehouse. It allows for data validation and reconciliation.
</li>
<li>
<strong>Data Warehouse:</strong>
The central repository where data from various sources is stored for reporting and analysis. It is structured for efficient querying and analysis.
</li>
<li>
<strong>Data Marts:</strong>
Data marts are subsets of the data warehouse designed for specific business units or departments. They store data relevant to particular business areas, making it easier for users to access and analyze the data they need.
</li>
<li>
<strong>Metadata Repository:</strong>
Metadata is data about data. A metadata repository stores information about the structure, source, and meaning of the data in the data warehouse. It helps users understand and locate the data they need.
</li>
<li>
<strong>Business Intelligence (BI) Tools:</strong>
BI tools are used to query, analyze, and visualize data stored in the data warehouse. They provide a user-friendly interface for creating reports, dashboards, and data visualizations.
</li>
</ol>
<p>While the other components, such as ETL processes, data marts, metadata repositories, and business intelligence tools, play crucial roles in the data warehousing environment, the data warehouse stands as the central hub where data is organized and made accessible to users for decision-making and analysis. It is the backbone of the entire data warehousing architecture.</p>
<h5>How does a data warehouse work?</h5>
<ul style="list-style-type: disc; margin-left: 30px;">
<li>A data warehouse may contain multiple databases.</li>
<li>Within each database, data is organized into tables and columns.</li>
<li>Within each column, you can define a description of the data, such as integer, data field, or string.</li>
<li>Tables can be organized inside of schemas, which you can think of as folders.</li>
<li>When data is ingested, it is stored in various tables described by the schema.</li>
<li>Query tools use the schema to determine which data tables to access and analyze.</li>
</ul>
<figure style="text-align: center;">
<img src="assets/img/portfolio/datawarehouse-3.png" alt="" style="max-width: 100%; max-height: 100%;">
<figcaption style="text-align: center;"><strong>Datawarehouse</strong></figcaption>
</figure>
<figure style="text-align: center;">
<img src="assets/img/portfolio/data-lake3.png" alt="" style="max-width: 100%; max-height: 100%;">
<figcaption style="text-align: center;"><strong>Data lake</strong></figcaption>
</figure>
<figure style="text-align: center;">
<img src="assets/img/portfolio/data-mart-3.png" alt="" style="max-width: 100%; max-height: 100%;">
<figcaption style="text-align: center;"><strong>Data mart</strong></figcaption>
</figure>
<h5>Architecture of the datawarehouse</h5>
<figure style="text-align: center;">
<img src="assets/img/portfolio/datawarehouse-3tier.png" alt="" style="max-width: 90%; max-height: 90%;">
<figcaption style="text-align: center;"><strong>Image reference:</strong><a href="https://panoply.io/data-warehouse-guide/data-warehouse-architecture-traditional-vs-cloud/" target="_blank"> panoply</a> </figcaption>
</figure>
<ul style="list-style-type: disc; margin-left: 30px;">
<p>The three-tier architecture of a data warehouse involves three distinct layers, each serving specific functions in the data processing and reporting pipeline.
Here's an overview of the three-tier structure:</p>
<li><strong>Data Source Layer (Bottom tier):</strong> The bottom tier consists of a data warehouse server, usually a relational database system,
which collects, cleanses, and transforms data from multiple data sources through a process known as Extract, Transform, and Load (ETL) or a
process known as Extract, Load, and Transform (ELT).
<ul>
<li>The first tier, often referred to as the bottom tier, consists of various data sources.</li>
<li>Data is extracted from operational databases, external systems, flat files, or other sources.</li>
<li>Initial processing may occur at this stage, such as data cleansing and extraction.</li>
</ul><br>
</li>
<li><strong>Data Warehouse Layer (Middle tier):</strong>
<ul>
<li>The second tier, known as the middle tier, is the core data warehouse layer.</li>
<li>ETL (Extract, Transform, Load) processes operate in this layer to transform raw data into a format suitable for analysis.</li>
<li>The transformed data is loaded into the data warehouse database, following dimensional modeling principles (star schema or snowflake schema).</li>
<li>The data warehouse stores structured and organized data, including fact tables (containing metrics) and dimension tables (containing descriptive attributes).</li>
<li>This layer often includes a staging area for temporary storage before data undergoes transformations.</li>
</ul>
The middle tier consists of an OLAP (i.e. online analytical processing) server which enables fast query speeds. Three types of OLAP models can be used in this tier, which are known as:
<ul>
<li>ROLAP</li>
<li>MOLAP</li>
<li>HOLAP</li>
</ul>
</li>
<li><strong>Frontend/Client Layer (Top tier): </strong>The top tier is represented by some kind of front-end user interface or reporting tool, which enables end users to conduct
ad-hoc data analysis on their business data and presents results through reporting, analysis, and data mining tools.</li>
<ul>
<li>The third tier, also known as the top tier, is the frontend or client layer.</li>
<li>Business users interact with this layer to access and analyze the data stored in the data warehouse.</li>
<li>Business Intelligence (BI) tools, reporting tools, and analytical applications reside in this layer.</li>
<li>Users can run queries, generate reports, and visualize data to gain insights and make informed decisions.</li>
<li>OLAP (Online Analytical Processing) engines facilitate multidimensional analysis.</li>
</ul>
</ul>
<!---- DATA LAKE IS DISCUSSED BELOW -->
<blockquote style="border-left: 5px solid #f7f2f3; background-color: #31cbdf; padding: 20px; color: #f1f0f0;">
<h2>Datalake</h2></blockquote>
<img src="assets/img/portfolio/Data-lake-1.png" alt="Types of data" style="max-width: 60%; max-height: 60%;">
<p>A data lake is a centralized repository that stores all your structured and unstructured data at any scale. You can store your data as-is, without having to structure the data, and run different types of analytics—from dashboards and visualizations to big data processing, real-time analytics, and machine learning to guide better decisions.</p>
<p>Data lakes are often used in conjunction with data warehouses, but they are not the same thing. A data warehouse is a structured data store that is optimized for querying and reporting on historical data. A data lake, on the other hand, is a scalable and flexible data store that can store any type of data, regardless of its structure.</p>
<img src="assets/img/portfolio/data-lake-pipeline.png" alt="Types of data" style="max-width: 90%; max-height: 90%;">
<h3 id="Benefits">Benefits<a class="anchor-link" href="#Benefits">¶</a></h3>
<p>Some of the benefits of using a data lake include:</p>
<ul style="list-style-type: disc; margin-left: 30px;">
<li><strong>Scalability:</strong> Data lakes can scale to store any amount of data.</li>
<li><strong>Flexibility:</strong> Data lakes can store any type of data, regardless of its structure.</li>
<li><strong>Performance:</strong> Data lakes can be optimized for different types of analytics, including batch processing, real-time analytics, and machine learning.</li>
<li><strong>Cost-effectiveness:</strong> Data lakes can be more cost-effective than traditional data warehouses, especially for storing large amounts of data.</li>
</ul>
<h3 id="Chalanges">Chalanges<a class="anchor-link" href="#Chalanges">¶</a></h3><p>Some of the challenges of using a data lake include:</p>
<ul style="list-style-type: disc; margin-left: 30px;">
<li><strong>Complexity:</strong> Data lakes can be more complex to manage and maintain than traditional data warehouses.</li>
<li><strong>Security:</strong> Data lakes need to be properly secured to protect sensitive data.</li>
<li><strong>Governance:</strong> Data lakes need to be properly governed to ensure that data is used in a responsible and ethical manner.</li>
</ul>
<p>Overall, data lakes are a powerful tool for storing and analyzing large amounts of data. They can be used to support a wide range of business applications, including customer intelligence, fraud detection, and product development.</p>
<h2 id="Applications">Applications<a class="anchor-link" href="#Applications">¶</a></h2><p>Here are some examples of how data lakes are being used in the real world:</p>
<ul style="list-style-type: disc; margin-left: 30px;">
<li><strong>Retail:</strong> Retailers use data lakes to store and analyze customer data, product data, and sales data. This data can be used to improve customer targeting, product recommendations, and inventory management.</li>
<li><strong>Finance:</strong> Financial institutions use data lakes to store and analyze customer data, transaction data, and market data. This data can be used to improve risk management, fraud detection, and investment decisions.</li>
<li><strong>Healthcare:</strong> Healthcare organizations use data lakes to store and analyze patient data, medical research data, and clinical trial data. This data can be used to improve patient care, develop new treatments, and reduce costs.</li>
</ul>
<h3>Database vs Data warehouse vs data lake vs data mart</h3>
<p>Data lakes are a powerful tool for storing and analyzing large amounts of data. They can be used to support a wide range of business applications, and they are becoming increasingly important as the amount of data that businesses collect and store continues to grow.</p>
<table>
<tr>
<th>Aspect</th>
<th>Database</th>
<th>Data Warehouse</th>
<th>Data Lake</th>
<th>Data Mart</th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>Purpose</strong></td>
<td>Designed for transactional data management.</td>
<td>Designed for analytical processing.</td>
<td>Designed for storing raw, unstructured data.</td>
<td>Subset of a data warehouse focused on specific business areas.</td>
</tr>
<tr>
<td><strong>Data Type</strong></td>
<td>Typically contains structured, transactional data.</td>
<td>Contains integrated and historical data.</td>
<td>Stores raw and structured, semi-structured, or unstructured data.</td>
<td>Contains summarized and pre-aggregated data for specific use cases.</td>
</tr>
<tr>
<td><strong>Data Structure</strong></td>
<td>Follows a normalized schema (relational database model).</td>
<td>Typically follows a structured schema (Star Schema or Snowflake Schema).</td>
<td>Schema-on-read, allowing flexible data structures.</td>
<td>Follows a dimensional model (Star Schema) for ease of reporting.</td>
</tr>
<tr>
<td><strong>Schema Design</strong></td>
<td>Optimized for efficient data storage and retrieval.</td>
<td>Optimized for query performance.</td>
<td>Flexible schema, data can be ingested without predefined structure.</td>
<td>Designed for ease of reporting and analysis.</td>
</tr>
<tr>
<td><strong>Query and Analytics</strong></td>
<td>Supports simple queries and transactional operations.</td>
<td>Designed for complex data analysis and reporting.</td>
<td>Suitable for ad-hoc queries and big data analytics.</td>
<td>Optimized for specific business units or departments' analytics.</td>
</tr>
<tr>
<td><strong>Data Integration</strong></td>
<td>Typically focuses on data from a single application or system.</td>
<td>Integrates data from various sources for a unified view.</td>
<td>Supports data from various sources with varying structures.</td>
<td>Integrates data relevant to a specific business area.</td>
</tr>
<tr>
<td><strong>Performance Optimization</strong></td>
<td>Optimized for ACID (Atomicity, Consistency, Isolation, Durability) properties.</td>
<td>Utilizes indexing, caching, and parallel processing.</td>
<td>Performance tuning is performed during data processing.</td>
<td>Designed for fast query response times for specific use cases.</td>
</tr>
<tr>
<td><strong>Scalability</strong></td>
<td>Scalability depends on the database system; typically vertical scaling.</td>
<td>Scalable both vertically and horizontally to handle large data volumes.</td>
<td>Scales horizontally to accommodate growing data needs.</td>
<td>Scales according to the needs of the specific business area.</td>
</tr>
<tr>
<td><strong>Tools and Technologies</strong></td>
<td>Utilizes relational database management systems (e.g., Oracle, MySQL).</td>
<td>Uses specialized data warehousing solutions (e.g., Redshift, BigQuery).</td>
<td>Utilizes distributed storage systems (e.g., Hadoop, Azure Data Lake Storage).</td>
<td>May use BI tools for reporting and analytics.</td>
</tr>
<tr>
<td><strong>Use Cases</strong></td>
<td>Transactional applications, e-commerce, record-keeping.</td>
<td>Business intelligence, reporting, complex analytics.</td>
<td>Big data analytics, data science, machine learning.</td>
<td>Department-specific analytics, budgeting, and forecasting.</td>
</tr>
</tbody>
</table>
<h3 id="Tools-used-to-intergrate-the-warehouses,-databases,-and-data-lakes">Tools used to intergrate the warehouses, databases, and data lakes<a class="anchor-link" href="#Tools-used-to-intergrate-the-warehouses,-databases,-and-data-lakes">¶</a></h3><p>Integrating data warehouses, databases, and data lakes often involves a combination of tools and technologies to facilitate data movement, transformation, and synchronization across these components. Here are some commonly used tools and methods for integrating them:</p>
<ol style="list-style-type: disc; margin-left: 30px;">
<li><p><strong>ETL (Extract, Transform, Load) Tools:</strong></p>
<ul>
<li>ETL tools are designed to extract data from source systems, transform it into the desired format, and load it into a target system, which can be a data warehouse, database, or data lake.</li>
<li><p><strong>Popular ETL tools include</strong></p>
<ul>
<li>Apache NiFi, </li>
<li>Talend, </li>
<li>Informatica PowerCenter, </li>
<li>Microsoft SQL Server Integration Services (SSIS), and </li>
<li><p>Apache Spark</p>
<p>for batch and stream processing.</p>
</li>
</ul>
</li>
</ul>
</li>
<li><p><strong>Data Integration Platforms:</strong></p>
<ul>
<li>These platforms provide a comprehensive solution for data integration, including data profiling, cleansing, transformation, and loading.</li>
<li><p>Examples include</p>
<ul>
<li>Apache Nifi, </li>
<li>Apache Camel, and </li>
<li><p>Apache Kafka</p>
<p>for stream processing.</p>
</li>
</ul>
</li>
</ul>
</li>
<li><p><strong>Data Integration Middleware:</strong></p>
<ul>
<li>Middleware solutions facilitate communication and data transfer between different systems, including data warehouses, databases, and data lakes.</li>
<li><p><strong>Tools:</strong></p>
<ul>
<li>Apache Kafka, </li>
<li>RabbitMQ, and </li>
<li><p>Apache Pulsar</p>
<p>are often used for real-time data streaming and messaging.</p>
</li>
</ul>
</li>
</ul>
</li>
<li><p><strong>Data Replication Tools:</strong></p>
<ul>
<li>Data replication tools enable real-time or near-real-time data synchronization between databases and data warehouses.</li>
<li><p><strong>Solutions:</strong></p>
<ul>
<li>Oracle GoldenGate, </li>
<li>AWS Database Migration Service, and </li>
<li><p>Microsoft Azure Data Factory</p>
<p>offer data replication capabilities.</p>
</li>
</ul>
</li>
</ul>
</li>
<li><p><strong>Data Virtualization Tools:</strong></p>
<ul>
<li>Data virtualization allows organizations to create a unified, virtual view of data spread across different sources without physically moving it.</li>
<li><p><strong>Tools:</strong></p>
<ul>
<li>Denodo, </li>
<li>Informatica Enterprise Data Catalog, and </li>
<li><p>Red Hat JBoss Data Virtualization</p>
<p>offer data virtualization capabilities.</p>
</li>
</ul>
</li>
</ul>
</li>
<li><p><strong>APIs and Web Services:</strong></p>
<ul>
<li>APIs (Application Programming Interfaces) and web services can be used to exchange data between systems programmatically.</li>
<li>RESTful APIs, SOAP APIs, and GraphQL are common protocols for data integration via web services.</li>
</ul>
</li>
<li><p><strong>Custom Scripting and Coding:</strong></p>
<ul>
<li>For highly customized integration requirements, organizations may develop custom scripts or code using programming languages like Python, Java, or Ruby.</li>
</ul>
</li>
<li><p><strong>Data Lake Services:</strong></p>
<ul>
<li>Cloud-based data lake services like AWS Lake Formation, Azure Data Lake Storage, and Google Cloud Storage often provide built-in tools for integrating data with various data warehousing and database services.</li>
</ul>
</li>
<li><p><strong>Data Integration Platforms as a Service (iPaaS):</strong></p>
<ul>
<li>iPaaS solutions offer cloud-based platforms for connecting various applications and data sources, including data lakes, data warehouses, and databases.</li>
<li>Examples include Dell Boomi, MuleSoft Anypoint Platform, and SnapLogic.</li>
</ul>
</li>
<li><p><strong>Change Data Capture (CDC) Tools:</strong></p>
<ul>
<li>CDC tools monitor changes in source data and capture them in real-time or batch mode for integration with target systems.</li>
<li>Tools like Debezium and Apache Nifi support CDC functionality.</li>
</ul>
</li>
</ol>
<p>The choice of tools depends on your specific integration requirements, the technology stack you are using, and whether you are working with on-premises or cloud-based data systems. Many organizations use a combination
of these tools and methods to create a robust and efficient data integration strategy that connects data across data warehouses, databases, and data lakes.</p>
<h3>OLAP and OLTP systems</h3>
<p>A Data Warehouse is a central repository (Centralized Database) where all the enterprise data coming from different resources is stored, managed, and analyzed for gaining business insights.
The data which comes in different formats can be stored in a warehouse and is accessed using some business intelligence tools to query the business problems.</p>
<p>This data is ingested through ETL (Extraction Transformation and Loading) Operations and then is analyzed using OLAP or OLTP System, whichever is used. This process of querying the business
data for decisions using a centralized database is called Data Warehousing.</p>
<figure style="text-align: center;">
<img src="assets/img/portfolio/OLTP-ETL-and-OLAP-layers1.png" alt="" style="max-width: 90%; max-height: 90%;">
<figcaption style="text-align: center;">For more information on OLAP, OLTP check: <a href="https://phoenixnap.com/kb/oltp-vs-olap" target="_blank">Phonixnap</a> (Architecture of datawarehouse)</figcaption>
</figure>
<ul>
<li><strong>What is OLAP?</strong>
OLAP stands for Online Analytical Processing System. This Data Processing System is used for multidimensional analysis of large volumes of Data. It is mostly preferred for Data Mining
(Extracting patterns in Large volumes of data), Business Intelligence, and Complex Analytical Calculations like Budget Forecasting, Sales Analysis, etc.
</li>
<li><strong>What is OLTP?</strong> OLTP or Online Transaction Processing System is used to enable real-time executions of huge numbers of transactions by a large number of users.
Here, the Transaction means a single unit of work that is done to modify the data in the database.<p></p>
<p><strong>For example</strong>, as soon as you withdraw money from the bank, your account balance reflects changes instantly. Or, when you change your account password, it is changed instantly
without a delay. These are the daily life examples of Transaction Processes in the Database handled using the OLTP System. This is the main difference between OLAP and OLTP.</p>
</li>
</ul>
<table>
<thead>
<tr>
<th>Aspect</th>
<th>OLAP (Online Analytical Processing)</th>
<th>OLTP (Online Transactional Processing)</th>
</tr>
</thead>
<tbody>
<tr>
<td></td>
<td>OLAP is used for complex, multi-dimensional analysis of historical data from data warehouses to support business intelligence and decision-making.</td>
<td>OLTP is used for managing and processing high volumes of real-time transactional data, such as order processing and inventory management, in databases.</td>
</tr>
<tr>
<td>Nature of Processing</td>
<td>Analytical: Focuses on complex data analysis.</td>
<td>Transactional: Focuses on real-time transactions.</td>
</tr>
<tr>
<td>Data Type</td>
<td>Typically processes historical and aggregated data.</td>
<td>Processes recent and detailed transactional data.</td>
</tr>
<tr>
<td>Usage</td>
<td>Data mining, business intelligence, reporting, complex calculations, forecasting.</td>
<td>E-commerce, ATM transactions, payment processing, reservations, record-keeping.</td>
</tr>
<tr>
<td>Data Structure</td>
<td>Multidimensional, often uses data warehousing.</td>
<td>Relational, follows a normalized schema.</td>
</tr>
<tr>
<td>Query Complexity</td>
<td>Handles complex queries involving aggregations and calculations.</td>
<td>Handles simple queries for transaction processing.</td>
</tr>
<tr>
<td>Performance</td>
<td>Query performance is optimized for large-scale data analysis.</td>
<td>Transaction processing is optimized for speed and concurrency.</td>
</tr>
<tr>
<td>Data Volume</td>
<td>Deals with large volumes of data.</td>
<td>Typically handles smaller, real-time datasets.</td>
</tr>
<tr>
<td>Concurrency</td>
<td>Supports a lower level of concurrent users.</td>
<td>Supports a higher level of concurrent users.</td>
</tr>
<tr>
<td>Database Size</td>
<td>Database size is usually larger.</td>
<td>Database size is comparatively smaller.</td>
</tr>
<tr>
<td>Backup and Recovery</td>
<td>Backup and recovery strategies may vary.</td>
<td>Requires robust backup and recovery mechanisms.</td>
</tr>
<tr>
<td>Schema Design</td>
<td>Often uses denormalized schema (Star Schema).</td>
<td>Follows a normalized schema (Third Normal Form).</td>
</tr>
<tr>
<td>Indexing</td>
<td>Indexing is critical for query performance.</td>
<td>Indexing is important for transaction speed.</td>
</tr>
<tr>
<td>Examples</td>
<td>Business intelligence tools, data warehouses.</td>
<td>E-commerce platforms, banking systems.</td>
</tr>
</tbody>
</table>
<ul>
<li><strong>OLTP (Online Transaction Processing):</strong> MYSQL, PostgreSQL, Oracle Database, Microsoft SQL Server.</li>
<li><strong>OLAP (Online Analytical Processing):</strong> Hive, Google BigQuery, Amazon Redshift, Snowflake, SAP HANA, Microsoft SQL Server Analysis Services (SSAS).</li>
<li><strong>Dual Purpose (OLAP and OLTP Capabilities): </strong>Microsoft SQL Server.</li>
</ul>
<p>Keep in mind that OLAP and OLTP serve different purposes within an organization, and the choice between them depends on the specific business requirements and the types of applications being used.</p>
<h3>Types of OLAP</h3>
<p>OLAP (Online Analytical Processing) is a category of software and technology used for multidimensional data analysis. There are two primary types of OLAP systems:
<ul style="list-style-type: disc; margin-left: 30px;">
<li>MOLAP (Multidimensional OLAP)</li>
<li>ROLAP (Relational OLAP)</li>
<li>HOLAP (Hybrid OLAP)</li>
</ul>
HOLAP (Hybrid OLAP) combines elements of both MOLAP and ROLAP. Here's an overview of each type:</p>
<ol style="margin-left: 30px;">
<li><strong>MOLAP (Multidimensional OLAP): </strong>
<ul>
<li><strong>Advantages: </strong>
<ul>
<li><strong>Fast Query Performance: </strong>MOLAP databases are optimized for multidimensional queries, making them exceptionally fast for analytical and ad-hoc queries.</li>
<li><strong>Aggregation Support: </strong>They support pre-aggregated data cubes, which are beneficial for summarizing and analyzing large datasets.</li>
<li><strong>Data Compression: </strong>MOLAP systems often employ efficient compression techniques to reduce storage requirements.</li>
</ul>
</li>
<li><strong>Disadvantages: </strong>
<ul>
<li><strong>Storage Overhead: </strong>The storage requirements for MOLAP can be significant due to the pre-aggregated data.</li>
<li><strong>Data Loading Time: </strong> Loading data into MOLAP databases can be time-consuming, especially for large datasets.</li>
<li><strong>Limited Scalability: </strong>Scalability can be an issue when dealing with very large datasets.</li>
</ul>
</li>
<figure style="text-align: center;">
<img src="assets/img/portfolio/data-warehouse-MOLAP.png" alt="" style="max-width: 90%; max-height: 90%;">
<figcaption style="text-align: center;"><strong>Image credit: </strong><a href="https://www.javatpoint.com/data-warehouse-types-of-olap" target="_blank">javapoint</a></figcaption>
</figure>
</ul>
</li>
<li><strong>ROLAP (Relational OLAP):</strong>
<ul>
<li><strong>Advantages: </strong>
<ul>
<li><strong>Scalability:</strong> ROLAP systems can handle very large datasets and are suitable for organizations with extensive data volumes.</li>
<li><strong>Flexibility:</strong> They work with existing relational database management systems (RDBMS), making them compatible with a wide range of data sources.</li>
<li><strong>Data Integration:</strong> ROLAP systems can seamlessly integrate with other business applications and processes.</li>
</ul>
</li>
<li><strong>Disadvantages: </strong>
<ul>
<li><strong>Query Performance:</strong> Analytical queries in ROLAP can be slower than MOLAP due to the relational nature of data storage.</li>
<li><strong>Complexity:</strong> Designing and maintaining ROLAP systems can be more complex, as they often require writing complex SQL queries.</li>
<li><strong>Lack of Aggregation:</strong> ROLAP systems may require more effort to create and manage aggregated data for performance optimization.</li>
</ul>
</li>
<figure style="text-align: center;">
<img src="assets/img/portfolio/data-warehouse-ROLAP.png" alt="" style="max-width: 90%; max-height: 90%;">
<figcaption style="text-align: center;"><strong>Image credit: </strong><a href="https://www.javatpoint.com/data-warehouse-types-of-olap" target="_blank">javapoint</a></figcaption>
</figure>
</ul>
</li>
<li><strong>HOLAP (Hybrid OLAP):</strong>
<p>HOLAP systems aim to combine the advantages of both MOLAP and ROLAP while mitigating their disadvantages. Here are some characteristics:</p>
<ul>
<li><strong>Advantages:</strong>
<ul>
<li><strong>Combines Speed and Scalability:</strong> HOLAP systems attempt to provide the query speed of MOLAP with the scalability of ROLAP.</li>
<li><strong>Flexible Storage:</strong> They allow for a mix of pre-aggregated and relational data storage, providing more flexibility in data management.</li>
<li><strong>Adaptable to Varying Data Needs:</strong> HOLAP can dynamically switch between storage modes based on query complexity.</li>
</ul>
</li>
<li><strong>Disavantages:</strong>
<ul>
<li><strong>Complex to Implement:</strong> Setting up HOLAP systems can be more complex than MOLAP or ROLAP alone.</li>
<li><p><strong>Potential Performance Trade-offs:</strong> While HOLAP aims to balance speed and scalability, it may not achieve the same query performance as pure MOLAP systems.</p>
<figure style="text-align: center;">
<img src="assets/img/portfolio/data-warehouse-HOLAP.png" alt="" style="max-width: 90%; max-height: 90%;">
<figcaption style="text-align: center;"><strong>Image credit: </strong><a href="https://www.javatpoint.com/data-warehouse-types-of-olap" target="_blank">javapoint</a></figcaption>
</figure>
</li>
</ul>
</li>
</ul>
</li>
<p>The choice between MOLAP, ROLAP, or HOLAP depends on an organization's specific requirements, data volume, query performance needs, and existing technology infrastructure. In practice, some organizations use a combination of these OLAP types to address different aspects of their analytical needs.</p>
</ol>
<h3 id="OLAP-operation">OLAP operation<a class="anchor-link" href="#OLAP-operation">¶</a></h3>
<p>OLAP (Online Analytical Processing) operations are a set of data manipulation techniques used for querying and analyzing multidimensional data stored in OLAP databases or data cubes. These operations help users gain insights from large datasets quickly and effectively. There are several fundamental OLAP operations:</p>
<ol style="margin-left: 30px;">
<li><p><strong>Slice:</strong> Slicing involves selecting a single dimension from a multidimensional cube to view a "slice" of the data. This operation allows users to focus on a specific dimension's values while keeping all other dimensions constant. For example, you can slice a sales data cube to view sales figures for a particular product category across different time periods.</p>
<figure style="text-align: center;">
<img src="assets/img/portfolio/data-warehouse-slice-operation.png" alt="" style="max-width: 80%; max-height: 80%;">
<figcaption style="text-align: center;"><strong>Image credit: </strong><a href="https://www.javatpoint.com/olap-operations" target="_blank">javapoint</a></figcaption>
</figure>
</li>
<li><p><strong>Dice:</strong> Dicing is similar to slicing but involves selecting specific values or ranges within multiple dimensions simultaneously. It allows users to create a subcube by specifying conditions for multiple dimensions. For instance, you could dice a data cube to examine sales data for a particular region, product category, and time frame.</p>
<figure style="text-align: center;">
<img src="assets/img/portfolio/data-warehouse-dice-operation.png" alt="" style="max-width: 80%; max-height: 80%;">
<figcaption style="text-align: center;"><strong>Image credit: </strong><a href="https://www.javatpoint.com/olap-operations" target="_blank">javapoint</a></figcaption>
</figure>
</li>
<li><p><strong>Roll-up (Climb):</strong> Roll-up involves aggregating data from a lower-level dimension to a higher-level dimension. It moves up the hierarchy to see summarized or aggregated data. For instance, you can roll up sales data from daily to monthly or yearly totals to get a broader perspective.</p>
<figure style="text-align: center;">
<img src="assets/img/portfolio/data-warehouse-rollup-operation.png" alt="" style="max-width: 80%; max-height: 80%;">
<figcaption style="text-align: center;"><strong>Image credit: </strong><a href="https://www.javatpoint.com/olap-operations" target="_blank">javapoint</a></figcaption>
</figure>
</li>
<li><p><strong>Drill-down (Dig):</strong> Drill-down is the opposite of roll-up. It involves breaking down aggregated data into more detailed levels. Users can explore data at finer granularity by drilling down. For example, you can drill down from yearly sales to monthly, weekly, or daily sales figures.</p>
<figure style="text-align: center;">
<img src="assets/img/portfolio/data-warehouse-drilldown-operation.png" alt="" style="max-width: 80%; max-height: 80%;">
<figcaption style="text-align: center;"><strong>Image credit: </strong><a href="https://www.javatpoint.com/olap-operations" target="_blank">javapoint</a></figcaption>
</figure>
</li>
<li><p><strong>Pivot (Rotate):</strong> Pivoting reorients the data cube to view it from a different perspective. It involves swapping the dimensions, which can provide alternative views of the data. Pivoting can help users analyze data from various angles and dimensions.</p>
<figure style="text-align: center;">
<img src="assets/img/portfolio/data-warehouse-pivot-operation.png" alt="" style="max-width: 80%; max-height: 80%;">
<figcaption style="text-align: center;"><strong>Image credit: </strong><a href="https://www.javatpoint.com/olap-operations" target="_blank">javapoint</a></figcaption>
</figure>
</li>
</ol>
<p>These OLAP operations are essential for business intelligence, reporting, and decision support systems. They enable users to navigate and explore multidimensional data efficiently, making it easier to discover insights and make informed decisions. The specific operations and functionalities may vary depending on the OLAP system or tool being used.</p>
<h3>Some other interesting things to know:</h3>
<ul style="list-style-type: disc; margin-left: 30px;">
<li> Visit <a href="datawarehouse-dimnesion-modeling.html">Data warehouse dimension modeling</a></li>
<li>Visit the <a href="https://www.javatpoint.com/data-mining">Data mining tutorial</a></li>
<li>Visit my repository on <a href="https://github.com/arunp77/Database-datapipeline-ETL/tree/main/Database">GitHub for Bigdata, Databases, DBMS, Data modling, Data mining.</a></li>
<li>Visit my website on <a href="sql-basic-details.html">SQL.</a></li>
<li>Visit my website on <a href="sql-postgresql.html">PostgreSQL.</a></li>
<li>Visit my website on <a href="scd.html">Slowly changing variables.</a></li>
<li>Visit my website on <a href="snowflake.html">SNowflake.</a></li>
<li>Visit my website on <a href="sql-project.html">SQL project in postgresql.</a></li>
<li>Visit my website on <a href="snowflake-task-stream.html">Snowflake data streaming.</a></li>
</ul>
<div class="navigation">
<a href="index.html" class="clickable-box">
<span class="arrow-left">Go home</span>
</a>
<a href="content-page.html" class="clickable-box">
<span class="arrow-right">Content page</span>
</a>
</div>
</div>
</div>
</section><!-- End Portfolio Details Section -->
</main><!-- End #main --
<!-- ======= Footer ======= -->
<footer id="footer">
<div class="container">
<div class="copyright">
© Copyright <strong><span>Arun</span></strong>
</div>
</div>
</footer><!-- End Footer -->
<a href="#" class="back-to-top d-flex align-items-center justify-content-center"><i class="bi bi-arrow-up-short"></i></a>
<!-- Vendor JS Files -->
<script src="assets/vendor/purecounter/purecounter_vanilla.js"></script>
<script src="assets/vendor/aos/aos.js"></script>
<script src="assets/vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
<script src="assets/vendor/glightbox/js/glightbox.min.js"></script>
<script src="assets/vendor/isotope-layout/isotope.pkgd.min.js"></script>
<script src="assets/vendor/swiper/swiper-bundle.min.js"></script>
<script src="assets/vendor/typed.js/typed.umd.js"></script>
<script src="assets/vendor/waypoints/noframework.waypoints.js"></script>
<script src="assets/vendor/php-email-form/validate.js"></script>
<!-- Template Main JS File -->
<script src="assets/js/main.js"></script>
<script>
document.addEventListener("DOMContentLoaded", function () {
hljs.initHighlightingOnLoad();
});
</script>
</body>
</html>