-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
kafka.html
718 lines (633 loc) · 56.9 KB
/
kafka.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta content="width=device-width, initial-scale=1.0" name="viewport">
<title>Scala Spark</title>
<meta content="" name="description">
<meta content="" name="keywords">
<!-- Favicons -->
<link href="assets/img/Favicon-1.png" rel="icon">
<link href="assets/img/Favicon-1.png" rel="apple-touch-icon">
<!-- Google Fonts -->
<link href="https://fonts.googleapis.com/css?family=Open+Sans:300,300i,400,400i,600,600i,700,700i|Raleway:300,300i,400,400i,500,500i,600,600i,700,700i|Poppins:300,300i,400,400i,500,500i,600,600i,700,700i" rel="stylesheet">
<!-- Vendor CSS Files -->
<link href="assets/vendor/aos/aos.css" rel="stylesheet">
<link href="assets/vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet">
<link href="assets/vendor/bootstrap-icons/bootstrap-icons.css" rel="stylesheet">
<link href="assets/vendor/boxicons/css/boxicons.min.css" rel="stylesheet">
<link href="assets/vendor/glightbox/css/glightbox.min.css" rel="stylesheet">
<link href="assets/vendor/swiper/swiper-bundle.min.css" rel="stylesheet">
<!-- Creating a python code section-->
<link rel="stylesheet" href="assets/css/prism.css">
<script src="assets/js/prism.js"></script>
<!-- Template Main CSS File -->
<link href="assets/css/style.css" rel="stylesheet">
<!-- To set the icon, visit https://fontawesome.com/account-->
<script src="https://kit.fontawesome.com/5d25c1efd3.js" crossorigin="anonymous"></script>
<!-- end of icon-->
<script type="text/javascript" async
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-MML-AM_CHTML">
</script>
<!-- =======================================================
* Template Name: iPortfolio
* Updated: Sep 18 2023 with Bootstrap v5.3.2
* Template URL: https://bootstrapmade.com/iportfolio-bootstrap-portfolio-websites-template/
* Author: BootstrapMade.com
* License: https://bootstrapmade.com/license/
======================================================== -->
</head>
<body>
<!-- ======= Mobile nav toggle button ======= -->
<i class="bi bi-list mobile-nav-toggle d-xl-none"></i>
<!-- ======= Header ======= -->
<header id="header">
<div class="d-flex flex-column">
<div class="profile">
<img src="assets/img/myphoto.jpeg" alt="" class="img-fluid rounded-circle">
<h1 class="text-light"><a href="index.html">Arun</a></h1>
<div class="social-links mt-3 text-center">
<a href="https://www.linkedin.com/in/arunp77/" target="_blank" class="linkedin"><i class="bx bxl-linkedin"></i></a>
<a href="https://github.com/arunp77" target="_blank" class="github"><i class="bx bxl-github"></i></a>
<a href="https://twitter.com/arunp77_" target="_blank" class="twitter"><i class="bx bxl-twitter"></i></a>
<a href="https://www.instagram.com/arunp77/" target="_blank" class="instagram"><i class="bx bxl-instagram"></i></a>
<a href="https://arunp77.medium.com/" target="_blank" class="medium"><i class="bx bxl-medium"></i></a>
</div>
</div>
<nav id="navbar" class="nav-menu navbar">
<ul>
<li><a href="index.html#hero" class="nav-link scrollto active"><i class="bx bx-home"></i> <span>Home</span></a></li>
<li><a href="index.html#about" class="nav-link scrollto"><i class="bx bx-user"></i> <span>About</span></a></li>
<li><a href="index.html#resume" class="nav-link scrollto"><i class="bx bx-file-blank"></i> <span>Resume</span></a></li>
<li><a href="index.html#portfolio" class="nav-link scrollto"><i class="bx bx-book-content"></i> <span>Portfolio</span></a></li>
<li><a href="index.html#skills-and-tools" class="nav-link scrollto"><i class="bx bx-wrench"></i> <span>Skills and Tools</span></a></li>
<li><a href="index.html#language" class="nav-link scrollto"><i class="bi bi-menu-up"></i> <span>Languages</span></a></li>
<li><a href="index.html#awards" class="nav-link scrollto"><i class="bi bi-award-fill"></i> <span>Awards</span></a></li>
<li><a href="index.html#professionalcourses" class="nav-link scrollto"><i class="bx bx-book-alt"></i> <span>Professional Certification</span></a></li>
<li><a href="index.html#publications" class="nav-link scrollto"><i class="bx bx-news"></i> <span>Publications</span></a></li>
<li><a href="index.html#extra-curricular" class="nav-link scrollto"><i class="bx bx-rocket"></i> <span>Extra-Curricular Activities</span></a></li>
<!-- <li><a href="#contact" class="nav-link scrollto"><i class="bx bx-envelope"></i> <span>Contact</span></a></li> -->
</ul>
</nav><!-- .nav-menu -->
</div>
</header><!-- End Header -->
<main id="main">
<!-- ======= Breadcrumbs ======= -->
<section id="breadcrumbs" class="breadcrumbs">
<div class="container">
<div class="d-flex justify-content-between align-items-center">
<h2>Data Engineering</h2>
<ol>
<li><a href="Data-engineering.html" class="clickable-box">Content section</a></li>
<li><a href="index.html#portfolio" class="clickable-box">Portfolio section</a></li>
</ol>
</div>
</div>
</section><!-- End Breadcrumbs -->
<!------ right dropdown menue ------->
<div class="right-side-list">
<div class="dropdown">
<button class="dropbtn"><strong>Shortcuts:</strong></button>
<div class="dropdown-content">
<ul>
<li><a href="cloud-compute.html"><i class="fas fa-cloud"></i> Cloud</a></li>
<li><a href="AWS-GCP.html"><i class="fas fa-cloud"></i> AWS-GCP</a></li>
<li><a href="amazon-s3.html"><i class="fas fa-cloud"></i> AWS S3</a></li>
<li><a href="ec2-confi.html"><i class="fas fa-server"></i> EC2</a></li>
<li><a href="Docker-Container.html"><i class="fab fa-docker" style="color: rgb(29, 27, 27);"></i> Docker</a></li>
<li><a href="Jupyter-nifi.html"><i class="fab fa-python" style="color: rgb(34, 32, 32);"></i> Jupyter-nifi</a></li>
<li><a href="snowflake-task-stream.html"><i class="fas fa-snowflake"></i> Snowflake</a></li>
<li><a href="data-model.html"><i class="fas fa-database"></i> Data modeling</a></li>
<li><a href="sql-basics.html"><i class="fas fa-table"></i> QL</a></li>
<li><a href="sql-basic-details.html"><i class="fas fa-database"></i> SQL</a></li>
<li><a href="Bigquerry-sql.html"><i class="fas fa-database"></i> Bigquery</a></li>
<li><a href="scd.html"><i class="fas fa-archive"></i> SCD</a></li>
<li><a href="sql-project.html"><i class="fas fa-database"></i> SQL project</a></li>
<!-- Add more subsections as needed -->
</ul>
</div>
</div>
</div>
<!-- ======= Portfolio Details Section ======= -->
<section id="portfolio-details" class="portfolio-details">
<div class="container">
<div class="row gy-4">
<h1>Scala Spark</h1>
<div class="col-lg-8">
<div class="portfolio-details-slider swiper">
<div class="swiper-wrapper align-items-center">
<figure>
<img src="assets/img/data-engineering/Kafka-wall.png" alt="" style="max-width: 90%; max-height: auto;">
<figcaption></figcaption>
</figure>
</div>
<div class="swiper-pagination"></div>
</div>
</div>
<div class="col-lg-4 grey-box">
<div class="section-title">
<h3>Table of Contents</h3>
<ol>
<li><a href="#introduction">Introduction</a></li>
<li><a href="#components">Kafka components</a></li>
<li><a href="#kafka-apis">Apache Kafka APIs</a></li>
<ul>
<li><a href="#use-case">Kafka API use cases</a></li>
<li><a href="#kafka-connect">Kafka Connect</a></li>
<li><a href="#kafka-stream">Kafka Streams</a></li>
<li><a href="#configuration">Key broker configurations</a></li>
<li><a href="#model-evaluate">Model evaluation</a></li>
</ul>
<li><a href="#instalation">Kafka installation: set in local machine</a></li>
<li><a href="#reference">Reference</a></li>
</ol>
</div>
</div>
</div>
<section>
<h3 id="introduction">Introduction to Kafka</h3>
<p><a href="https://kafka.apache.org/quickstart" target="_blank">Apache Kafka</a> is an open-source distributed event streaming platform initially developed by LinkedIn in 2011 and later open-sourced as an Apache Software Foundation project. It is designed to handle real-time data feeds, allowing for the building of robust and scalable streaming data pipelines. Kafka is widely used in various industries for applications such as real-time analytics, log aggregation, monitoring, and messaging.</p>
<div class="box-background1">
<h5>Apache Kafka in Layman Language</h5>
<p>Apache Kafka is like a super-efficient postal system for digital messages. Imagine you have a bunch of mailmen (producers) who are constantly picking up messages from people and dropping them into different mailboxes (topics). These mailboxes are organized by subject, so all related messages go into the same mailbox.</p>
<p>Now, there are other mailmen (consumers) who come to these mailboxes and pick up messages to deliver to the right people or systems. Kafka makes sure this process is fast, reliable, and can handle a huge number of messages without getting overwhelmed.</p>
In short, Apache Kafka helps in moving lots of information from one place to another smoothly and quickly.
</div>
<p> In Kafka, data communication revolves around the concepts of producers, topics, and consumers, which closely align with the pub/sub paradigm.</p>
<figure>
<img src="assets/img/data-engineering/kadka-producer-consumer.png" alt="" style="max-width: 50%; max-height: auto;">
<figcaption style="text-align: center;"><strong>© Image credit:</strong> <a href="index.html">Arun Kumar Pandey</a></figcaption>
</figure>
<ul>
<li><strong>Producers</strong> act as data publishers, generating records and sending them to Kafka topics. These records can represent any type of data, such as log events, sensor readings, or user interactions. Producers are responsible for specifying the topic to which each record should be published.</li>
<li><strong>Topics</strong> serve as logical channels or categories to which records are published. Each topic represents a stream of related data, organized based on a common theme or subject. Topics can have one or more partitions to enable parallel processing and scalability.</li>
<li><strong>Consumers</strong> subscribe to topics of interest and consume messages from them. They read records from the partitions of the subscribed topics and process them according to their application logic. Multiple consumers can subscribe to the same topic, forming consumer groups for parallel processing and load balancing.</li>
</ul>
<p>In the pub/sub model, a producer produces or push a message to the broker, and broker will store the messages. Consumer will then consume the message from the broker. </p>
<br>
<p><strong>Example:</strong></p>
<figure>
<img src="assets/img/data-engineering/kafka-messaging.png" alt="" style="max-width: 90%; max-height: auto;">
<figcaption style="text-align: center;"><strong>© Image credit:</strong> <a href="index.html">Arun Kumar Pandey</a></figcaption>
</figure>
<p>The diagram depicts a scenario where multiple services on the left side (Frontend, Hadoop, Database Slave, Chat Server) are communicating with various services on the right side (Database server, Security Systems, Real-time Monitoring, Other Services, Data Warehouse). Each service on the left side is connected to multiple services on the right side, indicating a complex network of communication channels.</p>
<p><strong>Major issues:</strong></p>
<ul>
<li><strong>Point-to-Point Communication: </strong> Without Kafka, each service on the left would directly communicate with multiple services on the right (int the first image). This results in a tightly coupled architecture where any change in one service may require changes in multiple other services, leading to complexity and maintenance challenges.</li>
<li><strong>Scalability: </strong>As the number of services increases (represented by 'n' services), the number of communication channels grows exponentially. Managing these point-to-point connections becomes increasingly difficult, leading to scalability issues.</li>
<li><strong>Fault Tolerance: </strong>Point-to-point communication lacks built-in fault tolerance mechanisms. If one service fails or becomes unavailable, it can disrupt the entire communication chain, leading to service outages and data loss.</li>
<li><strong>Data Loss and Inconsistency: </strong>In a direct communication setup, data loss or inconsistency may occur if a service fails to receive or process messages from another service. This can result in data discrepancies and integrity issues.</li>
</ul>
<p><strong>Kafka Solution:</strong> Introducing Kafka as a middleware layer resolves these issues by decoupling communication between services and providing a distributed event streaming platform. The addition of Kafka as an intermediary between the left and right sides of the diagram brings several benefits:</p>
<ul>
<li><strong>Message Queuing:</strong> Kafka acts as a message queue, allowing services to publish messages (producing) to topics and consume messages (subscribing) from topics asynchronously. This decouples producers and consumers, enabling asynchronous and distributed communication.</li>
<li><strong>Scalability and Flexibility:</strong> Kafka's distributed architecture scales horizontally, allowing for the addition of new producers and consumers without impacting existing services. It provides flexibility in adding or removing services without disrupting the overall communication flow.</li>
<li><strong>Fault Tolerance and Durability:</strong> Kafka replicates data across multiple brokers, ensuring fault tolerance and data durability. If a broker or service fails, Kafka can continue to serve messages from replicated partitions, preventing data loss and maintaining system availability.</li>
<li><strong>Stream Processing: </strong>Kafka supports stream processing capabilities, enabling real-time data processing and analytics on streaming data. Services can consume and process data in real-time, leading to timely insights and actions</li>
<li><strong>Integration with Ecosystem: </strong>Kafka integrates seamlessly with various data processing frameworks and tools, such as Apache Spark, Apache Flink, and Apache Storm, enabling a rich ecosystem of data processing capabilities.</li>
</ul>
<p>By introducing Kafka as a central messaging backbone, the communication architecture becomes more resilient, scalable, and flexible. It addresses major issues such as point-to-point communication, scalability, fault tolerance, and data consistency, making the overall system more robust and reliable.</p>
<h4 id="components">Kafka components</h4>
<p>Apache Kafka comprises several key components that work together to provide a distributed event streaming platform.
<div class="grey-box">
<figure>
<img src="assets/img/data-engineering/kafka-broker.png" alt="" style="max-width: 50%; max-height: auto;">
<figcaption style="text-align: center;">Since Kafka is a distributed system, in such a scenario we can break Kakfa topic into multiple parts and distribute those parts into different machines. This is known as partioning and each part is called as partition. So whenever, a producer send a message then it can go and sit into any of the partions. So as soon as a message arrives in partion, a number is assigned to them and they are known as offset numbers. These messages will be consumed by consumer. If we have single consumer then he/she can consume from each and every partions. On such kind of systems, if just one user is consuming the messages/events then it can perform upto it's capacity. So what we can do, we can create n-number consumer instances. In this case, we can group some users with a group name. For example in a payment systems, we can create a group with 'payment_consumer_group' and 'transaction_group' etc. So in the case of 'payment_consumer_group', we can devide the work load to each and every consumer to achive better output. <strong>© Image credit:</strong> <a href="index.html">Arun Kumar Pandey</a></figcaption>
</figure>
</div>
</p>
<ol>
<li><strong>Publishers (Producers):</strong> Producers are responsible for publishing records to Kafka topics. They generate records containing a key, a value, and an optional timestamp and send them to Kafka brokers. Producers can choose the partition to which a record should be sent or rely on Kafka's default partitioning mechanism.</li>
<li><strong>Consumer:</strong> Consumers subscribe to Kafka topics to consume records from them. They read records from partitions in a topic and process them according to their application logic. Consumers can be grouped into consumer groups for parallel processing and load balancing.</li>
<li><strong>Broker</strong> Kafka brokers are the fundamental building blocks of a Kafka cluster. They handle storage, replication (it means, if replication factor is 3, then we are mentaining 3 copies of the partion), and serving of partitions of topics. Brokers receive messages from producers and deliver them to consumers. Kafka clusters typically consist of multiple brokers distributed across physical or virtual machines.</li>
<li><strong>Cluster:</strong> In Kafka, a cluster refers to a group of Kafka brokers working together to form a distributed messaging system. A Kafka cluster typically consists of multiple brokers, each running on separate physical or virtual machines. These brokers collaborate to store and serve data, handle client requests, and maintain cluster metadata. These are helpful when producer gernate a huge amount of data.</li>
<figure>
<img src="assets/img/data-engineering/kafka-cluster.png" alt="" style="max-width: 90%; max-height: auto;">
<figcaption style="text-align: center;"><strong>© Image credit:</strong> <a href="index.html">Arun Kumar Pandey</a></figcaption>
</figure>
<li><strong>Topic:</strong> Topics represent streams of records, which are organized and categorized based on a common theme or subject. Producers publish records to topics, specifying the topic name to which the record should be sent. Consumers subscribe to topics to consume records from them.</li>
<li><strong>Partitions:</strong> Topics are divided into partitions, which are individual ordered sequences of records. Each partition can be hosted on multiple brokers for fault tolerance and scalability. Partitioning allows Kafka to parallelize data ingestion and consumption, enabling high throughput and efficient data processing.</li>
<figure>
<img src="assets/img/data-engineering/kafka-partition.png" alt="" style="max-width: 70%; max-height: auto;">
<figcaption style="text-align: center;">So, here we have saved 2 more copies of a given partition on different systems. Therefore the replication factor is 3. Normally, we don't set a replication factor for a partition but we set it for a topic and it applies to all partitions within the topic. (<strong>© Image credit:</strong> <a href="index.html">Arun Kumar Pandey</a>)</figcaption>
</figure>
<li><strong>Replication factor: </strong>In Apache Kafka, topics are divided into partitions to enable parallel processing and scalability. Each partition is a separate log where messages are stored sequentially. Replication factor is the number of copies of a partition that are maintained in the clusters.
<ul>
<li>Kafka makes copies of partitions for fault tolerance through replication. Each partition is replicated across multiple Kafka brokers.</li>
<li>The primary copy of a partition is called the leader, while the others are followers.</li>
<li>The leader handles all read and write requests, and followers replicate the data from the leader.</li>
<li>If the leader fails, a follower takes over as the new leader to ensure data availability and durability.</li>
</ul>
This mechanism ensures that even if a broker fails, the topic's data remains accessible and consistent.
</li>
<li><strong>Offset:</strong> In Kafka, an offset represents the position of a consumer within a partition of a topic. It is a numeric value that uniquely identifies a message within the partition. Each message in a partition is assigned a sequential offset, starting from 0 for the first message and increasing incrementally for subsequent messages. Offsets allow consumers to keep track of which messages they have already consumed within a partition. This enables consumers to resume reading from where they left off in case of failures or restarts. Consumers use offsets to determine the next message they need to read from a partition. By maintaining the offset of the last processed message, consumers can retrieve subsequent messages for processing. Consumers can commit their current offset to Kafka to indicate that they have successfully processed messages up to that point. This commit is typically done atomically along with processing logic to ensure exactly-once processing semantics.</li>
<li><strong>Consumer Group: </strong> A consumer group is a logical grouping of consumers that jointly consume and process records from one or more partitions of a topic. Each partition within a topic is consumed by exactly one consumer within a consumer group, enabling parallel processing and load distribution.</li>
<li><strong>Zookeper:</strong> ZooKeeper is used by Kafka for managing cluster metadata, leader election, and synchronization. It maintains information about brokers, topics, partitions, and consumer group membership. While ZooKeeper was a critical component in earlier versions of Kafka, newer versions are gradually moving towards removing this dependency.</li>
<figure>
<img src="assets/img/data-engineering/kafka-zookeeper.png" alt="" style="max-width: 70%; max-height: 50%;">
<figcaption style="text-align: center;"><strong>© Image credit:</strong> <a href="index.html">Arun Kumar Pandey</a></figcaption>
</figure>
</ol>
<h4 id="configuration">Key broker configurations</h4>
In Apache Kafka, brokers are critical components that handle message storage, retrieval, and distribution across the cluster. Configuring Kafka brokers properly is essential for performance, scalability, and fault tolerance. Here are some key Kafka broker configurations:
<ul>
<li><strong>General Settings: </strong>
<table>
<tr>
<th>Configuration</th>
<th>Description</th>
<th>Example</th>
</tr>
<tr>
<td><strong>broker.id</strong></td>
<td>Unique identifier for each broker in the Kafka cluster. It's crucial for differentiating brokers, especially in clusters with multiple brokers.</td>
<td><code>broker.id=1</code></td>
</tr>
<tr>
<td><strong>listeners</strong></td>
<td>Specifies the network interfaces and ports on which the broker listens for client requests (e.g., producers, consumers).</td>
<td><code>listeners=PLAINTEXT://localhost:9092</code></td>
</tr>
<tr>
<td><strong>log.dirs</strong></td>
<td>Directories where Kafka stores partition data. Multiple directories can be specified for load balancing across disks.</td>
<td><code>log.dirs=/var/lib/kafka/data</code></td>
</tr>
</table>
</li>
<li><strong> Topic and Log Management: </strong>
<table>
<tr>
<th>Configuration</th>
<th>Description</th>
<th>Example</th>
</tr>
<tr>
<td><strong>num.partitions</strong></td>
<td>Default number of partitions per topic when a new topic is created without specifying the number of partitions.</td>
<td><code>num.partitions=3</code></td>
</tr>
<tr>
<td><strong>log.retention.hours</strong></td>
<td>Time period for which Kafka retains messages. After this period, messages are eligible for deletion.</td>
<td><code>log.retention.hours=168</code> (7 days)</td>
</tr>
<tr>
<td><strong>log.segment.bytes</strong></td>
<td>Maximum size of a single log segment file. Once this size is reached, Kafka rolls over to a new segment file.</td>
<td><code>log.segment.bytes=1073741824</code> (1 GB)</td>
</tr>
<tr>
<td><strong>log.cleanup.policy</strong></td>
<td>Defines the cleanup policy for logs. Options are <code>delete</code> (removes old logs) and <code>compact</code> (log compaction for topics with keys).</td>
<td><code>log.cleanup.policy=delete</code></td>
</tr>
</table>
</li>
<li><strong>Replication and Fault Tolerance: </strong>
<table>
<tr>
<th>Configuration</th>
<th>Description</th>
<th>Example</th>
</tr>
<tr>
<td><strong>num.partitions</strong></td>
<td>Default number of partitions per topic when a new topic is created without specifying the number of partitions.</td>
<td><code>num.partitions=3</code></td>
</tr>
<tr>
<td><strong>log.retention.hours</strong></td>
<td>Time period for which Kafka retains messages. After this period, messages are eligible for deletion.</td>
<td><code>log.retention.hours=168</code> (7 days)</td>
</tr>
<tr>
<td><strong>log.segment.bytes</strong></td>
<td>Maximum size of a single log segment file. Once this size is reached, Kafka rolls over to a new segment file.</td>
<td><code>log.segment.bytes=1073741824</code> (1 GB)</td>
</tr>
<tr>
<td><strong>log.cleanup.policy</strong></td>
<td>Defines the cleanup policy for logs. Options are <code>delete</code> (removes old logs) and <code>compact</code> (log compaction for topics with keys).</td>
<td><code>log.cleanup.policy=delete</code></td>
</tr>
<tr>
<td><strong>default.replication.factor</strong></td>
<td>Default replication factor for topics when created. Ensures redundancy and fault tolerance.</td>
<td><code>default.replication.factor=3</code></td>
</tr>
<tr>
<td><strong>min.insync.replicas</strong></td>
<td>Minimum number of in-sync replicas that must acknowledge a write for the write to be considered successful. Helps in ensuring data durability.</td>
<td><code>min.insync.replicas=2</code></td>
</tr>
<tr>
<td><strong>unclean.leader.election.enable</strong></td>
<td>Determines if Kafka can elect an out-of-sync replica as leader in case of failure. Disabling this ensures data consistency but might reduce availability.</td>
<td><code>unclean.leader.election.enable=false</code></td>
</tr>
</table>
</li>
<li><strong>Performance Tuning: </strong>
<table>
<tr>
<th>Configuration</th>
<th>Description</th>
<th>Example</th>
</tr>
<tr>
<td><strong>num.partitions</strong></td>
<td>Default number of partitions per topic when a new topic is created without specifying the number of partitions.</td>
<td><code>num.partitions=3</code></td>
</tr>
<tr>
<td><strong>log.retention.hours</strong></td>
<td>Time period for which Kafka retains messages. After this period, messages are eligible for deletion.</td>
<td><code>log.retention.hours=168</code> (7 days)</td>
</tr>
<tr>
<td><strong>log.segment.bytes</strong></td>
<td>Maximum size of a single log segment file. Once this size is reached, Kafka rolls over to a new segment file.</td>
<td><code>log.segment.bytes=1073741824</code> (1 GB)</td>
</tr>
<tr>
<td><strong>log.cleanup.policy</strong></td>
<td>Defines the cleanup policy for logs. Options are <code>delete</code> (removes old logs) and <code>compact</code> (log compaction for topics with keys).</td>
<td><code>log.cleanup.policy=delete</code></td>
</tr>
<tr>
<td><strong>default.replication.factor</strong></td>
<td>Default replication factor for topics when created. Ensures redundancy and fault tolerance.</td>
<td><code>default.replication.factor=3</code></td>
</tr>
<tr>
<td><strong>min.insync.replicas</strong></td>
<td>Minimum number of in-sync replicas that must acknowledge a write for the write to be considered successful. Helps in ensuring data durability.</td>
<td><code>min.insync.replicas=2</code></td>
</tr>
<tr>
<td><strong>unclean.leader.election.enable</strong></td>
<td>Determines if Kafka can elect an out-of-sync replica as leader in case of failure. Disabling this ensures data consistency but might reduce availability.</td>
<td><code>unclean.leader.election.enable=false</code></td>
</tr>
<tr>
<td><strong>num.network.threads</strong></td>
<td>Number of threads handling network requests. Adjust based on the expected load.</td>
<td><code>num.network.threads=3</code></td>
</tr>
<tr>
<td><strong>num.io.threads</strong></td>
<td>Number of threads for disk I/O operations. Increasing this can improve performance if the broker handles heavy disk activity.</td>
<td><code>num.io.threads=8</code></td>
</tr>
<tr>
<td><strong>socket.send.buffer.bytes</strong> and <strong>socket.receive.buffer.bytes</strong></td>
<td>Size of the TCP send and receive buffer sizes. These can be tuned based on network bandwidth and latency.</td>
<td><code>socket.send.buffer.bytes=102400</code>, <code>socket.receive.buffer.bytes=102400</code></td>
</tr>
</table>
</li>
<li><strong>Security: </strong>
<table border="1">
<thead>
<tr>
<th>Configuration</th>
<th>Description</th>
<th>Example</th>
</tr>
</thead>
<tbody>
<tr>
<td>ssl.keystore.location</td>
<td>Path to the SSL keystore used for secure communication</td>
<td>/var/private/ssl/kafka.server.keystore.jks</td>
</tr>
<tr>
<td>ssl.keystore.password</td>
<td>Password for the SSL keystore</td>
<td>yourpassword</td>
</tr>
<tr>
<td>ssl.truststore.location</td>
<td>Path to the SSL truststore used to verify client certificates</td>
<td>/var/private/ssl/kafka.server.truststore.jks</td>
</tr>
<tr>
<td>ssl.truststore.password</td>
<td>Password for the SSL truststore</td>
<td>yourpassword</td>
</tr>
</tbody>
</table>
</li>
<li><strong>Monitoring and Metrics: </strong>
<table border="1">
<thead>
<tr>
<th>Configuration</th>
<th>Description</th>
<th>Example</th>
</tr>
</thead>
<tbody>
<tr>
<td>metric.reporters</td>
<td>A list of classes used to report Kafka metrics. These can integrate with external monitoring systems like Prometheus or JMX.</td>
<td>metric.reporters=com.example.MyMetricsReporter</td>
</tr>
<tr>
<td>log.flush.interval.messages</td>
<td>Controls how frequently data is flushed to disk in terms of the number of messages. Lower intervals reduce the risk of data loss but can impact performance.</td>
<td>log.flush.interval.messages=10000</td>
</tr>
<tr>
<td>log.flush.interval.ms</td>
<td>Controls how frequently data is flushed to disk in terms of time interval. Lower intervals reduce the risk of data loss but can impact performance.</td>
<td>log.flush.interval.ms=1000</td>
</tr>
</tbody>
</table>
</li>
<li><strong>Zookeeper Configuration: </strong>
<table border="1">
<thead>
<tr>
<th>Configuration</th>
<th>Description</th>
<th>Example</th>
</tr>
</thead>
<tbody>
<tr>
<td>metric.reporters</td>
<td>A list of classes used to report Kafka metrics. These can integrate with external monitoring systems like Prometheus or JMX.</td>
<td>metric.reporters=com.example.MyMetricsReporter</td>
</tr>
<tr>
<td>log.flush.interval.messages</td>
<td>Controls how frequently data is flushed to disk in terms of the number of messages. Lower intervals reduce the risk of data loss but can impact performance.</td>
<td>log.flush.interval.messages=10000</td>
</tr>
<tr>
<td>log.flush.interval.ms</td>
<td>Controls how frequently data is flushed to disk in terms of time interval. Lower intervals reduce the risk of data loss but can impact performance.</td>
<td>log.flush.interval.ms=1000</td>
</tr>
<tr>
<td>zookeeper.connect</td>
<td>Zookeeper connection string, which includes Zookeeper hosts and ports. Used by brokers to connect to the Zookeeper ensemble.</td>
<td>zookeeper.connect=localhost:2181</td>
</tr>
</tbody>
</table>
</li>
</ul>
<!------------------------>
<h3 id="kafka-apis">Apache Kafka APIs</h3>
<p>At the heart of Kafka are its APIs, which enable developers to interact with Kafka clusters programmatically. These APIs are used to build robust, real-time data processing applications. In the dynamic world of real-time data streaming, <strong>Application Programming Interfaces (APIs)</strong> serve as communication endpoints, facilitating seamless interactions between various software applications. Among these, Apache Kafka<sup>®</sup>’s API stands out, known for its exceptional capabilities in handling real-time data.</p>
<p>Here are list of Kafka APIs:</p>
<table>
<tr>
<th>API</th>
<th>Description</th>
</tr>
<tr>
<td>Producer API</td>
<td>Allows applications to send streams of data to topics in the Kafka cluster.</td>
</tr>
<tr>
<td>Consumer API </td>
<td>Permits applications to read data streams from topics in the Kafka cluster.</td>
</tr>
<tr>
<td>Streams API </td>
<td>Acts as a stream processor, transforming data streams from input to output topics.</td>
</tr>
<tr>
<td>Connect API</td>
<td>Enables the development and running of reusable producers or consumers that connect Kafka topics to existing data system applications.</td>
</tr>
<tr>
<td>Admin API </td>
<td>Supports administrative operations on a Kafka cluster, like creating or deleting topics.</td>
</tr>
<tr>
<td>Kafka API compatible alternative</td>
<td>Redpanda streaming data platform.</td>
</tr>
</table>
<p>Apache Kafka provides a rich set of APIs for building real-time data processing applications. Whether you're publishing data to Kafka, consuming messages from Kafka topics, processing data in Kafka streams, managing Kafka clusters, or integrating Kafka with external systems, there's an API available to meet your needs. By understanding and leveraging these APIs effectively, developers can harness the full power of Apache Kafka for building scalable, fault-tolerant, and real-time data pipelines.</p>
<!------------------------->
<h5 id="use-case">Kafka API use cases</h5>
<p>Some use cases for Kafka API include:</p>
<ul>
<li><strong>Real-time analytics: </strong>Kafka API is instrumental in powering real-time analytics in various fields. By consuming data streams through the Consumer API and producing result streams through the Producer API, real-time analytics engines provide insights as events occur, aiding in prompt decision-making.</li>
<li><strong>Event sourcing: </strong>Event sourcing is a programming paradigm that saves all changes to the application state as a sequence of events. Kafka APIs enhance system traceability and debugging in event streaming. They handle, store, and process large event streams at scale.</li>
<li><strong>Log aggregation: </strong>Log aggregation involves collecting and storing logs from various sources in a centralized location. The Producer API sends logs to Kafka, while the Consumer API consumes these logs. Kafka is a centralized, scalable, and reliable log management solution that aids in monitoring and debugging.</li>
<li><strong>Message queuing: </strong>In a microservices architecture, services often need to communicate with each other. Kafka API facilitates such interactions by acting as a message broker. Messages produced by one service are consumed by another, allowing for effective inter-service communication.</li>
<li><strong>Stream processing: </strong>The Kafka Streams API is used in applications that require continuous computation and manipulation of data streams, such as real-time data transformation, aggregation, or join operations. You can use them in applications like real-time fraud detection, finance, and live leaderboards in gaming.</li>
</ul>
<h4 id="kafka-connect">Kafka Connect</h4>
<p>Advantage of Kafka is its connector system <a href="https://kafka.apache.org/documentation.html#connect" target="_blank">Kafka Connect</a>. Connectors are reusable components which, as the name suggests, allow different systems to be connected to Kafka.</p>
<p>Thus, many technologies provide connectors to transfer data to or from Kafka. For example, if you want to ingest data from a MongoDB database, you can simply install and configure the <a href="https://www.mongodb.com/docs/kafka-connector/current/" target="_blank">official MongoDB connector</a> for Kafka. You can see a list of <a href="https://docs.confluent.io/platform/current/connect/kafka_connectors.html" target="_blank">Kafka connectors</a> on the Confluent website (the main contributor of Kafka).</p>
<p>So you can use Kafka Connect to transfer data between two databases through Kafka, for example to load data from an enterprise application using a relational database into a data lake, without having to code connectors for the source and target systems (sink).</p>
<figure>
<img src="assets/img/data-engineering/kafka-connect.png" alt="" style="max-width: 70%; max-height: 50%;">
<figcaption style="text-align: center;"><strong>© Image credit:</strong> <a href="https://datascientest.com/en/">Datascienitst.com</a></figcaption>
</figure>
<p>However, when transferring this type of data, it is often necessary to carry out an intermediate transformation step (filters, aggregations, etc.). This is called an ETL (Extract Transform Load). There are several tools for performing ETLs (for example Talend). But Kafka can also be used as such, especially by transforming data using the Kafka Streams API.</p>
<!------------------------>
<h4 id="kafka-stream">Kafka Streams</h4>
<p>The <a href="https://kafka.apache.org/documentation/streams/" target="_blank">Kafka Streams API</a> allows us to create derived topics from other topics, by performing transformations on them. This is known as stream processing.</p>
<p>In concrete terms, Kafka Streams allows you to perform operations such as filters, aggregations (averages, sums, etc.), but also joins between different streams. The resulting stream is persisted in a new topic.</p>
<p>Kafka Streams can be used in conjunction with Kafka Connect to perform ETL:</p>
<figure>
<img src="assets/img/data-engineering/kafka-stream.png" alt="" style="max-width: 70%; max-height: 50%;">
<figcaption style="text-align: center;"><strong>© Image credit:</strong> <a href="https://datascientest.com/en/">Datascienitst.com</a></figcaption>
</figure>
<p>However, it should be noted that Kafka Streams applications must be developed in Java or Scala programming languages, which can be complex to learn. However, there are alternative solutions for stream processing, such as <a href="https://spark.apache.org/docs/latest/streaming-programming-guide.html" target="_blank">Spark Streaming</a> which has an API in Python.</p>
<!------------------------>
<h3 id="instalation">Kafka installation: set in local machine</h3>
<p>Installing Apache Kafka involves several steps, including downloading the Kafka distribution, configuring the environment, starting the Kafka server, and verifying the installation. Below is a guide to help you with the installation process:</p>
<ol>
<li><strong>Prerequisites::</strong> Java: Kafka requires Java to be installed on your system. Make sure you have Java installed (To install go to linl and install <a href="https://www.java.com/en/download/help/download_options.html" target="_blank"> preferably Java 8 or later</a>)</li>
<li><strong>Download Kafka:</strong>
<ul>
<li>Visit the Apache Kafka website (<a href="https://kafka.apache.org/downloads" target="_blank">Kafka download page</a> ) and download the latest stable version of Kafka.</li>
<li>Choose the appropriate binary distribution based on your operating system (e.g., Kafka for Scala 2.13 if you're using Scala 2.13).</li>
</ul>
<p>Once the download is complete, extract the Kafka archive to a directory of your choice using a file extraction tool (e.g., tar for Linux/Mac or 7-Zip for Windows).</p>
</li>
<li><strong></strong></li>
<li><strong></strong></li>
<li><strong></strong></li>
<li><strong></strong></li>
<li><strong></strong></li>
<li><strong></strong></li>
<li><strong></strong></li>
</ol>
</section>
<!-------Reference ------->
<section id="reference">
<h3>References</h3>
<ol>
<li><a href="https://www.youtube.com/watch?v=dq-ZACSt_gA&t=28s" target="_blank">Youtube Kafka tutorial videos (I specially followed this one. Great playlist).</a></li>
<li><a href="https://kafka.apache.org/documentation/" target="_blank">Apache</a></li>
<li><a href="https://spark.apache.org/documentation.html" target="_blank"> Official Documentation</a></li>
<li><a href="https://www.databricks.com/learn/training/login" target="_blank">Databricks Learning Academy</a></li>
<li><a href="https://sparkbyexamples.com/" target="_blank">Spark by Examples</a></li>
<li><a href="https://www.datacamp.com/tutorial/pyspark-tutorial-getting-started-with-pyspark" target="_blank">Datacamp tutorial</a>.</li>
<li>For databricks, you can look at tutorial videos on youtube at <a href="https://www.youtube.com/watch?v=ChISx0-cMpU" target="_blank">youtube video by Bryan Cafferky</a>,
writer of the book "Master Azure Databricks". A great playlist for someone who just want to learn about the big data analytics at Databricks Azure cloud platform.</li>
<li>See the video for <a href="https://www.youtube.com/watch?v=_C8kWso4ne4" target="_blank">pyspark basics by Krish Naik</a>. Great video for starter.</li>
<li><a href="https://www.youtube.com/watch?v=QLGrLFOzMRw" target="_blank">Great youtube on Apache spark</a> one premise working.</li>
</ol>
</section>
<hr>
<div style="background-color: #f0f0f0; padding: 15px; border-radius: 5px;">
<h3>Some other interesting things to know:</h3>
<ul style="list-style-type: disc; margin-left: 30px;">
<li>Visit my website on <a href="sql-project.html">For Data, Big Data, Data-modeling, Datawarehouse, SQL, cloud-compute.</a></li>
<li>Visit my website on <a href="Data-engineering.html">Data engineering</a></li>
</ul>
</div>
<p></p>
<div class="navigation">
<a href="index.html#portfolio" class="clickable-box">
<span class="arrow-left">Portfolio section</span>
</a>
<a href="Data-engineering.html" class="clickable-box">
<span class="arrow-right">Content</span>
</a>
</div>
</div>
</section><!-- End Portfolio Details Section -->
</main><!-- End #main --
<!-- ======= Footer ======= -->
<footer id="footer">
<div class="container">
<div class="copyright">
© Copyright <strong><span>Arun</span></strong>
</div>
</div>
</footer><!-- End Footer -->
<a href="#" class="back-to-top d-flex align-items-center justify-content-center"><i class="bi bi-arrow-up-short"></i></a>
<!-- Vendor JS Files -->
<script src="assets/vendor/purecounter/purecounter_vanilla.js"></script>
<script src="assets/vendor/aos/aos.js"></script>
<script src="assets/vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
<script src="assets/vendor/glightbox/js/glightbox.min.js"></script>
<script src="assets/vendor/isotope-layout/isotope.pkgd.min.js"></script>
<script src="assets/vendor/swiper/swiper-bundle.min.js"></script>
<script src="assets/vendor/typed.js/typed.umd.js"></script>
<script src="assets/vendor/waypoints/noframework.waypoints.js"></script>
<script src="assets/vendor/php-email-form/validate.js"></script>
<!-- Template Main JS File -->
<script src="assets/js/main.js"></script>
<script>
document.addEventListener("DOMContentLoaded", function () {
hljs.initHighlightingOnLoad();
});
</script>
</body>
</html>