Skip to content

Commit

Permalink
Insert instrumentation for perf analysis based on tsc measurements
Browse files Browse the repository at this point in the history
  • Loading branch information
Vladislav Abrosimov committed Dec 12, 2023
1 parent fd41278 commit dbdf8b9
Show file tree
Hide file tree
Showing 5 changed files with 177 additions and 5 deletions.
42 changes: 39 additions & 3 deletions dataplane/dataplane.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <arpa/inet.h>
#include <cstdint>
#include <string.h>
#include <sys/socket.h>
#include <sys/stat.h>
Expand Down Expand Up @@ -30,6 +31,7 @@
#include "dataplane.h"
#include "report.h"
#include "sock_dev.h"
#include "tsc_deltas.h"
#include "worker.h"

common::log::LogPriority common::log::logPriority = common::log::TLOG_INFO;
Expand Down Expand Up @@ -1248,6 +1250,17 @@ eResult cDataPlane::allocateSharedMemory()
}
}

for (const auto& [socket_id, num] : number_of_workers_per_socket)
{
auto it = shm_size_per_socket.find(socket_id);
if (it == shm_size_per_socket.end())
{
it = shm_size_per_socket.emplace_hint(it, socket_id, 0);
}
it->second += sizeof(dataplane::perf::num_of_workers);
it->second += sizeof(dataplane::perf::tsc_deltas) * (num + ((int)socket_id == numa_node_of_cpu(config.controlPlaneCoreId)));
}

/// allocating IPC shared memory
key_t key = YANET_DEFAULT_IPC_SHMKEY;
for (const auto& [socket_id, size] : shm_size_per_socket)
Expand Down Expand Up @@ -1279,8 +1292,8 @@ eResult cDataPlane::allocateSharedMemory()
YADECAP_LOG_ERROR("shmat(%d, NULL, %d) = %d\n", shmid, 0, errno);
return eResult::errorInitSharedMemory;
}

shm_by_socket_id[socket_id] = std::make_tuple(key, shmaddr);
YADECAP_LOG_DEBUG("%p", shmaddr);
shm_by_socket_id[socket_id] = std::make_tuple(key, shmaddr, number_of_workers_per_socket[socket_id]);

key++;
}
Expand Down Expand Up @@ -1312,7 +1325,7 @@ eResult cDataPlane::splitSharedMemoryPerWorkers()
continue;
}

const auto& [key, shm] = it->second;
const auto& [key, shm, _] = it->second;

int ring_id = 0;
for (const auto& [tag, ring_cfg] : config.shared_memory)
Expand Down Expand Up @@ -1354,6 +1367,29 @@ eResult cDataPlane::splitSharedMemoryPerWorkers()
}
}

for (const auto& [socket_id, shm_info] : shm_by_socket_id)
{
const auto& [_, shm, number_of_workers] = shm_info;
auto num_of_workers = (dataplane::perf::num_of_workers*)((intptr_t)shm + offsets[shm]);
num_of_workers->number = number_of_workers;
offsets[shm] += sizeof(dataplane::perf::num_of_workers);
}

for (auto& [_, worker] : workers)
{
const auto& socket_id = worker->socketId;
const auto& it = shm_by_socket_id.find(socket_id);
if (it == shm_by_socket_id.end())
{
continue;
}
const auto& shm = std::get<1>(it->second);

worker->tsc_deltas = (dataplane::perf::tsc_deltas*)((intptr_t)shm + offsets[shm]);
memset(worker->tsc_deltas, 0, sizeof(dataplane::perf::tsc_deltas));
offsets[shm] += sizeof(dataplane::perf::tsc_deltas);
}

return eResult::success;
}

Expand Down
2 changes: 1 addition & 1 deletion dataplane/dataplane.h
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ class cDataPlane
// array instead of the table - how many coreIds can be there?
std::unordered_map<uint32_t, std::unordered_map<std::string, uint64_t*>> coreId_to_stats_tables;

std::map<tSocketId, std::tuple<key_t, void*>> shm_by_socket_id;
std::map<tSocketId, std::tuple<key_t, void*, uint64_t>> shm_by_socket_id;

std::mutex hugepage_pointers_mutex;
std::map<void*, hugepage_pointer> hugepage_pointers;
Expand Down
45 changes: 45 additions & 0 deletions dataplane/tsc_deltas.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#pragma once

#include <cstdint>
#include <rte_build_config.h>
#include <sys/types.h>

#include "type.h"

namespace dataplane
{

namespace perf
{

struct num_of_workers
{
uint64_t number;
} __attribute__((__aligned__(64)));

struct tsc_deltas
{
uint64_t iter_num;
uint64_t logicalPort_ingress_handle;
uint64_t acl_ingress_handle4;
uint64_t acl_ingress_handle6;
uint64_t tun64_ipv4_handle;
uint64_t tun64_ipv6_handle;
uint64_t route_handle4;
uint64_t route_handle6;

uint64_t decap_handle;
uint64_t nat64stateful_lan_handle;
uint64_t route_tunnel_handle4;
uint64_t route_tunnel_handle6;
uint64_t acl_egress_handle4;
uint64_t acl_egress_handle6;
uint64_t logicalPort_egress_handle;
uint64_t controlPlane_handle;
} __attribute__((__aligned__(128)));

static_assert(sizeof(tsc_deltas) <= 2 * RTE_CACHE_LINE_SIZE,
"too much deltas");
}

}
90 changes: 90 additions & 0 deletions dataplane/worker.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#include <cstdint>
#include <string>
#include <thread>

#include <rte_cycles.h>
#include <rte_errno.h>
#include <rte_ethdev.h>
#include <rte_ether.h>
Expand Down Expand Up @@ -509,10 +511,24 @@ inline void cWorker::handlePackets()
const auto& base = bases[localBaseId & 1];
const auto& globalbase = *base.globalBase;

auto tsc_start = rte_get_tsc_cycles();
uint64_t tsc_end;
tsc_deltas->iter_num++;

logicalPort_ingress_handle();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->logicalPort_ingress_handle += tsc_end - tsc_start;
tsc_start = tsc_end;

acl_ingress_handle4();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->acl_ingress_handle4 += tsc_end - tsc_start;
tsc_start = tsc_end;

acl_ingress_handle6();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->acl_ingress_handle6 += tsc_end - tsc_start;
tsc_start = tsc_end;

if (globalbase.early_decap_enabled)
{
Expand All @@ -521,60 +537,129 @@ inline void cWorker::handlePackets()
acl_ingress_stack4 = after_early_decap_stack4;
after_early_decap_stack4.clear();
acl_ingress_handle4();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->acl_ingress_handle4 += tsc_end - tsc_start;
tsc_start = tsc_end;
}

if (after_early_decap_stack6.mbufsCount > 0)
{
acl_ingress_stack6 = after_early_decap_stack6;
after_early_decap_stack6.clear();
acl_ingress_handle6();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->acl_ingress_handle6 += tsc_end - tsc_start;
tsc_start = tsc_end;
}
}

if (globalbase.tun64_enabled)
{
tun64_ipv4_handle();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->tun64_ipv4_handle += tsc_end - tsc_start;
tsc_start = tsc_end;

tun64_ipv6_handle();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->tun64_ipv6_handle += tsc_end - tsc_start;
tsc_start = tsc_end;
}

if (globalbase.decap_enabled)
{
decap_handle();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->decap_handle += tsc_end - tsc_start;
tsc_start = tsc_end;
}

if (globalbase.nat64stateful_enabled)
{
nat64stateful_lan_handle();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->acl_ingress_handle6 += tsc_end - tsc_start;
tsc_start = tsc_end;

nat64stateful_wan_handle();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->acl_ingress_handle6 += tsc_end - tsc_start;
tsc_start = tsc_end;
}

if (globalbase.nat64stateless_enabled)
{
nat64stateless_ingress_handle();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->acl_ingress_handle6 += tsc_end - tsc_start;
tsc_start = tsc_end;

nat64stateless_egress_handle();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->acl_ingress_handle6 += tsc_end - tsc_start;
tsc_start = tsc_end;
}

if (globalbase.balancer_enabled)
{
balancer_handle();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->acl_ingress_handle6 += tsc_end - tsc_start;
tsc_start = tsc_end;

balancer_icmp_reply_handle(); // balancer replies instead of real (when client pings VS)
tsc_end = rte_get_tsc_cycles();
tsc_deltas->acl_ingress_handle6 += tsc_end - tsc_start;
tsc_start = tsc_end;

balancer_icmp_forward_handle(); // forward icmp message to other balancers (if not sent to one of this balancer's reals)
tsc_end = rte_get_tsc_cycles();
tsc_deltas->acl_ingress_handle6 += tsc_end - tsc_start;
tsc_start = tsc_end;
}

route_handle4();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->route_handle4 += tsc_end - tsc_start;
tsc_start = tsc_end;

route_handle6();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->route_handle6 += tsc_end - tsc_start;
tsc_start = tsc_end;

route_tunnel_handle4();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->route_tunnel_handle4 += tsc_end - tsc_start;
tsc_start = tsc_end;

route_tunnel_handle6();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->route_tunnel_handle6 += tsc_end - tsc_start;
tsc_start = tsc_end;

if (globalbase.acl_egress_enabled)
{
acl_egress_handle4();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->acl_egress_handle4 += tsc_end - tsc_start;
tsc_start = tsc_end;

acl_egress_handle6();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->acl_egress_handle6 += tsc_end - tsc_start;
tsc_start = tsc_end;
}

logicalPort_egress_handle();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->logicalPort_egress_handle += tsc_end - tsc_start;
tsc_start = tsc_end;

controlPlane_handle();
tsc_end = rte_get_tsc_cycles();
tsc_deltas->controlPlane_handle += tsc_end - tsc_start;

physicalPort_egress_handle();
}

Expand Down Expand Up @@ -2199,6 +2284,8 @@ inline void cWorker::route_tunnel_handle6()
return;
}

auto tsc_start = rte_get_tsc_cycles();

for (unsigned int mbuf_i = 0;
mbuf_i < route_tunnel_stack6.mbufsCount;
mbuf_i++)
Expand Down Expand Up @@ -2292,6 +2379,9 @@ inline void cWorker::route_tunnel_handle6()
}

route_tunnel_stack6.clear();

auto tsc_end = rte_get_tsc_cycles();
tsc_deltas->route_tunnel_handle6 += tsc_end - tsc_start;
}

inline void cWorker::route_tunnel_nexthop(rte_mbuf* mbuf,
Expand Down
3 changes: 2 additions & 1 deletion dataplane/worker.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "globalbase.h"
#include "samples.h"
#include "sharedmemory.h"
#include "tsc_deltas.h"

class cDataPlane;
class mControlPlane;
Expand Down Expand Up @@ -296,7 +297,7 @@ class cWorker
rte_ring* ring_highPriority;
rte_ring* ring_normalPriority;
rte_ring* ring_lowPriority;

dataplane::perf::tsc_deltas* tsc_deltas;
rte_ring* ring_toFreePackets;

rte_ring* ring_log;
Expand Down

0 comments on commit dbdf8b9

Please sign in to comment.