From b84647aa589b1411091c31ae3acb4919e7fa336e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rebollo=20P=C3=A9rez?= Date: Tue, 18 Jun 2024 18:25:23 +0100 Subject: [PATCH 01/19] refac: visualization module --- .../data_visualizer/analysis_csv_headers.json | 3 +- .../data_visualizer/binary_csv_headers.json | 29 ++ .../scripts/data_visualizations.py | 7 +- .../tools/performance/visualization.py | 294 +++++++++++------- .../ansible-filebeat-oss/defaults/main.yml | 2 +- 5 files changed, 213 insertions(+), 122 deletions(-) create mode 100644 deps/wazuh_testing/wazuh_testing/data/data_visualizer/binary_csv_headers.json diff --git a/deps/wazuh_testing/wazuh_testing/data/data_visualizer/analysis_csv_headers.json b/deps/wazuh_testing/wazuh_testing/data/data_visualizer/analysis_csv_headers.json index 1bb8bea5ed..1c3485ceda 100644 --- a/deps/wazuh_testing/wazuh_testing/data/data_visualizer/analysis_csv_headers.json +++ b/deps/wazuh_testing/wazuh_testing/data/data_visualizer/analysis_csv_headers.json @@ -34,7 +34,8 @@ "alerts_info": { "title": "Alerts and events info.", "columns": [ - "Events processed", "Events received", "Written alerts", "Written firewall", "Written fts" + "Events processed", "Events received", "Written alerts", "Written firewall", "Written fts", + "Written archives", "Written stats" ] } } diff --git a/deps/wazuh_testing/wazuh_testing/data/data_visualizer/binary_csv_headers.json b/deps/wazuh_testing/wazuh_testing/data/data_visualizer/binary_csv_headers.json new file mode 100644 index 0000000000..7f8cc19db4 --- /dev/null +++ b/deps/wazuh_testing/wazuh_testing/data/data_visualizer/binary_csv_headers.json @@ -0,0 +1,29 @@ +{ + "cput": { + "title": "CPU(%)", + "columns": [ + "Events" + ] + }, + "bytes_sent": { + "title": "Bytes sent", + "columns": [ + "Bytes" + ] + }, + "drops": { + "title": "Events dropped", + "columns": [ + "Target Drops" + ] + } +} + + + + + + + + +CPU(%) VMS(KB) RSS(KB) USS(KB) PSS(KB) SWAP(KB) FD Read_Ops Write_Ops Disk_Read(KB) Disk_Written(KB) Disk_Read_Speed(KB/s) Disk_Write_Speed(KB/s) diff --git a/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py b/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py index f7c665dc4f..ab8fd04c7b 100644 --- a/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py +++ b/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py @@ -3,7 +3,7 @@ from os.path import exists from tempfile import gettempdir -from wazuh_testing.tools.performance.visualization import DataVisualizer +from wazuh_testing.tools.performance.visualization import BinaryDatavisualizer, DaemonStatisticsVisualizer, LogcollectorStatisticsVisualizer def get_script_arguments(): @@ -30,9 +30,8 @@ def main(): if not exists(destination): makedirs(destination) - dv = DataVisualizer(dataframes=options.csv_list, target=options.visualization_target, - compare=False, store_path=options.destination, base_name=options.name, - columns_path=options.columns) + dv = LogcollectorStatisticsVisualizer(dataframes=options.csv_list, + compare=False, store_path=options.destination, base_name=options.name) dv.plot() diff --git a/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py b/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py index 807d3ed5b5..2aeb1594f8 100644 --- a/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py +++ b/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py @@ -8,9 +8,10 @@ import matplotlib.pyplot as plt import pandas as pd import seaborn as sns +from abc import ABC, abstractmethod -class DataVisualizer: +class DataVisualizer(ABC): """Class that allows to visualize the data collected using the wazuh_metrics tool. Args: @@ -31,8 +32,9 @@ class DataVisualizer: x_ticks_interval (int): interval of the x-label. base_name (str, optional): base name used to store the images. """ - def __init__(self, dataframes, target, compare=False, store_path=gettempdir(), x_ticks_granularity='minutes', - x_ticks_interval=1, base_name=None, columns_path=None): + def __init__(self, dataframes, target, compare=False, store_path=gettempdir(), + x_ticks_granularity='minutes', + x_ticks_interval=1, base_name=None): self.dataframes_paths = dataframes self.dataframe = None self.compare = compare @@ -43,10 +45,43 @@ def __init__(self, dataframes, target, compare=False, store_path=gettempdir(), x self.x_ticks_interval = x_ticks_interval self.base_name = base_name sns.set(rc={'figure.figsize': (26, 9)}) - self.columns_to_plot = None + # self.columns_to_plot = None + + # if target in ['binary', 'analysis', 'remote', 'agent', 'logcollector', 'wazuhdb']: + # self.columns_to_plot = self._load_columns_to_plot(columns_path) + + def _validate_dataframe(self) -> None: + self._check_missing_mandatory_fields() + self._check_no_duplicated() + self._check_unexpected_values() + + @abstractmethod + def _get_expected_fields(self): + pass + + @abstractmethod + def _get_mandatory_fields(self): + pass + + def _get_data_columns(self): + return list(self.dataframe.columns) + + def _check_no_duplicated(self): + if self.dataframe.columns.duplicated().any(): + raise ValueError('Duplicate column names found in the CSV file.') + + def _check_missing_mandatory_fields(self): + if not set(self._get_mandatory_fields()).issubset(self._get_data_columns()): + raise ValueError(f"Missing some of the mandatory values: {self._get_mandatory_fields()}") + + def _check_unexpected_values(self): + print(sorted(set(self._get_expected_fields()))) + + print(sorted(set(self._get_data_columns()))) + + if not set(self._get_data_columns()).issubset(set(self._get_expected_fields())): + raise ValueError('Column names do not match the expected metrics.') - if target in ['binary', 'analysis', 'remote', 'agent', 'logcollector', 'wazuhdb']: - self.columns_to_plot = self._load_columns_to_plot(columns_path) @staticmethod def _color_palette(size): @@ -157,123 +192,150 @@ def _save_custom_plot(self, ax, y_label, title, rotation=90, cluster_log=False, svg_name = f"{self.base_name}_{svg_name}" plt.savefig(join(self.store_path, f"{svg_name}.svg"), dpi=1200, format='svg') - def _plot_data(self, elements, title=None, generic_label=None): - """Function to plot the different types of dataframes. - Args: - elements (list, pandas.columns): columns to plot. - title (str, optional): title of the plot. - generic_label (str, optional): set a generic label to plot all the columns. - """ - if self.target == 'binary': - for element in elements: - fig, ax = plt.subplots() - daemons = self._get_daemons() - colors = self._color_palette(len(daemons)) - for daemon, color in zip(daemons, colors): - self._basic_plot(ax, self.dataframe[self.dataframe.Daemon == daemon][element], - label=daemon, color=color) - self._save_custom_plot(ax, element, f"{element} {title}") - - elif self.target == 'logcollector': - for element in elements: - fig, ax = plt.subplots() - targets = self._get_logcollector_targets() - colors = self._color_palette(len(targets)) - for target, color in zip(targets, colors): - self._basic_plot(ax, self.dataframe[self.dataframe.Target == target][element], - label=target, color=color) - self._save_custom_plot(ax, element, title) - - elif self.target == 'cluster': - for element in elements: - fig, ax = plt.subplots() - nodes = self.dataframe[self.dataframe.activity == element]['node_name'].unique() - current_df = self.dataframe[self.dataframe.activity == element] - current_df.reset_index(drop=True, inplace=True) - for node, color in zip(nodes, self._color_palette(len(nodes))): - self._basic_plot(ax=ax, dataframe=current_df[current_df.node_name == node]['time_spent(s)'], - label=node, color=color) - self._save_custom_plot(ax, 'time_spent(s)', element.replace(' ', '_').lower(), cluster_log=True, - statistics=DataVisualizer._get_statistics( - current_df['time_spent(s)'], calculate_mean=True, calculate_median=True)) - - elif self.target == 'api': - for element in elements: - fig, ax = plt.subplots() - queries = self.dataframe.endpoint.unique() - colors = self._color_palette(len(queries)) - for endpoint, color in zip(queries, colors): - self._basic_plot(ax, self.dataframe[self.dataframe.endpoint == endpoint]['time_spent(s)'], - label=endpoint, color=color) - self._save_custom_plot(ax, element, 'API Response time') + @abstractmethod + def plot(self): + pass - else: - fig, ax = plt.subplots() - colors = self._color_palette(len(elements)) - for element, color in zip(elements, colors): - self._basic_plot(ax, self.dataframe[element], label=element, color=color) - self._save_custom_plot(ax, generic_label, title) - - def _plot_binaries_dataset(self): - """Function to plot the hardware data of the binary.""" - for element in self.columns_to_plot: - columns = self.dataframe.columns.drop(self.columns_to_plot[element]['columns']) - title = self.columns_to_plot[element]['title'] - self._plot_data(elements=columns, title=title) - - def _plot_generic_dataset(self): - """Function to plot the statistics from analysisd, remoted, logcollector and wazuhdb.""" - for element in self.columns_to_plot: - columns = self.columns_to_plot[element]['columns'] - title = self.columns_to_plot[element]['title'] - self._plot_data(elements=columns, title=title, generic_label=element) - - def _plot_agentd_dataset(self): - """Function to plot the statistics from wazuh-agentd.""" - if 'diff_seconds' not in self.dataframe.columns: - self.dataframe['diff_seconds'] = abs(pd.to_datetime(self.dataframe['last_keepalive']) - - pd.to_datetime(self.dataframe['last_ack'])) - self.dataframe['diff_seconds'] = self.dataframe.diff_seconds.dt.total_seconds() - - for element in self.columns_to_plot: - columns = self.columns_to_plot[element]['columns'] - title = self.columns_to_plot[element]['title'] - self._plot_data(elements=columns, title=title, generic_label=element) - - def _plot_cluster_dataset(self): - """Function to plot the information from the cluster.log file.""" - self._plot_data(elements=list(self.dataframe['activity'].unique()), generic_label='Managers') - - def _plot_api_dataset(self): - """Function to plot the information from the api.log file.""" - self._plot_data(elements=['endpoint'], generic_label='Queries') +class BinaryDatavisualizer(DataVisualizer): + binary_metrics = ["CPU", "VMS", "RSS", "USS", + "PSS", "SWAP", "FD", "Read_Ops", "Write_Ops", "Disk_Read", + "Disk_Written", "Disk_Read_Speed", "Disk_Write_Speed"] + expected_binary_fields = ["Daemon", "Version", "PID"] + binary_metrics + mandatory_fields = ["Daemon"] + + def __init__(self, dataframes, target, compare=False, store_path=gettempdir(), + x_ticks_granularity='minutes', + x_ticks_interval=1, base_name=None): + + super().__init__(dataframes, target, compare, store_path, x_ticks_granularity, x_ticks_interval) + self._validate_dataframe() + + def _get_mandatory_fields(self): + return self.mandatory_fields + + def _get_expected_fields(self): + return self.expected_binary_fields + + def _normalize_column_name(self, column_name: str): + if '(' in column_name: + return column_name.split('(')[0].strip() + return column_name + + def _get_data_columns(self): + column_names = self.dataframe.columns + normalized_columns = [self._normalize_column_name(col) for col in column_names.tolist()] + + return normalized_columns + + def _get_fields_to_plot(self): + column_names = self.dataframe.columns + fields_to_plot = [] + + for field_to_plot in column_names: + if self._normalize_column_name(field_to_plot) in self.binary_metrics: + fields_to_plot.append(field_to_plot) + + return fields_to_plot def plot(self): - """Public function to plot the dataset.""" - if self.target == 'binary': - self._plot_binaries_dataset() - elif self.target == 'analysis': - self._plot_generic_dataset() - elif self.target == 'remote': - self._plot_generic_dataset() - elif self.target == 'agent': - self._plot_agentd_dataset() - elif self.target == 'logcollector': - self._plot_generic_dataset() - elif self.target == 'cluster': - self._plot_cluster_dataset() - elif self.target == 'api': - self._plot_api_dataset() - elif self.target == 'wazuhdb': - self._plot_generic_dataset() - else: - raise AttributeError(f"Invalid target {self.target}") + columns_to_plot = self._get_fields_to_plot() + for element in columns_to_plot: + _, ax = plt.subplots() + daemons = self._get_daemons() + colors = self._color_palette(len(daemons)) + for daemon, color in zip(daemons, colors): + self._basic_plot(ax, self.dataframe[self.dataframe.Daemon == daemon][element], + label=daemon, color=color) + + self._save_custom_plot(ax, element, f"{element} {element}") def _get_daemons(self): """Get the list of Wazuh Daemons in the dataset.""" return self.dataframe.Daemon.unique() - def _get_logcollector_targets(self): + +class DaemonStatisticsVisualizer(DataVisualizer): + mandatory_fields = ['API Timestamp', 'Interval (Timestamp-Uptime)', 'Events processed', 'Events received'] + + def __init__(self, dataframes, daemon, compare=False, store_path=gettempdir(), + x_ticks_granularity='minutes', + x_ticks_interval=1, base_name=None): + self.daemon = daemon + super().__init__(dataframes, daemon, compare, store_path, x_ticks_granularity, x_ticks_interval) + self.plots_data = self._load_columns_to_plot() + self.expected_fields = [] + for graph in self.plots_data.values(): + for column in graph['columns']: + self.expected_fields.append(column) + self.expected_fields.extend(self.mandatory_fields) + self._validate_dataframe() + + + def _load_columns_to_plot(self): + filename = self.daemon + '_csv_headers.json' + full_path = join(dirname(realpath(__file__)), '..', '..', 'data', 'data_visualizer', filename) + + with open(full_path, 'r') as columns_file: + full_data = json.load(columns_file) + + return full_data + + def _get_mandatory_fields(self): + return self.mandatory_fields + + def _get_expected_fields(self): + return self.expected_fields + + def plot(self): + for element in self.plots_data.values(): + columns = element['columns'] + title = element['title'] + colors = self._color_palette(len(columns)) + + fig, ax = plt.subplots() + for element, color in zip(columns, colors): + self._basic_plot(ax, self.dataframe[element], label=element, color=color) + + self._save_custom_plot(ax, element, title) + + +class LogcollectorStatisticsVisualizer(DaemonStatisticsVisualizer): + mandatory_fields = ['Location', 'Target'] + + def __init__(self, dataframes, compare=False, store_path=gettempdir(), + x_ticks_granularity='minutes', + x_ticks_interval=1, base_name=None): + super().__init__(dataframes, 'logcollector', compare, store_path, x_ticks_granularity, x_ticks_interval) + + def _get_logcollector_location(self): """Get the list of unique logcollector targets (sockets) in the dataset.""" - return self.dataframe.Target.unique() + return self.dataframe.Location.unique() + + def plot(self, filter_by_target): + for element in self.plots_data.values(): + fig, ax = plt.subplots() + targets = self._get_logcollector_location() + colors = self._color_palette(len(targets)) + for target, color in zip(targets, colors): + self._basic_plot(ax, self.dataframe[self.dataframe.Location == target][element['columns']], + label=target, color=color) + + self._save_custom_plot(ax, element['title'], element['title']) + +# class ClusterStatisticsVisualizer(DaemonStatisticsVisualizer): +# def plot(self): +# elements = list(self.dataframe['activity'].unique()) +# self._plot_data(elements=elements, generic_label='Managers') + +# for element in elements: +# fig, ax = plt.subplots() +# nodes = self.dataframe[self.dataframe.activity == element]['node_name'].unique() +# current_df = self.dataframe[self.dataframe.activity == element] +# current_df.reset_index(drop=True, inplace=True) +# for node, color in zip(nodes, self._color_palette(len(nodes))): +# self._basic_plot(ax=ax, dataframe=current_df[current_df.node_name == node]['time_spent(s)'], +# label=node, color=color) +# self._save_custom_plot(ax, 'time_spent(s)', element.replace(' ', '_').lower(), cluster_log=True, +# statistics=DataVisualizer._get_statistics( +# current_df['time_spent(s)'], calculate_mean=True, calculate_median=True)) diff --git a/provisioning/roles/wazuh/ansible-filebeat-oss/defaults/main.yml b/provisioning/roles/wazuh/ansible-filebeat-oss/defaults/main.yml index 4af68ae478..1c04a3a139 100644 --- a/provisioning/roles/wazuh/ansible-filebeat-oss/defaults/main.yml +++ b/provisioning/roles/wazuh/ansible-filebeat-oss/defaults/main.yml @@ -1,7 +1,7 @@ --- filebeat_version: 7.10.2 -wazuh_template_branch: 4.8.0 +wazuh_template_branch: 4.9.0 filebeat_node_name: node-1 From 94364c27ae929249f9dc54b1b9f0a30868fa6013 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rebollo=20P=C3=A9rez?= Date: Wed, 19 Jun 2024 18:27:43 +0100 Subject: [PATCH 02/19] refac: DataVisualization class --- .../scripts/data_visualizations.py | 8 +- .../tools/performance/visualization.py | 244 ++++++++---------- 2 files changed, 118 insertions(+), 134 deletions(-) diff --git a/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py b/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py index ab8fd04c7b..164ab3e19f 100644 --- a/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py +++ b/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py @@ -3,7 +3,9 @@ from os.path import exists from tempfile import gettempdir -from wazuh_testing.tools.performance.visualization import BinaryDatavisualizer, DaemonStatisticsVisualizer, LogcollectorStatisticsVisualizer +from wazuh_testing.tools.performance.visualization import BinaryDatavisualizer, DaemonStatisticsVisualizer, \ + LogcollectorStatisticsVisualizer, \ + ClusterStatisticsVisualizer def get_script_arguments(): @@ -30,8 +32,8 @@ def main(): if not exists(destination): makedirs(destination) - dv = LogcollectorStatisticsVisualizer(dataframes=options.csv_list, - compare=False, store_path=options.destination, base_name=options.name) + dv = ClusterStatisticsVisualizer(dataframes_paths=options.csv_list, store_path=options.destination, + base_name=options.name) dv.plot() diff --git a/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py b/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py index 2aeb1594f8..8e024e27e5 100644 --- a/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py +++ b/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py @@ -1,87 +1,68 @@ +import json +from abc import ABC, abstractmethod from os.path import dirname, join, realpath from re import sub from tempfile import gettempdir -from matplotlib.ticker import LinearLocator -import json import matplotlib.dates as mdates import matplotlib.pyplot as plt import pandas as pd import seaborn as sns -from abc import ABC, abstractmethod +from matplotlib.ticker import LinearLocator class DataVisualizer(ABC): """Class that allows to visualize the data collected using the wazuh_metrics tool. Args: - dataframes (list): list containing the paths. - target (str): string to set the visualization type. - compare (bool): boolean to compare the different datasets. + dataframes_paths (list): list containing the paths. store_path (str): path to store the CSV images. Defaults to the temp directory. - x_ticks_granularity (string): granularity of the Timestamp. It is set by default to minutes. - x_ticks_interval (int): interval of the x-label. base_name (str, optional): base name used to store the images. Attributes: dataframes_paths (list): paths of the CSVs. dataframe (pandas.Dataframe): dataframe containing the info from all the CSVs. - compare (bool): boolean to compare the different datasets. - target (str): string to set the visualization type. store_path (str): path to store the CSV images. Defaults to the temp directory. - x_ticks_granularity (string): granularity of the Timestamp. It is set by default to minutes. - x_ticks_interval (int): interval of the x-label. base_name (str, optional): base name used to store the images. """ - def __init__(self, dataframes, target, compare=False, store_path=gettempdir(), - x_ticks_granularity='minutes', - x_ticks_interval=1, base_name=None): - self.dataframes_paths = dataframes - self.dataframe = None - self.compare = compare - self.target = target + def __init__(self, dataframes_paths, store_path=gettempdir(), base_name=None): + self.dataframes_paths = dataframes_paths self.store_path = store_path - self._load_dataframes() - self.x_ticks_granularity = x_ticks_granularity - self.x_ticks_interval = x_ticks_interval self.base_name = base_name - sns.set(rc={'figure.figsize': (26, 9)}) - # self.columns_to_plot = None + self.dataframe = pd.DataFrame() - # if target in ['binary', 'analysis', 'remote', 'agent', 'logcollector', 'wazuhdb']: - # self.columns_to_plot = self._load_columns_to_plot(columns_path) - - def _validate_dataframe(self) -> None: - self._check_missing_mandatory_fields() - self._check_no_duplicated() - self._check_unexpected_values() + self._load_dataframes() + sns.set_theme(rc={'figure.figsize': (26, 9)}) @abstractmethod - def _get_expected_fields(self): + def _get_expected_fields(self) -> list: pass @abstractmethod - def _get_mandatory_fields(self): + def plot(self) -> None: pass - def _get_data_columns(self): - return list(self.dataframe.columns) + def _validate_dataframe(self) -> None: + self._check_missing_mandatory_fields() + self._check_no_duplicated() + self._check_unexpected_values() def _check_no_duplicated(self): if self.dataframe.columns.duplicated().any(): raise ValueError('Duplicate column names found in the CSV file.') def _check_missing_mandatory_fields(self): - if not set(self._get_mandatory_fields()).issubset(self._get_data_columns()): - raise ValueError(f"Missing some of the mandatory values: {self._get_mandatory_fields()}") + if not (self._get_expected_fields() == self._get_data_columns()): + raise ValueError(f"Missing some of the mandatory values. Expected values: {self._get_expected_fields()}") def _check_unexpected_values(self): - print(sorted(set(self._get_expected_fields()))) - - print(sorted(set(self._get_data_columns()))) - if not set(self._get_data_columns()).issubset(set(self._get_expected_fields())): raise ValueError('Column names do not match the expected metrics.') + def _get_data_columns(self) -> list: + try: + return list(self.dataframe.columns) + except StopIteration: + return [] @staticmethod def _color_palette(size): @@ -95,30 +76,9 @@ def _color_palette(size): """ return sns.hls_palette(size if size > 1 else 1, h=.5) - def _load_columns_to_plot(self, columns_path): - full_path = columns_path - - if full_path is None: - filename = None - - if self.target != 'binary': - filename = self.target + '_csv_headers.json' - else: - filename = self.target + '_non_printable_headers.json' - - full_path = join(dirname(realpath(__file__)), '..', '..', 'data', 'data_visualizer', filename) - - with open(full_path, 'r') as columns_file: - full_data = json.load(columns_file) - - return full_data - def _load_dataframes(self): """Load the dataframes from dataframes_paths.""" for df_path in self.dataframes_paths: - if self.dataframe is None and self.target != 'cluster': - self.dataframe = pd.read_csv(df_path, index_col="Timestamp", parse_dates=True) - else: new_csv = pd.read_csv(df_path, index_col="Timestamp", parse_dates=True) self.dataframe = pd.concat([self.dataframe, new_csv]) @@ -128,10 +88,7 @@ def _set_x_ticks_interval(self, ax): Args: ax (axes.SubplotBase): subplot base where the data will be printed. """ - if self.x_ticks_granularity == 'seconds': - ax.xaxis.set_major_locator(LinearLocator(30)) - elif self.x_ticks_granularity == 'minutes': - ax.xaxis.set_major_locator(LinearLocator(30)) + ax.xaxis.set_major_locator(LinearLocator(30)) ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S')) @staticmethod @@ -193,29 +150,20 @@ def _save_custom_plot(self, ax, y_label, title, rotation=90, cluster_log=False, plt.savefig(join(self.store_path, f"{svg_name}.svg"), dpi=1200, format='svg') - @abstractmethod - def plot(self): - pass class BinaryDatavisualizer(DataVisualizer): - binary_metrics = ["CPU", "VMS", "RSS", "USS", - "PSS", "SWAP", "FD", "Read_Ops", "Write_Ops", "Disk_Read", - "Disk_Written", "Disk_Read_Speed", "Disk_Write_Speed"] - expected_binary_fields = ["Daemon", "Version", "PID"] + binary_metrics - mandatory_fields = ["Daemon"] - - def __init__(self, dataframes, target, compare=False, store_path=gettempdir(), - x_ticks_granularity='minutes', - x_ticks_interval=1, base_name=None): - - super().__init__(dataframes, target, compare, store_path, x_ticks_granularity, x_ticks_interval) + binary_metrics_fields = ["Daemon", "Version", "PID", + "CPU", "VMS", "RSS", "USS", + "PSS", "SWAP", "FD", "Read_Ops", + "Write_Ops", "Disk_Read", "Disk_Written", + "Disk_Read_Speed", "Disk_Write_Speed"] + + def __init__(self, dataframes, store_path=gettempdir(), base_name=None): + super().__init__(dataframes, store_path, base_name) self._validate_dataframe() - def _get_mandatory_fields(self): - return self.mandatory_fields - - def _get_expected_fields(self): - return self.expected_binary_fields + def _get_expected_fields(self) -> list: + return self.binary_metrics_fields def _normalize_column_name(self, column_name: str): if '(' in column_name: @@ -228,12 +176,16 @@ def _get_data_columns(self): return normalized_columns + def _get_daemons(self): + """Get the list of Wazuh Daemons in the dataset.""" + return self.dataframe.Daemon.unique() + def _get_fields_to_plot(self): column_names = self.dataframe.columns fields_to_plot = [] for field_to_plot in column_names: - if self._normalize_column_name(field_to_plot) in self.binary_metrics: + if self._normalize_column_name(field_to_plot) in self.binary_metrics_fields: fields_to_plot.append(field_to_plot) return fields_to_plot @@ -250,69 +202,57 @@ def plot(self): self._save_custom_plot(ax, element, f"{element} {element}") - def _get_daemons(self): - """Get the list of Wazuh Daemons in the dataset.""" - return self.dataframe.Daemon.unique() class DaemonStatisticsVisualizer(DataVisualizer): - mandatory_fields = ['API Timestamp', 'Interval (Timestamp-Uptime)', 'Events processed', 'Events received'] + general_fields = ['API Timestamp', 'Interval (Timestamp-Uptime)', 'Events processed', 'Events received'] + statistics_plot_data_directory = join(dirname(realpath(__file__)), '..', '..', 'data', 'data_visualizer') + statistics_filename_suffix = '_csv_headers.json' - def __init__(self, dataframes, daemon, compare=False, store_path=gettempdir(), - x_ticks_granularity='minutes', - x_ticks_interval=1, base_name=None): + def __init__(self, dataframes, daemon, store_path=gettempdir(), base_name=None): self.daemon = daemon - super().__init__(dataframes, daemon, compare, store_path, x_ticks_granularity, x_ticks_interval) - self.plots_data = self._load_columns_to_plot() + super().__init__(dataframes, daemon, store_path) + self.plots_data = self._load_plot_data() self.expected_fields = [] for graph in self.plots_data.values(): for column in graph['columns']: self.expected_fields.append(column) - self.expected_fields.extend(self.mandatory_fields) + self.expected_fields.extend(self.general_fields) self._validate_dataframe() + def _get_statistic_plot_data_file(self): + return join(self.statistics_plot_data_directory, self.daemon + self.statistics_filename_suffix) - def _load_columns_to_plot(self): - filename = self.daemon + '_csv_headers.json' - full_path = join(dirname(realpath(__file__)), '..', '..', 'data', 'data_visualizer', filename) - - with open(full_path, 'r') as columns_file: + def _load_plot_data(self): + statistic_plot_data = self._get_statistic_plot_data_file() + with open(statistic_plot_data, 'r') as columns_file: full_data = json.load(columns_file) return full_data - def _get_mandatory_fields(self): - return self.mandatory_fields - - def _get_expected_fields(self): - return self.expected_fields - def plot(self): for element in self.plots_data.values(): columns = element['columns'] title = element['title'] colors = self._color_palette(len(columns)) - fig, ax = plt.subplots() + _, ax = plt.subplots() for element, color in zip(columns, colors): self._basic_plot(ax, self.dataframe[element], label=element, color=color) self._save_custom_plot(ax, element, title) - class LogcollectorStatisticsVisualizer(DaemonStatisticsVisualizer): - mandatory_fields = ['Location', 'Target'] + general_fields = ['Location', 'Target'] - def __init__(self, dataframes, compare=False, store_path=gettempdir(), - x_ticks_granularity='minutes', - x_ticks_interval=1, base_name=None): - super().__init__(dataframes, 'logcollector', compare, store_path, x_ticks_granularity, x_ticks_interval) + def __init__(self, dataframes, store_path=gettempdir(), base_name=None): + super().__init__(dataframes, 'logcollector', store_path) def _get_logcollector_location(self): """Get the list of unique logcollector targets (sockets) in the dataset.""" return self.dataframe.Location.unique() - def plot(self, filter_by_target): + def plot(self): for element in self.plots_data.values(): fig, ax = plt.subplots() targets = self._get_logcollector_location() @@ -323,19 +263,61 @@ def plot(self, filter_by_target): self._save_custom_plot(ax, element['title'], element['title']) -# class ClusterStatisticsVisualizer(DaemonStatisticsVisualizer): -# def plot(self): -# elements = list(self.dataframe['activity'].unique()) -# self._plot_data(elements=elements, generic_label='Managers') - -# for element in elements: -# fig, ax = plt.subplots() -# nodes = self.dataframe[self.dataframe.activity == element]['node_name'].unique() -# current_df = self.dataframe[self.dataframe.activity == element] -# current_df.reset_index(drop=True, inplace=True) -# for node, color in zip(nodes, self._color_palette(len(nodes))): -# self._basic_plot(ax=ax, dataframe=current_df[current_df.node_name == node]['time_spent(s)'], -# label=node, color=color) -# self._save_custom_plot(ax, 'time_spent(s)', element.replace(' ', '_').lower(), cluster_log=True, -# statistics=DataVisualizer._get_statistics( -# current_df['time_spent(s)'], calculate_mean=True, calculate_median=True)) +class ClusterStatisticsVisualizer(DataVisualizer): + expected_cluster_fields= ['node_name', 'activity', 'time_spent(s)'] + + def __init__(self, dataframes_paths, store_path=gettempdir(), base_name=None): + super().__init__(dataframes_paths, store_path, base_name) + self._validate_dataframe() + + def _get_expected_fields(self) -> list: + return self.expected_cluster_fields + + def plot(self): + elements = list(self.dataframe['activity'].unique()) + + for element in elements: + _, ax = plt.subplots() + nodes = self.dataframe[self.dataframe.activity == element]['node_name'].unique() + current_df = self.dataframe[self.dataframe.activity == element] + current_df.reset_index(drop=True, inplace=True) + for node, color in zip(nodes, self._color_palette(len(nodes))): + self._basic_plot(ax=ax, dataframe=current_df[current_df.node_name == node]['time_spent(s)'], + label=node, color=color) + self._save_custom_plot(ax, 'time_spent(s)', element.replace(' ', '_').lower(), cluster_log=True, + statistics=DataVisualizer._get_statistics( + current_df['time_spent(s)'], calculate_mean=True, calculate_median=True)) + + +class IndexerAlerts(DataVisualizer): + expected_fields = ['Total alerts'] + + def __init__(self, dataframes_paths, store_path=gettempdir(), base_name=None): + super().__init__(dataframes_paths, store_path, base_name) + self._validate_dataframe() + + def _plot_agregated_alerts(self): + _, ax = plt.subplots() + df['Difference'] = df['Total alerts'].diff() + + + def _plot_plain_alerts(self): + _, ax = plt.subplots() + self._basic_plot(ax=ax, dataframe=self.dataframe, label='alerts', self._color_palette(1)) + + + def plot(self): + self._plot_plain_alerts() + self._plot_agregated_alerts() + + +class IndexerVulnerabilities(DataVisualizer): + expected_fields = ['Vulnerabilities'] + + def __init__(self, dataframes_paths, store_path=gettempdir(), base_name=None): + super().__init__(dataframes_paths, store_path, base_name) + self._validate_dataframe() + + def plot(self): + pass + From c294f2febf5ff015d5c3b20c81e85c0c7f97c395 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rebollo=20P=C3=A9rez?= Date: Thu, 20 Jun 2024 17:46:18 +0100 Subject: [PATCH 03/19] feat: adapt data_visualization script --- .../scripts/data_visualizations.py | 46 ++++++++++++---- .../tools/performance/visualization.py | 52 ++++++++++++++----- 2 files changed, 75 insertions(+), 23 deletions(-) diff --git a/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py b/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py index 164ab3e19f..264da0d808 100644 --- a/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py +++ b/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py @@ -3,9 +3,30 @@ from os.path import exists from tempfile import gettempdir -from wazuh_testing.tools.performance.visualization import BinaryDatavisualizer, DaemonStatisticsVisualizer, \ - LogcollectorStatisticsVisualizer, \ - ClusterStatisticsVisualizer +from wazuh_testing.tools.performance.visualization import ( + BinaryDatavisualizer, + ClusterStatisticsVisualizer, + DaemonStatisticsVisualizer, + IndexerAlerts, + IndexerVulnerabilities, + LogcollectorStatisticsVisualizer, +) + + +supported_targets = ['binary', 'analysis', 'remote', 'wazuhdb', 'logcollector', + 'cluster', 'indexer-alerts', + 'indexer-vulnerabilities'] +strategy_plot_by_target = { + 'binary': BinaryDatavisualizer, + 'cluster': ClusterStatisticsVisualizer, + 'logcollector': LogcollectorStatisticsVisualizer, + 'indexer-alerts': IndexerAlerts, + 'indexer-vulnerabilities': IndexerVulnerabilities +} + +def create_destination_directory(destination_directory): + if not exists(destination_directory): + makedirs(destination_directory) def get_script_arguments(): @@ -14,7 +35,7 @@ def get_script_arguments(): parser.add_argument('-s', '--sources', dest='csv_list', required=True, type=str, nargs='+', action='store', help='Paths to the CSV files separated by whitespace.') parser.add_argument('-t', '--target', dest='visualization_target', default='binary', - choices=['binary', 'analysis', 'remote', 'agent', 'logcollector', 'cluster', 'api', 'wazuhdb'], + choices=supported_targets, help='Generate data visualizations for a specific target. Default binary.') parser.add_argument('-d', '--destination', dest='destination', default=gettempdir(), help=f'Directory to store the images. Default {gettempdir()}') @@ -28,12 +49,19 @@ def get_script_arguments(): def main(): options = get_script_arguments() - destination = options.destination + create_destination_directory(options.destination) + + target = options.visualization_target + + if target in ['analysis', 'remote', 'wazuhdb']: + dv = DaemonStatisticsVisualizer(options.csv_list, daemon=options.target, + store_path=options.destination, + base_name=options.name) + else: + dv = strategy_plot_by_target[target](options.csv_list, + store_path=options.destination, + base_name=options.name) - if not exists(destination): - makedirs(destination) - dv = ClusterStatisticsVisualizer(dataframes_paths=options.csv_list, store_path=options.destination, - base_name=options.name) dv.plot() diff --git a/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py b/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py index 8e024e27e5..642e215606 100644 --- a/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py +++ b/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py @@ -79,8 +79,8 @@ def _color_palette(size): def _load_dataframes(self): """Load the dataframes from dataframes_paths.""" for df_path in self.dataframes_paths: - new_csv = pd.read_csv(df_path, index_col="Timestamp", parse_dates=True) - self.dataframe = pd.concat([self.dataframe, new_csv]) + new_csv = pd.read_csv(df_path, index_col="Timestamp", parse_dates=True) + self.dataframe = pd.concat([self.dataframe, new_csv]) def _set_x_ticks_interval(self, ax): """Set the number of labels that will appear in the X axis and their format. @@ -120,7 +120,7 @@ def _basic_plot(ax, dataframe, label=None, color=None): """ ax.plot(dataframe, label=label, color=color) - def _save_custom_plot(self, ax, y_label, title, rotation=90, cluster_log=False, statistics=None): + def _save_custom_plot(self, ax, y_label, title, rotation=90, disable_x_labels=False, statistics=None): """Function to add info to the plot, the legend and save the SVG image. Args: @@ -138,7 +138,7 @@ def _save_custom_plot(self, ax, y_label, title, rotation=90, cluster_log=False, ax.set_ylabel(y_label) ax.set_title(title) - if not cluster_log: + if not disable_x_labels: self._set_x_ticks_interval(ax) plt.xticks(rotation=rotation) svg_name = sub(pattern=r'\(.*\)', string=y_label, repl='') @@ -147,8 +147,8 @@ def _save_custom_plot(self, ax, y_label, title, rotation=90, cluster_log=False, if self.base_name is not None: svg_name = f"{self.base_name}_{svg_name}" - plt.savefig(join(self.store_path, f"{svg_name}.svg"), dpi=1200, format='svg') + plt.savefig(join(self.store_path, f"{svg_name}.svg"), dpi=1200, format='svg') class BinaryDatavisualizer(DataVisualizer): @@ -203,7 +203,6 @@ def plot(self): self._save_custom_plot(ax, element, f"{element} {element}") - class DaemonStatisticsVisualizer(DataVisualizer): general_fields = ['API Timestamp', 'Interval (Timestamp-Uptime)', 'Events processed', 'Events received'] statistics_plot_data_directory = join(dirname(realpath(__file__)), '..', '..', 'data', 'data_visualizer') @@ -211,7 +210,7 @@ class DaemonStatisticsVisualizer(DataVisualizer): def __init__(self, dataframes, daemon, store_path=gettempdir(), base_name=None): self.daemon = daemon - super().__init__(dataframes, daemon, store_path) + super().__init__(dataframes, store_path, base_name) self.plots_data = self._load_plot_data() self.expected_fields = [] for graph in self.plots_data.values(): @@ -220,6 +219,9 @@ def __init__(self, dataframes, daemon, store_path=gettempdir(), base_name=None): self.expected_fields.extend(self.general_fields) self._validate_dataframe() + def _get_expected_fields(self): + return self.expected_fields + def _get_statistic_plot_data_file(self): return join(self.statistics_plot_data_directory, self.daemon + self.statistics_filename_suffix) @@ -243,10 +245,13 @@ def plot(self): self._save_custom_plot(ax, element, title) class LogcollectorStatisticsVisualizer(DaemonStatisticsVisualizer): - general_fields = ['Location', 'Target'] + expected_fields = ['Location', 'Target'] def __init__(self, dataframes, store_path=gettempdir(), base_name=None): - super().__init__(dataframes, 'logcollector', store_path) + super().__init__(dataframes, 'logcollector', store_path, base_name) + + def _get_expected_fields(self): + return self.expected_fields def _get_logcollector_location(self): """Get the list of unique logcollector targets (sockets) in the dataset.""" @@ -254,7 +259,7 @@ def _get_logcollector_location(self): def plot(self): for element in self.plots_data.values(): - fig, ax = plt.subplots() + _, ax = plt.subplots() targets = self._get_logcollector_location() colors = self._color_palette(len(targets)) for target, color in zip(targets, colors): @@ -296,14 +301,27 @@ def __init__(self, dataframes_paths, store_path=gettempdir(), base_name=None): super().__init__(dataframes_paths, store_path, base_name) self._validate_dataframe() + def _get_expected_fields(self): + return self.expected_fields + + def _calculate_timestamp_interval(self): + interval = self.dataframe.index[1] - self.dataframe.index[0] + return interval.total_seconds() + def _plot_agregated_alerts(self): _, ax = plt.subplots() - df['Difference'] = df['Total alerts'].diff() + self.dataframe['Difference'] = self.dataframe['Total alerts'].diff() + self.dataframe['Difference'] = self.dataframe['Difference'] / self._calculate_timestamp_interval() + self._basic_plot(ax=ax, dataframe=self.dataframe['Difference'], label='Alerts per timestamp', + color=self._color_palette(1)[0]) + self._save_custom_plot(ax, 'Different alerts', 'Difference alerts') def _plot_plain_alerts(self): _, ax = plt.subplots() - self._basic_plot(ax=ax, dataframe=self.dataframe, label='alerts', self._color_palette(1)) + self._basic_plot(ax=ax, dataframe=self.dataframe['Total alerts'], label='Total alerts', + color=self._color_palette(1)[0]) + self._save_custom_plot(ax, 'Total alerts', 'Total alerts') def plot(self): @@ -312,12 +330,18 @@ def plot(self): class IndexerVulnerabilities(DataVisualizer): - expected_fields = ['Vulnerabilities'] + expected_fields = ['Total vulnerabilities'] + + def _get_expected_fields(self): + return self.expected_fields def __init__(self, dataframes_paths, store_path=gettempdir(), base_name=None): super().__init__(dataframes_paths, store_path, base_name) self._validate_dataframe() def plot(self): - pass + _, ax = plt.subplots() + self._basic_plot(ax=ax, dataframe=self.dataframe['Total vulnerabilities'], label='Indexed Vulnerabilities', + color=self._color_palette(1)[0]) + self._save_custom_plot(ax, 'Total Vulnerabilities', 'Total vulnerabilities') From 38584b5eb92afa9b4f05e158920d485fd7c4a4d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rebollo=20P=C3=A9rez?= Date: Thu, 20 Jun 2024 17:50:30 +0100 Subject: [PATCH 04/19] refac: remove csv binary headers unnused file --- .../data_visualizer/binary_csv_headers.json | 29 ------------------- .../ansible-filebeat-oss/defaults/main.yml | 2 +- 2 files changed, 1 insertion(+), 30 deletions(-) delete mode 100644 deps/wazuh_testing/wazuh_testing/data/data_visualizer/binary_csv_headers.json diff --git a/deps/wazuh_testing/wazuh_testing/data/data_visualizer/binary_csv_headers.json b/deps/wazuh_testing/wazuh_testing/data/data_visualizer/binary_csv_headers.json deleted file mode 100644 index 7f8cc19db4..0000000000 --- a/deps/wazuh_testing/wazuh_testing/data/data_visualizer/binary_csv_headers.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "cput": { - "title": "CPU(%)", - "columns": [ - "Events" - ] - }, - "bytes_sent": { - "title": "Bytes sent", - "columns": [ - "Bytes" - ] - }, - "drops": { - "title": "Events dropped", - "columns": [ - "Target Drops" - ] - } -} - - - - - - - - -CPU(%) VMS(KB) RSS(KB) USS(KB) PSS(KB) SWAP(KB) FD Read_Ops Write_Ops Disk_Read(KB) Disk_Written(KB) Disk_Read_Speed(KB/s) Disk_Write_Speed(KB/s) diff --git a/provisioning/roles/wazuh/ansible-filebeat-oss/defaults/main.yml b/provisioning/roles/wazuh/ansible-filebeat-oss/defaults/main.yml index 1c04a3a139..4af68ae478 100644 --- a/provisioning/roles/wazuh/ansible-filebeat-oss/defaults/main.yml +++ b/provisioning/roles/wazuh/ansible-filebeat-oss/defaults/main.yml @@ -1,7 +1,7 @@ --- filebeat_version: 7.10.2 -wazuh_template_branch: 4.9.0 +wazuh_template_branch: 4.8.0 filebeat_node_name: node-1 From b6615d0bfa32f7e8ad65744032bede4e3bbe0643 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rebollo=20P=C3=A9rez?= Date: Thu, 20 Jun 2024 18:07:55 +0100 Subject: [PATCH 05/19] fix: replace cluster_log by disable_x_labels --- .../wazuh_testing/tools/performance/visualization.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py b/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py index 642e215606..1ca5dd9531 100644 --- a/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py +++ b/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py @@ -289,7 +289,7 @@ def plot(self): for node, color in zip(nodes, self._color_palette(len(nodes))): self._basic_plot(ax=ax, dataframe=current_df[current_df.node_name == node]['time_spent(s)'], label=node, color=color) - self._save_custom_plot(ax, 'time_spent(s)', element.replace(' ', '_').lower(), cluster_log=True, + self._save_custom_plot(ax, 'time_spent(s)', element.replace(' ', '_').lower(), disable_x_labels=True, statistics=DataVisualizer._get_statistics( current_df['time_spent(s)'], calculate_mean=True, calculate_median=True)) @@ -344,4 +344,3 @@ def plot(self): self._basic_plot(ax=ax, dataframe=self.dataframe['Total vulnerabilities'], label='Indexed Vulnerabilities', color=self._color_palette(1)[0]) self._save_custom_plot(ax, 'Total Vulnerabilities', 'Total vulnerabilities') - From 4f39ed9cc106e16d80e578d346e73c7615c4ca33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rebollo=20P=C3=A9rez?= Date: Fri, 21 Jun 2024 11:05:53 +0100 Subject: [PATCH 06/19] fix: extra . in statistics title --- deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py b/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py index 264da0d808..d3f3a8076e 100644 --- a/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py +++ b/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py @@ -54,7 +54,7 @@ def main(): target = options.visualization_target if target in ['analysis', 'remote', 'wazuhdb']: - dv = DaemonStatisticsVisualizer(options.csv_list, daemon=options.target, + dv = DaemonStatisticsVisualizer(options.csv_list, daemon=target, store_path=options.destination, base_name=options.name) else: From 27e4dce4baac705dc4b5728c1c976067cbce6bf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rebollo=20P=C3=A9rez?= Date: Fri, 21 Jun 2024 11:06:48 +0100 Subject: [PATCH 07/19] fix: extra . in statistics title --- .../data/data_visualizer/analysis_csv_headers.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/wazuh_testing/wazuh_testing/data/data_visualizer/analysis_csv_headers.json b/deps/wazuh_testing/wazuh_testing/data/data_visualizer/analysis_csv_headers.json index 1c3485ceda..208bbc1f3e 100644 --- a/deps/wazuh_testing/wazuh_testing/data/data_visualizer/analysis_csv_headers.json +++ b/deps/wazuh_testing/wazuh_testing/data/data_visualizer/analysis_csv_headers.json @@ -32,7 +32,7 @@ ] }, "alerts_info": { - "title": "Alerts and events info.", + "title": "Alerts and events info", "columns": [ "Events processed", "Events received", "Written alerts", "Written firewall", "Written fts", "Written archives", "Written stats" From 46816f547773df27bc11e8c57a5e328bc92bec55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rebollo=20P=C3=A9rez?= Date: Fri, 21 Jun 2024 11:07:08 +0100 Subject: [PATCH 08/19] fix: bug in statistics method --- .../tools/performance/visualization.py | 45 ++++++++++--------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py b/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py index 1ca5dd9531..ad28897daf 100644 --- a/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py +++ b/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py @@ -1,4 +1,5 @@ import json +import logging from abc import ABC, abstractmethod from os.path import dirname, join, realpath from re import sub @@ -51,12 +52,14 @@ def _check_no_duplicated(self): raise ValueError('Duplicate column names found in the CSV file.') def _check_missing_mandatory_fields(self): - if not (self._get_expected_fields() == self._get_data_columns()): - raise ValueError(f"Missing some of the mandatory values. Expected values: {self._get_expected_fields()}") + if not (set(self._get_expected_fields()).issubset(set(self._get_data_columns()))): + missing_fields = (set(self._get_expected_fields()) - set(self._get_data_columns())) + raise ValueError(f"Missing some of the mandatory values: {missing_fields}") def _check_unexpected_values(self): - if not set(self._get_data_columns()).issubset(set(self._get_expected_fields())): - raise ValueError('Column names do not match the expected metrics.') + if not (set(self._get_data_columns()).issubset(set(self._get_expected_fields()))): + missing_fields = (set(self._get_data_columns()) - set(self._get_expected_fields())) + logging.warning(f"Unexpected fields provided. These will not be plotted: {missing_fields}") def _get_data_columns(self) -> list: try: @@ -101,10 +104,11 @@ def _get_statistics(df, calculate_mean=True, calculate_median=False): calculate_median (bool, optional): specify whether or not the median will be calculated. """ statistics = str() + if calculate_mean: - statistics += f"Mean: {round(pd.DataFrame.mean(df), 3)}\n" + statistics += f"Mean: {round(pd.Series.mean(df), 3)}\n" if calculate_median: - statistics += f"Median: {round(pd.DataFrame.median(df), 3)}\n" + statistics += f"Median: {round(pd.Series.median(df), 3)}\n" return statistics @@ -152,11 +156,12 @@ def _save_custom_plot(self, ax, y_label, title, rotation=90, disable_x_labels=Fa class BinaryDatavisualizer(DataVisualizer): - binary_metrics_fields = ["Daemon", "Version", "PID", - "CPU", "VMS", "RSS", "USS", - "PSS", "SWAP", "FD", "Read_Ops", - "Write_Ops", "Disk_Read", "Disk_Written", - "Disk_Read_Speed", "Disk_Write_Speed"] + binary_metrics_fields_to_plot = ["CPU", "VMS", "RSS", "USS", + "PSS", "SWAP", "FD", "Read_Ops", + "Write_Ops", "Disk_Read", "Disk_Written", + "Disk_Read_Speed", "Disk_Write_Speed"] + binary_metrics_extra_fields = ["Daemon", "Version", "PID"] + binary_metrics_fields = binary_metrics_fields_to_plot + binary_metrics_extra_fields def __init__(self, dataframes, store_path=gettempdir(), base_name=None): super().__init__(dataframes, store_path, base_name) @@ -185,7 +190,7 @@ def _get_fields_to_plot(self): fields_to_plot = [] for field_to_plot in column_names: - if self._normalize_column_name(field_to_plot) in self.binary_metrics_fields: + if self._normalize_column_name(field_to_plot) in self.binary_metrics_fields_to_plot: fields_to_plot.append(field_to_plot) return fields_to_plot @@ -200,11 +205,11 @@ def plot(self): self._basic_plot(ax, self.dataframe[self.dataframe.Daemon == daemon][element], label=daemon, color=color) - self._save_custom_plot(ax, element, f"{element} {element}") + self._save_custom_plot(ax, element, element) class DaemonStatisticsVisualizer(DataVisualizer): - general_fields = ['API Timestamp', 'Interval (Timestamp-Uptime)', 'Events processed', 'Events received'] + general_fields = ['API Timestamp', 'Interval (Timestamp-Uptime)'] statistics_plot_data_directory = join(dirname(realpath(__file__)), '..', '..', 'data', 'data_visualizer') statistics_filename_suffix = '_csv_headers.json' @@ -239,19 +244,19 @@ def plot(self): colors = self._color_palette(len(columns)) _, ax = plt.subplots() - for element, color in zip(columns, colors): - self._basic_plot(ax, self.dataframe[element], label=element, color=color) - - self._save_custom_plot(ax, element, title) + for column, color in zip(columns, colors): + self._basic_plot(ax, self.dataframe[column], label=column, color=color) + print(title) + self._save_custom_plot(ax, title, title) class LogcollectorStatisticsVisualizer(DaemonStatisticsVisualizer): - expected_fields = ['Location', 'Target'] + general_fields = ['Location', 'Target'] def __init__(self, dataframes, store_path=gettempdir(), base_name=None): super().__init__(dataframes, 'logcollector', store_path, base_name) def _get_expected_fields(self): - return self.expected_fields + return self.general_fields def _get_logcollector_location(self): """Get the list of unique logcollector targets (sockets) in the dataset.""" From 1a7e0d770748cd20db6b666da1ec5d6e2687ed33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rebollo=20P=C3=A9rez?= Date: Fri, 21 Jun 2024 15:01:23 +0100 Subject: [PATCH 09/19] docs: include 5518 changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a13add7cc0..e7ed3d5bd0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file. ### Added +- Added the capability to plot indexer alerts and vulnerability data. ([#5518](https://github.com/wazuh/wazuh-qa/pull/5518)) \- (Framework) - Add functionality to unify data of the binary processes with their subprocesses to plot ([#5500](https://github.com/wazuh/wazuh-qa/pull/5500)) \- (Framework) ### Changed From 9782a68b42248b5955018b4dcbb20449c830a841 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rebollo=20P=C3=A9rez?= Date: Fri, 21 Jun 2024 15:02:04 +0100 Subject: [PATCH 10/19] docs: improve 5432 changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e7ed3d5bd0..50f124969d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ All notable changes to this project will be documented in this file. ### Added -- Added the capability to plot indexer alerts and vulnerability data. ([#5518](https://github.com/wazuh/wazuh-qa/pull/5518)) \- (Framework) +- Added the capability to plot indexed alerts and vulnerabilities. ([#5518](https://github.com/wazuh/wazuh-qa/pull/5518)) \- (Framework) - Add functionality to unify data of the binary processes with their subprocesses to plot ([#5500](https://github.com/wazuh/wazuh-qa/pull/5500)) \- (Framework) ### Changed From 25de9a1366338167844014168ab5ba8d3219260c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rebollo=20P=C3=A9rez?= Date: Fri, 21 Jun 2024 15:03:19 +0100 Subject: [PATCH 11/19] style: fix pep8 to data visualization --- deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py b/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py index 8879495d7b..5a6de1fce3 100644 --- a/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py +++ b/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py @@ -47,7 +47,7 @@ def get_script_arguments(): parser.add_argument('-c', '--columns', dest='columns', default=None, help=f'Path to Json with Columns to Plot. Default {None}.') parser.add_argument('-u', '--unify', dest='unify', action='store_true', - help=f'Unify data of the binary processes with their subprocesses to plot.') + help='Unify data of the binary processes with their subprocesses to plot.') return parser.parse_args() From 3bd3ce28b8d9a00cc6f6e95cc604f02292796e40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rebollo=20P=C3=A9rez?= Date: Fri, 21 Jun 2024 15:05:19 +0100 Subject: [PATCH 12/19] style: replace " by ' --- .../wazuh_testing/scripts/data_visualizations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py b/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py index 5a6de1fce3..bc00822efa 100644 --- a/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py +++ b/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py @@ -30,10 +30,10 @@ def create_destination_directory(destination_directory): def validate_arguments(options): if options.visualization_target != 'binary' and options.unify: - raise ValueError("Unify option is not allowed for non binary data plotting") + raise ValueError('Unify option is not allowed for non binary data plotting') def get_script_arguments(): - parser = argparse.ArgumentParser(usage="%(prog)s [options]", description="Script to generate data visualizations", + parser = argparse.ArgumentParser(usage='%(prog)s [options]', description='Script to generate data visualizations', formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-s', '--sources', dest='csv_list', required=True, type=str, nargs='+', action='store', help='Paths to the CSV files separated by whitespace.') From 7f904767e3867934c7601dcde820417d8e8ff3ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rebollo=20P=C3=A9rez?= Date: Fri, 21 Jun 2024 16:05:40 +0100 Subject: [PATCH 13/19] style: pep8 fix --- deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py b/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py index bc00822efa..4866fdaace 100644 --- a/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py +++ b/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py @@ -28,10 +28,12 @@ def create_destination_directory(destination_directory): if not exists(destination_directory): makedirs(destination_directory) + def validate_arguments(options): if options.visualization_target != 'binary' and options.unify: raise ValueError('Unify option is not allowed for non binary data plotting') + def get_script_arguments(): parser = argparse.ArgumentParser(usage='%(prog)s [options]', description='Script to generate data visualizations', formatter_class=argparse.RawTextHelpFormatter) From e36d0a19469c1c260e5c31b75bb101b906a8a1c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rebollo=20P=C3=A9rez?= Date: Fri, 21 Jun 2024 16:17:42 +0100 Subject: [PATCH 14/19] docs: include datavisualization class docstrings --- .../tools/performance/visualization.py | 281 ++++++++++++++++-- 1 file changed, 259 insertions(+), 22 deletions(-) diff --git a/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py b/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py index a75b109543..bb8127a260 100644 --- a/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py +++ b/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py @@ -15,10 +15,6 @@ class DataVisualizer(ABC): """Class that allows to visualize the data collected using the wazuh_metrics tool. - Args: - dataframes_paths (list): list containing the paths. - store_path (str): path to store the CSV images. Defaults to the temp directory. - base_name (str, optional): base name used to store the images. Attributes: dataframes_paths (list): paths of the CSVs. dataframe (pandas.Dataframe): dataframe containing the info from all the CSVs. @@ -26,6 +22,13 @@ class DataVisualizer(ABC): base_name (str, optional): base name used to store the images. """ def __init__(self, dataframes_paths, store_path=gettempdir(), base_name=None): + """Initializes the DataVisualizer. + + Args: + dataframes_paths (list): List of paths to CSV files. + store_path (str, optional): Path to store the CSV images. Defaults to the temp directory. + base_name (str, optional): Base name used to store the images. + """ self.dataframes_paths = dataframes_paths self.store_path = store_path self.base_name = base_name @@ -35,33 +38,64 @@ def __init__(self, dataframes_paths, store_path=gettempdir(), base_name=None): sns.set_theme(rc={'figure.figsize': (26, 9)}) @abstractmethod - def _get_expected_fields(self) -> list: + def _get_expected_fields(self): + """Abstract method to define expected fields in the data. + + Returns: + list: List of expected field names. + """ pass @abstractmethod - def plot(self) -> None: + def plot(self): + """Abstract method to create data visualizations.""" pass def _validate_dataframe(self) -> None: + """Validates the loaded dataframe. + + Raises: + ValueError: If there are missing mandatory fields or duplicated column names. + """ self._check_missing_mandatory_fields() self._check_no_duplicated() self._check_unexpected_values() def _check_no_duplicated(self): + """Checks for duplicated column names in the dataframe. + + Raises: + ValueError: If duplicate column names are found. + """ if self.dataframe.columns.duplicated().any(): raise ValueError('Duplicate column names found in the CSV file.') def _check_missing_mandatory_fields(self): + """Checks if mandatory fields are present in the dataframe. + + Raises: + ValueError: If mandatory fields are missing. + """ if not (set(self._get_expected_fields()).issubset(set(self._get_data_columns()))): missing_fields = (set(self._get_expected_fields()) - set(self._get_data_columns())) raise ValueError(f"Missing some of the mandatory values: {missing_fields}") def _check_unexpected_values(self): + """Checks for unexpected values in the dataframe. + + Raises: + ValueError: If unexpected values are found. + """ if not (set(self._get_data_columns()).issubset(set(self._get_expected_fields()))): missing_fields = (set(self._get_data_columns()) - set(self._get_expected_fields())) logging.warning(f"Unexpected fields provided. These will not be plotted: {missing_fields}") def _get_data_columns(self) -> list: + """Retrieves the list of column names from the loaded dataframe. + + Returns: + list: List of column names. + """ try: return list(self.dataframe.columns) except StopIteration: @@ -96,14 +130,14 @@ def _set_x_ticks_interval(self, ax): @staticmethod def _get_statistics(df, calculate_mean=True, calculate_median=False): - """Function for calculating statistics. + """Calculate data statistics. Args: df (pandas.DataFrame): dataframe on which the operations will be applied. calculate_mean (bool, optional): specify whether or not the mean will be calculated. calculate_median (bool, optional): specify whether or not the median will be calculated. """ - statistics = str() + statistics = '' if calculate_mean: statistics += f"Mean: {round(pd.Series.mean(df), 3)}\n" @@ -114,7 +148,7 @@ def _get_statistics(df, calculate_mean=True, calculate_median=False): @staticmethod def _basic_plot(ax, dataframe, label=None, color=None): - """Basic function to visualize a dataframe. + """Visualize simple dataframe. Args: ax (axes.SubplotBase): subplot base where the data will be printed. @@ -125,14 +159,14 @@ def _basic_plot(ax, dataframe, label=None, color=None): ax.plot(dataframe, label=label, color=color) def _save_custom_plot(self, ax, y_label, title, rotation=90, disable_x_labels=False, statistics=None): - """Function to add info to the plot, the legend and save the SVG image. + """Add info to the plot, the legend and save the SVG image. Args: ax (axes.SubplotBase): subplot base where the data will be printed. y_label (str): label for the Y axis. title (str): title of the plot. rotation (int, optional): optional int to set the rotation of the X-axis labels. - cluster_log (bool, optional): optional flag used to plot specific graphics for the cluster. + disable_x_labels (bool, optional): If True, the plot will not display the x-axis labels (timestamps). statistics (str, optional): optional statistics measures. """ if statistics: @@ -156,6 +190,17 @@ def _save_custom_plot(self, ax, y_label, title, rotation=90, disable_x_labels=Fa class BinaryDatavisualizer(DataVisualizer): + """A class for visualizing binary metrics data. + + Attributes: + dataframes_paths (list): paths of the CSVs. + dataframe (pandas.Dataframe): dataframe containing the info from all the CSVs. + store_path (str): path to store the CSV images. Defaults to the temp directory. + base_name (str, optional): base name used to store the images. + binary_metrics_fields_to_plot (list): List of binary metrics fields to plot. + binary_metrics_extra_fields (list): List of additional binary metrics fields. + binary_metrics_fields (list): Combined list of binary metrics fields. + """ binary_metrics_fields_to_plot = ["CPU", "VMS", "RSS", "USS", "PSS", "SWAP", "FD", "Read_Ops", "Write_Ops", "Disk_Read", "Disk_Written", @@ -164,31 +209,66 @@ class BinaryDatavisualizer(DataVisualizer): binary_metrics_fields = binary_metrics_fields_to_plot + binary_metrics_extra_fields def __init__(self, dataframes, store_path=gettempdir(), base_name=None, unify_child_daemon_metrics=False): + """Initialize the BinaryDatavisualizer. + + Args: + dataframes (list): List of dataframes containing binary metrics data. + store_path (str, optional): Path to store visualizations. Defaults to system temp directory. + base_name (str, optional): Base name for saved visualizations. Defaults to None. + unify_child_daemon_metrics (bool, optional): Whether to unify child daemon metrics. Defaults to False. + """ super().__init__(dataframes, store_path, base_name) self._validate_dataframe() if unify_child_daemon_metrics: self.dataframe = self.dataframe.reset_index(drop=False) self._unify_dataframes() - def _get_expected_fields(self) -> list: + def _get_expected_fields(self): + """Get the list of expected fields for binary metrics. + + Returns: + list: List of expected binary metrics fields. + """ return self.binary_metrics_fields def _normalize_column_name(self, column_name: str): + """Normalize column names by removing units within parentheses. + + Args: + column_name (str): The column name to normalize. + + Returns: + str: The normalized column name. + """ if '(' in column_name: return column_name.split('(')[0].strip() return column_name def _get_data_columns(self): + """Get the list of data columns in the dataframe after normalization. + + Returns: + list: List of normalized data column names. + """ column_names = self.dataframe.columns normalized_columns = [self._normalize_column_name(col) for col in column_names.tolist()] return normalized_columns def _get_daemons(self): - """Get the list of Wazuh Daemons in the dataset.""" + """Get the list of unique Wazuh Daemons in the dataset. + + Returns: + list: List of unique Daemon names. + """ return self.dataframe.Daemon.unique() def _get_fields_to_plot(self): + """Get the list of fields to plot from the dataframe. + + Returns: + list: List of fields to plot. + """ column_names = self.dataframe.columns fields_to_plot = [] @@ -199,8 +279,7 @@ def _get_fields_to_plot(self): return fields_to_plot def _unify_dataframes(self): - """Unify the data of each process with their respective sub-processes. - """ + """Unify the data of each process with their respective sub-processes.""" pids = self.dataframe[['Daemon', 'PID']].drop_duplicates() versions = self.dataframe[['Daemon', 'Version']].drop_duplicates() @@ -218,6 +297,10 @@ def _unify_dataframes(self): self.dataframe = self.dataframe.merge(versions[['Daemon', 'Version']], on='Daemon', how='left') def plot(self): + """Plot the binary metrics data for each field to be plotted. + + This method creates and saves plots for each binary metric field. + """ columns_to_plot = self._get_fields_to_plot() for element in columns_to_plot: _, ax = plt.subplots() @@ -231,11 +314,31 @@ def plot(self): class DaemonStatisticsVisualizer(DataVisualizer): + """A class for visualizing daemon statistics data. + + Attributes: + dataframes_paths (list): paths of the CSVs. + dataframe (pandas.Dataframe): dataframe containing the info from all the CSVs. + store_path (str): path to store the CSV images. Defaults to the temp directory. + base_name (str, optional): base name used to store the images. + daemon (str): Name of the daemon for which statistics are visualized. + plots_data (dict): Data required for plotting statistics. + expected_fields (list): List of expected fields for the daemon statistics. + """ + general_fields = ['API Timestamp', 'Interval (Timestamp-Uptime)'] statistics_plot_data_directory = join(dirname(realpath(__file__)), '..', '..', 'data', 'data_visualizer') statistics_filename_suffix = '_csv_headers.json' def __init__(self, dataframes, daemon, store_path=gettempdir(), base_name=None): + """Initialize the DaemonStatisticsVisualizer. + + Args: + dataframes (list): List of dataframes containing daemon statistics data. + daemon (str): Name of the daemon for which statistics are visualized. + store_path (str, optional): Path to store visualizations. Defaults to system temp directory. + base_name (str, optional): Base name for saved visualizations. Defaults to None. + """ self.daemon = daemon super().__init__(dataframes, store_path, base_name) self.plots_data = self._load_plot_data() @@ -247,19 +350,38 @@ def __init__(self, dataframes, daemon, store_path=gettempdir(), base_name=None): self._validate_dataframe() def _get_expected_fields(self): + """Get the list of expected fields for the daemon statistics. + + Returns: + list: List of expected fields. + """ return self.expected_fields def _get_statistic_plot_data_file(self): + """Get the file path for the statistics plot data file. + + Returns: + str: Path to the statistics plot data file. + """ return join(self.statistics_plot_data_directory, self.daemon + self.statistics_filename_suffix) def _load_plot_data(self): + """Load the plot data from the statistics plot data file. + + Returns: + dict: Data required for plotting statistics. + """ statistic_plot_data = self._get_statistic_plot_data_file() - with open(statistic_plot_data, 'r') as columns_file: + with open(statistic_plot_data) as columns_file: full_data = json.load(columns_file) return full_data def plot(self): + """Plot the daemon statistics data for each field to be plotted. + + This method creates and saves plots for each statistic field. + """ for element in self.plots_data.values(): columns = element['columns'] title = element['title'] @@ -268,23 +390,51 @@ def plot(self): _, ax = plt.subplots() for column, color in zip(columns, colors): self._basic_plot(ax, self.dataframe[column], label=column, color=color) - print(title) self._save_custom_plot(ax, title, title) class LogcollectorStatisticsVisualizer(DaemonStatisticsVisualizer): + """A class for visualizing logcollector statistics data. + + Attributes: + dataframes_paths (list): paths of the CSVs. + dataframe (pandas.Dataframe): dataframe containing the info from all the CSVs. + store_path (str): path to store the CSV images. Defaults to the temp directory. + base_name (str, optional): base name used to store the images. + general_fields (list): List of general fields for logcollector statistics. + """ general_fields = ['Location', 'Target'] def __init__(self, dataframes, store_path=gettempdir(), base_name=None): + """Initialize the LogcollectorStatisticsVisualizer. + + Args: + dataframes (list): List of dataframes containing logcollector statistics data. + store_path (str, optional): Path to store visualizations. Defaults to system temp directory. + base_name (str, optional): Base name for saved visualizations. Defaults to None. + """ super().__init__(dataframes, 'logcollector', store_path, base_name) def _get_expected_fields(self): + """Get the list of expected fields for logcollector statistics. + + Returns: + list: List of expected fields. + """ return self.general_fields def _get_logcollector_location(self): - """Get the list of unique logcollector targets (sockets) in the dataset.""" + """Get the list of unique logcollector targets (sockets) in the dataset. + + Returns: + numpy.ndarray: Array of unique logcollector targets. + """ return self.dataframe.Location.unique() def plot(self): + """Plot the logcollector statistics data for each target. + + This method creates and saves plots for each logcollector target. + """ for element in self.plots_data.values(): _, ax = plt.subplots() targets = self._get_logcollector_location() @@ -296,16 +446,41 @@ def plot(self): self._save_custom_plot(ax, element['title'], element['title']) class ClusterStatisticsVisualizer(DataVisualizer): + """A class for visualizing cluster statistics data. + + Attributes: + dataframes_paths (list): paths of the CSVs. + dataframe (pandas.Dataframe): dataframe containing the info from all the CSVs. + store_path (str): path to store the CSV images. Defaults to the temp directory. + base_name (str, optional): base name used to store the images. + expected_cluster_fields (list): List of expected fields for cluster statistics. + """ expected_cluster_fields= ['node_name', 'activity', 'time_spent(s)'] def __init__(self, dataframes_paths, store_path=gettempdir(), base_name=None): + """Initialize the ClusterStatisticsVisualizer. + + Args: + dataframes_paths (list): List of paths to dataframes containing cluster statistics data. + store_path (str, optional): Path to store visualizations. Defaults to system temp directory. + base_name (str, optional): Base name for saved visualizations. Defaults to None. + """ super().__init__(dataframes_paths, store_path, base_name) self._validate_dataframe() def _get_expected_fields(self) -> list: + """Get the list of expected fields for cluster statistics. + + Returns: + list: List of expected cluster fields. + """ return self.expected_cluster_fields def plot(self): + """Plot the cluster statistics data for each activity. + + This method creates and saves plots for each cluster activity. + """ elements = list(self.dataframe['activity'].unique()) for element in elements: @@ -322,20 +497,49 @@ def plot(self): class IndexerAlerts(DataVisualizer): + """A class for visualizing indexer alerts data. + + Attributes: + dataframes_paths (list): paths of the CSVs. + dataframe (pandas.Dataframe): dataframe containing the info from all the CSVs. + store_path (str): path to store the CSV images. Defaults to the temp directory. + expected_fields (list): List of expected fields for indexer alerts. + """ expected_fields = ['Total alerts'] def __init__(self, dataframes_paths, store_path=gettempdir(), base_name=None): + """Initialize the IndexerAlerts visualizer. + + Args: + dataframes_paths (list): List of paths to dataframes containing indexer alerts data. + store_path (str, optional): Path to store visualizations. Defaults to system temp directory. + base_name (str, optional): Base name for saved visualizations. Defaults to None. + """ super().__init__(dataframes_paths, store_path, base_name) self._validate_dataframe() def _get_expected_fields(self): + """Get the list of expected fields for indexer alerts. + + Returns: + list: List of expected fields. + """ return self.expected_fields def _calculate_timestamp_interval(self): + """Calculate the interval between timestamps in seconds. + + Returns: + float: Interval between timestamps in seconds. + """ interval = self.dataframe.index[1] - self.dataframe.index[0] return interval.total_seconds() def _plot_agregated_alerts(self): + """Plot the aggregated alerts per timestamp. + + This method creates and saves a plot for the aggregated alerts. + """ _, ax = plt.subplots() self.dataframe['Difference'] = self.dataframe['Total alerts'].diff() self.dataframe['Difference'] = self.dataframe['Difference'] / self._calculate_timestamp_interval() @@ -345,28 +549,61 @@ def _plot_agregated_alerts(self): self._save_custom_plot(ax, 'Different alerts', 'Difference alerts') def _plot_plain_alerts(self): + """Plot the total alerts. + + This method creates and saves a plot for the total alerts. + """ _, ax = plt.subplots() self._basic_plot(ax=ax, dataframe=self.dataframe['Total alerts'], label='Total alerts', color=self._color_palette(1)[0]) self._save_custom_plot(ax, 'Total alerts', 'Total alerts') - def plot(self): + """Plot the indexer alerts data. + + This method creates and saves plots for both total alerts and aggregated alerts. + """ self._plot_plain_alerts() self._plot_agregated_alerts() class IndexerVulnerabilities(DataVisualizer): - expected_fields = ['Total vulnerabilities'] + """ + A class for visualizing indexer vulnerabilities data. - def _get_expected_fields(self): - return self.expected_fields + Attributes: + dataframes_paths (list): paths of the CSVs. + dataframe (pandas.Dataframe): dataframe containing the info from all the CSVs. + store_path (str): path to store the CSV images. Defaults to the temp directory. + expected_fields (list): List of expected fields for indexer vulnerabilities. + """ + expected_fields = ['Total vulnerabilities'] def __init__(self, dataframes_paths, store_path=gettempdir(), base_name=None): + """Initialize the IndexerVulnerabilities visualizer. + + Args: + dataframes_paths (list): List of paths to dataframes containing indexer vulnerabilities data. + store_path (str, optional): Path to store visualizations. Defaults to system temp directory. + base_name (str, optional): Base name for saved visualizations. Defaults to None. + """ super().__init__(dataframes_paths, store_path, base_name) self._validate_dataframe() + def _get_expected_fields(self): + """Get the list of expected fields for indexer vulnerabilities. + + Returns: + list: List of expected fields. + """ + return self.expected_fields + + def plot(self): + """Plot the indexer vulnerabilities data. + + This method creates and saves a plot for the total vulnerabilities. + """ _, ax = plt.subplots() self._basic_plot(ax=ax, dataframe=self.dataframe['Total vulnerabilities'], label='Indexed Vulnerabilities', color=self._color_palette(1)[0]) From 0312c7e3d43a15190f56f9a75f2cfba1ad84c1bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rebollo=20P=C3=A9rez?= Date: Tue, 25 Jun 2024 09:59:49 +0100 Subject: [PATCH 15/19] refac: refactor strategy selection logic --- .../scripts/data_visualizations.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py b/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py index 4866fdaace..df8025c395 100644 --- a/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py +++ b/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py @@ -18,6 +18,7 @@ 'indexer-vulnerabilities'] strategy_plot_by_target = { 'binary': BinaryDatavisualizer, + 'daemon-statistics': DaemonStatisticsVisualizer, 'cluster': ClusterStatisticsVisualizer, 'logcollector': LogcollectorStatisticsVisualizer, 'indexer-alerts': IndexerAlerts, @@ -61,19 +62,20 @@ def main(): target = options.visualization_target validate_arguments(options) + visualization_options = { + 'dataframes': options.csv_list, + 'store_path': options.destination, + 'base_name': options.name + } + + strategy = target if target in ['analysis', 'remote', 'wazuhdb']: - dv = DaemonStatisticsVisualizer(options.csv_list, daemon=target, - store_path=options.destination, - base_name=options.name) + visualization_options['daemon'] = target + strategy = 'daemon-statistics' elif target == 'binary': - dv = BinaryDatavisualizer(options.csv_list, - store_path=options.destination, - base_name=options.name, - unify_child_daemon_metrics=options.unify) - else: - dv = strategy_plot_by_target[target](options.csv_list, - store_path=options.destination, - base_name=options.name) + visualization_options['unify_child_daemon_metrics'] = options.unify + + dv = strategy_plot_by_target[strategy](**visualization_options) dv.plot() From 6c405dcc7fb97b261d4077134678f8561144524e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rebollo=20P=C3=A9rez?= Date: Tue, 25 Jun 2024 10:00:03 +0100 Subject: [PATCH 16/19] fix: docstring indentation --- .../wazuh_testing/tools/performance/visualization.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py b/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py index bb8127a260..85842c8ed4 100644 --- a/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py +++ b/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py @@ -458,13 +458,14 @@ class ClusterStatisticsVisualizer(DataVisualizer): expected_cluster_fields= ['node_name', 'activity', 'time_spent(s)'] def __init__(self, dataframes_paths, store_path=gettempdir(), base_name=None): - """Initialize the ClusterStatisticsVisualizer. + """Initialize the ClusterStatisticsVisualizer. Args: dataframes_paths (list): List of paths to dataframes containing cluster statistics data. store_path (str, optional): Path to store visualizations. Defaults to system temp directory. base_name (str, optional): Base name for saved visualizations. Defaults to None. """ + super().__init__(dataframes_paths, store_path, base_name) self._validate_dataframe() From be0afc8fab0a27f458b6a607d85797c31ac7a4fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rebollo=20P=C3=A9rez?= Date: Tue, 25 Jun 2024 10:02:43 +0100 Subject: [PATCH 17/19] docs: fix docstrings styling --- .../wazuh_testing/tools/performance/visualization.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py b/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py index 85842c8ed4..0611ce262f 100644 --- a/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py +++ b/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py @@ -465,7 +465,6 @@ def __init__(self, dataframes_paths, store_path=gettempdir(), base_name=None): store_path (str, optional): Path to store visualizations. Defaults to system temp directory. base_name (str, optional): Base name for saved visualizations. Defaults to None. """ - super().__init__(dataframes_paths, store_path, base_name) self._validate_dataframe() @@ -569,8 +568,7 @@ def plot(self): class IndexerVulnerabilities(DataVisualizer): - """ - A class for visualizing indexer vulnerabilities data. + """A class for visualizing indexer vulnerabilities data. Attributes: dataframes_paths (list): paths of the CSVs. From 9430c86dec64776e1c6ef140b30ca8a555b42a0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rebollo=20P=C3=A9rez?= Date: Tue, 25 Jun 2024 10:04:48 +0100 Subject: [PATCH 18/19] refac: supported daemon statistics list --- .../wazuh_testing/wazuh_testing/scripts/data_visualizations.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py b/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py index df8025c395..0c1c81015a 100644 --- a/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py +++ b/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py @@ -16,6 +16,7 @@ supported_targets = ['binary', 'analysis', 'remote', 'wazuhdb', 'logcollector', 'cluster', 'indexer-alerts', 'indexer-vulnerabilities'] +daemon_supported_statistics = ['analysis', 'remote', 'wazuhdb'] strategy_plot_by_target = { 'binary': BinaryDatavisualizer, 'daemon-statistics': DaemonStatisticsVisualizer, @@ -69,7 +70,7 @@ def main(): } strategy = target - if target in ['analysis', 'remote', 'wazuhdb']: + if target in daemon_supported_statistics: visualization_options['daemon'] = target strategy = 'daemon-statistics' elif target == 'binary': From 6b3b248cde3253beb77be1277cc7d914e47d00b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rebollo=20P=C3=A9rez?= Date: Tue, 25 Jun 2024 10:16:03 +0100 Subject: [PATCH 19/19] style: pep8 fixes --- .../scripts/data_visualizations.py | 2 +- .../tools/performance/visualization.py | 20 ++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py b/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py index 0c1c81015a..1b44453732 100644 --- a/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py +++ b/deps/wazuh_testing/wazuh_testing/scripts/data_visualizations.py @@ -12,7 +12,6 @@ LogcollectorStatisticsVisualizer, ) - supported_targets = ['binary', 'analysis', 'remote', 'wazuhdb', 'logcollector', 'cluster', 'indexer-alerts', 'indexer-vulnerabilities'] @@ -26,6 +25,7 @@ 'indexer-vulnerabilities': IndexerVulnerabilities } + def create_destination_directory(destination_directory): if not exists(destination_directory): makedirs(destination_directory) diff --git a/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py b/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py index 0611ce262f..222cb8793e 100644 --- a/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py +++ b/deps/wazuh_testing/wazuh_testing/tools/performance/visualization.py @@ -21,6 +21,7 @@ class DataVisualizer(ABC): store_path (str): path to store the CSV images. Defaults to the temp directory. base_name (str, optional): base name used to store the images. """ + def __init__(self, dataframes_paths, store_path=gettempdir(), base_name=None): """Initializes the DataVisualizer. @@ -202,9 +203,9 @@ class BinaryDatavisualizer(DataVisualizer): binary_metrics_fields (list): Combined list of binary metrics fields. """ binary_metrics_fields_to_plot = ["CPU", "VMS", "RSS", "USS", - "PSS", "SWAP", "FD", "Read_Ops", - "Write_Ops", "Disk_Read", "Disk_Written", - "Disk_Read_Speed", "Disk_Write_Speed"] + "PSS", "SWAP", "FD", "Read_Ops", + "Write_Ops", "Disk_Read", "Disk_Written", + "Disk_Read_Speed", "Disk_Write_Speed"] binary_metrics_extra_fields = ["Daemon", "Version", "PID"] binary_metrics_fields = binary_metrics_fields_to_plot + binary_metrics_extra_fields @@ -308,7 +309,7 @@ def plot(self): colors = self._color_palette(len(daemons)) for daemon, color in zip(daemons, colors): self._basic_plot(ax, self.dataframe[self.dataframe.Daemon == daemon][element], - label=daemon, color=color) + label=daemon, color=color) self._save_custom_plot(ax, element, element) @@ -392,6 +393,7 @@ def plot(self): self._basic_plot(ax, self.dataframe[column], label=column, color=color) self._save_custom_plot(ax, title, title) + class LogcollectorStatisticsVisualizer(DaemonStatisticsVisualizer): """A class for visualizing logcollector statistics data. @@ -441,10 +443,11 @@ def plot(self): colors = self._color_palette(len(targets)) for target, color in zip(targets, colors): self._basic_plot(ax, self.dataframe[self.dataframe.Location == target][element['columns']], - label=target, color=color) + label=target, color=color) self._save_custom_plot(ax, element['title'], element['title']) + class ClusterStatisticsVisualizer(DataVisualizer): """A class for visualizing cluster statistics data. @@ -455,7 +458,7 @@ class ClusterStatisticsVisualizer(DataVisualizer): base_name (str, optional): base name used to store the images. expected_cluster_fields (list): List of expected fields for cluster statistics. """ - expected_cluster_fields= ['node_name', 'activity', 'time_spent(s)'] + expected_cluster_fields = ['node_name', 'activity', 'time_spent(s)'] def __init__(self, dataframes_paths, store_path=gettempdir(), base_name=None): """Initialize the ClusterStatisticsVisualizer. @@ -490,9 +493,9 @@ def plot(self): current_df.reset_index(drop=True, inplace=True) for node, color in zip(nodes, self._color_palette(len(nodes))): self._basic_plot(ax=ax, dataframe=current_df[current_df.node_name == node]['time_spent(s)'], - label=node, color=color) + label=node, color=color) self._save_custom_plot(ax, 'time_spent(s)', element.replace(' ', '_').lower(), disable_x_labels=True, - statistics=DataVisualizer._get_statistics( + statistics=DataVisualizer._get_statistics( current_df['time_spent(s)'], calculate_mean=True, calculate_median=True)) @@ -597,7 +600,6 @@ def _get_expected_fields(self): """ return self.expected_fields - def plot(self): """Plot the indexer vulnerabilities data.