Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance Benchmark Report Generation #2248

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/dtypes_benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ jobs:
run: |
invoke benchmark-dtypes
continue-on-error: true
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add a timeout-minutes to the run_dtypes_benchmark job?

Copy link
Member Author

@pvk-developer pvk-developer Oct 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can set it up to 20 minutes. This are the actual times for it:
image

PS:
The reason of the continue-on-error is to continue and generate the report even if pytest fails. That way we can then send a message on slack and update the reports marking what failed during the execution.


# Upload the json files as artifacts
- name: Upload artifacts
uses: actions/upload-artifact@v3
Expand Down
27 changes: 25 additions & 2 deletions tests/_external/gdrive_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,23 @@ def _set_column_width(writer, results, sheet_name):
writer.sheets[sheet_name].set_column(col_idx, col_idx, column_width + 2)


def save_to_gdrive(output_folder, results, output_filename=None):
def _set_color_fields(worksheet, data, marked_data, writer, color_code):
for _, row in marked_data.iterrows():
dtype = row['dtype']
sdtype = row['sdtype']
method = row['method']

format_code = writer.book.add_format({'bg_color': color_code})

for data_row in range(len(data)):
if data.loc[data_row, 'dtype'] == dtype and data.loc[data_row, 'sdtype'] == sdtype:
method_col = data.columns.get_loc(method)
worksheet.write(
data_row + 1, method_col, bool(data.loc[data_row, method]), format_code
)


def save_to_gdrive(output_folder, results, output_filename=None, mark_results=None):
"""Save a ``DataFrame`` to google drive folder as ``xlsx`` (spreadsheet).

Given the output folder id (google drive folder id), store the given ``results`` as
Expand All @@ -117,6 +133,8 @@ def save_to_gdrive(output_folder, results, output_filename=None):
output_filename (str, optional):
String representing the filename to be used for the results spreadsheet. If None,
uses to the current date and commit as the name. Defaults to None.
mark_results (dict, optional):
A dict mapping to mark the results.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mapping what to the mark?


Returns:
str:
Expand All @@ -126,11 +144,16 @@ def save_to_gdrive(output_folder, results, output_filename=None):
output_filename = _generate_filename()

output = io.BytesIO()

with pd.ExcelWriter(output, engine='xlsxwriter') as writer: # pylint: disable=E0110
for sheet_name, data in results.items():
data.to_excel(writer, sheet_name=sheet_name, index=False)
_set_column_width(writer, data, sheet_name)
if mark_results:
for color_code, marked_results in mark_results.items():
marked_data = marked_results[marked_results['python_version'] == sheet_name]
if not marked_data.empty:
worksheet = writer.sheets[sheet_name]
_set_color_fields(worksheet, data, marked_data, writer, color_code)

file_config = {'title': output_filename, 'parents': [{'id': output_folder}]}
drive = _get_drive_client()
Expand Down
62 changes: 52 additions & 10 deletions tests/benchmark/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,20 +62,20 @@ def _get_output_filename():

def compare_previous_result_with_current():
"""Compare the previous result with the current and post a message on slack."""

new_supported_dtypes = []
unsupported_dtypes = []
previously_unseen_dtypes = []
for result in Path('results/').rglob('*.json'):
python_version = result.stem
current_results = _load_temp_results(result)
csv_output = Path(f'results/{python_version}.csv')
current_results.to_csv(csv_output, index=False)

new_supported_dtypes = []
unsupported_dtypes = []
previously_unseen_dtypes = []

for index, row in current_results.iterrows():
dtype = row['dtype']
sdtype = row['sdtype']
for col in current_results.columns[1:]:
for col in current_results.columns[2:]:
current_value = row[col]
stored_value, previously_seen = get_previous_dtype_result(
dtype,
Expand Down Expand Up @@ -135,7 +135,6 @@ def save_results_to_json(results, filename=None):
Defaults to `None`.
"""
filename = filename or TEMPRESULTS

if os.path.exists(filename):
with open(filename, 'r') as file:
try:
Expand All @@ -150,41 +149,84 @@ def save_results_to_json(results, filename=None):

def calculate_support_percentage(df):
"""Calculate the percentage of supported features (True) for each dtype in a DataFrame."""
feature_columns = df.drop(columns=['dtype'])
feature_columns = df.drop(columns=['dtype', 'sdtype'])
# Calculate percentage of TRUE values for each row (dtype)
percentage_support = feature_columns.mean(axis=1) * 100
return pd.DataFrame({'dtype': df['dtype'], 'percentage_supported': percentage_support})

df = pd.DataFrame({
'dtype': df['dtype'],
'sdtype': df['sdtype'],
'percentage_supported': percentage_support,
})
df['percentage_supported'] = df['percentage_supported'].round(2)
return df


def _version_tuple(version_str):
return tuple(int(part) for part in version_str.split('.'))


def compare_and_store_results_in_gdrive():
"""Compare results, store them in Google Drive, and post a message to Slack.

This function compares the current results with previous ones and processes the
results to create a summary.
"""
csv_handler = CSVHandler()
comparison_results = compare_previous_result_with_current()

results = csv_handler.read('results/')
sorted_results = {}
# Sort dictionary by version keys
results = {
key: value
for key, value in sorted(results.items(), key=lambda item: _version_tuple(item[0]))
}

# Compute the summary
summary = pd.DataFrame()
for name, df in results.items():
df = calculate_support_percentage(df)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what does this df represent? Can we name it accordingly

if summary.empty:
summary = df.rename(columns={'percentage_supported': name})
else:
summary[name] = df['percentage_supported']

summary['average'] = summary[list(results)].mean(axis=1).round(2)
for col in summary.columns:
if col not in ('sdtype', 'dtype'):
summary[col] = summary[col].apply(lambda x: f'{x}%')

sorted_results['Summary'] = summary

slack_messages = []
mark_results = {}
exit_code = 0
for key, value in comparison_results.items():
if not value.empty:
sorted_results[key] = value
if key == 'unsupported_dtypes':
slack_messages.append(':fire: New unsupported DTypes!')
mark_results['#EB9999'] = value
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor: we should label the hex codes as constants like RED_HEX = ... or whatever it is

exit_code = 1

elif key == 'new_supported_dtypes':
slack_messages.append(':party_blob: New DTypes supported!')
mark_results['#B7D7A8'] = value

if len(slack_messages) == 0:
slack_messages.append(':dealwithit: No new changes to the DTypes in SDV.')

for key, value in results.items():
sorted_results[key] = value

file_id = save_to_gdrive(GDRIVE_OUTPUT_FOLDER, sorted_results)

file_id = save_to_gdrive(GDRIVE_OUTPUT_FOLDER, sorted_results, mark_results=mark_results)
slack_messages.append(
f'See <https://docs.google.com/spreadsheets/d/{file_id}|dtypes summary and details>'
)
slack_message = '\n'.join(slack_messages)
post_slack_message('sdv-alerts', slack_message)
sys.exit(exit_code)


if __name__ == '__main__':
Expand Down
Loading