diff --git a/.github/workflows/dtypes_benchmark.yml b/.github/workflows/dtypes_benchmark.yml index 18355116c..fc4451791 100644 --- a/.github/workflows/dtypes_benchmark.yml +++ b/.github/workflows/dtypes_benchmark.yml @@ -39,6 +39,8 @@ jobs: run: | invoke benchmark-dtypes + continue-on-error: true + # Upload the json files as artifacts - name: Upload artifacts uses: actions/upload-artifact@v3 diff --git a/tests/_external/gdrive_utils.py b/tests/_external/gdrive_utils.py index 332787b77..5d7b6290b 100644 --- a/tests/_external/gdrive_utils.py +++ b/tests/_external/gdrive_utils.py @@ -101,7 +101,23 @@ def _set_column_width(writer, results, sheet_name): writer.sheets[sheet_name].set_column(col_idx, col_idx, column_width + 2) -def save_to_gdrive(output_folder, results, output_filename=None): +def _set_color_fields(worksheet, data, marked_data, writer, color_code): + for _, row in marked_data.iterrows(): + dtype = row['dtype'] + sdtype = row['sdtype'] + method = row['method'] + + format_code = writer.book.add_format({'bg_color': color_code}) + + for data_row in range(len(data)): + if data.loc[data_row, 'dtype'] == dtype and data.loc[data_row, 'sdtype'] == sdtype: + method_col = data.columns.get_loc(method) + worksheet.write( + data_row + 1, method_col, bool(data.loc[data_row, method]), format_code + ) + + +def save_to_gdrive(output_folder, results, output_filename=None, mark_results=None): """Save a ``DataFrame`` to google drive folder as ``xlsx`` (spreadsheet). Given the output folder id (google drive folder id), store the given ``results`` as @@ -117,6 +133,8 @@ def save_to_gdrive(output_folder, results, output_filename=None): output_filename (str, optional): String representing the filename to be used for the results spreadsheet. If None, uses to the current date and commit as the name. Defaults to None. + mark_results (dict, optional): + A dict mapping to mark the results. Returns: str: @@ -126,11 +144,16 @@ def save_to_gdrive(output_folder, results, output_filename=None): output_filename = _generate_filename() output = io.BytesIO() - with pd.ExcelWriter(output, engine='xlsxwriter') as writer: # pylint: disable=E0110 for sheet_name, data in results.items(): data.to_excel(writer, sheet_name=sheet_name, index=False) _set_column_width(writer, data, sheet_name) + if mark_results: + for color_code, marked_results in mark_results.items(): + marked_data = marked_results[marked_results['python_version'] == sheet_name] + if not marked_data.empty: + worksheet = writer.sheets[sheet_name] + _set_color_fields(worksheet, data, marked_data, writer, color_code) file_config = {'title': output_filename, 'parents': [{'id': output_folder}]} drive = _get_drive_client() diff --git a/tests/benchmark/utils.py b/tests/benchmark/utils.py index 082ff58d1..d6b871a5b 100644 --- a/tests/benchmark/utils.py +++ b/tests/benchmark/utils.py @@ -62,20 +62,20 @@ def _get_output_filename(): def compare_previous_result_with_current(): """Compare the previous result with the current and post a message on slack.""" + + new_supported_dtypes = [] + unsupported_dtypes = [] + previously_unseen_dtypes = [] for result in Path('results/').rglob('*.json'): python_version = result.stem current_results = _load_temp_results(result) csv_output = Path(f'results/{python_version}.csv') current_results.to_csv(csv_output, index=False) - new_supported_dtypes = [] - unsupported_dtypes = [] - previously_unseen_dtypes = [] - for index, row in current_results.iterrows(): dtype = row['dtype'] sdtype = row['sdtype'] - for col in current_results.columns[1:]: + for col in current_results.columns[2:]: current_value = row[col] stored_value, previously_seen = get_previous_dtype_result( dtype, @@ -135,7 +135,6 @@ def save_results_to_json(results, filename=None): Defaults to `None`. """ filename = filename or TEMPRESULTS - if os.path.exists(filename): with open(filename, 'r') as file: try: @@ -150,27 +149,70 @@ def save_results_to_json(results, filename=None): def calculate_support_percentage(df): """Calculate the percentage of supported features (True) for each dtype in a DataFrame.""" - feature_columns = df.drop(columns=['dtype']) + feature_columns = df.drop(columns=['dtype', 'sdtype']) # Calculate percentage of TRUE values for each row (dtype) percentage_support = feature_columns.mean(axis=1) * 100 - return pd.DataFrame({'dtype': df['dtype'], 'percentage_supported': percentage_support}) + + df = pd.DataFrame({ + 'dtype': df['dtype'], + 'sdtype': df['sdtype'], + 'percentage_supported': percentage_support, + }) + df['percentage_supported'] = df['percentage_supported'].round(2) + return df + + +def _version_tuple(version_str): + return tuple(int(part) for part in version_str.split('.')) def compare_and_store_results_in_gdrive(): + """Compare results, store them in Google Drive, and post a message to Slack. + + This function compares the current results with previous ones and processes the + results to create a summary. + """ csv_handler = CSVHandler() comparison_results = compare_previous_result_with_current() results = csv_handler.read('results/') sorted_results = {} + # Sort dictionary by version keys + results = { + key: value + for key, value in sorted(results.items(), key=lambda item: _version_tuple(item[0])) + } + + # Compute the summary + summary = pd.DataFrame() + for name, df in results.items(): + df = calculate_support_percentage(df) + if summary.empty: + summary = df.rename(columns={'percentage_supported': name}) + else: + summary[name] = df['percentage_supported'] + + summary['average'] = summary[list(results)].mean(axis=1).round(2) + for col in summary.columns: + if col not in ('sdtype', 'dtype'): + summary[col] = summary[col].apply(lambda x: f'{x}%') + + sorted_results['Summary'] = summary slack_messages = [] + mark_results = {} + exit_code = 0 for key, value in comparison_results.items(): if not value.empty: sorted_results[key] = value if key == 'unsupported_dtypes': slack_messages.append(':fire: New unsupported DTypes!') + mark_results['#EB9999'] = value + exit_code = 1 + elif key == 'new_supported_dtypes': slack_messages.append(':party_blob: New DTypes supported!') + mark_results['#B7D7A8'] = value if len(slack_messages) == 0: slack_messages.append(':dealwithit: No new changes to the DTypes in SDV.') @@ -178,13 +220,13 @@ def compare_and_store_results_in_gdrive(): for key, value in results.items(): sorted_results[key] = value - file_id = save_to_gdrive(GDRIVE_OUTPUT_FOLDER, sorted_results) - + file_id = save_to_gdrive(GDRIVE_OUTPUT_FOLDER, sorted_results, mark_results=mark_results) slack_messages.append( f'See ' ) slack_message = '\n'.join(slack_messages) post_slack_message('sdv-alerts', slack_message) + sys.exit(exit_code) if __name__ == '__main__':