From 407b95733081aaa1eccddb9ed4ae711de48d388b Mon Sep 17 00:00:00 2001 From: Marcel Zwiers Date: Mon, 4 Nov 2024 17:58:20 +0100 Subject: [PATCH] Bugfixes to deal with duplicate column names --- bidscoin/bids.py | 44 ++++++++++++++++++--------------- bidscoin/plugins/events2bids.py | 44 +++++++++++++++++++++++---------- 2 files changed, 55 insertions(+), 33 deletions(-) diff --git a/bidscoin/bids.py b/bidscoin/bids.py index c614513e..041badc5 100644 --- a/bidscoin/bids.py +++ b/bidscoin/bids.py @@ -102,15 +102,14 @@ def eventstable(self) -> pd.DataFrame: if not self.isvalid: return pd.DataFrame() - # Take the columns of interest from the logtable and rename them df = copy.deepcopy(self.logtable) # Convert the timing values to seconds (with maximally 4 digits after the decimal point) df[self.time['cols']] = (df[self.time['cols']].apply(pd.to_numeric, errors='coerce') / self.time['unit']).round(4) - # Take the columns of interest and from now on use the BIDS column names - df = df.loc[:, [name for item in self.columns for name in item.values()]] - df.columns = [name for item in self.columns for name in item.keys()] + # Take the logtable columns of interest and from now on use the BIDS column names + df = df.loc[:, [sourcecol for item in self.columns for sourcecol in item.values() if sourcecol]] + df.columns = [eventscol for item in self.columns for eventscol, sourcecol in item.items() if sourcecol] # Set the clock at zero at the start of the experiment if self.time.get('start'): @@ -121,22 +120,22 @@ def eventstable(self) -> pd.DataFrame: df['onset'] = df['onset'] - df['onset'][start].iloc[0] # Take the time of the first occurrence as zero # Loop over the row groups to filter/edit the rows - rows = pd.Series([len(self.rows) == 0] * len(df)).astype(bool) # Series with True values if no row expressions were specified + rows = pd.Series([len(self.rows) == 0] * len(df)).astype(bool) # Boolean series with True values if no row expressions were specified for group in self.rows: for column, regex in group['include'].items(): - # Get the rows that match the expression + # Get the rows that match the expression, i.e. make them True rowgroup = self.logtable[column].astype(str).str.fullmatch(str(regex)) # Add the matching rows to the grand rows group - rows |= rowgroup + rows |= rowgroup.values # Write the value(s) of the matching rows - for newcolumn, newvalue in (group.get('cast') or {}).items(): - df.loc[rowgroup, newcolumn] = newvalue + for colname, values in (group.get('cast') or {}).items(): + df.loc[rowgroup, colname] = values - return df.loc[rows].sort_values(by='onset') + return df.loc[rows.values].sort_values(by='onset') @property def columns(self) -> List[dict]: @@ -177,30 +176,35 @@ def is_float(s): return False if not (valid := len(self.columns) >= 2): - LOGGER.warning(f"Events table must have at least two columns, got {len(self.columns)} instead") + LOGGER.warning(f"Events table must have at least two columns, got {len(self.columns)} instead\n{self}") return False if (key := [*self.columns[0].keys()][0]) != 'onset': - LOGGER.warning(f"First events column must be named 'onset', got '{key}' instead") + LOGGER.warning(f"First events column must be named 'onset', got '{key}' instead\n{self}") valid = False if (key := [*self.columns[1].keys()][0]) != 'duration': - LOGGER.warning(f"Second events column must be named 'duration', got '{key}' instead") + LOGGER.warning(f"Second events column must be named 'duration', got '{key}' instead\n{self}") valid = False if len(self.time.get('cols',[])) < 2: - LOGGER.warning(f"Events table must have at least two timecol items, got {len(self.time.get('cols',[]))} instead") + LOGGER.warning(f"Events table must have at least two timecol items, got {len(self.time.get('cols',[]))} instead\n{self}") return False elif not is_float(self.time.get('unit')): - LOGGER.warning(f"Time conversion factor must be a float, got '{self.time.get('unit')}' instead") + LOGGER.warning(f"Time conversion factor must be a float, got '{self.time.get('unit')}' instead\n{self}") valid = False + # Check if the logtable has existing and unique column names + df = self.logtable for name in set([name for item in self.columns for name in item.values()] + [name for item in self.rows for name in item['include'].keys()] + [*self.time.get('start',{}).keys()] + self.time.get('cols',[])): - if name not in self.logtable: - LOGGER.warning(f"Column '{name}' not found in the event table of {self.sourcefile}") + if name and name not in df: + LOGGER.warning(f"Column '{name}' not found in the event table of {self}") valid = False + if not df.columns[df.columns.duplicated()].empty: + LOGGER.warning(f"Duplicate columns found in: {df.columns}\n{self}") + valid = False return valid @@ -677,7 +681,7 @@ def check(self, checks: Tuple[bool, bool, bool]=(False, False, False)) -> Tuple[ for ext in ('.tsv', '.tsv.gz'): # NB: `ext` used to be '.json', which is more generic (but see https://github.com/bids-standard/bids-validator/issues/2113) if run_suffixok := bids_validator.BIDSValidator().is_bids(f"/sub-unknown/{datatype}/{bidsname}{ext}"): break # NB: Using the BIDSValidator sounds nice but doesn't give any control over the BIDS-version run_valsok = run_suffixok - LOGGER.bcdebug(f"bidsname={run_suffixok}: /sub-unknown/{datatype}/{bidsname}.*") + LOGGER.bcdebug(f"bidsname (suffixok={run_suffixok}): /sub-unknown/{datatype}/{bidsname}.*") if checks[0] and run_keysok in (None, False): LOGGER.bcdebug(f'Invalid "{run_keysok}" key-checks in run-item: "{bids["suffix"]}" ({datatype} -> {provenance})\nRun["bids"]:\t{bids}') @@ -1100,8 +1104,8 @@ def __init__(self, yamlfile: Path, folder: Path=templatefolder, plugins: Iterabl module = bcoin.import_plugin(plugin) if not self.plugins.get(plugin): LOGGER.info(f"Adding default bidsmap options from the {plugin} plugin") - self.plugins[plugin] = module.OPTIONS if 'OPTIONS' in dir(module) else {} - if 'BIDSMAP' in dir(module) and yamlfile.parent == templatefolder: + self.plugins[plugin] = module.OPTIONS if hasattr(module, 'OPTIONS') else {} + if hasattr(module, 'BIDSMAP') and yamlfile.parent == templatefolder: for dataformat, datasection in module.BIDSMAP.items(): if dataformat not in bidsmap_data: LOGGER.info(f"Adding default bidsmappings from the {plugin} plugin") diff --git a/bidscoin/plugins/events2bids.py b/bidscoin/plugins/events2bids.py index c7b4cc99..e0f8e0f8 100644 --- a/bidscoin/plugins/events2bids.py +++ b/bidscoin/plugins/events2bids.py @@ -249,26 +249,27 @@ def __init__(self, sourcefile: Path, _data: dict, options: dict): def logtable(self) -> pd.DataFrame: """Returns a Presentation log-table""" - nrows = len(self._sourcetable) - stimulus_header = (self._sourcetable.iloc[:, 0] == 'Event Type').idxmax() or nrows - video_header = (self._sourcetable.iloc[:, 0] == 'filename').idxmax() or nrows - survey_header = (self._sourcetable.iloc[:, 0] == 'Time').idxmax() or nrows - - # Keep only the event, stimulus, video or survey table - self._sourcetable.columns = self._columns + df = self._sourcetable + nrows = len(df) + stimulus_header = (df.iloc[:, 0] == 'Event Type').idxmax() or nrows + video_header = (df.iloc[:, 0] == 'filename').idxmax() or nrows + survey_header = (df.iloc[:, 0] == 'Time').idxmax() or nrows + + # Get the row indices to slice the event, stimulus, video or survey table + df.columns = self._columns if self.options['table'] == 'event': begin = 0 end = min(stimulus_header, video_header, survey_header) elif self.options['table'] == 'stimulus': - self._sourcetable.columns = self._sourcetable.iloc[stimulus_header] + df.columns = df.iloc[stimulus_header] begin = stimulus_header + 1 end = min(video_header, survey_header) elif self.options['table'] == 'video': - self._sourcetable.columns = self._sourcetable.iloc[video_header] + df.columns = df.iloc[video_header] begin = video_header + 1 end = survey_header elif self.options['table'] == 'survey': - self._sourcetable.columns = self._sourcetable.iloc[survey_header] + df.columns = df.iloc[survey_header] begin = survey_header + 1 end = nrows else: @@ -276,6 +277,23 @@ def logtable(self) -> pd.DataFrame: end = nrows LOGGER.error(f"NOT IMPLEMENTED TABLE: {self.options['table']}") - LOGGER.bcdebug(f"Slicing '{self.options['table']}' sourcetable[{begin}:{end}]") - - return self._sourcetable.iloc[begin:end] + LOGGER.bcdebug(f"Slicing '{self.options['table']}{df.shape}' sourcetable[{begin}:{end}]") + + # Ensure unique column names by renaming columns with NaN or empty names and by appending suffixes to duplicate names + cols = [] # The new column names + dupl = {} # The duplicate index number + for i, col in enumerate(df.columns): + if pd.isna(col) or col == '': # Check if the column name is NaN or an empty string + cols.append(new_col := f"unknown_{i}") + LOGGER.info(f"Renaming empty column name at index {i}: {col} -> {new_col}") + elif col in dupl: # If duplicate, append the index number + dupl[col] += 1 + cols.append(new_col := f"{col}_{dupl[col]}") + LOGGER.info(f"Renaming duplicate column name: {col} -> {new_col}") + else: # First occurrence of the column name, add it to dupl + dupl[col] = 0 + cols.append(col) + df.columns = cols + + # Return the sliced the table + return df.iloc[begin:end]