Merge pull request #18 from VectorInstitute/add_docstrings_data

Add more docstrings for lib/data.py
VectorInstitute · Mar 22, 2024 · 5b8576f · 5b8576f
2 parents 95d5811 + cb0d318
commit 5b8576f
Showing 1 changed file with 109 additions and 19 deletions.
diff --git a/lib/data.py b/lib/data.py
@@ -1,20 +1,40 @@
-"""
-data.py.
-
-Create custom pretrain and finetune PyTorch Dataset objects for MIMIC-IV FHIR dataset.
-"""
+"""Data module for pretraining and finetuning the model."""
 
 from typing import Any, Dict, List, Tuple, Union
 
 import pandas as pd
 import torch
 from torch.utils.data import Dataset
 
-from .tokenizer import ConceptTokenizer
+from lib.tokenizer import ConceptTokenizer
 
 
 class PretrainDataset(Dataset):
-    """Dataset for pretraining the model."""
+    """Dataset for pretraining the model.
+
+    Parameters
+    ----------
+    data : pd.DataFrame
+        The input data containing sequences to be tokenized and masked.
+    tokenizer : ConceptTokenizer
+        An instance of the ConceptTokenizer class used for tokenizing sequences.
+    max_len : int, optional
+        The maximum length of the tokenized sequences, by default 2048.
+    mask_prob : float, optional
+        The probability of masking a token in the sequence, by default 0.15.
+
+    Attributes
+    ----------
+    data : pd.DataFrame
+        Stores the input data.
+    tokenizer : ConceptTokenizer
+        Tokenizer used for tokenizing sequences.
+    max_len : int
+        Maximum length of the tokenized sequences.
+    mask_prob : float
+        Probability of masking a token in the sequence.
+
+    """
 
     def __init__(
         self,
@@ -25,7 +45,6 @@ def __init__(
     ):
         """Initiate the class."""
         super(PretrainDataset, self).__init__()
-
         self.data = data
         self.tokenizer = tokenizer
         self.max_len = max_len
@@ -36,13 +55,36 @@ def __len__(self) -> int:
         return len(self.data)
 
     def tokenize_data(self, sequence: Union[str, List[str]]) -> Any:
-        """Tokenize the sequence and return input_ids and attention mask."""
+        """Tokenize the sequence and return input_ids and attention mask.
+
+        Parameters
+        ----------
+        sequence : Union[str, List[str]]
+            The sequence to be tokenized.
+
+        Returns
+        -------
+        Any
+            A dictionary containing input_ids and attention_mask.
+
+        """
         return self.tokenizer(sequence, max_length=self.max_len)
 
     def mask_tokens(self, sequence: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Mask the tokens in the sequence using vectorized operations."""
-        mask_token_id = self.tokenizer.get_mask_token_id()
+        """Mask the tokens in the sequence using vectorized operations.
+
+        Parameters
+        ----------
+        sequence : torch.Tensor
+            The sequence of tokens to be masked.
+
+        Returns
+        -------
+        Tuple[torch.Tensor, torch.Tensor]
+            A tuple containing masked sequence and labels.
 
+        """
+        mask_token_id = self.tokenizer.get_mask_token_id()
         masked_sequence = sequence.clone()
 
         # Ignore [PAD], [UNK], [MASK] tokens
@@ -67,16 +109,24 @@ def mask_tokens(self, sequence: torch.Tensor) -> Tuple[torch.Tensor, torch.Tenso
             dtype=torch.long,
         )
         masked_sequence[randomized] = random_idx[randomized]
-
         labels = torch.where(selected, sequence, -100)
 
         return masked_sequence, labels
 
     def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
         """Get data at corresponding index.
 
-        Return it as a dictionary including
-        all different token sequences along with attention mask and labels.
+        Parameters
+        ----------
+        idx : int
+            The index of the data to be retrieved.
+
+        Returns
+        -------
+        Dict[str, torch.Tensor]
+            A dictionary containing all different token sequences along with
+            attention mask and labels.
+
         """
         data = self.data.iloc[idx]
         tokenized_input = self.tokenize_data(data[f"event_tokens_{self.max_len}"])
@@ -110,7 +160,27 @@ def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
 
 
 class FinetuneDataset(Dataset):
-    """Dataset for finetuning the model."""
+    """Dataset for finetuning the model.
+
+    Parameters
+    ----------
+    data : pd.DataFrame
+        The input data containing sequences to be tokenized and masked.
+    tokenizer : ConceptTokenizer
+        An instance of the ConceptTokenizer class used for tokenizing sequences.
+    max_len : int, optional
+        The maximum length of the tokenized sequences, by default 2048.
+
+    Attributes
+    ----------
+    data : pd.DataFrame
+        Stores the input data.
+    tokenizer : ConceptTokenizer
+        Tokenizer used for tokenizing sequences.
+    max_len : int
+        Maximum length of the tokenized sequences.
+
+    """
 
     def __init__(
         self,
@@ -120,7 +190,6 @@ def __init__(
     ):
         """Initiate the class."""
         super(FinetuneDataset, self).__init__()
-
         self.data = data
         self.tokenizer = tokenizer
         self.max_len = max_len
@@ -130,14 +199,35 @@ def __len__(self) -> int:
         return len(self.data)
 
     def tokenize_data(self, sequence: Union[str, List[str]]) -> Any:
-        """Tokenize the sequence and return input_ids and attention mask."""
+        """Tokenize the sequence and return input_ids and attention mask.
+
+        Parameters
+        ----------
+        sequence : Union[str, List[str]]
+            The sequence to be tokenized.
+
+        Returns
+        -------
+        Any
+            A dictionary containing input_ids and attention_mask.
+
+        """
         return self.tokenizer(sequence, max_length=self.max_len)
 
     def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
         """Get data at corresponding index.
 
-        Return it as a dictionary including
-        all different token sequences along with attention mask and labels.
+        Parameters
+        ----------
+        idx : int
+            The index of the data to be retrieved.
+
+        Returns
+        -------
+        Dict[str, torch.Tensor]
+            A dictionary containing all different token sequences along with
+            attention mask and labels.
+
         """
         data = self.data.iloc[idx]
         tokenized_input = self.tokenize_data(data[f"event_tokens_{self.max_len}"])