Skip to content

Commit

Permalink
Merge pull request #18 from VectorInstitute/add_docstrings_data
Browse files Browse the repository at this point in the history
Add more docstrings for lib/data.py
  • Loading branch information
amrit110 authored Mar 22, 2024
2 parents 95d5811 + cb0d318 commit 5b8576f
Showing 1 changed file with 109 additions and 19 deletions.
128 changes: 109 additions & 19 deletions lib/data.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,40 @@
"""
data.py.
Create custom pretrain and finetune PyTorch Dataset objects for MIMIC-IV FHIR dataset.
"""
"""Data module for pretraining and finetuning the model."""

from typing import Any, Dict, List, Tuple, Union

import pandas as pd
import torch
from torch.utils.data import Dataset

from .tokenizer import ConceptTokenizer
from lib.tokenizer import ConceptTokenizer


class PretrainDataset(Dataset):
"""Dataset for pretraining the model."""
"""Dataset for pretraining the model.
Parameters
----------
data : pd.DataFrame
The input data containing sequences to be tokenized and masked.
tokenizer : ConceptTokenizer
An instance of the ConceptTokenizer class used for tokenizing sequences.
max_len : int, optional
The maximum length of the tokenized sequences, by default 2048.
mask_prob : float, optional
The probability of masking a token in the sequence, by default 0.15.
Attributes
----------
data : pd.DataFrame
Stores the input data.
tokenizer : ConceptTokenizer
Tokenizer used for tokenizing sequences.
max_len : int
Maximum length of the tokenized sequences.
mask_prob : float
Probability of masking a token in the sequence.
"""

def __init__(
self,
Expand All @@ -25,7 +45,6 @@ def __init__(
):
"""Initiate the class."""
super(PretrainDataset, self).__init__()

self.data = data
self.tokenizer = tokenizer
self.max_len = max_len
Expand All @@ -36,13 +55,36 @@ def __len__(self) -> int:
return len(self.data)

def tokenize_data(self, sequence: Union[str, List[str]]) -> Any:
"""Tokenize the sequence and return input_ids and attention mask."""
"""Tokenize the sequence and return input_ids and attention mask.
Parameters
----------
sequence : Union[str, List[str]]
The sequence to be tokenized.
Returns
-------
Any
A dictionary containing input_ids and attention_mask.
"""
return self.tokenizer(sequence, max_length=self.max_len)

def mask_tokens(self, sequence: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
"""Mask the tokens in the sequence using vectorized operations."""
mask_token_id = self.tokenizer.get_mask_token_id()
"""Mask the tokens in the sequence using vectorized operations.
Parameters
----------
sequence : torch.Tensor
The sequence of tokens to be masked.
Returns
-------
Tuple[torch.Tensor, torch.Tensor]
A tuple containing masked sequence and labels.
"""
mask_token_id = self.tokenizer.get_mask_token_id()
masked_sequence = sequence.clone()

# Ignore [PAD], [UNK], [MASK] tokens
Expand All @@ -67,16 +109,24 @@ def mask_tokens(self, sequence: torch.Tensor) -> Tuple[torch.Tensor, torch.Tenso
dtype=torch.long,
)
masked_sequence[randomized] = random_idx[randomized]

labels = torch.where(selected, sequence, -100)

return masked_sequence, labels

def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
"""Get data at corresponding index.
Return it as a dictionary including
all different token sequences along with attention mask and labels.
Parameters
----------
idx : int
The index of the data to be retrieved.
Returns
-------
Dict[str, torch.Tensor]
A dictionary containing all different token sequences along with
attention mask and labels.
"""
data = self.data.iloc[idx]
tokenized_input = self.tokenize_data(data[f"event_tokens_{self.max_len}"])
Expand Down Expand Up @@ -110,7 +160,27 @@ def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:


class FinetuneDataset(Dataset):
"""Dataset for finetuning the model."""
"""Dataset for finetuning the model.
Parameters
----------
data : pd.DataFrame
The input data containing sequences to be tokenized and masked.
tokenizer : ConceptTokenizer
An instance of the ConceptTokenizer class used for tokenizing sequences.
max_len : int, optional
The maximum length of the tokenized sequences, by default 2048.
Attributes
----------
data : pd.DataFrame
Stores the input data.
tokenizer : ConceptTokenizer
Tokenizer used for tokenizing sequences.
max_len : int
Maximum length of the tokenized sequences.
"""

def __init__(
self,
Expand All @@ -120,7 +190,6 @@ def __init__(
):
"""Initiate the class."""
super(FinetuneDataset, self).__init__()

self.data = data
self.tokenizer = tokenizer
self.max_len = max_len
Expand All @@ -130,14 +199,35 @@ def __len__(self) -> int:
return len(self.data)

def tokenize_data(self, sequence: Union[str, List[str]]) -> Any:
"""Tokenize the sequence and return input_ids and attention mask."""
"""Tokenize the sequence and return input_ids and attention mask.
Parameters
----------
sequence : Union[str, List[str]]
The sequence to be tokenized.
Returns
-------
Any
A dictionary containing input_ids and attention_mask.
"""
return self.tokenizer(sequence, max_length=self.max_len)

def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
"""Get data at corresponding index.
Return it as a dictionary including
all different token sequences along with attention mask and labels.
Parameters
----------
idx : int
The index of the data to be retrieved.
Returns
-------
Dict[str, torch.Tensor]
A dictionary containing all different token sequences along with
attention mask and labels.
"""
data = self.data.iloc[idx]
tokenized_input = self.tokenize_data(data[f"event_tokens_{self.max_len}"])
Expand Down

0 comments on commit 5b8576f

Please sign in to comment.