-
Notifications
You must be signed in to change notification settings - Fork 345
/
criteo_dataset.py
50 lines (40 loc) · 1.74 KB
/
criteo_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import torch
import pandas as pd
import click
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from loguru import logger
class CriteoParquetDataset(Dataset):
def __init__(self, file_name: str):
df = pd.read_parquet(file_name)
self.total_rows = len(df)
self.label_tensor = torch.from_numpy(df["labels"].values).to(torch.float32)
dense_columns = [f for f in df.columns if f.startswith("DENSE")]
sparse_columns = [f for f in df.columns if f.startswith("SPARSE")]
self.dense_tensor = torch.from_numpy(df[dense_columns].values)
self.sparse_tensor = torch.from_numpy(df[sparse_columns].values)
def __len__(self):
return self.total_rows
def __getitem__(self, idx):
return self.label_tensor[idx], self.dense_tensor[idx], \
self.sparse_tensor[idx]
@click.command()
@click.option('--file_path', type=click.Path(exists=True),
help='Path to the parquet file')
def process_file(file_path):
"""
Process the file specified by --file_path.
"""
logger.info("Reading the parquet file {}...".format(file_path))
dataset = CriteoParquetDataset(file_path)
data_loader = DataLoader(dataset, batch_size=32, shuffle=False)
for labels, dense, sparse in data_loader:
logger.info("Labels: {}".format(labels))
logger.info("Dense: {}".format(dense))
logger.info("Sparse: {}".format(sparse))
logger.info("Labels size and dtype: {}, {}".format(labels.size(), labels.dtype))
logger.info("Dense size and dtype: {}, {}".format(dense.size(), dense.dtype))
logger.info("Sparse size and dtype: {}, {}".format(sparse.size(), sparse.dtype))
break
if __name__ == "__main__":
process_file()