This repository has been archived by the owner on Aug 10, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
split.py
189 lines (165 loc) · 7.48 KB
/
split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
from typing import List, Dict, Tuple
from pandas import DataFrame
"""
Video Face Manipulation Detection Through Ensemble of CNNs
Image and Sound Processing Lab - Politecnico di Milano
Nicolò Bonettini
Edoardo Daniele Cannas
Sara Mandelli
Luca Bondi
Paolo Bestagini
"""
import numpy as np
import pandas as pd
available_datasets = [
'dfdc-35-5-10',
'ff-c23-720-140-140',
'ff-c23-720-140-140-5fpv',
'ff-c23-720-140-140-10fpv',
'ff-c23-720-140-140-15fpv',
'ff-c23-720-140-140-20fpv',
'ff-c23-720-140-140-25fpv',
'celebdf', # just for convenience, not used in the original paper
'subject-85-10-5' # Your own dataset, where 85-10-5 stands for the distribution of train-val-test
]
def load_df(
dfdc_df_path: str, ffpp_df_path: str, subject_df_path: str,
dfdc_faces_dir: str, ffpp_faces_dir: str, subject_root_dir: str,
dataset: str
) -> (pd.DataFrame, str):
if dataset.startswith('dfdc'):
df = pd.read_pickle(dfdc_df_path)
root = dfdc_faces_dir
elif dataset.startswith('ff-'):
df = pd.read_pickle(ffpp_df_path)
root = ffpp_faces_dir
elif dataset.startswith('subject'):
df = pd.read_pickle(subject_df_path)
df = balance_dataframe(df)
root = subject_root_dir
else:
raise NotImplementedError('Unknown dataset: {}'.format(dataset))
return df, root
def get_split_df(df: pd.DataFrame, dataset: str, split: str) -> pd.DataFrame:
split_df = None
if dataset == 'dfdc-35-5-10':
if split == 'train':
split_df = df[df['folder'].isin(range(35))]
elif split == 'val':
split_df = df[df['folder'].isin(range(35, 40))]
elif split == 'test':
split_df = df[df['folder'].isin(range(40, 50))]
else:
raise NotImplementedError('Unknown split: {}'.format(split))
elif dataset.startswith('ff-c23-720-140-140'):
# Save random state
st0 = np.random.get_state()
# Set seed for this selection only
np.random.seed(41)
# Split on original videos
crf = dataset.split('-')[1]
random_youtube_videos = np.random.permutation(
df[(df['source'] == 'youtube') & (df['quality'] == crf)]['video'].unique())
train_orig = random_youtube_videos[:720]
val_orig = random_youtube_videos[720:720 + 140]
test_orig = random_youtube_videos[720 + 140:]
if split == 'train':
split_df = pd.concat((df[df['original'].isin(train_orig)], df[df['video'].isin(train_orig)]), axis=0)
elif split == 'val':
split_df = pd.concat((df[df['original'].isin(val_orig)], df[df['video'].isin(val_orig)]), axis=0)
elif split == 'test':
split_df = pd.concat((df[df['original'].isin(test_orig)], df[df['video'].isin(test_orig)]), axis=0)
else:
raise NotImplementedError('Unknown split: {}'.format(split))
if dataset.endswith('fpv'):
fpv = int(dataset.rsplit('-', 1)[1][:-3])
idxs = []
for video in split_df['video'].unique():
idxs.append(np.random.choice(split_df[split_df['video'] == video].index, fpv, replace=False))
idxs = np.concatenate(idxs)
split_df = split_df.loc[idxs]
# Restore random state
np.random.set_state(st0)
elif dataset == 'celebdf':
seed = 41
num_real_train = 600
# Save random state
st0 = np.random.get_state()
# Set seed for this selection only
np.random.seed(seed)
# Split on original videos
random_train_val_real_videos = np.random.permutation(
df[(df['label'] == False) & (df['test'] == False)]['video'].unique())
train_orig = random_train_val_real_videos[:num_real_train]
val_orig = random_train_val_real_videos[num_real_train:]
if split == 'train':
split_df = pd.concat((df[df['original'].isin(train_orig)], df[df['video'].isin(train_orig)]), axis=0)
elif split == 'val':
split_df = pd.concat((df[df['original'].isin(val_orig)], df[df['video'].isin(val_orig)]), axis=0)
elif split == 'test':
split_df = df[df['test'] == True]
else:
raise NotImplementedError('Unknown split: {}'.format(split))
# Restore random state
np.random.set_state(st0)
elif dataset.startswith("subject"):
num_frames = len(df)
split_amount = dataset.split("-")[1:]
train_frames = int(num_frames * (int(split_amount[0]) / 100))
val_frames = int(num_frames * (int(split_amount[1]) / 100))
test_frames = int(num_frames * (int(split_amount[2]) / 100))
split_df_val = df.sample(n=val_frames, random_state=41)
df = df.drop(split_df_val.index)
split_df_test = df.sample(n=test_frames, random_state=41)
df = df.drop(split_df_test.index)
split_df_train = df.sample(n=train_frames, random_state=41)
if split == "train":
split_df = split_df_train.copy()
elif split == "val":
split_df = split_df_val.copy()
elif split == "test":
split_df = split_df_test.copy()
else:
raise NotImplementedError('Unknown dataset: {}'.format(dataset))
return split_df
def balance_dataframe(dataframe: DataFrame) -> DataFrame:
print("Balancing dataframe...")
class_counts = dataframe['label'].value_counts()
minority_class = class_counts.idxmax()
majority_class = class_counts.idxmin()
max_count_per_class = class_counts.min()
minority_class_data = dataframe[dataframe['label'] == minority_class]
majority_class_data = dataframe[dataframe['label'] == majority_class]
balanced_minority_class_data = minority_class_data.sample(n=max_count_per_class, random_state=41)
balanced_dataframe = pd.concat([majority_class_data, balanced_minority_class_data])
return balanced_dataframe
def make_splits(
dfdc_df: str, ffpp_df: str, subject_df: str,
dfdc_dir: str, ffpp_dir: str, subject_dir: str,
dbs: Dict[str, List[str]]
) -> Dict[str, Dict[str, Tuple[pd.DataFrame, str]]]:
"""
Make split and return Dataframe and root
:param
dfdc_df: str, path to the DataFrame containing info on the faces extracted from the DFDC dataset with extract_faces.py
ffpp_df: str, path to the DataFrame containing info on the faces extracted from the FF++ dataset with extract_faces.py
dfdc_dir: str, path to the directory containing the faces extracted from the DFDC dataset with extract_faces.py
ffpp_dir: str, path to the directory containing the faces extracted from the FF++ dataset with extract_faces.py
dbs: {split_name:[split_dataset1,split_dataset2,...]}
Example:
{'train':['dfdc-35-5-15',],'val':['dfdc-35-5-15',]}
:return: split_dict: dictonary containing {split_name: ['train', 'val'], splitdb: List(pandas.DataFrame, str)}
Example:
{'train, 'dfdc-35-5-15': (dfdc_train_df, 'path/to/dir/of/DFDC/faces')}
"""
split_dict = {}
full_dfs = {}
for split_name, split_dbs in dbs.items():
split_dict[split_name] = dict()
for split_db in split_dbs:
if split_db not in full_dfs:
full_dfs[split_db] = load_df(dfdc_df, ffpp_df, subject_df, dfdc_dir, ffpp_dir, subject_dir, split_db)
full_df, root = full_dfs[split_db]
split_df = get_split_df(df=full_df, dataset=split_db, split=split_name)
split_dict[split_name][split_db] = (split_df, root)
return split_dict