index_celebdf.py

"""
Index Celeb-DF v2
Image and Sound Processing Lab - Politecnico di Milano
Nicolò Bonettini
Edoardo Daniele Cannas
Sara Mandelli
Luca Bondi
Paolo Bestagini
"""
import argparse
from multiprocessing import Pool
from pathlib import Path

import numpy as np
import pandas as pd

from isplutils.utils import extract_meta_av, extract_meta_cv


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--source', type=Path, help='Source dir',
                        required=True)
    parser.add_argument('--videodataset', type=Path, default='data/celebdf_videos.pkl',
                        help='Path to save the videos DataFrame')

    args = parser.parse_args()

    ## Parameters parsing
    source_dir: Path = args.source
    videodataset_path: Path = args.videodataset

    # Create ouput folder (if doesn't exist)
    videodataset_path.parent.mkdir(parents=True, exist_ok=True)

    ## DataFrame
    if videodataset_path.exists():
        print('Loading video DataFrame')
        df_videos = pd.read_pickle(videodataset_path)
    else:
        print('Creating video DataFrame')

        split_file = Path(source_dir).joinpath('List_of_testing_videos.txt')
        if not split_file.exists():
            raise FileNotFoundError('Unable to find "List_of_testing_videos.txt" in {}'.format(source_dir))
        test_videos_df = pd.read_csv(split_file, delimiter=' ', header=0, index_col=1)

        ff_videos = Path(source_dir).rglob('*.mp4')
        df_videos = pd.DataFrame(
            {'path': [f.relative_to(source_dir) for f in ff_videos]})

        df_videos['height'] = df_videos['width'] = df_videos['frames'] = np.zeros(len(df_videos), dtype=np.uint16)
        with Pool() as p:
            meta = p.map(extract_meta_av, df_videos['path'].map(lambda x: str(source_dir.joinpath(x))))
        meta = np.stack(meta)
        df_videos.loc[:, ['height', 'width', 'frames']] = meta

        # Fix for videos that av cannot decode properly
        for idx, record in df_videos[df_videos['frames'] == 0].iterrows():
            meta = extract_meta_cv(str(source_dir.joinpath(record['path'])))
            df_videos.loc[idx, ['height', 'width', 'frames']] = meta

        df_videos['class'] = df_videos['path'].map(lambda x: x.parts[0]).astype('category')
        df_videos['label'] = df_videos['class'].map(
            lambda x: True if x == 'Celeb-synthesis' else False)  # True is FAKE, False is REAL
        df_videos['name'] = df_videos['path'].map(lambda x: x.with_suffix('').name)

        df_videos['original'] = -1 * np.ones(len(df_videos), dtype=np.int16)
        df_videos.loc[(df_videos['label'] == True), 'original'] = \
            df_videos[(df_videos['label'] == True)]['name'].map(
                lambda x: df_videos.index[
                    np.flatnonzero(df_videos['name'] == '_'.join([x.split('_')[0], x.split('_')[2]]))[0]]
            )

        df_videos['test'] = df_videos['path'].map(str).isin(test_videos_df.index)

        print('Saving video DataFrame to {}'.format(videodataset_path))
        df_videos.to_pickle(str(videodataset_path))

    print('Real videos: {:d}'.format(sum(df_videos['label'] == 0)))
    print('Fake videos: {:d}'.format(sum(df_videos['label'] == 1)))


if __name__ == '__main__':
    main()