forked from mindspore-lab/mindone
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add clip_score_frame and clip_score_text for video evaluation metrics * update readme * update * update * update * refactor video metrics * update * update * update * update * update * update * update * update * update --------- Co-authored-by: songyuanwei <song.yuanwei@huawei.com>
- Loading branch information
1 parent
3c64917
commit 468be76
Showing
8 changed files
with
292 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .video_metrics import ClipScoreFrame, ClipScoreText |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
from sklearn.metrics.pairwise import cosine_similarity | ||
|
||
from mindspore import Tensor, ops | ||
|
||
VIDEO_EXTENSIONS = {".mp4"} | ||
|
||
|
||
class ClipScoreText: | ||
def __init__(self, model, processor): | ||
super().__init__() | ||
self.model = model | ||
self.processor = processor | ||
|
||
def score(self, frames, prompt): | ||
inputs = self.processor(text=[prompt], images=frames) | ||
inputs = {k: Tensor(v) for k, v in inputs.items()} | ||
outputs = self.model(**inputs) | ||
score = outputs[0].asnumpy().mean() | ||
|
||
return score | ||
|
||
|
||
class ClipScoreFrame: | ||
def __init__(self, model, processor): | ||
super().__init__() | ||
self.model = model | ||
self.processor = processor | ||
self.fill_diagonal = ops.FillDiagonal(0.0) | ||
|
||
def score(self, frames): | ||
inputs = self.processor(images=frames) | ||
inputs = {k: Tensor(v) for k, v in inputs.items()} | ||
image_features = self.model.get_image_features(**inputs).asnumpy() | ||
cosine_sim_matrix = cosine_similarity(image_features) | ||
cosine_sim_matrix = self.fill_diagonal(Tensor(cosine_sim_matrix)) # set diagonal elements to 0 | ||
score = cosine_sim_matrix.sum() / (len(frames) * (len(frames) - 1)) | ||
return score |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,2 @@ | ||
from .clip import CLIPTextModel, CLIPTextModelWithProjection, CLIPVisionModel, CLIPVisionModelWithProjection | ||
from .clip import CLIPModel, CLIPTextModel, CLIPTextModelWithProjection, CLIPVisionModel, CLIPVisionModelWithProjection | ||
from .t5 import T5EncoderModel, T5Model |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,7 @@ | ||
from .modeling_ms_clip import CLIPTextModel, CLIPTextModelWithProjection, CLIPVisionModel, CLIPVisionModelWithProjection | ||
from .modeling_ms_clip import ( | ||
CLIPModel, | ||
CLIPTextModel, | ||
CLIPTextModelWithProjection, | ||
CLIPVisionModel, | ||
CLIPVisionModelWithProjection, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# Utility Scripts | ||
|
||
This folder is a collection of utility scripts, listed and explained below. | ||
|
||
> All scripts need to be run in the root path of project, unless otherwise noted. | ||
## eval_videos_metrics.py | ||
|
||
This script contains code and scripts for diffusion model evaluation, e.g., | ||
|
||
- CLIP Score For frame Consistency | ||
- CLIP Score for Textual Alignment | ||
|
||
|
||
Note that all the above metrics are computed based on neural network models. | ||
|
||
> A convincing evaluation for diffusion models requires both visually qualitative comparision and quantitative measure. A higher CLIP score does not necessarily show one model is better than another. | ||
|
||
#### CLIP Score for Frame Consistency | ||
|
||
To compute the CLIP score on all frames of output video and report the average cosine similarity between all video frame pairs, please run | ||
|
||
```shell | ||
python ./scripts/eval_videos_metrics.py --video_data_dir <path-to-video-dir> --video_caption_path <path-to-video-caption-path> --model_name <HF-model-name> --metric clip_score_frame | ||
``` | ||
|
||
#### CLIP Score for Textual Alignment | ||
|
||
To compute the average CLIP score between all frames of the output video and the corresponding editing prompts, please run | ||
|
||
```shell | ||
python ./scripts/eval_videos_metrics.py --video_data_dir <path-to-video-dir> --video_caption_path <path-to-video-caption-path> --model_name <HF-model-name> --metric clip_score_text | ||
``` | ||
|
||
Format of `.csv`: | ||
``` | ||
video,caption | ||
video_name1.mp4,"an airliner is taxiing on the tarmac at Dubai Airport" | ||
video_name2.mp4,"a pigeon sitting on the street near the house" | ||
... | ||
``` | ||
|
||
## Reference | ||
|
||
[1] https://github.com/showlab/loveu-tgve-2023/tree/main |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
import argparse | ||
import os | ||
|
||
import cv2 | ||
import pandas as pd | ||
from PIL import Image | ||
from tqdm import tqdm | ||
from transformers import CLIPProcessor | ||
|
||
from mindone.metrics import ClipScoreFrame, ClipScoreText | ||
from mindone.transformers import CLIPModel | ||
|
||
VIDEO_EXTENSIONS = {".mp4"} | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
"--model_name", | ||
type=str, | ||
default="openai/clip-vit-base-patch32/", | ||
help="the name of a (Open/)CLIP model as shown in HuggingFace." "Default: openai/clip-vit-base-patch32/", | ||
) | ||
parser.add_argument("--video_data_dir", type=str, default=None, help="path to data folder." "Default: None") | ||
parser.add_argument("--video_caption_path", type=str, default=None, help="path to video caption path." "Default: None") | ||
parser.add_argument("--metric", type=str, default="clip_score_text", choices=["clip_score_text", "clip_score_frame"]) | ||
args = parser.parse_args() | ||
|
||
assert args.video_data_dir is not None | ||
|
||
model = CLIPModel.from_pretrained(args.model_name) | ||
processor = CLIPProcessor.from_pretrained(args.model_name) | ||
clip_score_text = ClipScoreText(model, processor) | ||
clip_score_frame = ClipScoreFrame(model, processor) | ||
|
||
scores = [] | ||
df = pd.read_csv(args.video_caption_path) | ||
for index, row in tqdm(df.iterrows(), total=df.shape[0]): | ||
video_name = row["video"] | ||
edited_prompt = row["caption"] | ||
if os.path.splitext(video_name)[1] in VIDEO_EXTENSIONS: | ||
video_path = f"{args.video_data_dir}/{video_name}" | ||
else: | ||
print(f"Not support format: {video_name}. ") | ||
continue | ||
if not os.path.exists(video_path): | ||
raise FileNotFoundError(video_path) | ||
cap = cv2.VideoCapture(video_path) | ||
frames = [] | ||
index = 0 | ||
while cap.isOpened(): | ||
ret, frame = cap.read() | ||
if ret: | ||
frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) | ||
frames.append(frame) | ||
index += 1 | ||
else: | ||
break | ||
cap.release() | ||
frames = [i.resize((224, 224)) for i in frames] | ||
if args.metric == "clip_score_text": | ||
score = clip_score_text.score(frames, edited_prompt) | ||
scores.append(score) | ||
elif args.metric == "clip_score_frame": | ||
score = clip_score_frame.score(frames) | ||
scores.append(score) | ||
else: | ||
raise NotImplementedError(args.metric) | ||
|
||
print("{}: {}".format(args.metric, sum(scores) / len(scores))) |