Skip to content

Commit

Permalink
Video metric (mindspore-lab#362)
Browse files Browse the repository at this point in the history
* add clip_score_frame and clip_score_text for video evaluation metrics

* update readme

* update

* update

* update

* refactor video metrics

* update

* update

* update

* update

* update

* update

* update

* update

* update

---------

Co-authored-by: songyuanwei <song.yuanwei@huawei.com>
  • Loading branch information
Songyuanwei and songyuanwei authored Apr 23, 2024
1 parent 3c64917 commit 468be76
Show file tree
Hide file tree
Showing 8 changed files with 292 additions and 3 deletions.
1 change: 1 addition & 0 deletions mindone/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .video_metrics import ClipScoreFrame, ClipScoreText
37 changes: 37 additions & 0 deletions mindone/metrics/video_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from sklearn.metrics.pairwise import cosine_similarity

from mindspore import Tensor, ops

VIDEO_EXTENSIONS = {".mp4"}


class ClipScoreText:
def __init__(self, model, processor):
super().__init__()
self.model = model
self.processor = processor

def score(self, frames, prompt):
inputs = self.processor(text=[prompt], images=frames)
inputs = {k: Tensor(v) for k, v in inputs.items()}
outputs = self.model(**inputs)
score = outputs[0].asnumpy().mean()

return score


class ClipScoreFrame:
def __init__(self, model, processor):
super().__init__()
self.model = model
self.processor = processor
self.fill_diagonal = ops.FillDiagonal(0.0)

def score(self, frames):
inputs = self.processor(images=frames)
inputs = {k: Tensor(v) for k, v in inputs.items()}
image_features = self.model.get_image_features(**inputs).asnumpy()
cosine_sim_matrix = cosine_similarity(image_features)
cosine_sim_matrix = self.fill_diagonal(Tensor(cosine_sim_matrix)) # set diagonal elements to 0
score = cosine_sim_matrix.sum() / (len(frames) * (len(frames) - 1))
return score
1 change: 1 addition & 0 deletions mindone/transformers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .modeling_ms_utils import MSPreTrainedModel
from .models import (
CLIPModel,
CLIPTextModel,
CLIPTextModelWithProjection,
CLIPVisionModel,
Expand Down
2 changes: 1 addition & 1 deletion mindone/transformers/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .clip import CLIPTextModel, CLIPTextModelWithProjection, CLIPVisionModel, CLIPVisionModelWithProjection
from .clip import CLIPModel, CLIPTextModel, CLIPTextModelWithProjection, CLIPVisionModel, CLIPVisionModelWithProjection
from .t5 import T5EncoderModel, T5Model
8 changes: 7 additions & 1 deletion mindone/transformers/models/clip/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,7 @@
from .modeling_ms_clip import CLIPTextModel, CLIPTextModelWithProjection, CLIPVisionModel, CLIPVisionModelWithProjection
from .modeling_ms_clip import (
CLIPModel,
CLIPTextModel,
CLIPTextModelWithProjection,
CLIPVisionModel,
CLIPVisionModelWithProjection,
)
132 changes: 131 additions & 1 deletion mindone/transformers/models/clip/modeling_ms_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def _make_causal_mask(
Make causal mask used for bi-directional self-attention.
"""
bsz, tgt_len = input_ids_shape
mask = ops.full((tgt_len, tgt_len), float("-inf"))
mask = ops.full((tgt_len, tgt_len), float("-inf"), dtype=dtype)
mask_cond = ops.arange(mask.shape[-1])
mask = mask.masked_fill(mask_cond < (mask_cond + 1).view(mask.shape[-1], 1), 0)

Expand Down Expand Up @@ -685,6 +685,136 @@ def construct(
)


class CLIPModel(CLIPPreTrainedModel):
config_class = CLIPConfig
_no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]

def __init__(self, config: CLIPConfig):
super().__init__(config)

if not isinstance(config.text_config, CLIPTextConfig):
raise ValueError(
"config.text_config is expected to be of type CLIPTextConfig but is of type"
f" {type(config.text_config)}."
)

if not isinstance(config.vision_config, CLIPVisionConfig):
raise ValueError(
"config.vision_config is expected to be of type CLIPVisionConfig but is of type"
f" {type(config.vision_config)}."
)

text_config = config.text_config
vision_config = config.vision_config
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.logit_scale_init_value = config.logit_scale_init_value

self.projection_dim = config.projection_dim
self.text_embed_dim = text_config.hidden_size
self.vision_embed_dim = vision_config.hidden_size

self.text_model = CLIPTextTransformer(text_config)
self.vision_model = CLIPVisionTransformer(vision_config)

self.visual_projection = nn.Dense(self.vision_embed_dim, self.projection_dim, has_bias=False)
self.text_projection = nn.Dense(self.text_embed_dim, self.projection_dim, has_bias=False)
self.logit_scale = ms.Parameter(ms.Tensor(self.logit_scale_init_value))

# Initialize weights and apply final processing
self.post_init()

def get_text_features(
self,
input_ids: Optional[ms.Tensor] = None,
attention_mask: Optional[ms.Tensor] = None,
position_ids: Optional[ms.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
) -> ms.Tensor:
# Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states

text_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
)

pooled_output = text_outputs[1]
text_features = self.text_projection(pooled_output)

return text_features

def get_image_features(
self,
pixel_values: Optional[ms.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
) -> ms.Tensor:
# Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states

vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
)

pooled_output = vision_outputs[1] # pooled_output
image_features = self.visual_projection(pooled_output)

return image_features

def construct(
self,
input_ids: Optional[ms.Tensor] = None,
pixel_values: Optional[ms.Tensor] = None,
attention_mask: Optional[ms.Tensor] = None,
position_ids: Optional[ms.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
) -> Tuple:
# Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states

vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
)

text_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
)
image_embeds = vision_outputs[1]
image_embeds = self.visual_projection(image_embeds)

text_embeds = text_outputs[1]
text_embeds = self.text_projection(text_embeds)

# normalized features
image_embeds = image_embeds / image_embeds.norm(ord=2, dim=-1, keepdim=True)
text_embeds = text_embeds / text_embeds.norm(ord=2, dim=-1, keepdim=True)

# cosine similarity as logits
logit_scale = self.logit_scale.exp()
logits_per_text = ops.matmul(text_embeds, image_embeds.t()) * logit_scale
logits_per_image = logits_per_text.t()

output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
return output


class CLIPTextModelWithProjection(CLIPPreTrainedModel):
config_class = CLIPTextConfig

Expand Down
46 changes: 46 additions & 0 deletions scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Utility Scripts

This folder is a collection of utility scripts, listed and explained below.

> All scripts need to be run in the root path of project, unless otherwise noted.
## eval_videos_metrics.py

This script contains code and scripts for diffusion model evaluation, e.g.,

- CLIP Score For frame Consistency
- CLIP Score for Textual Alignment


Note that all the above metrics are computed based on neural network models.

> A convincing evaluation for diffusion models requires both visually qualitative comparision and quantitative measure. A higher CLIP score does not necessarily show one model is better than another.

#### CLIP Score for Frame Consistency

To compute the CLIP score on all frames of output video and report the average cosine similarity between all video frame pairs, please run

```shell
python ./scripts/eval_videos_metrics.py --video_data_dir <path-to-video-dir> --video_caption_path <path-to-video-caption-path> --model_name <HF-model-name> --metric clip_score_frame
```

#### CLIP Score for Textual Alignment

To compute the average CLIP score between all frames of the output video and the corresponding editing prompts, please run

```shell
python ./scripts/eval_videos_metrics.py --video_data_dir <path-to-video-dir> --video_caption_path <path-to-video-caption-path> --model_name <HF-model-name> --metric clip_score_text
```

Format of `.csv`:
```
video,caption
video_name1.mp4,"an airliner is taxiing on the tarmac at Dubai Airport"
video_name2.mp4,"a pigeon sitting on the street near the house"
...
```

## Reference

[1] https://github.com/showlab/loveu-tgve-2023/tree/main
68 changes: 68 additions & 0 deletions scripts/eval_videos_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import argparse
import os

import cv2
import pandas as pd
from PIL import Image
from tqdm import tqdm
from transformers import CLIPProcessor

from mindone.metrics import ClipScoreFrame, ClipScoreText
from mindone.transformers import CLIPModel

VIDEO_EXTENSIONS = {".mp4"}

parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
type=str,
default="openai/clip-vit-base-patch32/",
help="the name of a (Open/)CLIP model as shown in HuggingFace." "Default: openai/clip-vit-base-patch32/",
)
parser.add_argument("--video_data_dir", type=str, default=None, help="path to data folder." "Default: None")
parser.add_argument("--video_caption_path", type=str, default=None, help="path to video caption path." "Default: None")
parser.add_argument("--metric", type=str, default="clip_score_text", choices=["clip_score_text", "clip_score_frame"])
args = parser.parse_args()

assert args.video_data_dir is not None

model = CLIPModel.from_pretrained(args.model_name)
processor = CLIPProcessor.from_pretrained(args.model_name)
clip_score_text = ClipScoreText(model, processor)
clip_score_frame = ClipScoreFrame(model, processor)

scores = []
df = pd.read_csv(args.video_caption_path)
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
video_name = row["video"]
edited_prompt = row["caption"]
if os.path.splitext(video_name)[1] in VIDEO_EXTENSIONS:
video_path = f"{args.video_data_dir}/{video_name}"
else:
print(f"Not support format: {video_name}. ")
continue
if not os.path.exists(video_path):
raise FileNotFoundError(video_path)
cap = cv2.VideoCapture(video_path)
frames = []
index = 0
while cap.isOpened():
ret, frame = cap.read()
if ret:
frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
frames.append(frame)
index += 1
else:
break
cap.release()
frames = [i.resize((224, 224)) for i in frames]
if args.metric == "clip_score_text":
score = clip_score_text.score(frames, edited_prompt)
scores.append(score)
elif args.metric == "clip_score_frame":
score = clip_score_frame.score(frames)
scores.append(score)
else:
raise NotImplementedError(args.metric)

print("{}: {}".format(args.metric, sum(scores) / len(scores)))

0 comments on commit 468be76

Please sign in to comment.