-
Notifications
You must be signed in to change notification settings - Fork 4
/
T2V.bib
326 lines (285 loc) · 13.4 KB
/
T2V.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
@String(ICLR = {ICLR})
@String(ICML = {ICML})
@String(NIPS = {NeurIPS})
@String(CVPR = {CVPR})
@String(ICCV = {ICCV})
@String(ECCV = {ECCV})
@String(IJCV = {IJCV})
% methods
@article{wu2021godiva,
title={{GODIVA}: Generating Open-DomaIn Videos from nAtural Descriptions},
author={Wu, Chenfei and Huang, Lun and Zhang, Qianxi and Li, Binyang and Ji, Lei and Yang, Fan and Sapiro, Guillermo and Duan, Nan},
journal={arXiv preprint arXiv:2104.14806},
year={2021}
}
@inproceedings{wu2022nuwa,
title={{N{\"U}WA}: Visual Synthesis Pre-training for Neural visUal World creAtion},
author={Wu, Chenfei and Liang, Jian and Ji, Lei and Yang, Fan and Fang, Yuejian and Jiang, Daxin and Duan, Nan},
booktitle=ECCV,
pages={720--736},
year={2022}
}
@inproceedings{ho2022video,
title={Video Diffusion Models},
author={Ho, Jonathan and Salimans, Tim and Gritsenko, Alexey and Chan, William and Norouzi, Mohammad and Fleet, David J},
booktitle=NIPS,
pages={8633--8646},
year={2022},
}
@article{ho2022imagen,
title={Imagen Video: High Definition Video Generation with Diffusion Models},
author={Ho, Jonathan and Chan, William and Saharia, Chitwan and Whang, Jay and Gao, Ruiqi and Gritsenko, Alexey and Kingma, Diederik P and Poole, Ben and Norouzi, Mohammad and Fleet, David J and others},
journal={arXiv preprint arXiv:2210.02303},
year={2022}
}
@article{zhou2022magicvideo,
title={{MagicVideo}: Efficient Video Generation With Latent Diffusion Models},
author={Zhou, Daquan and Wang, Weimin and Yan, Hanshu and Lv, Weiwei and Zhu, Yizhe and Feng, Jiashi},
journal={arXiv preprint arXiv:2211.11018},
year={2022}
}
%% LVDM
@article{he2023latent,
title={Latent Video Diffusion Models for High-Fidelity Long Video Generation},
author={He, Yingqing and Yang, Tianyu and Zhang, Yong and Shan, Ying and Chen, Qifeng},
journal={arXiv preprint arXiv:2211.13221},
year={2023}
}
@inproceedings{singer2023make,
title={{Make-A-Video}: Text-to-Video Generation without Text-Video Data},
author={Singer, Uriel and Polyak, Adam and Hayes, Thomas and Yin, Xi and An, Jie and Zhang, Songyang and Hu, Qiyuan and Yang, Harry and Ashual, Oron and Gafni, Oran and others},
booktitle=ICLR,
year={2023}
}
@inproceedings{villegas2023phenaki,
title={Phenaki: Variable Length Video Generation from Open Domain Textual Descriptions},
author={Villegas, Ruben and Babaeizadeh, Mohammad and Kindermans, Pieter-Jan and Moraldo, Hernan and Zhang, Han and Saffar, Mohammad Taghi and Castro, Santiago and Kunze, Julius and Erhan, Dumitru},
booktitle=ICLR,
year={2023}
}
@inproceedings{hong2023cogvideo,
title={{CogVideo}: Large-scale Pretraining for Text-to-Video Generation via Transformers},
author={Hong, Wenyi and Ding, Ming and Zheng, Wendi and Liu, Xinghan and Tang, Jie},
booktitle=ICLR,
year={2023}
}
%% Video LDM
@inproceedings{blattmann2023align,
title={Align your Latents: High-Resolution Video Synthesis with Latent Diffusion Models},
author={Blattmann, Andreas and Rombach, Robin and Ling, Huan and Dockhorn, Tim and Kim, Seung Wook and Fidler, Sanja and Kreis, Karsten},
booktitle=CVPR,
pages={22563--22575},
year={2023}
}
%% Gen1
@inproceedings{esser2023structure,
title={Structure and Content-Guided Video Synthesis with Diffusion Models},
author={Esser, Patrick and Chiu, Johnathan and Atighehchian, Parmida and Granskog, Jonathan and Germanidis, Anastasis},
booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
pages={7346--7356},
year={2023}
}
%% PYoCo
@inproceedings{ge2023preserve,
title={Preserve Your Own Correlation: A Noise Prior for Video Diffusion Models},
author={Ge, Songwei and Nah, Seungjun and Liu, Guilin and Poon, Tyler and Tao, Andrew and Catanzaro, Bryan and Jacobs, David and Huang, Jia-Bin and Liu, Ming-Yu and Balaji, Yogesh},
booktitle=ICCV,
pages={22930--22941},
year={2023}
}
@inproceedings{wang2023videocomposer,
title={{VideoComposer}: Compositional Video Synthesis with Motion Controllability},
author={Wang, Xiang and Yuan, Hangjie and Zhang, Shiwei and Chen, Dayou and Wang, Jiuniu and Zhang, Yingya and Shen, Yujun and Zhao, Deli and Zhou, Jingren},
booktitle=NIPS,
year={2023}
}
@inproceedings{sun2023glober,
title={{GLOBER}: Coherent Non-autoregressive Video Generation via GLOBal Guided Video DecodER},
author={Sun, Mingzhen and Wang, Weining and Qin, Zihan and Sun, Jiahui and Chen, Sihan and Liu, Jing},
booktitle=NIPS,
year={2023}
}
@article{luo2023videofusion,
title={{VideoFusion}: Decomposed Diffusion Models for High-Quality Video Generation},
author={Luo, Zhengxiong and Chen, Dayou and Zhang, Yingya and Huang, Yan and Wang, Liang and Shen, Yujun and Zhao, Deli and Zhou, Jingren and Tan, Tieniu},
journal={arXiv preprint arXiv:2303.08320},
year={2023}
}
@article{an2023latent,
title={{Latent-Shift}: Latent Diffusion with Temporal Shift for Efficient Text-to-Video Generation},
author={An, Jie and Zhang, Songyang and Yang, Harry and Gupta, Sonal and Huang, Jia-Bin and Luo, Jiebo and Yin, Xi},
journal={arXiv preprint arXiv:2304.08477},
year={2023}
}
@article{wang2023videofactory,
title={{VideoFactory}: Swap Attention in Spatiotemporal Diffusions for Text-to-Video Generation},
author={Wang, Wenjing and Yang, Huan and Tuo, Zixi and He, Huiguo and Zhu, Junchen and Fu, Jianlong and Liu, Jiaying},
journal={arXiv preprint arXiv:2305.10874},
year={2023}
}
@article{xing2023make,
title={{Make-Your-Video}: Customized Video Generation Using Textual and Structural Guidance},
author={Xing, Jinbo and Xia, Menghan and Liu, Yuxin and Zhang, Yuechen and Zhang, Yong and He, Yingqing and Liu, Hanyuan and Chen, Haoxin and Cun, Xiaodong and Wang, Xintao and others},
journal={arXiv preprint arXiv:2306.00943},
year={2023}
}
@article{he2023animate,
title={{Animate-A-Story}: Storytelling with Retrieval-Augmented Video Generation},
author={He, Yingqing and Xia, Menghan and Chen, Haoxin and Cun, Xiaodong and Gong, Yuan and Xing, Jinbo and Zhang, Yong and Wang, Xintao and Weng, Chao and Shan, Ying and others},
journal={arXiv preprint arXiv:2307.06940},
year={2023}
}
@article{wang2023internvid,
title={{InternVid}: A Large-scale Video-Text Dataset for Multimodal Understanding and Generation},
author={Wang, Yi and He, Yinan and Li, Yizhuo and Li, Kunchang and Yu, Jiashuo and Ma, Xin and Chen, Xinyuan and Wang, Yaohui and Luo, Ping and Liu, Ziwei and others},
journal={arXiv preprint arXiv:2307.06942},
year={2023}
}
@article{wang2023modelscope,
title={{ModelScope} Text-to-Video Technical Report},
author={Wang, Jiuniu and Yuan, Hangjie and Chen, Dayou and Zhang, Yingya and Wang, Xiang and Zhang, Shiwei},
journal={arXiv preprint arXiv:2308.06571},
year={2023}
}
%% Dysen-VDM
@article{fei2023empowering,
title={Empowering Dynamics-aware Text-to-Video Diffusion with Large Language Models},
author={Fei, Hao and Wu, Shengqiong and Ji, Wei and Zhang, Hanwang and Chua, Tat-Seng},
journal={arXiv preprint arXiv:2308.13812},
year={2023}
}
%% VidRD
@article{gu2023reuse,
title={Reuse and Diffuse: Iterative Denoising for Text-to-Video Generation},
author={Gu, Jiaxi and Wang, Shicong and Zhao, Haoyu and Lu, Tianyi and Zhang, Xing and Wu, Zuxuan and Xu, Songcen and Zhang, Wei and Jiang, Yu-Gang and Xu, Hang},
journal={arXiv preprint arXiv:2309.03549},
year={2023}
}
@article{wang2023lavie,
title={{LAVIE}: High-Quality Video Generation with Cascaded Latent Diffusion Models},
author={Wang, Yaohui and Chen, Xinyuan and Ma, Xin and Zhou, Shangchen and Huang, Ziqi and Wang, Yi and Yang, Ceyuan and He, Yinan and Yu, Jiashuo and Yang, Peiqing and others},
journal={arXiv preprint arXiv:2309.15103},
year={2023}
}
@article{zhang2023show,
title={Show-1: Marrying Pixel and Latent Diffusion Models for Text-to-Video Generation},
author={Zhang, David Junhao and Wu, Jay Zhangjie and Liu, Jia-Wei and Zhao, Rui and Ran, Lingmin and Gu, Yuchao and Gao, Difei and Shou, Mike Zheng},
journal={arXiv preprint arXiv:2309.15818},
year={2023}
}
@article{chen2023videocrafter1,
title={{VideoCrafter1}: Open Diffusion Models for High-Quality Video Generation},
author={Chen, Haoxin and Xia, Menghan and He, Yingqing and Zhang, Yong and Cun, Xiaodong and Yang, Shaoshu and Xing, Jinbo and Liu, Yaofang and Chen, Qifeng and Wang, Xintao and others},
journal={arXiv preprint arXiv:2310.19512},
year={2023}
}
@article{girdhar2023emu,
title={Emu Video: Factorizing Text-to-Video Generation by Explicit Image Conditioning},
author={Girdhar, Rohit and Singh, Mannat and Brown, Andrew and Duval, Quentin and Azadi, Samaneh and Rambhatla, Sai Saketh and Shah, Akbar and Yin, Xi and Parikh, Devi and Misra, Ishan},
journal={arXiv preprint arXiv:2311.10709},
year={2023}
}
@article{blattmann2023stable,
title={Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets},
author={Blattmann, Andreas and Dockhorn, Tim and Kulal, Sumith and Mendelevitch, Daniel and Kilian, Maciej and Lorenz, Dominik and Levi, Yam and English, Zion and Voleti, Vikram and Letts, Adam and others},
journal={arXiv preprint arXiv:2311.15127},
year={2023}
}
%% PixelDance
@article{zeng2023make,
title={Make Pixels Dance: High-Dynamic Video Generation},
author={Zeng, Yan and Wei, Guoqiang and Zheng, Jiani and Zou, Jiaxin and Wei, Yang and Zhang, Yuchen and Li, Hang},
journal={arXiv preprint arXiv:2311.10982},
year={2023}
}
@article{kondratyuk2023videopoet,
title={{VideoPoet}: A Large Language Model for Zero-Shot Video Generation},
author={Kondratyuk, Dan and Yu, Lijun and Gu, Xiuye and Lezama, Jos{\'e} and Huang, Jonathan and Hornung, Rachel and Adam, Hartwig and Akbari, Hassan and Alon, Yair and Birodkar, Vighnesh and others},
journal={arXiv preprint arXiv:2312.14125},
year={2023}
}
% datasets
%% UCF-101
@article{soomro2012dataset,
title={A Dataset of 101 Human Action Classes From Videos in the Wild},
author={Soomro, Khurram and Zamir, Amir Roshan and Shah, Mubarak},
journal={Center for Research in Computer Vision},
volume={2},
number={11},
year={2012}
}
@inproceedings{xu2016msr,
title={{MSR-VTT}: A Large Video Description Dataset for Bridging Video and Language},
author={Xu, Jun and Mei, Tao and Yao, Ting and Rui, Yong},
booktitle=CVPR,
pages={5288--5296},
year={2016}
}
@article{kay2017kinetics,
title={The {Kinetics} Human Action Video Dataset},
author={Kay, Will and Carreira, Joao and Simonyan, Karen and Zhang, Brian and Hillier, Chloe and Vijayanarasimhan, Sudheendra and Viola, Fabio and Green, Tim and Back, Trevor and Natsev, Paul and others},
journal={arXiv preprint arXiv:1705.06950},
year={2017}
}
@inproceedings{miech2019howto100m,
title={{HowTo100M}: Learning a Text-Video Embedding by Watching Hundred Million Narrated Video Clips},
author={Miech, Antoine and Zhukov, Dimitri and Alayrac, Jean-Baptiste and Tapaswi, Makarand and Laptev, Ivan and Sivic, Josef},
booktitle=ICCV,
pages={2630--2640},
year={2019}
}
%% WebVid-10M
@inproceedings{bain2021frozen,
title={Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval},
author={Bain, Max and Nagrani, Arsha and Varol, G{\"u}l and Zisserman, Andrew},
booktitle=ICCV,
pages={1728--1738},
year={2021}
}
%% HD-VILA-100M
@inproceedings{xue2022advancing,
title={Advancing High-Resolution Video-Language Representation with Large-Scale Video Transcriptions},
author={Xue, Hongwei and Hang, Tiankai and Zeng, Yanhong and Sun, Yuchong and Liu, Bei and Yang, Huan and Fu, Jianlong and Guo, Baining},
booktitle=CVPR,
pages={5036--5045},
year={2022}
}
article{wang2023internvid,
title={{InternVid}: A Large-scale Video-Text Dataset for Multimodal Understanding and Generation},
author={Wang, Yi and He, Yinan and Li, Yizhuo and Li, Kunchang and Yu, Jiashuo and Ma, Xin and Chen, Xinyuan and Wang, Yaohui and Luo, Ping and Liu, Ziwei and others},
journal={arXiv preprint arXiv:2307.06942},
year={2023}
}
% metrics
%% FID
@inproceedings{heusel2017gans,
title={{GAN}s Trained by a Two Time-Scale Update Rule Converge to a Local {Nash} Equilibrium},
author={Heusel, Martin and Ramsauer, Hubert and Unterthiner, Thomas and Nessler, Bernhard and Hochreiter, Sepp},
booktitle=NIPS,
pages={6629--6640},
year={2017}
}
%% FVD
@article{unterthiner2018towards,
title={Towards Accurate Generative Models of Video: A New Metric \& Challenges},
author={Unterthiner, Thomas and Van Steenkiste, Sjoerd and Kurach, Karol and Marinier, Raphael and Michalski, Marcin and Gelly, Sylvain},
journal={arXiv preprint arXiv:1812.01717},
year={2018}
}
%% IS
@article{saito2020train,
title={Train Sparsely, Generate Densely: Memory-efficient Unsupervised Training of High-resolution Temporal {GAN}},
author={Saito, Masaki and Saito, Shunta and Koyama, Masanori and Kobayashi, Sosuke},
journal=IJCV,
volume={128},
number={10-11},
pages={2586--2606},
year={2020}
}
%% CLIPSIM
article{wu2021godiva,
title={{GODIVA}: Generating Open-DomaIn Videos from nAtural Descriptions},
author={Wu, Chenfei and Huang, Lun and Zhang, Qianxi and Li, Binyang and Ji, Lei and Yang, Fan and Sapiro, Guillermo and Duan, Nan},
journal={arXiv preprint arXiv:2104.14806},
year={2021}
}