@inproceedings{fang24emnlp, title = {{VIMI}: Grounding Video Generation through Multi-modal Instruction}, author = {Yuwei Fang and Willi Menapace and Aliaksandr Siarohin and Tsai-Shien Chen and Kuan-Chieh Wang and Ivan Skorokhodov and Graham Neubig and Sergey Tulyakov}, booktitle = {Conference on Empirical Methods in Natural Language Processing (EMNLP)}, address = {Miami, USA}, month = {November}, url = {https://www.arxiv.org/abs/2407.06304}, year = {2024} }