Please remember to report your frame rate and tokens per frame with each submission.
We use LLaMA-3.1-8B as the LLM evaluation assistant. # F stands for the frame sampling number of the input video, and TPF represents the visual tokens per frame.
@article{auroracap,
title={AuroraCap: Efficient, Performant Video Detailed Captioning and a New Benchmark},
author={Wenhao Chai, Enxin Song, Yilun Du, Chenlin Meng, Vashisht Madhavan, Omer Bar-Tal, jenq-neng Hwang, Saining Xie, Christopher D. Manning},
year={2024},
journal={arXiv preprint arXiv:2410.03051},
}