Real-time novel view synthesis (NVS) and depth estimation are pivotal for immersive applications, particularly in augmented telepresence. While state-of-the-art monocular depth estimation methods could be employed to predict depth maps for novel views, their independent processing of novel views often leads to temporal inconsistencies, such as flickering artifacts in depth maps. To address this, we present a unified multimodal framework that generates both novel view images and their corresponding depth maps, ensuring geometric and visual consistency.
@ARTICLE{11348070,
author={Gond, Manu and Zerman, Emin and Knorr, Sebastian and Sjöström, Mårten},
journal={IEEE Access},
title={PVSDNet: Joint Depth Prediction and View Synthesis Via Shared Latent Spaces in Real-Time},
year={2026},
volume={14},
number={},
pages={9021-9037},
keywords={Real-time systems;Depth measurement;Training;Geometry;Rendering (computer graphics);Accuracy;Three-dimensional displays;Telepresence;Neural radiance field;Visualization;Augmented reality;depth image;low-rank adaptation fine-tuning;monocular depth estimation;telepresence;view synthesis},
doi={10.1109/ACCESS.2026.3653905}}