ViT-Up: Faithful Feature Upsampling for Vision Transformers
Paper • 2606.14024 • Published • 9
ViT-Up is an implicit feature upsampler for Vision Transformers that predicts backbone-aligned features at arbitrary continuous image coordinates.
This repository provides pretrained ViT-Up weights for DINOv3-S+ and DINOv3-B.
ViT-Up models can be loaded directly with torch.hub.load. The Hub entry points download ViT-Up weights from Hugging Face and load the matching DINOv3 backbone.
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
# Available entry points:
# - vit_up_dinov3_splus
# - vit_up_dinov3_base
model = torch.hub.load(
"krispinwandel/vit-up",
"vit_up_dinov3_splus",
pretrained=True,
trust_repo=True,
device=device,
).eval()
images = torch.randn(1, 3, 448, 448, device=device)
query_coords = torch.rand(1, 100, 2, device=device) # normalized (x, y) in [0, 1]
with torch.no_grad():
features = model(images, query_coords)
print(features.shape) # (B, N_queries, D)
# Alternative API
model.set_images(images)
features = []
query_chunk_size = 10
for i in range(0, query_coords.shape[1], query_chunk_size):
chunk_coords = query_coords[:, i : i + query_chunk_size]
chunk_features = model(query_coords=chunk_coords)
features.append(chunk_features)
features = torch.cat(features, dim=1)
print(features.shape) # (B, N_queries, D)
@misc{wandel2026vitupfaithfulfeatureupsampling,
title={ViT-Up: Faithful Feature Upsampling for Vision Transformers},
author={Krispin Wandel and Jingchuan Wang and Hesheng Wang},
year={2026},
eprint={2606.14024},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2606.14024},
}