pe-av-small / README.md
eustlb's picture
eustlb HF Staff
updates
4a279d8
|
raw
history blame
9.72 kB
metadata
license: apache-2.0

Perception Encoder Audio-Visual (PE-AV)

PE-AV is a state-of-the-art multimodal model that embeds audio, video, audio-video, and text into a joint embedding space. The model enables powerful cross-modal retrieval and understanding across audio, video, and text modalities.

Model Description

PE-AV is trained using contrastive learning to align audio, video, and text representations in a shared embedding space. The model can encode:

  • Audio only: Extract audio embeddings from audio waveforms
  • Video only: Extract visual embeddings from video frames
  • Audio-Video: Extract joint audio-visual embeddings
  • Text: Extract text embeddings optimized for different modality pairs

Model Variants

We release 6 model checkpoints with varying sizes and capabilities:

Model Avg Retrieval Video Frames used
pe-av-small-16-frame 45.2 16 frames
pe-av-base-16-frame 47.0 16 frames
pe-av-large-16-frame 48.2 16 frames
pe-av-small 48.1 all frames
pe-av-base 50.2 all frames
pe-av-large 51.6 all frames

The -16-frame variants sample exactly 16 frames (evenly spaced apart) from each video, while the base variants support variable-length videos.

Quick Start

The model is available in both transformers as well as perception_models libraries

perception_models Usage

import torch
from core.audio_visual_encoder import PEAudioVisual, PEAudioVisualTransform

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and transform
model = PEAudioVisual.from_config("pe-av-large", pretrained=True).to(device)
transform = PEAudioVisualTransform.from_config("pe-av-large")

video_files = ["video1.mp4", "video2.mp4"]
descriptions = ["description1", "description2"]
audio_files = ["audio1.wav", "audio2.wav"]

# Process inputs and get embeddings
inputs = transform(videos=video_files, text=descriptions, audio=audio_files).to(device)

with torch.inference_mode(), torch.autocast(device.type, dtype=torch.bfloat16):
    outputs = model(**inputs)

# Access different embeddings
audio_embeds = outputs.audio_embeds  # Audio-only embeddings
visual_embeds = outputs.visual_embeds  # Video-only embeddings
audio_visual_embeds = outputs.audio_visual_embeds  # Joint audio-visual embeddings
audio_text_embeds = outputs.audio_text_embeds  # Text embeddings aligned to audio
visual_text_embeds = outputs.visual_text_embeds  # Text embeddings aligned to video
audio_visual_text_embeds = outputs.audio_visual_text_embeds  # Text embeddings aligned to audio-visual
audio_plus_text_embeds = outputs.audio_plus_text_embeds  # Joint audio and text embedding
visual_plus_text_embeds = outputs.visual_plus_text_embeds  # Joint video and text embedding

# Compute the dot product to get their similarities
audio_visual_similarity = audio_embeds @ visual_embeds.T
# When computing similarity against text embeddings, use the
# appropriate text embedding based on the other modality
audio_text_similarity = audio_embeds @ audio_text_embeds.T
video_text_similarity = visual_embeds @ visual_text_embeds.T

Note that you can omit any of the modalities, and use the same forward method. The corresponding embeddings in output will be None. For example:

inputs = transform(videos=video_files, text=descriptions).to(device)

with torch.inference_mode(), torch.autocast(device.type, dtype=torch.bfloat16):
    outputs = model(**inputs)

audio_embeds = outputs.audio_embeds  # None
visual_embeds = outputs.visual_embeds  # available
audio_visual_embeds = outputs.audio_visual_embeds # None
audio_visual_text_embeds = outputs.audio_visual_text_embeds # None
audio_text_embeds = outputs.audio_text_embeds  # None
visual_text_embeds = outputs.visual_text_embeds  # available
audio_plus_text_embeds = outputs.audio_plus_text_embeds  # None
visual_plus_text_embeds = outputs.visual_plus_text_embeds  # Available

We also provide methods for directly encoding an individual modality:

def encode_video_text(self, input_ids, attention_mask=None)
def encode_audio_text(self, input_ids, attention_mask=None)
def encode_audio_video_text(self, input_ids, attention_mask=None)
def encode_audio(self, input_values, padding_mask=None, input_features=None)
def encode_video(self, pixel_values_videos, padding_mask_videos=None, pe_features=None)
def encode_audio_video(
    self,
    input_values,
    pixel_values_videos,
    padding_mask=None,
    padding_mask_videos=None,
    pe_features=None,  # optionally re-use pre-computed PE features
    input_features=None,  # Optionally re-use pre-computed audio codec features
)
def encode_audio_plus_text(
    self,
    input_ids,
    input_values,
    attention_mask=None,
    padding_mask=None,
    input_features=None  # Optionally re-use pre-computed audio codec features
)
def encode_video_plus_text(
    self,
    input_ids,
    pixel_values_videos,
    attention_mask=None,
    padding_mask_videos=None,
    pe_features=None,  # optionally re-use pre-computed PE features
)

transformers Usage 🤗

Install transformers from source

pip install git+https://github.com/huggingface/transformers

For more information, check the documentation here.

from transformers import PeAudioVideoModel, PeAudioVideoProcessor
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PeAudioVideoModel.from_pretrained("facebook/pe-av-small", device_map=device, dtype=torch.bfloat16)
processor = PeAudioVideoProcessor.from_pretrained("facebook/pe-av-small")

video_files = ["video1.mp4", "video2.mp4"]
descriptions = ["description1", "description2"]
audio_files = ["audio1.wav", "audio2.wav"]

# Process inputs and get embeddings
inputs = processor(
    videos=video_files, text=descriptions, audio=audio_files, return_tensors="pt", padding=True
)

with torch.inference_mode(), torch.autocast(device.type, dtype=torch.bfloat16):
    outputs = model(**inputs.to(device, dtype=model.dtype))

audio_embeds = outputs.audio_embeds  # Audio-only embeddings
video_embeds = outputs.video_embeds  # Video-only embeddings
audio_video_embeds = outputs.audio_video_embeds  # Joint audio-video embeddings
text_audio_embeds = outputs.text_audio_embeds  # Text embeddings aligned to audio
text_video_embeds = outputs.text_video_embeds  # Text embeddings aligned to video
text_audio_video_embeds = outputs.text_audio_video_embeds  # Text embeddings aligned to audio-video
audio_plus_text_embeds = outputs.audio_plus_text_embeds  # Joint audio and text embedding
video_plus_text_embeds = outputs.video_plus_text_embeds  # Joint video and text embedding

Note that you can omit any of the modalities, and use the same forward method. The corresponding embeddings in output will be None.

Moreover using transformers, one can load only the sub-model of interest to avoid loading the full model if for example one only wants to retrieve audio embeddgins:

from transformers import PeAudioModel, PeAudioProcessor
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PeAudioVideoModel.from_pretrained("facebook/pe-av-small", device_map=device, dtype=torch.bfloat16)
processor = PeAudioVideoProcessor.from_pretrained("facebook/pe-av-small")

descriptions = ["description1", "description2"]
audio_files = ["audio1.wav", "audio2.wav"]

# Process inputs and get embeddings
inputs = processor(
    text=descriptions, audio=audio_files, return_tensors="pt", padding=True
)

with torch.inference_mode(), torch.autocast(device.type, dtype=torch.bfloat16):
    outputs = model(**inputs.to(device, dtype=model.dtype))

audio_embeds = outputs.audio_embeds  # Audio-only embeddings
text_audio_embeds = outputs.text_audio_embeds  # Text embeddings aligned to audio

likewise for video embeddings:

from transformers import PeVideoModel, PeVideoProcessor
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PeVideoModel.from_pretrained("facebook/pe-av-small", device_map=device, dtype=torch.bfloat16)
processor = PeVideoProcessor.from_pretrained("facebook/pe-av-small")

descriptions = ["description1", "description2"]
video_files = ["video1.mp4", "video2.mp4"]

# Process inputs and get embeddings
inputs = processor(
    text=descriptions, videos=video_files, return_tensors="pt", padding=True
)

with torch.inference_mode(), torch.autocast(device.type, dtype=torch.bfloat16):
    outputs = model(**inputs.to(device, dtype=model.dtype))

video_embeds = outputs.video_embeds  # Video-only embeddings
text_video_embeds = outputs.text_video_embeds  # Text embeddings aligned to video

Citation

@article{pe-av2025,
  title={PEAV: An Audiovisual Perception Encoder via Large-Scale Multimodal Correspondence Learning},
  author={Apoorv Vyas, Heng-Jui Chang, Cheng-Fu Yang, Po-Yao Huang, Luya Gao, Julius Richter, Sanyuan Chen, Matt Le, Piotr Dollár, Christoph Feichtenhofer, Ann Lee, Wei-Ning Hsu},
  url={arxiv link coming soon}
  year={2025}
}

License

This model is released under the Apache 2.0 license.