This video locally installs Qwen2.5-Omni in a jupyter notebook. You can use same commands in Google Colab, Kaggle, etc.
Code:
conda create -n qwen_omni python=3.10 -y && conda activate qwen_omni
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
pip install pillow accelerate soundfile
pip install git+https://github.com/huggingface/transformers@3a1ead0aabed473eafe527915eea8c197d424356
pip install "qwen-omni-utils[decord]" librosa ipywidgets notebook
pip install -U flash-attn --no-build-isolation
jupyter notebook
Run following commands in notebook cells:
import torch
import soundfile as sf
from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor
from qwen_omni_utils import process_mm_info
model = Qwen2_5OmniModel.from_pretrained(
"Qwen/Qwen2.5-Omni-7B",
torch_dtype=torch.bfloat16,
device_map="auto",
attn_implementation="flash_attention_2",
enable_audio_output=True
)
processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
conversation = [
{
"role": "system",
"content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech. Please always respond in English."
},
{
"role": "user",
"content": [{"type": "video", "video": "./fahdvideo.mp4"}]
}
]
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios, images, videos = process_mm_info(conversation, use_audio_in_video=True)
inputs = processor(text=text, audios=audios, images=images, videos=videos,
return_tensors="pt", padding=True).to(model.device).to(model.dtype)
with torch.no_grad():
text_ids, audio = model.generate(**inputs, use_audio_in_video=True, spk="Chelsie")
output_text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print("\n๐ Qwen Omni Response:\n", output_text)
sf.write("output.wav", audio.reshape(-1).detach().cpu().numpy(), samplerate=24000)
print("\n✅ Audio saved to output.wav")
For Audio:
import librosa
from IPython.display import Audio, display
audio_path = "/home/Ubuntu/audio/cough.wav"
prompt = "Classify the given human vocal sound in English."
audio, sr = librosa.load(audio_path, sr=16000)
display(Audio(audio, rate=16000))
response = audio_inference(audio_path,
prompt=prompt,
sys_prompt="You are a vocal sound classification model.")
print("\n๐ Model Response:\n", response[0])
No comments:
Post a Comment