import torch
import transformers
import datasets
# Available adapters:
# ['lug', 'lug+eng', 'ach', 'ach+eng', 'lgg', 'lgg+eng',
# 'nyn', 'nyn+eng', 'teo', 'teo+eng']
language = 'lug'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = transformers.Wav2Vec2ForCTC.from_pretrained(
'Sunbird/asr-mms-salt').to(device)
model.load_adapter(language)
processor = transformers.Wav2Vec2Processor.from_pretrained(
'Sunbird/asr-mms-salt')
processor.tokenizer.set_target_lang(language)
# Get some test audio
ds = datasets.load_dataset('Sunbird/salt', 'multispeaker-lug', split='test')
audio = ds[0]['audio']
sample_rate = ds[0]['sample_rate']
# Apply the model
inputs = processor(audio, sampling_rate=sample_rate, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs.to(device)).logits
ids = torch.argmax(outputs, dim=-1)[0]
transcription = processor.decode(ids)
print(transcription)
# ekikola ky'akasooli kyakyenvu wabula langi yakyo etera okuba eyaakitaka wansi