Last active
June 27, 2023 23:13
-
-
Save kleysonr/be7e870fa372875a8b55ed5e901d02e0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Convert video .mkv em audio .mp3 16khz | |
# ffmpeg -i video.mkv -acodec libmp3lame -ac 1 -ar 16000 output.mp3 | |
# | |
# Quebra o audio .mp3 em pedacos de 30 segundos cada | |
# ffmpeg -i output.mp3 -f segment -segment_time 30 -c copy samples/sample%05d.mp3 | |
from huggingsound import SpeechRecognitionModel, KenshoLMDecoder | |
import torch | |
import os | |
def listar_arquivos_mp3(diretorio): | |
arquivos = os.listdir(diretorio) | |
arquivos_mp3 = [os.path.join(diretorio, arquivo) for arquivo in arquivos if arquivo.endswith('.mp3')] | |
arquivos_mp3.sort() | |
return arquivos_mp3 | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-portuguese", device=device) | |
input_files = listar_arquivos_mp3('./samples') | |
# The LM format used by the LM decoders is the KenLM format (arpa or binary file). | |
# You can download some LM files examples from here: https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-portuguese/tree/main/language_model | |
# lm_path = "lm_files/lm.binary" | |
# unigrams_path = "lm_files/unigrams.txt" | |
# We implemented three different decoders for LM boosted decoding: KenshoLMDecoder, ParlanceLMDecoder, and FlashlightLMDecoder | |
# On this example, we'll use the KenshoLMDecoder | |
# To use this decoder you'll need to install the Kensho's ctcdecode first (https://github.com/kensho-technologies/pyctcdecode) | |
# decoder = KenshoLMDecoder(model.token_set, lm_path=lm_path, unigrams_path=unigrams_path) | |
# transcription = model.transcribe(input_files, decoder=decoder) | |
transcription = model.transcribe(input_files) | |
with open('transcription.txt', 'w', encoding='utf-8') as f: | |
for i in transcription: | |
text = i['transcription'] | |
f.write(text) | |
f.write('\n') | |
# print(transcription) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment