kleysonr · June 27, 2023 23:13
diff --git a/transcription_audio.py b/transcription_audio.py
 # Convert video .mkv em audio .mp3 16khz
 # ffmpeg -i video.mkv -acodec libmp3lame -ac 1 -ar 16000 output.mp3
 #
 # Quebra o audio .mp3 em pedacos de 30 segundos cada
 # ffmpeg -i output.mp3 -f segment -segment_time 30 -c copy samples/sample%05d.mp3

 from huggingsound import SpeechRecognitionModel, KenshoLMDecoder
 import torch
 import os

 def listar_arquivos_mp3(diretorio):
    arquivos = os.listdir(diretorio)
    arquivos_mp3 = [os.path.join(diretorio, arquivo) for arquivo in arquivos if arquivo.endswith('.mp3')]
    arquivos_mp3.sort()
    return arquivos_mp3

 device = "cuda" if torch.cuda.is_available() else "cpu"

 model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-portuguese", device=device)
 input_files = listar_arquivos_mp3('./samples')

 # The LM format used by the LM decoders is the KenLM format (arpa or binary file).
 # You can download some LM files examples from here: https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-portuguese/tree/main/language_model
 # lm_path = "lm_files/lm.binary"
 # unigrams_path = "lm_files/unigrams.txt"

 # We implemented three different decoders for LM boosted decoding: KenshoLMDecoder, ParlanceLMDecoder, and FlashlightLMDecoder
 # On this example, we'll use the KenshoLMDecoder
 # To use this decoder you'll need to install the Kensho's ctcdecode first (https://github.com/kensho-technologies/pyctcdecode)
 # decoder = KenshoLMDecoder(model.token_set, lm_path=lm_path, unigrams_path=unigrams_path)

 # transcription = model.transcribe(input_files, decoder=decoder)

 transcription = model.transcribe(input_files)

 with open('transcription.txt', 'w', encoding='utf-8') as f:

    for i in transcription:

        text = i['transcription']
        f.write(text)
        f.write('\n')

 # print(transcription)
	# Convert video .mkv em audio .mp3 16khz
	# ffmpeg -i video.mkv -acodec libmp3lame -ac 1 -ar 16000 output.mp3
	#
	# Quebra o audio .mp3 em pedacos de 30 segundos cada
	# ffmpeg -i output.mp3 -f segment -segment_time 30 -c copy samples/sample%05d.mp3

	from huggingsound import SpeechRecognitionModel, KenshoLMDecoder
	import torch
	import os

	def listar_arquivos_mp3(diretorio):
	arquivos = os.listdir(diretorio)
	arquivos_mp3 = [os.path.join(diretorio, arquivo) for arquivo in arquivos if arquivo.endswith('.mp3')]
	arquivos_mp3.sort()
	return arquivos_mp3

	device = "cuda" if torch.cuda.is_available() else "cpu"

	model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-portuguese", device=device)
	input_files = listar_arquivos_mp3('./samples')

	# The LM format used by the LM decoders is the KenLM format (arpa or binary file).
	# You can download some LM files examples from here: https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-portuguese/tree/main/language_model
	# lm_path = "lm_files/lm.binary"
	# unigrams_path = "lm_files/unigrams.txt"

	# We implemented three different decoders for LM boosted decoding: KenshoLMDecoder, ParlanceLMDecoder, and FlashlightLMDecoder
	# On this example, we'll use the KenshoLMDecoder
	# To use this decoder you'll need to install the Kensho's ctcdecode first (https://github.com/kensho-technologies/pyctcdecode)
	# decoder = KenshoLMDecoder(model.token_set, lm_path=lm_path, unigrams_path=unigrams_path)

	# transcription = model.transcribe(input_files, decoder=decoder)

	transcription = model.transcribe(input_files)

	with open('transcription.txt', 'w', encoding='utf-8') as f:

	for i in transcription:

	text = i['transcription']
	f.write(text)
	f.write('\n')

	# print(transcription)