Audio Streaming and Live Transcribing Pipeline using Kaldi
Generate & stream a tone:
ffmpeg -re -f lavfi -i aevalsrc="sin(400*2*PI*t)" -ar 8000 -f mulaw -f rtp rtp://127.0.0.1:1234
To stream:
ffmpeg -stream_loop -1 -re -i /Users/harims/code/archived/vosk_code/audiofiles/hellohowareyou.mp3 -ar 8000 -f mulaw -f rtp rtp://127.0.0.1:1234
To receive:
Take the SDP information from the output of above command. Make a something.sdp file and paste info. Open with VLC player.
ffplay -protocol_whitelist file,udp,rtp -i /< fullpath >/something.sdp
Only one receiver at a time.
BELOW: something.sdp file
v=0
o=- 0 0 IN IP4 127.0.0.1
s=No Name
c=IN IP4 127.0.0.1
t=0 0
a=tool:libavformat 60.3.100
m=audio 1234 RTP/AVP 97
b=AS:128
a=rtpmap:97 PCMU/8000/2
Transcribing
python vosk-transcriber --input /Users/harims/code/archived/vosk_code/audiofiles/hellohowareyou.mp3
#!/Users/harims/code/venvs/voicerecognition/bin/python3
# -*- coding: utf-8 -*-
import re
import sys
from vosk.transcriber.cli import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())
Using Kaldi recognizer
#!/usr/bin/env python3
import subprocess
import sys
from vosk import Model, KaldiRecognizer, SetLogLevel
SAMPLE_RATE = 16000
SetLogLevel(0)
model = Model(lang="en-us")
rec = KaldiRecognizer(model, SAMPLE_RATE)
with subprocess.Popen(["ffmpeg","-protocol_whitelist", "file,udp,rtp", "-i", "/Users/harims/code/archived/vosk_code/foo.sdp","-ar", str(SAMPLE_RATE) , "-ac", "1", "-f", "s16le", "-", "-loglevel", "quiet"], stdout=subprocess.PIPE) as process:
while True:
data = process.stdout.read(4000)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
print(rec.Result())
# else:
# print(rec.PartialResult())
print(rec.FinalResult())
**Working Input from Microphone & Transcribing**
#!/usr/bin/env python3
import subprocess
import sys
from vosk import Model, KaldiRecognizer, SetLogLevel
SAMPLE_RATE = 16000
SetLogLevel(0)
model = Model(lang="en-us")
# model = Model(model_path="/Users/harims/code/vosk-model-en-us-0.42-gigaspeech", lang="en-us") // Big model
rec = KaldiRecognizer(model, SAMPLE_RATE)
# FROM MIC
import pyaudio # Soundcard audio I/O access library
import wave # Python 3 module for reading / writing simple .wav files
# Setup channel info
FORMAT = pyaudio.paInt16 # data type formate
CHANNELS = 1 # Adjust to your number of channels
RATE = SAMPLE_RATE # Sample Rate
CHUNK = 1024 # Block Size
RECORD_SECONDS = 10 # Record time
WAVE_OUTPUT_FILENAME = "file.wav"
# Startup pyaudio instance
audio = pyaudio.PyAudio()
# start Recording
stream = audio.open(format=FORMAT, channels=CHANNELS,
rate=RATE, input=True,
frames_per_buffer=CHUNK)
print("recording...")
frames = []
# Record for RECORD_SECONDS
# for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
# data = stream.read(CHUNK)
# frames.append(data)
# print ("finished recording")
# frames = []
# Record for RECORD_SECONDS
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
# frames.append(data)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
print(rec.Result())
else:
print(rec.PartialResult())
print ("finished recording")
# Stop Recording
stream.stop_stream()
stream.close()
audio.terminate()
###########################
Write to file
###########################
# Write your new .wav file with built in Python 3 Wave module
# waveFile = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
# waveFile.setnchannels(CHANNELS)
# waveFile.setsampwidth(audio.get_sample_size(FORMAT))
# waveFile.setframerate(RATE)
# waveFile.writeframes(b''.join(frames))
# waveFile.close()
#####################################################################
Working code - From rtp stream (via sdp file)
#####################################################################
# with subprocess.Popen(["ffmpeg","-protocol_whitelist", "file,udp,rtp", "-i", "/Users/harims/code/archived/vosk_code/foo.sdp","-ar", str(SAMPLE_RATE) , "-ac", "1", "-f", "s16le", "-", "-loglevel", "quiet"],
# stdout=subprocess.PIPE) as process:
# while True:
# data = process.stdout.read(4000)
# if len(data) == 0:
# break
# if rec.AcceptWaveform(data):
# print(rec.Result())
# # else:
# # print(rec.PartialResult())
# print(rec.FinalResult())
Text to speech
tts --text "Text for TTS" \
--model_name "tts_models/en/ek1/tacotron2" \
--vocoder_name "vocoder_models/universal/libri-tts/wavegrad" \
--out_path ~/sound.wav
#sideprojects