Implementing Offline Speech Recognition With Vosk
Implementing Offline Speech Recognition With Vosk
Vosk
I’ll help you convert the application to use Vosk for offline speech recognition instead of Google’s online
service. This will make the application work without an internet connection and provide more privacy.
Overview of Changes
1. Replace speech_recognition with vosk for speech recognition
2. Add PyAudio for audio stream handling
3. Download and implement language model loading
4. Modify the speech recognition logic to work with Vosk’s API
python
import os
import sys
import json
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
import threading
import pyaudio
import wave
import time
from pynput.keyboard import Controller
from vosk import Model, KaldiRecognizer, SetLogLevel
class SpeechTypingApp:
def __init__(self, root):
self.root = root
self.root.title("Offline Speech to Typing")
self.root.geometry("500x400")
self.root.resizable(False, False)
# Variables
self.is_listening = False
self.keyboard = Controller()
self.listen_thread = None
self.language_var = tk.StringVar(value="en-us")
self.models_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
# Ensure models directory exists
if not os.path.exists(self.models_dir):
os.makedirs(self.models_dir)
# Audio settings
self.sample_rate = 16000
self.chunk_size = 4000
def create_widgets(self):
# Main frame
main_frame = ttk.Frame(self.root, padding="20")
main_frame.pack(fill=tk.BOTH, expand=True)
# App title
title_label = ttk.Label(
main_frame,
text="Offline Speech to Typing",
font=("Arial", 16, "bold")
)
title_label.pack(pady=10)
# Language selection
lang_frame = ttk.Frame(main_frame)
lang_frame.pack(fill=tk.X, pady=10)
self.language_menu = ttk.Combobox(
lang_frame,
textvariable=self.language_var,
state="readonly",
width=20
)
self.language_menu.pack(side=tk.LEFT, padx=5)
# Listen button
style = ttk.Style()
style.configure("Listen.TButton", font=("Arial", 12))
self.listen_button = ttk.Button(
main_frame,
text="Start Listening",
command=self.toggle_listening,
style="Listen.TButton",
width=20
)
self.listen_button.pack(pady=10)
# Recognition result
result_frame = ttk.LabelFrame(main_frame, text="Last Recognition")
result_frame.pack(fill=tk.X, pady=10, padx=5)
self.result_text = tk.Text(
result_frame,
height=5,
wrap=tk.WORD,
font=("Arial", 10)
)
self.result_text.pack(fill=tk.X, padx=5, pady=5)
# Instructions
instructions = "Click 'Start Listening' and speak. Your speech will be converted to text and typed
automatically. Make sure to download a language model first."
ttk.Label(
main_frame,
text=instructions,
wraplength=450,
justify="center",
foreground="gray"
).pack(pady=5)
def load_available_models(self):
"""Scan the models directory and populate the language dropdown"""
models = []
if models:
self.language_menu['values'] = models
self.language_var.set(models[0])
self.status_label.config(text="Ready")
else:
self.language_menu['values'] = ["No models found"]
self.language_var.set("No models found")
self.status_label.config(text="Please download a language model")
self.listen_button.config(state="disabled")
def download_model_dialog(self):
"""Show dialog to guide user to download a model"""
download_window = tk.Toplevel(self.root)
download_window.title("Download Language Model")
download_window.geometry("600x400")
download_window.transient(self.root)
download_window.grab_set()
# Instructions
instructions = (
"To download a Vosk model:\n\n"
"1. Visit https://alphacephei.com/vosk/models\n"
"2. Download a model (small models are ~40MB, larger ones are more accurate)\n"
"3. Extract the ZIP file\n"
"4. Select the extracted folder below"
)
ttk.Label(
frame,
text=instructions,
wraplength=550,
justify="left"
).pack(pady=10)
# Model links
models_info = [
("English (Small)", "https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip",
"42MB"),
("English (Large)", "https://alphacephei.com/vosk/models/vosk-model-en-us-0.22.zip", "1.8GB"),
("French", "https://alphacephei.com/vosk/models/vosk-model-fr-0.22.zip", "1.4GB"),
("German", "https://alphacephei.com/vosk/models/vosk-model-de-0.21.zip", "1.9GB"),
("Spanish", "https://alphacephei.com/vosk/models/vosk-model-es-0.42.zip", "1.5GB"),
("Chinese", "https://alphacephei.com/vosk/models/vosk-model-cn-0.22.zip", "1.2GB"),
("Russian", "https://alphacephei.com/vosk/models/vosk-model-ru-0.22.zip", "1.5GB")
]
link = ttk.Label(
model_line,
text=url,
foreground="blue",
cursor="hand2"
)
link.pack(side=tk.LEFT, padx=5)
link.bind("<Button-1>", lambda e, url=url: self.open_url(url))
ttk.Button(
select_frame,
text="Select Downloaded Model Folder",
command=lambda: self.select_model_directory(download_window)
).pack()
if not model_dir:
return
if os.path.exists(target_dir):
overwrite = messagebox.askyesno(
"Model Already Exists",
f"A model named '{model_name}' already exists. Overwrite it?"
)
if not overwrite:
return
ttk.Label(
progress_window,
text=f"Installing model {model_name}...\nThis may take a moment.",
justify="center"
).pack(pady=20)
self.root.update()
# Copy the model files (this is a simplified approach - for large models,
# a more sophisticated copy with progress reporting would be better)
import shutil
try:
if os.path.exists(target_dir):
shutil.rmtree(target_dir)
shutil.copytree(model_dir, target_dir)
progress_window.destroy()
download_window.destroy()
messagebox.showinfo(
"Model Installed",
f"The model '{model_name}' has been successfully installed."
)
except Exception as e:
progress_window.destroy()
messagebox.showerror(
"Installation Error",
f"An error occurred while installing the model: {str(e)}"
)
def toggle_listening(self):
if self.is_listening:
self.is_listening = False
self.listen_button.config(text="Start Listening")
self.status_label.config(text="Stopped")
else:
selected_model = self.language_var.get()
if selected_model == "No models found":
messagebox.showerror(
"No Model",
"Please download a language model first."
)
return
self.is_listening = True
self.listen_button.config(text="Stop Listening")
self.status_label.config(text="Starting...")
def listen_and_type(self):
"""Listen for speech using Vosk and type the recognized text"""
model_name = self.language_var.get()
model_path = os.path.join(self.models_dir, model_name)
try:
# Load the model
self.status_label.config(text="Loading model...")
model = Model(model_path)
# Create recognizer
recognizer = KaldiRecognizer(model, self.sample_rate)
recognizer.SetWords(True) # Get word timing information
# Initialize PyAudio
audio = pyaudio.PyAudio()
# Start listening
stream.start_stream()
while self.is_listening:
data = stream.read(self.chunk_size, exception_on_overflow=False)
if recognizer.AcceptWaveform(data):
result = json.loads(recognizer.Result())
text = result.get("text", "").strip()
if text:
# Display the text
self.result_text.delete(1.0, tk.END)
self.result_text.insert(tk.END, text)
# Update status
display_text = text if len(text) < 30 else text[:27] + "..."
self.status_label.config(text=f"Recognized: {display_text}")
# Clean up
stream.stop_stream()
stream.close()
audio.terminate()
except Exception as e:
self.status_label.config(text=f"Error: {str(e)}")
messagebox.showerror(
"Recognition Error",
f"An error occurred during speech recognition: {str(e)}"
)
self.is_listening = False
self.listen_button.config(text="Start Listening")
def main():
root = tk.Tk()
app = SpeechTypingApp(root)
root.mainloop()
if __name__ == "__main__":
main()
Setup Instructions
Install Required Libraries:
bash
pip install vosk pyaudio pynput
1.
2. Save the Code: Save the code to a file named offline_speech_typing_app.py
python offline_speech_typing_app.py
3.
4. Download a Language Model:
○ When you first run the app, you’ll need to download a language model
○ Click the “Download Model” button
○ https://alphacephei.com/vosk/models/vosk-model-fa-0.42.zip
○
○ Follow the instructions to download a model from the Vosk website
○ Extract the ZIP file and select the extracted folder when prompted
5. Using the App:
bash
Technical Details
1. Vosk API: Uses the Vosk speech recognition toolkit, which is based on Kaldi
2. Language Models: Downloads and manages pre-trained acoustic models
3. Audio Processing: Uses PyAudio to capture microphone input
4. Threading: Keeps the UI responsive while speech recognition runs in the background
5. Error Handling: Comprehensive error handling for model loading and recognition issues
Would you like me to explain any part of the implementation in more detail or make any adjustments to
the functionality?