# Directory Structure
```
├── .gitignore
├── package-lock.json
├── package.json
├── README.md
├── server.js
├── src
│ ├── index.ts
│ └── types
│ ├── node-record-lpcm16.d.ts
│ └── say.d.ts
├── test
│ └── index.html
└── tsconfig.json
```
# Files
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
```
# Dependencies
node_modules/
npm-debug.log*
yarn-debug.log*
yarn-error.log*
# Build output
build/
dist/
*.js.map
# Environment variables
.env
.env.local
.env.*.local
# IDE files
.idea/
.vscode/
*.swp
*.swo
# Operating System
.DS_Store
Thumbs.db
# Temporary files
*.log
*.tmp
recording.wav
# Test coverage
coverage/
```
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
```markdown
# MS-Lucidia-Voice-Gateway-MCP
A Model Context Protocol (MCP) server that provides text-to-speech and speech-to-text capabilities using Windows' built-in speech services. This server leverages the native Windows Speech API (SAPI) through PowerShell commands, eliminating the need for external APIs or services.
## Features
- Text-to-Speech (TTS) using Windows SAPI voices
- Speech-to-Text (STT) using Windows Speech Recognition
- Simple web interface for testing
- No external API dependencies
- Uses native Windows capabilities
## Prerequisites
- Windows 10/11 with Speech Recognition enabled
- Node.js 16+
- PowerShell
## Installation
1. Clone the repository:
```bash
git clone https://github.com/ExpressionsBot/MS-Lucidia-Voice-Gateway-MCP.git
cd MS-Lucidia-Voice-Gateway-MCP
```
2. Install dependencies:
```bash
npm install
```
3. Build the project:
```bash
npm run build
```
## Usage
### Testing Interface
1. Start the test server:
```bash
npm run test
```
2. Open `http://localhost:3000` in your browser
3. Use the web interface to test TTS and STT capabilities
### Available Tools
#### text_to_speech
Converts text to speech using Windows SAPI.
Parameters:
- `text` (required): The text to convert to speech
- `voice` (optional): The voice to use (e.g., "Microsoft David Desktop")
- `speed` (optional): Speech rate from 0.5 to 2.0 (default: 1.0)
Example:
```javascript
fetch('http://localhost:3000/tts', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
text: "Hello, this is a test",
voice: "Microsoft David Desktop",
speed: 1.0
})
});
```
#### speech_to_text
Records audio and converts it to text using Windows Speech Recognition.
Parameters:
- `duration` (optional): Recording duration in seconds (default: 5, max: 60)
Example:
```javascript
fetch('http://localhost:3000/stt', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
duration: 5
})
}).then(response => response.json())
.then(data => console.log(data.text));
```
## Troubleshooting
1. Make sure Windows Speech Recognition is enabled:
- Open Windows Settings
- Go to Time & Language > Speech
- Enable Speech Recognition
2. Check available voices:
- Open PowerShell and run:
```powershell
Add-Type -AssemblyName System.Speech
(New-Object System.Speech.Synthesis.SpeechSynthesizer).GetInstalledVoices().VoiceInfo.Name
```
3. Test speech recognition:
- Open Speech Recognition in Windows Settings
- Run through the setup wizard if not already done
- Test that Windows can recognize your voice
## Contributing
1. Fork the repository
2. Create your feature branch
3. Commit your changes
4. Push to the branch
5. Create a new Pull Request
## License
MIT
```
--------------------------------------------------------------------------------
/src/types/say.d.ts:
--------------------------------------------------------------------------------
```typescript
declare module 'say' {
export function speak(
text: string,
voice?: string,
speed?: number,
callback?: (err: Error | string | null) => void
): void;
export function stop(): void;
}
```
--------------------------------------------------------------------------------
/src/types/node-record-lpcm16.d.ts:
--------------------------------------------------------------------------------
```typescript
declare module 'node-record-lpcm16' {
interface RecordOptions {
sampleRate?: number;
channels?: number;
audioType?: string;
}
interface Recording {
stream(): NodeJS.ReadableStream;
stop(): void;
}
export function record(options?: RecordOptions): Recording;
}
```
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
```json
{
"compilerOptions": {
"target": "ES2020",
"module": "commonjs",
"lib": ["ES2020"],
"outDir": "./build",
"rootDir": "./src",
"strict": true,
"esModuleInterop": true,
"skipLibCheck": true,
"forceConsistentCasingInFileNames": true,
"resolveJsonModule": true,
"declaration": true,
"moduleResolution": "node"
},
"include": ["src/**/*"],
"exclude": ["node_modules", "build", "test"]
}
```
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
```json
{
"name": "ms-lucidia-voice-gateway-mcp",
"version": "1.0.0",
"description": "Windows Speech server for Lucidia using native Windows speech capabilities",
"main": "build/index.js",
"scripts": {
"start": "node build/index.js",
"test": "node server.js",
"dev": "nodemon server.js",
"build": "tsc",
"watch": "tsc -w"
},
"dependencies": {
"cors": "^2.8.5",
"dotenv": "^16.3.1",
"express": "^4.18.2",
"openai": "^4.24.1"
},
"devDependencies": {
"@types/cors": "^2.8.17",
"@types/express": "^4.17.21",
"@types/node": "^20.11.5",
"nodemon": "^3.0.3",
"typescript": "^5.3.3"
},
"repository": {
"type": "git",
"url": "git+https://github.com/ExpressionsBot/MS-Lucidia-Voice-Gateway-MCP.git"
},
"keywords": [
"windows",
"speech",
"tts",
"stt",
"gpt4",
"lucidia"
],
"author": "ExpressionsBot",
"license": "MIT"
}
```
--------------------------------------------------------------------------------
/server.js:
--------------------------------------------------------------------------------
```javascript
const express = require('express');
const cors = require('cors');
const { exec } = require('child_process');
const { promisify } = require('util');
const fs = require('fs').promises;
const path = require('path');
const { createServer } = require('net');
const execAsync = promisify(exec);
const app = express();
// Helper function to find an available port
async function findAvailablePort(startPort) {
const isPortAvailable = (port) => {
return new Promise((resolve) => {
const server = createServer()
.listen(port, () => {
server.once('close', () => resolve(true));
server.close();
})
.on('error', () => resolve(false));
});
};
let port = startPort;
while (!(await isPortAvailable(port))) {
port++;
}
return port;
}
app.use(cors());
app.use(express.json());
app.use(express.static('test'));
// Helper function to execute PowerShell commands
async function runPowerShell(script) {
try {
const { stdout } = await execAsync(`powershell -Command "${script}"`);
return stdout.trim();
} catch (error) {
throw new Error(`PowerShell execution failed: ${error.message}`);
}
}
// Get available voices
app.get('/voices', async (req, res) => {
try {
const script = `
Add-Type -AssemblyName System.Speech;
(New-Object System.Speech.Synthesis.SpeechSynthesizer).GetInstalledVoices().VoiceInfo.Name
`;
const output = await runPowerShell(script);
const voices = output.split('\n').map(v => v.trim()).filter(Boolean);
res.json(voices);
} catch (error) {
res.status(500).json({ error: error.message });
}
});
// Text to Speech
app.post('/tts', async (req, res) => {
try {
const { text, voice = 'Microsoft David Desktop', speed = 1.0 } = req.body;
if (!text) {
return res.status(400).json({ error: 'Text is required' });
}
const script = `
Add-Type -AssemblyName System.Speech;
$synthesizer = New-Object System.Speech.Synthesis.SpeechSynthesizer;
$synthesizer.SelectVoice('${voice}');
$synthesizer.Rate = ${Math.round((speed - 1) * 10)};
$synthesizer.Speak('${text.replace(/'/g, "''")}');
`;
await runPowerShell(script);
res.json({ success: true });
} catch (error) {
res.status(500).json({ error: error.message });
}
});
// Speech to Text
app.post('/stt', async (req, res) => {
try {
const { duration = 5 } = req.body;
const audioFile = path.join(__dirname, 'recording.wav');
// Record audio using PowerShell
const recordScript = `
Add-Type -AssemblyName System.Windows.Forms;
$audio = New-Object System.IO.MemoryStream;
$waveSource = New-Object NAudio.Wave.WaveInEvent;
$waveSource.WaveFormat = New-Object NAudio.Wave.WaveFormat(16000, 1);
$waveFile = New-Object NAudio.Wave.WaveFileWriter('${audioFile}', $waveSource.WaveFormat);
$waveSource.DataAvailable = {
param($sender, $e)
$waveFile.Write($e.Buffer, 0, $e.BytesRecorded)
};
$waveSource.StartRecording();
Start-Sleep -Seconds ${duration};
$waveSource.StopRecording();
$waveFile.Dispose();
`;
await runPowerShell(recordScript);
// Transcribe the recorded audio
const transcribeScript = `
Add-Type -AssemblyName System.Speech;
$recognizer = New-Object System.Speech.Recognition.SpeechRecognizer;
$grammar = New-Object System.Speech.Recognition.DictationGrammar;
$recognizer.LoadGrammar($grammar);
$audio = [System.IO.File]::ReadAllBytes('${audioFile}');
$stream = New-Object System.IO.MemoryStream(@(,$audio));
$result = $recognizer.RecognizeSync([System.Speech.AudioFormat.AudioStream]::new($stream));
$result.Text;
`;
const transcription = await runPowerShell(transcribeScript);
// Clean up the audio file
await fs.unlink(audioFile);
res.json({ text: transcription || 'No speech detected' });
} catch (error) {
res.status(500).json({ error: error.message });
}
});
// Start the server
async function startServer() {
try {
const port = await findAvailablePort(3000);
app.listen(port, () => {
console.log(`Windows Speech Server running at http://localhost:${port}`);
});
} catch (error) {
console.error('Failed to start server:', error);
process.exit(1);
}
}
startServer();
```
--------------------------------------------------------------------------------
/test/index.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Windows Speech MCP Test</title>
<style>
body {
font-family: Arial, sans-serif;
max-width: 800px;
margin: 0 auto;
padding: 20px;
background-color: #f5f5f5;
}
.container {
background-color: white;
padding: 20px;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.section {
margin-bottom: 20px;
padding: 20px;
border: 1px solid #ddd;
border-radius: 4px;
}
h1 {
color: #333;
text-align: center;
}
textarea {
width: 100%;
height: 100px;
margin: 10px 0;
padding: 8px;
border: 1px solid #ddd;
border-radius: 4px;
resize: vertical;
}
button {
background-color: #007bff;
color: white;
border: none;
padding: 10px 20px;
border-radius: 4px;
cursor: pointer;
margin: 5px;
}
button:hover {
background-color: #0056b3;
}
select {
padding: 8px;
margin: 5px;
border-radius: 4px;
border: 1px solid #ddd;
}
.status {
margin-top: 10px;
padding: 10px;
border-radius: 4px;
}
.success {
background-color: #d4edda;
color: #155724;
}
.error {
background-color: #f8d7da;
color: #721c24;
}
</style>
</head>
<body>
<div class="container">
<h1>Windows Speech MCP Test</h1>
<div class="section">
<h2>Text to Speech</h2>
<textarea id="ttsText" placeholder="Enter text to speak...">Hello, this is a test of Windows speech synthesis.</textarea>
<div>
<select id="ttsVoice">
<option value="Microsoft David Desktop">David</option>
<option value="Microsoft Zira Desktop">Zira</option>
</select>
<select id="ttsSpeed">
<option value="0.5">0.5x Speed</option>
<option value="1.0" selected>1.0x Speed</option>
<option value="1.5">1.5x Speed</option>
<option value="2.0">2.0x Speed</option>
</select>
<button onclick="speak()">Speak</button>
</div>
<div id="ttsStatus" class="status"></div>
</div>
<div class="section">
<h2>Speech to Text</h2>
<div>
<select id="sttDuration">
<option value="5">5 seconds</option>
<option value="10">10 seconds</option>
<option value="15">15 seconds</option>
<option value="30">30 seconds</option>
</select>
<button onclick="startRecording()">Start Recording</button>
</div>
<textarea id="sttText" placeholder="Transcribed text will appear here..." readonly></textarea>
<div id="sttStatus" class="status"></div>
</div>
</div>
<script>
async function speak() {
const text = document.getElementById('ttsText').value;
const voice = document.getElementById('ttsVoice').value;
const speed = parseFloat(document.getElementById('ttsSpeed').value);
const statusDiv = document.getElementById('ttsStatus');
try {
const response = await fetch('http://localhost:3000/tts', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({ text, voice, speed })
});
if (!response.ok) throw new Error('Failed to synthesize speech');
statusDiv.textContent = 'Speech synthesis successful!';
statusDiv.className = 'status success';
} catch (error) {
statusDiv.textContent = `Error: ${error.message}`;
statusDiv.className = 'status error';
}
}
async function startRecording() {
const duration = parseInt(document.getElementById('sttDuration').value);
const statusDiv = document.getElementById('sttStatus');
const textArea = document.getElementById('sttText');
try {
statusDiv.textContent = `Recording for ${duration} seconds...`;
statusDiv.className = 'status';
const response = await fetch('http://localhost:3000/stt', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({ duration })
});
if (!response.ok) throw new Error('Failed to transcribe speech');
const result = await response.json();
textArea.value = result.text;
statusDiv.textContent = 'Transcription successful!';
statusDiv.className = 'status success';
} catch (error) {
statusDiv.textContent = `Error: ${error.message}`;
statusDiv.className = 'status error';
textArea.value = '';
}
}
// Fetch available voices when the page loads
async function loadVoices() {
try {
const response = await fetch('http://localhost:3000/voices');
if (!response.ok) throw new Error('Failed to fetch voices');
const voices = await response.json();
const voiceSelect = document.getElementById('ttsVoice');
voiceSelect.innerHTML = '';
voices.forEach(voice => {
const option = document.createElement('option');
option.value = voice;
option.textContent = voice.replace('Microsoft ', '').replace(' Desktop', '');
voiceSelect.appendChild(option);
});
} catch (error) {
console.error('Failed to load voices:', error);
}
}
window.onload = loadVoices;
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
```typescript
#!/usr/bin/env node
import express, { Request, Response } from 'express';
import cors from 'cors';
import { exec } from 'child_process';
import { promisify } from 'util';
import * as path from 'path';
import * as fs from 'fs';
import * as net from 'net';
import OpenAI from 'openai';
import dotenv from 'dotenv';
dotenv.config();
const execAsync = promisify(exec);
// Configuration
const DEFAULT_VOICE = 'Microsoft Jenny(Natural) - English (United States)';
const DEFAULT_TIMEOUT = parseInt(process.env.TIMEOUT || '30000', 10);
const DEFAULT_PORT = 3000;
// Initialize OpenAI
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY
});
// Type definitions for request arguments
interface TextToSpeechArgs {
text: string;
voice?: string;
speed?: number;
}
interface SpeechToTextArgs {
duration?: number;
}
interface ChatArgs {
message: string;
voice?: string;
speed?: number;
}
// Helper function to find an available port
async function findAvailablePort(startPort: number): Promise<number> {
const isPortAvailable = (port: number): Promise<boolean> => {
return new Promise((resolve) => {
const server = net.createServer()
.once('error', () => resolve(false))
.once('listening', () => {
server.close();
resolve(true);
})
.listen(port);
});
};
for (let port = startPort; port < startPort + 100; port++) {
if (await isPortAvailable(port)) {
return port;
}
}
throw new Error('No available ports found');
}
// Helper function to get available Windows voices
async function getWindowsVoices(): Promise<string[]> {
try {
const { stdout } = await execAsync('powershell -Command "Add-Type -AssemblyName System.Speech; (New-Object System.Speech.Synthesis.SpeechSynthesizer).GetInstalledVoices().VoiceInfo.Name"', {
timeout: DEFAULT_TIMEOUT
});
return stdout.split('\n').map(v => v.trim()).filter(Boolean);
} catch (error) {
console.error('Error getting voices:', error);
return [DEFAULT_VOICE];
}
}
// Helper function to speak text using Windows TTS
async function speakText(text: string, voice: string = DEFAULT_VOICE, speed: number = 1.0): Promise<void> {
const script = `
Add-Type -AssemblyName System.Speech;
$synthesizer = New-Object System.Speech.Synthesis.SpeechSynthesizer;
$synthesizer.SelectVoice('${voice}');
$synthesizer.Rate = ${Math.round((speed - 1) * 10)};
$synthesizer.Speak('${text.replace(/'/g, "''")}');
`;
await execAsync(`powershell -Command "${script}"`, { timeout: DEFAULT_TIMEOUT });
}
// Helper function to get GPT-4 response
async function getChatResponse(message: string): Promise<string> {
try {
const completion = await openai.chat.completions.create({
model: "gpt-4",
messages: [
{
role: "system",
content: "You are a helpful assistant. Keep your responses concise and natural, as they will be spoken aloud."
},
{
role: "user",
content: message
}
],
temperature: 0.7,
max_tokens: 150
});
return completion.choices[0]?.message?.content || "I'm sorry, I couldn't generate a response.";
} catch (error) {
console.error('Error getting GPT-4 response:', error);
throw error;
}
}
// Initialize Express app
const app = express();
app.use(cors());
app.use(express.json());
app.use(express.static('test'));
// Add timeout middleware
app.use((req: Request, res: Response, next) => {
res.setTimeout(DEFAULT_TIMEOUT, () => {
res.status(408).json({ error: 'Request timeout' });
});
next();
});
// Get available voices
app.get('/voices', async (_req: Request, res: Response) => {
try {
const voices = await getWindowsVoices();
res.json(voices);
} catch (error) {
res.status(500).json({ error: error instanceof Error ? error.message : String(error) });
}
});
// Text to Speech
app.post('/tts', async (req: Request<{}, {}, TextToSpeechArgs>, res: Response) => {
try {
const { text, voice = DEFAULT_VOICE, speed = 1.0 } = req.body;
if (!text) {
return res.status(400).json({ error: 'Text is required' });
}
await speakText(text, voice, speed);
res.json({ success: true });
} catch (error) {
if (error instanceof Error && error.message.includes('timeout')) {
res.status(408).json({ error: 'Operation timed out' });
} else {
res.status(500).json({ error: error instanceof Error ? error.message : String(error) });
}
}
});
// Speech to Text
app.post('/stt', async (req: Request<{}, {}, SpeechToTextArgs>, res: Response) => {
try {
const { duration = 5 } = req.body;
const audioFile = path.join(__dirname, 'recording.wav');
// Record audio using PowerShell
const recordScript = `
Add-Type -AssemblyName System.Windows.Forms;
$audio = New-Object System.IO.MemoryStream;
$waveSource = New-Object NAudio.Wave.WaveInEvent;
$waveSource.WaveFormat = New-Object NAudio.Wave.WaveFormat(16000, 1);
$waveFile = New-Object NAudio.Wave.WaveFileWriter('${audioFile}', $waveSource.WaveFormat);
$waveSource.DataAvailable = {
param($sender, $e)
$waveFile.Write($e.Buffer, 0, $e.BytesRecorded)
};
$waveSource.StartRecording();
Start-Sleep -Seconds ${duration};
$waveSource.StopRecording();
$waveFile.Dispose();
`;
await execAsync(recordScript, { timeout: DEFAULT_TIMEOUT + (duration * 1000) });
// Transcribe the recorded audio
const transcribeScript = `
Add-Type -AssemblyName System.Speech;
$recognizer = New-Object System.Speech.Recognition.SpeechRecognizer;
$grammar = New-Object System.Speech.Recognition.DictationGrammar;
$recognizer.LoadGrammar($grammar);
$audio = [System.IO.File]::ReadAllBytes('${audioFile}');
$stream = New-Object System.IO.MemoryStream(@(,$audio));
$result = $recognizer.RecognizeSync([System.Speech.AudioFormat.AudioStream]::new($stream));
$result.Text;
`;
const { stdout } = await execAsync(`powershell -Command "${transcribeScript}"`, { timeout: DEFAULT_TIMEOUT });
// Clean up the audio file
await fs.promises.unlink(audioFile);
res.json({ text: stdout.trim() || 'No speech detected' });
} catch (error) {
// Clean up the audio file if it exists
const audioFile = path.join(__dirname, 'recording.wav');
if (fs.existsSync(audioFile)) {
await fs.promises.unlink(audioFile);
}
if (error instanceof Error && error.message.includes('timeout')) {
res.status(408).json({ error: 'Operation timed out' });
} else {
res.status(500).json({ error: error instanceof Error ? error.message : String(error) });
}
}
});
// Chat endpoint that gets GPT-4 response and speaks it
app.post('/chat', async (req: Request<{}, {}, ChatArgs>, res: Response) => {
try {
const { message, voice = DEFAULT_VOICE, speed = 1.0 } = req.body;
if (!message) {
return res.status(400).json({ error: 'Message is required' });
}
// Get GPT-4 response
const response = await getChatResponse(message);
// Speak the response
await speakText(response, voice, speed);
res.json({
success: true,
response,
spoken: true
});
} catch (error) {
if (error instanceof Error && error.message.includes('timeout')) {
res.status(408).json({ error: 'Operation timed out' });
} else {
res.status(500).json({ error: error instanceof Error ? error.message : String(error) });
}
}
});
// Start the server
async function startServer() {
try {
const port = await findAvailablePort(DEFAULT_PORT);
app.listen(port, () => {
console.log(`Windows Speech Server running at http://localhost:${port}`);
console.log(`Using default voice: ${DEFAULT_VOICE}`);
console.log(`Timeout set to: ${DEFAULT_TIMEOUT}ms`);
console.log('GPT-4 integration enabled');
});
} catch (error) {
console.error('Failed to start server:', error);
process.exit(1);
}
}
startServer();
```