#
tokens: 6093/50000 9/9 files
lines: off (toggle) GitHub
raw markdown copy
# Directory Structure

```
├── .gitignore
├── package-lock.json
├── package.json
├── README.md
├── server.js
├── src
│   ├── index.ts
│   └── types
│       ├── node-record-lpcm16.d.ts
│       └── say.d.ts
├── test
│   └── index.html
└── tsconfig.json
```

# Files

--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------

```
# Dependencies
node_modules/
npm-debug.log*
yarn-debug.log*
yarn-error.log*

# Build output
build/
dist/
*.js.map

# Environment variables
.env
.env.local
.env.*.local

# IDE files
.idea/
.vscode/
*.swp
*.swo

# Operating System
.DS_Store
Thumbs.db

# Temporary files
*.log
*.tmp
recording.wav

# Test coverage
coverage/
```

--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------

```markdown
# MS-Lucidia-Voice-Gateway-MCP

A Model Context Protocol (MCP) server that provides text-to-speech and speech-to-text capabilities using Windows' built-in speech services. This server leverages the native Windows Speech API (SAPI) through PowerShell commands, eliminating the need for external APIs or services.

## Features

- Text-to-Speech (TTS) using Windows SAPI voices
- Speech-to-Text (STT) using Windows Speech Recognition
- Simple web interface for testing
- No external API dependencies
- Uses native Windows capabilities

## Prerequisites

- Windows 10/11 with Speech Recognition enabled
- Node.js 16+
- PowerShell

## Installation

1. Clone the repository:
```bash
git clone https://github.com/ExpressionsBot/MS-Lucidia-Voice-Gateway-MCP.git
cd MS-Lucidia-Voice-Gateway-MCP
```

2. Install dependencies:
```bash
npm install
```

3. Build the project:
```bash
npm run build
```

## Usage

### Testing Interface

1. Start the test server:
```bash
npm run test
```

2. Open `http://localhost:3000` in your browser
3. Use the web interface to test TTS and STT capabilities

### Available Tools

#### text_to_speech
Converts text to speech using Windows SAPI.

Parameters:
- `text` (required): The text to convert to speech
- `voice` (optional): The voice to use (e.g., "Microsoft David Desktop")
- `speed` (optional): Speech rate from 0.5 to 2.0 (default: 1.0)

Example:
```javascript
fetch('http://localhost:3000/tts', {
  method: 'POST',
  headers: { 'Content-Type': 'application/json' },
  body: JSON.stringify({
    text: "Hello, this is a test",
    voice: "Microsoft David Desktop",
    speed: 1.0
  })
});
```

#### speech_to_text
Records audio and converts it to text using Windows Speech Recognition.

Parameters:
- `duration` (optional): Recording duration in seconds (default: 5, max: 60)

Example:
```javascript
fetch('http://localhost:3000/stt', {
  method: 'POST',
  headers: { 'Content-Type': 'application/json' },
  body: JSON.stringify({
    duration: 5
  })
}).then(response => response.json())
  .then(data => console.log(data.text));
```

## Troubleshooting

1. Make sure Windows Speech Recognition is enabled:
   - Open Windows Settings
   - Go to Time & Language > Speech
   - Enable Speech Recognition

2. Check available voices:
   - Open PowerShell and run:
   ```powershell
   Add-Type -AssemblyName System.Speech
   (New-Object System.Speech.Synthesis.SpeechSynthesizer).GetInstalledVoices().VoiceInfo.Name
   ```

3. Test speech recognition:
   - Open Speech Recognition in Windows Settings
   - Run through the setup wizard if not already done
   - Test that Windows can recognize your voice

## Contributing

1. Fork the repository
2. Create your feature branch
3. Commit your changes
4. Push to the branch
5. Create a new Pull Request

## License

MIT

```

--------------------------------------------------------------------------------
/src/types/say.d.ts:
--------------------------------------------------------------------------------

```typescript
declare module 'say' {
  export function speak(
    text: string,
    voice?: string,
    speed?: number,
    callback?: (err: Error | string | null) => void
  ): void;

  export function stop(): void;
}
```

--------------------------------------------------------------------------------
/src/types/node-record-lpcm16.d.ts:
--------------------------------------------------------------------------------

```typescript
declare module 'node-record-lpcm16' {
  interface RecordOptions {
    sampleRate?: number;
    channels?: number;
    audioType?: string;
  }

  interface Recording {
    stream(): NodeJS.ReadableStream;
    stop(): void;
  }

  export function record(options?: RecordOptions): Recording;
}
```

--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------

```json
{
  "compilerOptions": {
    "target": "ES2020",
    "module": "commonjs",
    "lib": ["ES2020"],
    "outDir": "./build",
    "rootDir": "./src",
    "strict": true,
    "esModuleInterop": true,
    "skipLibCheck": true,
    "forceConsistentCasingInFileNames": true,
    "resolveJsonModule": true,
    "declaration": true,
    "moduleResolution": "node"
  },
  "include": ["src/**/*"],
  "exclude": ["node_modules", "build", "test"]
}

```

--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------

```json
{
  "name": "ms-lucidia-voice-gateway-mcp",
  "version": "1.0.0",
  "description": "Windows Speech server for Lucidia using native Windows speech capabilities",
  "main": "build/index.js",
  "scripts": {
    "start": "node build/index.js",
    "test": "node server.js",
    "dev": "nodemon server.js",
    "build": "tsc",
    "watch": "tsc -w"
  },
  "dependencies": {
    "cors": "^2.8.5",
    "dotenv": "^16.3.1",
    "express": "^4.18.2",
    "openai": "^4.24.1"
  },
  "devDependencies": {
    "@types/cors": "^2.8.17",
    "@types/express": "^4.17.21",
    "@types/node": "^20.11.5",
    "nodemon": "^3.0.3",
    "typescript": "^5.3.3"
  },
  "repository": {
    "type": "git",
    "url": "git+https://github.com/ExpressionsBot/MS-Lucidia-Voice-Gateway-MCP.git"
  },
  "keywords": [
    "windows",
    "speech",
    "tts",
    "stt",
    "gpt4",
    "lucidia"
  ],
  "author": "ExpressionsBot",
  "license": "MIT"
}

```

--------------------------------------------------------------------------------
/server.js:
--------------------------------------------------------------------------------

```javascript
const express = require('express');
const cors = require('cors');
const { exec } = require('child_process');
const { promisify } = require('util');
const fs = require('fs').promises;
const path = require('path');
const { createServer } = require('net');

const execAsync = promisify(exec);
const app = express();

// Helper function to find an available port
async function findAvailablePort(startPort) {
  const isPortAvailable = (port) => {
    return new Promise((resolve) => {
      const server = createServer()
        .listen(port, () => {
          server.once('close', () => resolve(true));
          server.close();
        })
        .on('error', () => resolve(false));
    });
  };

  let port = startPort;
  while (!(await isPortAvailable(port))) {
    port++;
  }
  return port;
}

app.use(cors());
app.use(express.json());
app.use(express.static('test'));

// Helper function to execute PowerShell commands
async function runPowerShell(script) {
    try {
        const { stdout } = await execAsync(`powershell -Command "${script}"`);
        return stdout.trim();
    } catch (error) {
        throw new Error(`PowerShell execution failed: ${error.message}`);
    }
}

// Get available voices
app.get('/voices', async (req, res) => {
    try {
        const script = `
            Add-Type -AssemblyName System.Speech;
            (New-Object System.Speech.Synthesis.SpeechSynthesizer).GetInstalledVoices().VoiceInfo.Name
        `;
        const output = await runPowerShell(script);
        const voices = output.split('\n').map(v => v.trim()).filter(Boolean);
        res.json(voices);
    } catch (error) {
        res.status(500).json({ error: error.message });
    }
});

// Text to Speech
app.post('/tts', async (req, res) => {
    try {
        const { text, voice = 'Microsoft David Desktop', speed = 1.0 } = req.body;
        
        if (!text) {
            return res.status(400).json({ error: 'Text is required' });
        }

        const script = `
            Add-Type -AssemblyName System.Speech;
            $synthesizer = New-Object System.Speech.Synthesis.SpeechSynthesizer;
            $synthesizer.SelectVoice('${voice}');
            $synthesizer.Rate = ${Math.round((speed - 1) * 10)};
            $synthesizer.Speak('${text.replace(/'/g, "''")}');
        `;

        await runPowerShell(script);
        res.json({ success: true });
    } catch (error) {
        res.status(500).json({ error: error.message });
    }
});

// Speech to Text
app.post('/stt', async (req, res) => {
    try {
        const { duration = 5 } = req.body;
        const audioFile = path.join(__dirname, 'recording.wav');

        // Record audio using PowerShell
        const recordScript = `
            Add-Type -AssemblyName System.Windows.Forms;
            $audio = New-Object System.IO.MemoryStream;
            $waveSource = New-Object NAudio.Wave.WaveInEvent;
            $waveSource.WaveFormat = New-Object NAudio.Wave.WaveFormat(16000, 1);
            $waveFile = New-Object NAudio.Wave.WaveFileWriter('${audioFile}', $waveSource.WaveFormat);
            $waveSource.DataAvailable = {
                param($sender, $e)
                $waveFile.Write($e.Buffer, 0, $e.BytesRecorded)
            };
            $waveSource.StartRecording();
            Start-Sleep -Seconds ${duration};
            $waveSource.StopRecording();
            $waveFile.Dispose();
        `;

        await runPowerShell(recordScript);

        // Transcribe the recorded audio
        const transcribeScript = `
            Add-Type -AssemblyName System.Speech;
            $recognizer = New-Object System.Speech.Recognition.SpeechRecognizer;
            $grammar = New-Object System.Speech.Recognition.DictationGrammar;
            $recognizer.LoadGrammar($grammar);
            $audio = [System.IO.File]::ReadAllBytes('${audioFile}');
            $stream = New-Object System.IO.MemoryStream(@(,$audio));
            $result = $recognizer.RecognizeSync([System.Speech.AudioFormat.AudioStream]::new($stream));
            $result.Text;
        `;

        const transcription = await runPowerShell(transcribeScript);

        // Clean up the audio file
        await fs.unlink(audioFile);

        res.json({ text: transcription || 'No speech detected' });
    } catch (error) {
        res.status(500).json({ error: error.message });
    }
});

// Start the server
async function startServer() {
    try {
        const port = await findAvailablePort(3000);
        app.listen(port, () => {
            console.log(`Windows Speech Server running at http://localhost:${port}`);
        });
    } catch (error) {
        console.error('Failed to start server:', error);
        process.exit(1);
    }
}

startServer();
```

--------------------------------------------------------------------------------
/test/index.html:
--------------------------------------------------------------------------------

```html
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Windows Speech MCP Test</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
            background-color: #f5f5f5;
        }
        .container {
            background-color: white;
            padding: 20px;
            border-radius: 8px;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        }
        .section {
            margin-bottom: 20px;
            padding: 20px;
            border: 1px solid #ddd;
            border-radius: 4px;
        }
        h1 {
            color: #333;
            text-align: center;
        }
        textarea {
            width: 100%;
            height: 100px;
            margin: 10px 0;
            padding: 8px;
            border: 1px solid #ddd;
            border-radius: 4px;
            resize: vertical;
        }
        button {
            background-color: #007bff;
            color: white;
            border: none;
            padding: 10px 20px;
            border-radius: 4px;
            cursor: pointer;
            margin: 5px;
        }
        button:hover {
            background-color: #0056b3;
        }
        select {
            padding: 8px;
            margin: 5px;
            border-radius: 4px;
            border: 1px solid #ddd;
        }
        .status {
            margin-top: 10px;
            padding: 10px;
            border-radius: 4px;
        }
        .success {
            background-color: #d4edda;
            color: #155724;
        }
        .error {
            background-color: #f8d7da;
            color: #721c24;
        }
    </style>
</head>
<body>
    <div class="container">
        <h1>Windows Speech MCP Test</h1>
        
        <div class="section">
            <h2>Text to Speech</h2>
            <textarea id="ttsText" placeholder="Enter text to speak...">Hello, this is a test of Windows speech synthesis.</textarea>
            <div>
                <select id="ttsVoice">
                    <option value="Microsoft David Desktop">David</option>
                    <option value="Microsoft Zira Desktop">Zira</option>
                </select>
                <select id="ttsSpeed">
                    <option value="0.5">0.5x Speed</option>
                    <option value="1.0" selected>1.0x Speed</option>
                    <option value="1.5">1.5x Speed</option>
                    <option value="2.0">2.0x Speed</option>
                </select>
                <button onclick="speak()">Speak</button>
            </div>
            <div id="ttsStatus" class="status"></div>
        </div>

        <div class="section">
            <h2>Speech to Text</h2>
            <div>
                <select id="sttDuration">
                    <option value="5">5 seconds</option>
                    <option value="10">10 seconds</option>
                    <option value="15">15 seconds</option>
                    <option value="30">30 seconds</option>
                </select>
                <button onclick="startRecording()">Start Recording</button>
            </div>
            <textarea id="sttText" placeholder="Transcribed text will appear here..." readonly></textarea>
            <div id="sttStatus" class="status"></div>
        </div>
    </div>

    <script>
        async function speak() {
            const text = document.getElementById('ttsText').value;
            const voice = document.getElementById('ttsVoice').value;
            const speed = parseFloat(document.getElementById('ttsSpeed').value);
            const statusDiv = document.getElementById('ttsStatus');

            try {
                const response = await fetch('http://localhost:3000/tts', {
                    method: 'POST',
                    headers: {
                        'Content-Type': 'application/json'
                    },
                    body: JSON.stringify({ text, voice, speed })
                });

                if (!response.ok) throw new Error('Failed to synthesize speech');
                
                statusDiv.textContent = 'Speech synthesis successful!';
                statusDiv.className = 'status success';
            } catch (error) {
                statusDiv.textContent = `Error: ${error.message}`;
                statusDiv.className = 'status error';
            }
        }

        async function startRecording() {
            const duration = parseInt(document.getElementById('sttDuration').value);
            const statusDiv = document.getElementById('sttStatus');
            const textArea = document.getElementById('sttText');

            try {
                statusDiv.textContent = `Recording for ${duration} seconds...`;
                statusDiv.className = 'status';

                const response = await fetch('http://localhost:3000/stt', {
                    method: 'POST',
                    headers: {
                        'Content-Type': 'application/json'
                    },
                    body: JSON.stringify({ duration })
                });

                if (!response.ok) throw new Error('Failed to transcribe speech');
                
                const result = await response.json();
                textArea.value = result.text;
                statusDiv.textContent = 'Transcription successful!';
                statusDiv.className = 'status success';
            } catch (error) {
                statusDiv.textContent = `Error: ${error.message}`;
                statusDiv.className = 'status error';
                textArea.value = '';
            }
        }

        // Fetch available voices when the page loads
        async function loadVoices() {
            try {
                const response = await fetch('http://localhost:3000/voices');
                if (!response.ok) throw new Error('Failed to fetch voices');
                
                const voices = await response.json();
                const voiceSelect = document.getElementById('ttsVoice');
                voiceSelect.innerHTML = '';
                
                voices.forEach(voice => {
                    const option = document.createElement('option');
                    option.value = voice;
                    option.textContent = voice.replace('Microsoft ', '').replace(' Desktop', '');
                    voiceSelect.appendChild(option);
                });
            } catch (error) {
                console.error('Failed to load voices:', error);
            }
        }

        window.onload = loadVoices;
    </script>
</body>
</html>
```

--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------

```typescript
#!/usr/bin/env node

import express, { Request, Response } from 'express';
import cors from 'cors';
import { exec } from 'child_process';
import { promisify } from 'util';
import * as path from 'path';
import * as fs from 'fs';
import * as net from 'net';
import OpenAI from 'openai';
import dotenv from 'dotenv';

dotenv.config();

const execAsync = promisify(exec);

// Configuration
const DEFAULT_VOICE = 'Microsoft Jenny(Natural) - English (United States)';
const DEFAULT_TIMEOUT = parseInt(process.env.TIMEOUT || '30000', 10);
const DEFAULT_PORT = 3000;

// Initialize OpenAI
const openai = new OpenAI({
  apiKey: process.env.OPENAI_API_KEY
});

// Type definitions for request arguments
interface TextToSpeechArgs {
  text: string;
  voice?: string;
  speed?: number;
}

interface SpeechToTextArgs {
  duration?: number;
}

interface ChatArgs {
  message: string;
  voice?: string;
  speed?: number;
}

// Helper function to find an available port
async function findAvailablePort(startPort: number): Promise<number> {
  const isPortAvailable = (port: number): Promise<boolean> => {
    return new Promise((resolve) => {
      const server = net.createServer()
        .once('error', () => resolve(false))
        .once('listening', () => {
          server.close();
          resolve(true);
        })
        .listen(port);
    });
  };

  for (let port = startPort; port < startPort + 100; port++) {
    if (await isPortAvailable(port)) {
      return port;
    }
  }
  throw new Error('No available ports found');
}

// Helper function to get available Windows voices
async function getWindowsVoices(): Promise<string[]> {
  try {
    const { stdout } = await execAsync('powershell -Command "Add-Type -AssemblyName System.Speech; (New-Object System.Speech.Synthesis.SpeechSynthesizer).GetInstalledVoices().VoiceInfo.Name"', {
      timeout: DEFAULT_TIMEOUT
    });
    return stdout.split('\n').map(v => v.trim()).filter(Boolean);
  } catch (error) {
    console.error('Error getting voices:', error);
    return [DEFAULT_VOICE];
  }
}

// Helper function to speak text using Windows TTS
async function speakText(text: string, voice: string = DEFAULT_VOICE, speed: number = 1.0): Promise<void> {
  const script = `
    Add-Type -AssemblyName System.Speech;
    $synthesizer = New-Object System.Speech.Synthesis.SpeechSynthesizer;
    $synthesizer.SelectVoice('${voice}');
    $synthesizer.Rate = ${Math.round((speed - 1) * 10)};
    $synthesizer.Speak('${text.replace(/'/g, "''")}');
  `;

  await execAsync(`powershell -Command "${script}"`, { timeout: DEFAULT_TIMEOUT });
}

// Helper function to get GPT-4 response
async function getChatResponse(message: string): Promise<string> {
  try {
    const completion = await openai.chat.completions.create({
      model: "gpt-4",
      messages: [
        { 
          role: "system", 
          content: "You are a helpful assistant. Keep your responses concise and natural, as they will be spoken aloud."
        },
        { 
          role: "user", 
          content: message 
        }
      ],
      temperature: 0.7,
      max_tokens: 150
    });

    return completion.choices[0]?.message?.content || "I'm sorry, I couldn't generate a response.";
  } catch (error) {
    console.error('Error getting GPT-4 response:', error);
    throw error;
  }
}

// Initialize Express app
const app = express();

app.use(cors());
app.use(express.json());
app.use(express.static('test'));

// Add timeout middleware
app.use((req: Request, res: Response, next) => {
  res.setTimeout(DEFAULT_TIMEOUT, () => {
    res.status(408).json({ error: 'Request timeout' });
  });
  next();
});

// Get available voices
app.get('/voices', async (_req: Request, res: Response) => {
  try {
    const voices = await getWindowsVoices();
    res.json(voices);
  } catch (error) {
    res.status(500).json({ error: error instanceof Error ? error.message : String(error) });
  }
});

// Text to Speech
app.post('/tts', async (req: Request<{}, {}, TextToSpeechArgs>, res: Response) => {
  try {
    const { text, voice = DEFAULT_VOICE, speed = 1.0 } = req.body;
    
    if (!text) {
      return res.status(400).json({ error: 'Text is required' });
    }

    await speakText(text, voice, speed);
    res.json({ success: true });
  } catch (error) {
    if (error instanceof Error && error.message.includes('timeout')) {
      res.status(408).json({ error: 'Operation timed out' });
    } else {
      res.status(500).json({ error: error instanceof Error ? error.message : String(error) });
    }
  }
});

// Speech to Text
app.post('/stt', async (req: Request<{}, {}, SpeechToTextArgs>, res: Response) => {
  try {
    const { duration = 5 } = req.body;
    const audioFile = path.join(__dirname, 'recording.wav');

    // Record audio using PowerShell
    const recordScript = `
      Add-Type -AssemblyName System.Windows.Forms;
      $audio = New-Object System.IO.MemoryStream;
      $waveSource = New-Object NAudio.Wave.WaveInEvent;
      $waveSource.WaveFormat = New-Object NAudio.Wave.WaveFormat(16000, 1);
      $waveFile = New-Object NAudio.Wave.WaveFileWriter('${audioFile}', $waveSource.WaveFormat);
      $waveSource.DataAvailable = {
        param($sender, $e)
        $waveFile.Write($e.Buffer, 0, $e.BytesRecorded)
      };
      $waveSource.StartRecording();
      Start-Sleep -Seconds ${duration};
      $waveSource.StopRecording();
      $waveFile.Dispose();
    `;

    await execAsync(recordScript, { timeout: DEFAULT_TIMEOUT + (duration * 1000) });

    // Transcribe the recorded audio
    const transcribeScript = `
      Add-Type -AssemblyName System.Speech;
      $recognizer = New-Object System.Speech.Recognition.SpeechRecognizer;
      $grammar = New-Object System.Speech.Recognition.DictationGrammar;
      $recognizer.LoadGrammar($grammar);
      $audio = [System.IO.File]::ReadAllBytes('${audioFile}');
      $stream = New-Object System.IO.MemoryStream(@(,$audio));
      $result = $recognizer.RecognizeSync([System.Speech.AudioFormat.AudioStream]::new($stream));
      $result.Text;
    `;

    const { stdout } = await execAsync(`powershell -Command "${transcribeScript}"`, { timeout: DEFAULT_TIMEOUT });

    // Clean up the audio file
    await fs.promises.unlink(audioFile);

    res.json({ text: stdout.trim() || 'No speech detected' });
  } catch (error) {
    // Clean up the audio file if it exists
    const audioFile = path.join(__dirname, 'recording.wav');
    if (fs.existsSync(audioFile)) {
      await fs.promises.unlink(audioFile);
    }
    
    if (error instanceof Error && error.message.includes('timeout')) {
      res.status(408).json({ error: 'Operation timed out' });
    } else {
      res.status(500).json({ error: error instanceof Error ? error.message : String(error) });
    }
  }
});

// Chat endpoint that gets GPT-4 response and speaks it
app.post('/chat', async (req: Request<{}, {}, ChatArgs>, res: Response) => {
  try {
    const { message, voice = DEFAULT_VOICE, speed = 1.0 } = req.body;
    
    if (!message) {
      return res.status(400).json({ error: 'Message is required' });
    }

    // Get GPT-4 response
    const response = await getChatResponse(message);
    
    // Speak the response
    await speakText(response, voice, speed);

    res.json({ 
      success: true,
      response,
      spoken: true
    });
  } catch (error) {
    if (error instanceof Error && error.message.includes('timeout')) {
      res.status(408).json({ error: 'Operation timed out' });
    } else {
      res.status(500).json({ error: error instanceof Error ? error.message : String(error) });
    }
  }
});

// Start the server
async function startServer() {
  try {
    const port = await findAvailablePort(DEFAULT_PORT);
    app.listen(port, () => {
      console.log(`Windows Speech Server running at http://localhost:${port}`);
      console.log(`Using default voice: ${DEFAULT_VOICE}`);
      console.log(`Timeout set to: ${DEFAULT_TIMEOUT}ms`);
      console.log('GPT-4 integration enabled');
    });
  } catch (error) {
    console.error('Failed to start server:', error);
    process.exit(1);
  }
}

startServer();

```