# Directory Structure
```
├── .gitignore
├── package-lock.json
├── package.json
├── README.md
├── server.js
├── src
│ ├── index.ts
│ └── types
│ ├── node-record-lpcm16.d.ts
│ └── say.d.ts
├── test
│ └── index.html
└── tsconfig.json
```
# Files
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
```
1 | # Dependencies
2 | node_modules/
3 | npm-debug.log*
4 | yarn-debug.log*
5 | yarn-error.log*
6 |
7 | # Build output
8 | build/
9 | dist/
10 | *.js.map
11 |
12 | # Environment variables
13 | .env
14 | .env.local
15 | .env.*.local
16 |
17 | # IDE files
18 | .idea/
19 | .vscode/
20 | *.swp
21 | *.swo
22 |
23 | # Operating System
24 | .DS_Store
25 | Thumbs.db
26 |
27 | # Temporary files
28 | *.log
29 | *.tmp
30 | recording.wav
31 |
32 | # Test coverage
33 | coverage/
```
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
```markdown
1 | # MS-Lucidia-Voice-Gateway-MCP
2 |
3 | A Model Context Protocol (MCP) server that provides text-to-speech and speech-to-text capabilities using Windows' built-in speech services. This server leverages the native Windows Speech API (SAPI) through PowerShell commands, eliminating the need for external APIs or services.
4 |
5 | ## Features
6 |
7 | - Text-to-Speech (TTS) using Windows SAPI voices
8 | - Speech-to-Text (STT) using Windows Speech Recognition
9 | - Simple web interface for testing
10 | - No external API dependencies
11 | - Uses native Windows capabilities
12 |
13 | ## Prerequisites
14 |
15 | - Windows 10/11 with Speech Recognition enabled
16 | - Node.js 16+
17 | - PowerShell
18 |
19 | ## Installation
20 |
21 | 1. Clone the repository:
22 | ```bash
23 | git clone https://github.com/ExpressionsBot/MS-Lucidia-Voice-Gateway-MCP.git
24 | cd MS-Lucidia-Voice-Gateway-MCP
25 | ```
26 |
27 | 2. Install dependencies:
28 | ```bash
29 | npm install
30 | ```
31 |
32 | 3. Build the project:
33 | ```bash
34 | npm run build
35 | ```
36 |
37 | ## Usage
38 |
39 | ### Testing Interface
40 |
41 | 1. Start the test server:
42 | ```bash
43 | npm run test
44 | ```
45 |
46 | 2. Open `http://localhost:3000` in your browser
47 | 3. Use the web interface to test TTS and STT capabilities
48 |
49 | ### Available Tools
50 |
51 | #### text_to_speech
52 | Converts text to speech using Windows SAPI.
53 |
54 | Parameters:
55 | - `text` (required): The text to convert to speech
56 | - `voice` (optional): The voice to use (e.g., "Microsoft David Desktop")
57 | - `speed` (optional): Speech rate from 0.5 to 2.0 (default: 1.0)
58 |
59 | Example:
60 | ```javascript
61 | fetch('http://localhost:3000/tts', {
62 | method: 'POST',
63 | headers: { 'Content-Type': 'application/json' },
64 | body: JSON.stringify({
65 | text: "Hello, this is a test",
66 | voice: "Microsoft David Desktop",
67 | speed: 1.0
68 | })
69 | });
70 | ```
71 |
72 | #### speech_to_text
73 | Records audio and converts it to text using Windows Speech Recognition.
74 |
75 | Parameters:
76 | - `duration` (optional): Recording duration in seconds (default: 5, max: 60)
77 |
78 | Example:
79 | ```javascript
80 | fetch('http://localhost:3000/stt', {
81 | method: 'POST',
82 | headers: { 'Content-Type': 'application/json' },
83 | body: JSON.stringify({
84 | duration: 5
85 | })
86 | }).then(response => response.json())
87 | .then(data => console.log(data.text));
88 | ```
89 |
90 | ## Troubleshooting
91 |
92 | 1. Make sure Windows Speech Recognition is enabled:
93 | - Open Windows Settings
94 | - Go to Time & Language > Speech
95 | - Enable Speech Recognition
96 |
97 | 2. Check available voices:
98 | - Open PowerShell and run:
99 | ```powershell
100 | Add-Type -AssemblyName System.Speech
101 | (New-Object System.Speech.Synthesis.SpeechSynthesizer).GetInstalledVoices().VoiceInfo.Name
102 | ```
103 |
104 | 3. Test speech recognition:
105 | - Open Speech Recognition in Windows Settings
106 | - Run through the setup wizard if not already done
107 | - Test that Windows can recognize your voice
108 |
109 | ## Contributing
110 |
111 | 1. Fork the repository
112 | 2. Create your feature branch
113 | 3. Commit your changes
114 | 4. Push to the branch
115 | 5. Create a new Pull Request
116 |
117 | ## License
118 |
119 | MIT
120 |
```
--------------------------------------------------------------------------------
/src/types/say.d.ts:
--------------------------------------------------------------------------------
```typescript
1 | declare module 'say' {
2 | export function speak(
3 | text: string,
4 | voice?: string,
5 | speed?: number,
6 | callback?: (err: Error | string | null) => void
7 | ): void;
8 |
9 | export function stop(): void;
10 | }
```
--------------------------------------------------------------------------------
/src/types/node-record-lpcm16.d.ts:
--------------------------------------------------------------------------------
```typescript
1 | declare module 'node-record-lpcm16' {
2 | interface RecordOptions {
3 | sampleRate?: number;
4 | channels?: number;
5 | audioType?: string;
6 | }
7 |
8 | interface Recording {
9 | stream(): NodeJS.ReadableStream;
10 | stop(): void;
11 | }
12 |
13 | export function record(options?: RecordOptions): Recording;
14 | }
```
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
```json
1 | {
2 | "compilerOptions": {
3 | "target": "ES2020",
4 | "module": "commonjs",
5 | "lib": ["ES2020"],
6 | "outDir": "./build",
7 | "rootDir": "./src",
8 | "strict": true,
9 | "esModuleInterop": true,
10 | "skipLibCheck": true,
11 | "forceConsistentCasingInFileNames": true,
12 | "resolveJsonModule": true,
13 | "declaration": true,
14 | "moduleResolution": "node"
15 | },
16 | "include": ["src/**/*"],
17 | "exclude": ["node_modules", "build", "test"]
18 | }
19 |
```
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
```json
1 | {
2 | "name": "ms-lucidia-voice-gateway-mcp",
3 | "version": "1.0.0",
4 | "description": "Windows Speech server for Lucidia using native Windows speech capabilities",
5 | "main": "build/index.js",
6 | "scripts": {
7 | "start": "node build/index.js",
8 | "test": "node server.js",
9 | "dev": "nodemon server.js",
10 | "build": "tsc",
11 | "watch": "tsc -w"
12 | },
13 | "dependencies": {
14 | "cors": "^2.8.5",
15 | "dotenv": "^16.3.1",
16 | "express": "^4.18.2",
17 | "openai": "^4.24.1"
18 | },
19 | "devDependencies": {
20 | "@types/cors": "^2.8.17",
21 | "@types/express": "^4.17.21",
22 | "@types/node": "^20.11.5",
23 | "nodemon": "^3.0.3",
24 | "typescript": "^5.3.3"
25 | },
26 | "repository": {
27 | "type": "git",
28 | "url": "git+https://github.com/ExpressionsBot/MS-Lucidia-Voice-Gateway-MCP.git"
29 | },
30 | "keywords": [
31 | "windows",
32 | "speech",
33 | "tts",
34 | "stt",
35 | "gpt4",
36 | "lucidia"
37 | ],
38 | "author": "ExpressionsBot",
39 | "license": "MIT"
40 | }
41 |
```
--------------------------------------------------------------------------------
/server.js:
--------------------------------------------------------------------------------
```javascript
1 | const express = require('express');
2 | const cors = require('cors');
3 | const { exec } = require('child_process');
4 | const { promisify } = require('util');
5 | const fs = require('fs').promises;
6 | const path = require('path');
7 | const { createServer } = require('net');
8 |
9 | const execAsync = promisify(exec);
10 | const app = express();
11 |
12 | // Helper function to find an available port
13 | async function findAvailablePort(startPort) {
14 | const isPortAvailable = (port) => {
15 | return new Promise((resolve) => {
16 | const server = createServer()
17 | .listen(port, () => {
18 | server.once('close', () => resolve(true));
19 | server.close();
20 | })
21 | .on('error', () => resolve(false));
22 | });
23 | };
24 |
25 | let port = startPort;
26 | while (!(await isPortAvailable(port))) {
27 | port++;
28 | }
29 | return port;
30 | }
31 |
32 | app.use(cors());
33 | app.use(express.json());
34 | app.use(express.static('test'));
35 |
36 | // Helper function to execute PowerShell commands
37 | async function runPowerShell(script) {
38 | try {
39 | const { stdout } = await execAsync(`powershell -Command "${script}"`);
40 | return stdout.trim();
41 | } catch (error) {
42 | throw new Error(`PowerShell execution failed: ${error.message}`);
43 | }
44 | }
45 |
46 | // Get available voices
47 | app.get('/voices', async (req, res) => {
48 | try {
49 | const script = `
50 | Add-Type -AssemblyName System.Speech;
51 | (New-Object System.Speech.Synthesis.SpeechSynthesizer).GetInstalledVoices().VoiceInfo.Name
52 | `;
53 | const output = await runPowerShell(script);
54 | const voices = output.split('\n').map(v => v.trim()).filter(Boolean);
55 | res.json(voices);
56 | } catch (error) {
57 | res.status(500).json({ error: error.message });
58 | }
59 | });
60 |
61 | // Text to Speech
62 | app.post('/tts', async (req, res) => {
63 | try {
64 | const { text, voice = 'Microsoft David Desktop', speed = 1.0 } = req.body;
65 |
66 | if (!text) {
67 | return res.status(400).json({ error: 'Text is required' });
68 | }
69 |
70 | const script = `
71 | Add-Type -AssemblyName System.Speech;
72 | $synthesizer = New-Object System.Speech.Synthesis.SpeechSynthesizer;
73 | $synthesizer.SelectVoice('${voice}');
74 | $synthesizer.Rate = ${Math.round((speed - 1) * 10)};
75 | $synthesizer.Speak('${text.replace(/'/g, "''")}');
76 | `;
77 |
78 | await runPowerShell(script);
79 | res.json({ success: true });
80 | } catch (error) {
81 | res.status(500).json({ error: error.message });
82 | }
83 | });
84 |
85 | // Speech to Text
86 | app.post('/stt', async (req, res) => {
87 | try {
88 | const { duration = 5 } = req.body;
89 | const audioFile = path.join(__dirname, 'recording.wav');
90 |
91 | // Record audio using PowerShell
92 | const recordScript = `
93 | Add-Type -AssemblyName System.Windows.Forms;
94 | $audio = New-Object System.IO.MemoryStream;
95 | $waveSource = New-Object NAudio.Wave.WaveInEvent;
96 | $waveSource.WaveFormat = New-Object NAudio.Wave.WaveFormat(16000, 1);
97 | $waveFile = New-Object NAudio.Wave.WaveFileWriter('${audioFile}', $waveSource.WaveFormat);
98 | $waveSource.DataAvailable = {
99 | param($sender, $e)
100 | $waveFile.Write($e.Buffer, 0, $e.BytesRecorded)
101 | };
102 | $waveSource.StartRecording();
103 | Start-Sleep -Seconds ${duration};
104 | $waveSource.StopRecording();
105 | $waveFile.Dispose();
106 | `;
107 |
108 | await runPowerShell(recordScript);
109 |
110 | // Transcribe the recorded audio
111 | const transcribeScript = `
112 | Add-Type -AssemblyName System.Speech;
113 | $recognizer = New-Object System.Speech.Recognition.SpeechRecognizer;
114 | $grammar = New-Object System.Speech.Recognition.DictationGrammar;
115 | $recognizer.LoadGrammar($grammar);
116 | $audio = [System.IO.File]::ReadAllBytes('${audioFile}');
117 | $stream = New-Object System.IO.MemoryStream(@(,$audio));
118 | $result = $recognizer.RecognizeSync([System.Speech.AudioFormat.AudioStream]::new($stream));
119 | $result.Text;
120 | `;
121 |
122 | const transcription = await runPowerShell(transcribeScript);
123 |
124 | // Clean up the audio file
125 | await fs.unlink(audioFile);
126 |
127 | res.json({ text: transcription || 'No speech detected' });
128 | } catch (error) {
129 | res.status(500).json({ error: error.message });
130 | }
131 | });
132 |
133 | // Start the server
134 | async function startServer() {
135 | try {
136 | const port = await findAvailablePort(3000);
137 | app.listen(port, () => {
138 | console.log(`Windows Speech Server running at http://localhost:${port}`);
139 | });
140 | } catch (error) {
141 | console.error('Failed to start server:', error);
142 | process.exit(1);
143 | }
144 | }
145 |
146 | startServer();
```
--------------------------------------------------------------------------------
/test/index.html:
--------------------------------------------------------------------------------
```html
1 | <!DOCTYPE html>
2 | <html lang="en">
3 | <head>
4 | <meta charset="UTF-8">
5 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
6 | <title>Windows Speech MCP Test</title>
7 | <style>
8 | body {
9 | font-family: Arial, sans-serif;
10 | max-width: 800px;
11 | margin: 0 auto;
12 | padding: 20px;
13 | background-color: #f5f5f5;
14 | }
15 | .container {
16 | background-color: white;
17 | padding: 20px;
18 | border-radius: 8px;
19 | box-shadow: 0 2px 4px rgba(0,0,0,0.1);
20 | }
21 | .section {
22 | margin-bottom: 20px;
23 | padding: 20px;
24 | border: 1px solid #ddd;
25 | border-radius: 4px;
26 | }
27 | h1 {
28 | color: #333;
29 | text-align: center;
30 | }
31 | textarea {
32 | width: 100%;
33 | height: 100px;
34 | margin: 10px 0;
35 | padding: 8px;
36 | border: 1px solid #ddd;
37 | border-radius: 4px;
38 | resize: vertical;
39 | }
40 | button {
41 | background-color: #007bff;
42 | color: white;
43 | border: none;
44 | padding: 10px 20px;
45 | border-radius: 4px;
46 | cursor: pointer;
47 | margin: 5px;
48 | }
49 | button:hover {
50 | background-color: #0056b3;
51 | }
52 | select {
53 | padding: 8px;
54 | margin: 5px;
55 | border-radius: 4px;
56 | border: 1px solid #ddd;
57 | }
58 | .status {
59 | margin-top: 10px;
60 | padding: 10px;
61 | border-radius: 4px;
62 | }
63 | .success {
64 | background-color: #d4edda;
65 | color: #155724;
66 | }
67 | .error {
68 | background-color: #f8d7da;
69 | color: #721c24;
70 | }
71 | </style>
72 | </head>
73 | <body>
74 | <div class="container">
75 | <h1>Windows Speech MCP Test</h1>
76 |
77 | <div class="section">
78 | <h2>Text to Speech</h2>
79 | <textarea id="ttsText" placeholder="Enter text to speak...">Hello, this is a test of Windows speech synthesis.</textarea>
80 | <div>
81 | <select id="ttsVoice">
82 | <option value="Microsoft David Desktop">David</option>
83 | <option value="Microsoft Zira Desktop">Zira</option>
84 | </select>
85 | <select id="ttsSpeed">
86 | <option value="0.5">0.5x Speed</option>
87 | <option value="1.0" selected>1.0x Speed</option>
88 | <option value="1.5">1.5x Speed</option>
89 | <option value="2.0">2.0x Speed</option>
90 | </select>
91 | <button onclick="speak()">Speak</button>
92 | </div>
93 | <div id="ttsStatus" class="status"></div>
94 | </div>
95 |
96 | <div class="section">
97 | <h2>Speech to Text</h2>
98 | <div>
99 | <select id="sttDuration">
100 | <option value="5">5 seconds</option>
101 | <option value="10">10 seconds</option>
102 | <option value="15">15 seconds</option>
103 | <option value="30">30 seconds</option>
104 | </select>
105 | <button onclick="startRecording()">Start Recording</button>
106 | </div>
107 | <textarea id="sttText" placeholder="Transcribed text will appear here..." readonly></textarea>
108 | <div id="sttStatus" class="status"></div>
109 | </div>
110 | </div>
111 |
112 | <script>
113 | async function speak() {
114 | const text = document.getElementById('ttsText').value;
115 | const voice = document.getElementById('ttsVoice').value;
116 | const speed = parseFloat(document.getElementById('ttsSpeed').value);
117 | const statusDiv = document.getElementById('ttsStatus');
118 |
119 | try {
120 | const response = await fetch('http://localhost:3000/tts', {
121 | method: 'POST',
122 | headers: {
123 | 'Content-Type': 'application/json'
124 | },
125 | body: JSON.stringify({ text, voice, speed })
126 | });
127 |
128 | if (!response.ok) throw new Error('Failed to synthesize speech');
129 |
130 | statusDiv.textContent = 'Speech synthesis successful!';
131 | statusDiv.className = 'status success';
132 | } catch (error) {
133 | statusDiv.textContent = `Error: ${error.message}`;
134 | statusDiv.className = 'status error';
135 | }
136 | }
137 |
138 | async function startRecording() {
139 | const duration = parseInt(document.getElementById('sttDuration').value);
140 | const statusDiv = document.getElementById('sttStatus');
141 | const textArea = document.getElementById('sttText');
142 |
143 | try {
144 | statusDiv.textContent = `Recording for ${duration} seconds...`;
145 | statusDiv.className = 'status';
146 |
147 | const response = await fetch('http://localhost:3000/stt', {
148 | method: 'POST',
149 | headers: {
150 | 'Content-Type': 'application/json'
151 | },
152 | body: JSON.stringify({ duration })
153 | });
154 |
155 | if (!response.ok) throw new Error('Failed to transcribe speech');
156 |
157 | const result = await response.json();
158 | textArea.value = result.text;
159 | statusDiv.textContent = 'Transcription successful!';
160 | statusDiv.className = 'status success';
161 | } catch (error) {
162 | statusDiv.textContent = `Error: ${error.message}`;
163 | statusDiv.className = 'status error';
164 | textArea.value = '';
165 | }
166 | }
167 |
168 | // Fetch available voices when the page loads
169 | async function loadVoices() {
170 | try {
171 | const response = await fetch('http://localhost:3000/voices');
172 | if (!response.ok) throw new Error('Failed to fetch voices');
173 |
174 | const voices = await response.json();
175 | const voiceSelect = document.getElementById('ttsVoice');
176 | voiceSelect.innerHTML = '';
177 |
178 | voices.forEach(voice => {
179 | const option = document.createElement('option');
180 | option.value = voice;
181 | option.textContent = voice.replace('Microsoft ', '').replace(' Desktop', '');
182 | voiceSelect.appendChild(option);
183 | });
184 | } catch (error) {
185 | console.error('Failed to load voices:', error);
186 | }
187 | }
188 |
189 | window.onload = loadVoices;
190 | </script>
191 | </body>
192 | </html>
```
--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
```typescript
1 | #!/usr/bin/env node
2 |
3 | import express, { Request, Response } from 'express';
4 | import cors from 'cors';
5 | import { exec } from 'child_process';
6 | import { promisify } from 'util';
7 | import * as path from 'path';
8 | import * as fs from 'fs';
9 | import * as net from 'net';
10 | import OpenAI from 'openai';
11 | import dotenv from 'dotenv';
12 |
13 | dotenv.config();
14 |
15 | const execAsync = promisify(exec);
16 |
17 | // Configuration
18 | const DEFAULT_VOICE = 'Microsoft Jenny(Natural) - English (United States)';
19 | const DEFAULT_TIMEOUT = parseInt(process.env.TIMEOUT || '30000', 10);
20 | const DEFAULT_PORT = 3000;
21 |
22 | // Initialize OpenAI
23 | const openai = new OpenAI({
24 | apiKey: process.env.OPENAI_API_KEY
25 | });
26 |
27 | // Type definitions for request arguments
28 | interface TextToSpeechArgs {
29 | text: string;
30 | voice?: string;
31 | speed?: number;
32 | }
33 |
34 | interface SpeechToTextArgs {
35 | duration?: number;
36 | }
37 |
38 | interface ChatArgs {
39 | message: string;
40 | voice?: string;
41 | speed?: number;
42 | }
43 |
44 | // Helper function to find an available port
45 | async function findAvailablePort(startPort: number): Promise<number> {
46 | const isPortAvailable = (port: number): Promise<boolean> => {
47 | return new Promise((resolve) => {
48 | const server = net.createServer()
49 | .once('error', () => resolve(false))
50 | .once('listening', () => {
51 | server.close();
52 | resolve(true);
53 | })
54 | .listen(port);
55 | });
56 | };
57 |
58 | for (let port = startPort; port < startPort + 100; port++) {
59 | if (await isPortAvailable(port)) {
60 | return port;
61 | }
62 | }
63 | throw new Error('No available ports found');
64 | }
65 |
66 | // Helper function to get available Windows voices
67 | async function getWindowsVoices(): Promise<string[]> {
68 | try {
69 | const { stdout } = await execAsync('powershell -Command "Add-Type -AssemblyName System.Speech; (New-Object System.Speech.Synthesis.SpeechSynthesizer).GetInstalledVoices().VoiceInfo.Name"', {
70 | timeout: DEFAULT_TIMEOUT
71 | });
72 | return stdout.split('\n').map(v => v.trim()).filter(Boolean);
73 | } catch (error) {
74 | console.error('Error getting voices:', error);
75 | return [DEFAULT_VOICE];
76 | }
77 | }
78 |
79 | // Helper function to speak text using Windows TTS
80 | async function speakText(text: string, voice: string = DEFAULT_VOICE, speed: number = 1.0): Promise<void> {
81 | const script = `
82 | Add-Type -AssemblyName System.Speech;
83 | $synthesizer = New-Object System.Speech.Synthesis.SpeechSynthesizer;
84 | $synthesizer.SelectVoice('${voice}');
85 | $synthesizer.Rate = ${Math.round((speed - 1) * 10)};
86 | $synthesizer.Speak('${text.replace(/'/g, "''")}');
87 | `;
88 |
89 | await execAsync(`powershell -Command "${script}"`, { timeout: DEFAULT_TIMEOUT });
90 | }
91 |
92 | // Helper function to get GPT-4 response
93 | async function getChatResponse(message: string): Promise<string> {
94 | try {
95 | const completion = await openai.chat.completions.create({
96 | model: "gpt-4",
97 | messages: [
98 | {
99 | role: "system",
100 | content: "You are a helpful assistant. Keep your responses concise and natural, as they will be spoken aloud."
101 | },
102 | {
103 | role: "user",
104 | content: message
105 | }
106 | ],
107 | temperature: 0.7,
108 | max_tokens: 150
109 | });
110 |
111 | return completion.choices[0]?.message?.content || "I'm sorry, I couldn't generate a response.";
112 | } catch (error) {
113 | console.error('Error getting GPT-4 response:', error);
114 | throw error;
115 | }
116 | }
117 |
118 | // Initialize Express app
119 | const app = express();
120 |
121 | app.use(cors());
122 | app.use(express.json());
123 | app.use(express.static('test'));
124 |
125 | // Add timeout middleware
126 | app.use((req: Request, res: Response, next) => {
127 | res.setTimeout(DEFAULT_TIMEOUT, () => {
128 | res.status(408).json({ error: 'Request timeout' });
129 | });
130 | next();
131 | });
132 |
133 | // Get available voices
134 | app.get('/voices', async (_req: Request, res: Response) => {
135 | try {
136 | const voices = await getWindowsVoices();
137 | res.json(voices);
138 | } catch (error) {
139 | res.status(500).json({ error: error instanceof Error ? error.message : String(error) });
140 | }
141 | });
142 |
143 | // Text to Speech
144 | app.post('/tts', async (req: Request<{}, {}, TextToSpeechArgs>, res: Response) => {
145 | try {
146 | const { text, voice = DEFAULT_VOICE, speed = 1.0 } = req.body;
147 |
148 | if (!text) {
149 | return res.status(400).json({ error: 'Text is required' });
150 | }
151 |
152 | await speakText(text, voice, speed);
153 | res.json({ success: true });
154 | } catch (error) {
155 | if (error instanceof Error && error.message.includes('timeout')) {
156 | res.status(408).json({ error: 'Operation timed out' });
157 | } else {
158 | res.status(500).json({ error: error instanceof Error ? error.message : String(error) });
159 | }
160 | }
161 | });
162 |
163 | // Speech to Text
164 | app.post('/stt', async (req: Request<{}, {}, SpeechToTextArgs>, res: Response) => {
165 | try {
166 | const { duration = 5 } = req.body;
167 | const audioFile = path.join(__dirname, 'recording.wav');
168 |
169 | // Record audio using PowerShell
170 | const recordScript = `
171 | Add-Type -AssemblyName System.Windows.Forms;
172 | $audio = New-Object System.IO.MemoryStream;
173 | $waveSource = New-Object NAudio.Wave.WaveInEvent;
174 | $waveSource.WaveFormat = New-Object NAudio.Wave.WaveFormat(16000, 1);
175 | $waveFile = New-Object NAudio.Wave.WaveFileWriter('${audioFile}', $waveSource.WaveFormat);
176 | $waveSource.DataAvailable = {
177 | param($sender, $e)
178 | $waveFile.Write($e.Buffer, 0, $e.BytesRecorded)
179 | };
180 | $waveSource.StartRecording();
181 | Start-Sleep -Seconds ${duration};
182 | $waveSource.StopRecording();
183 | $waveFile.Dispose();
184 | `;
185 |
186 | await execAsync(recordScript, { timeout: DEFAULT_TIMEOUT + (duration * 1000) });
187 |
188 | // Transcribe the recorded audio
189 | const transcribeScript = `
190 | Add-Type -AssemblyName System.Speech;
191 | $recognizer = New-Object System.Speech.Recognition.SpeechRecognizer;
192 | $grammar = New-Object System.Speech.Recognition.DictationGrammar;
193 | $recognizer.LoadGrammar($grammar);
194 | $audio = [System.IO.File]::ReadAllBytes('${audioFile}');
195 | $stream = New-Object System.IO.MemoryStream(@(,$audio));
196 | $result = $recognizer.RecognizeSync([System.Speech.AudioFormat.AudioStream]::new($stream));
197 | $result.Text;
198 | `;
199 |
200 | const { stdout } = await execAsync(`powershell -Command "${transcribeScript}"`, { timeout: DEFAULT_TIMEOUT });
201 |
202 | // Clean up the audio file
203 | await fs.promises.unlink(audioFile);
204 |
205 | res.json({ text: stdout.trim() || 'No speech detected' });
206 | } catch (error) {
207 | // Clean up the audio file if it exists
208 | const audioFile = path.join(__dirname, 'recording.wav');
209 | if (fs.existsSync(audioFile)) {
210 | await fs.promises.unlink(audioFile);
211 | }
212 |
213 | if (error instanceof Error && error.message.includes('timeout')) {
214 | res.status(408).json({ error: 'Operation timed out' });
215 | } else {
216 | res.status(500).json({ error: error instanceof Error ? error.message : String(error) });
217 | }
218 | }
219 | });
220 |
221 | // Chat endpoint that gets GPT-4 response and speaks it
222 | app.post('/chat', async (req: Request<{}, {}, ChatArgs>, res: Response) => {
223 | try {
224 | const { message, voice = DEFAULT_VOICE, speed = 1.0 } = req.body;
225 |
226 | if (!message) {
227 | return res.status(400).json({ error: 'Message is required' });
228 | }
229 |
230 | // Get GPT-4 response
231 | const response = await getChatResponse(message);
232 |
233 | // Speak the response
234 | await speakText(response, voice, speed);
235 |
236 | res.json({
237 | success: true,
238 | response,
239 | spoken: true
240 | });
241 | } catch (error) {
242 | if (error instanceof Error && error.message.includes('timeout')) {
243 | res.status(408).json({ error: 'Operation timed out' });
244 | } else {
245 | res.status(500).json({ error: error instanceof Error ? error.message : String(error) });
246 | }
247 | }
248 | });
249 |
250 | // Start the server
251 | async function startServer() {
252 | try {
253 | const port = await findAvailablePort(DEFAULT_PORT);
254 | app.listen(port, () => {
255 | console.log(`Windows Speech Server running at http://localhost:${port}`);
256 | console.log(`Using default voice: ${DEFAULT_VOICE}`);
257 | console.log(`Timeout set to: ${DEFAULT_TIMEOUT}ms`);
258 | console.log('GPT-4 integration enabled');
259 | });
260 | } catch (error) {
261 | console.error('Failed to start server:', error);
262 | process.exit(1);
263 | }
264 | }
265 |
266 | startServer();
267 |
```