# Directory Structure
```
├── .env.example
├── .gitignore
├── LICENSE
├── package.json
├── README.md
├── src
│ ├── index.ts
│ ├── server.ts
│ ├── services
│ │ └── gemini.ts
│ ├── tools
│ │ ├── audio-recognition.ts
│ │ ├── image-recognition.ts
│ │ └── video-recognition.ts
│ ├── types
│ │ └── index.ts
│ └── utils
│ └── logger.ts
└── tsconfig.json
```
# Files
--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
```
# Google Gemini API key (required)
GOOGLE_API_KEY=your_api_key_here
# Transport type: 'stdio' or 'sse' (defaults to 'stdio')
TRANSPORT_TYPE=stdio
# Port for SSE transport (defaults to 3000)
PORT=3000
# Log level: 'verbose', 'debug', 'info', 'warn', 'error', 'fatal' (defaults to 'fatal')
LOG_LEVEL=fatal
```
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
```
# Dependencies
node_modules/
package-lock.json
yarn.lock
pnpm-lock.yaml
# Build output
dist/
build/
*.tsbuildinfo
# Environment variables
.env
.env.local
.env.*.local
# Logs
logs/
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
# Editor directories and files
.idea/
.vscode/
*.suo
*.ntvs*
*.njsproj
*.sln
*.sw?
# OS files
.DS_Store
Thumbs.db
```
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
```markdown
# MCP Video Recognition Server
An MCP (Model Context Protocol) server that provides tools for image, audio, and video recognition using Google's Gemini AI.
<a href="https://glama.ai/mcp/servers/@mario-andreschak/mcp_video_recognition">
<img width="380" height="200" src="https://glama.ai/mcp/servers/@mario-andreschak/mcp_video_recognition/badge" alt="Video Recognition Server MCP server" />
</a>
## Features
- **Image Recognition**: Analyze and describe images using Google Gemini AI
- **Audio Recognition**: Analyze and transcribe audio using Google Gemini AI
- **Video Recognition**: Analyze and describe videos using Google Gemini AI
## Prerequisites
- Node.js 18 or higher
- Google Gemini API key
## Installation
### Manual Installation
1. Clone the repository:
```bash
git clone https://github.com/yourusername/mcp-video-recognition.git
cd mcp-video-recognition
```
2. Install dependencies:
```bash
npm install
```
3. Build the project:
```bash
npm run build
```
### Installing in [FLUJO](https://github.com/mario-andreschak/FLUJO/)
1. Click Add Server
2. Copy & Paste Github URL into FLUJO
3. Click Parse, Clone, Install, Build and Save.
### Installing via Configuration Files
To integrate this MCP server with Cline or other MCP clients via configuration files:
1. Open your Cline settings:
- In VS Code, go to File -> Preferences -> Settings
- Search for "Cline MCP Settings"
- Click "Edit in settings.json"
2. Add the server configuration to the `mcpServers` object:
```json
{
"mcpServers": {
"video-recognition": {
"command": "node",
"args": [
"/path/to/mcp-video-recognition/dist/index.js"
],
"disabled": false,
"autoApprove": []
}
}
}
```
3. Replace `/path/to/mcp-video-recognition/dist/index.js` with the actual path to the `index.js` file in your project directory. Use forward slashes (/) or double backslashes (\\\\) for the path on Windows.
4. Save the settings file. Cline should automatically connect to the server.
## Configuration
The server is configured using environment variables:
- `GOOGLE_API_KEY` (required): Your Google Gemini API key
- `TRANSPORT_TYPE`: Transport type to use (`stdio` or `sse`, defaults to `stdio`)
- `PORT`: Port number for SSE transport (defaults to 3000)
- `LOG_LEVEL`: Logging level (`verbose`, `debug`, `info`, `warn`, `error`, defaults to `info`)
## Usage
### Starting the Server
#### With stdio Transport (Default)
```bash
GOOGLE_API_KEY=your_api_key npm start
```
#### With SSE Transport
```bash
GOOGLE_API_KEY=your_api_key TRANSPORT_TYPE=sse PORT=3000 npm start
```
### Using the Tools
The server provides three tools that can be called by MCP clients:
#### Image Recognition
```json
{
"name": "image_recognition",
"arguments": {
"filepath": "/path/to/image.jpg",
"prompt": "Describe this image in detail",
"modelname": "gemini-2.0-flash"
}
}
```
#### Audio Recognition
```json
{
"name": "audio_recognition",
"arguments": {
"filepath": "/path/to/audio.mp3",
"prompt": "Transcribe this audio",
"modelname": "gemini-2.0-flash"
}
}
```
#### Video Recognition
```json
{
"name": "video_recognition",
"arguments": {
"filepath": "/path/to/video.mp4",
"prompt": "Describe what happens in this video",
"modelname": "gemini-2.0-flash"
}
}
```
### Tool Parameters
All tools accept the following parameters:
- `filepath` (required): Path to the media file to analyze
- `prompt` (optional): Custom prompt for the recognition (defaults to "Describe this content")
- `modelname` (optional): Gemini model to use for recognition (defaults to "gemini-2.0-flash")
## Development
### Running in Development Mode
```bash
GOOGLE_API_KEY=your_api_key npm run dev
```
### Project Structure
- `src/index.ts`: Entry point
- `src/server.ts`: MCP server implementation
- `src/tools/`: Tool implementations
- `src/services/`: Service implementations (Gemini API)
- `src/types/`: Type definitions
- `src/utils/`: Utility functions
## License
MIT
```
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
```json
{
"compilerOptions": {
"target": "ES2022",
"module": "NodeNext",
"moduleResolution": "NodeNext",
"esModuleInterop": true,
"strict": true,
"outDir": "dist",
"sourceMap": true,
"declaration": true,
"skipLibCheck": true,
"forceConsistentCasingInFileNames": true,
"resolveJsonModule": true
},
"include": ["src/**/*"],
"exclude": ["node_modules", "dist"]
}
```
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
```json
{
"name": "mcp-video-recognition",
"version": "1.0.0",
"description": "MCP server for Google Gemini image, audio, and video recognition",
"main": "dist/index.js",
"type": "module",
"scripts": {
"build": "tsc",
"start": "node dist/index.js",
"dev": "tsc -w & node --watch dist/index.js",
"debug": "tsc & npx @modelcontextprotocol/inspector node dist/index.js",
"lint": "eslint src --ext .ts",
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [
"mcp",
"gemini",
"video",
"audio",
"image",
"recognition"
],
"author": "",
"license": "MIT",
"dependencies": {
"@google/genai": "^0.9.0",
"@modelcontextprotocol/sdk": "^1.10.1",
"express": "^5.1.0",
"zod": "^3.24.3"
},
"devDependencies": {
"@types/express": "^5.0.1",
"@types/node": "^22.14.1",
"typescript": "^5.8.3"
}
}
```
--------------------------------------------------------------------------------
/src/utils/logger.ts:
--------------------------------------------------------------------------------
```typescript
/**
* Logger utility for the MCP server
*/
export enum LogLevel {
VERBOSE = 'verbose',
DEBUG = 'debug',
INFO = 'info',
WARN = 'warn',
ERROR = 'error',
FATAL = 'fatal'
}
export class Logger {
private readonly name: string;
private static level: LogLevel = LogLevel.FATAL;
constructor(name: string) {
this.name = name;
}
static setLogLevel(level: LogLevel): void {
Logger.level = level;
}
private shouldLog(level: LogLevel): boolean {
const levels = Object.values(LogLevel);
return levels.indexOf(level) >= levels.indexOf(Logger.level);
}
private formatMessage(level: LogLevel, message: string): string {
const timestamp = new Date().toISOString();
return `[${timestamp}] [${level.toUpperCase()}] [${this.name}] ${message}`;
}
verbose(message: string, data?: unknown): void {
if (this.shouldLog(LogLevel.VERBOSE)) {
const formattedData = data ? JSON.stringify(data) : '';
console.log(this.formatMessage(LogLevel.VERBOSE, message), formattedData);
}
}
debug(message: string, data?: unknown): void {
if (this.shouldLog(LogLevel.DEBUG)) {
console.log(this.formatMessage(LogLevel.DEBUG, message), data || '');
}
}
info(message: string, data?: unknown): void {
if (this.shouldLog(LogLevel.INFO)) {
console.log(this.formatMessage(LogLevel.INFO, message), data || '');
}
}
warn(message: string, data?: unknown): void {
if (this.shouldLog(LogLevel.WARN)) {
console.warn(this.formatMessage(LogLevel.WARN, message), data || '');
}
}
error(message: string, error?: unknown): void {
if (this.shouldLog(LogLevel.ERROR)) {
console.error(this.formatMessage(LogLevel.ERROR, message), error || '');
}
}
fatal(message: string, error?: unknown): void {
if (this.shouldLog(LogLevel.FATAL)) {
console.error(this.formatMessage(LogLevel.FATAL, message), error || '');
}
}
}
export const createLogger = (name: string): Logger => new Logger(name);
```
--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
```typescript
/**
* Entry point for the MCP video recognition server
*/
import { Server } from './server.js';
import { createLogger, LogLevel, Logger } from './utils/logger.js';
import type { ServerConfig } from './server.js';
const log = createLogger('Main');
// Set log level from environment variable
const logLevel = ( process.env.LOG_LEVEL || LogLevel.FATAL ) as LogLevel;
Logger.setLogLevel(logLevel as LogLevel);
/**
* Load configuration from environment variables
*/
function loadConfig(): ServerConfig {
// Check for required environment variables
const apiKey = process.env.GOOGLE_API_KEY;
if (!apiKey) {
throw new Error('GOOGLE_API_KEY environment variable is required');
}
// Determine transport type
const transportType = process.env.TRANSPORT_TYPE === 'sse' ? 'sse' : 'stdio';
// Parse port if provided
const portStr = process.env.PORT;
const port = portStr ? parseInt(portStr, 10) : undefined;
return {
gemini: {
apiKey
},
transport: transportType,
port
};
}
/**
* Main function to start the server
*/
async function main(): Promise<void> {
try {
log.info('Starting MCP video recognition server');
// Load configuration
const config = loadConfig();
log.info(`Using transport: ${config.transport}`);
// Create and start server
const server = new Server(config);
await server.start();
// Handle process termination
process.on('SIGINT', async () => {
log.info('Received SIGINT signal, shutting down...');
await server.stop();
process.exit(0);
});
process.on('SIGTERM', async () => {
log.info('Received SIGTERM signal, shutting down...');
await server.stop();
process.exit(0);
});
log.info('Server started successfully');
} catch (error) {
log.error('Failed to start server', error);
process.exit(1);
}
}
// Start the server
main().catch(error => {
console.error('Unhandled error:', error);
process.exit(1);
});
```
--------------------------------------------------------------------------------
/src/types/index.ts:
--------------------------------------------------------------------------------
```typescript
/**
* Type definitions for the MCP server
*/
import { z } from 'zod';
import type { Tool, CallToolResult } from '@modelcontextprotocol/sdk/types.js';
/**
* Common parameters for all recognition tools
*/
export const RecognitionParamsSchema = z.object({
filepath: z.string().describe('Path to the media file to analyze'),
prompt: z.string().default('Describe this content').describe('Custom prompt for the recognition'),
modelname: z.string().default('gemini-2.0-flash').describe('Gemini model to use for recognition')
});
export type RecognitionParams = z.infer<typeof RecognitionParamsSchema>;
/**
* Video recognition specific types
*/
export const VideoRecognitionParamsSchema = RecognitionParamsSchema.extend({});
export type VideoRecognitionParams = z.infer<typeof VideoRecognitionParamsSchema>;
/**
* Image recognition specific types
*/
export const ImageRecognitionParamsSchema = RecognitionParamsSchema.extend({});
export type ImageRecognitionParams = z.infer<typeof ImageRecognitionParamsSchema>;
/**
* Audio recognition specific types
*/
export const AudioRecognitionParamsSchema = RecognitionParamsSchema.extend({});
export type AudioRecognitionParams = z.infer<typeof AudioRecognitionParamsSchema>;
/**
* Tool definitions
*/
export interface ToolDefinition {
name: string;
description: string;
inputSchema: z.ZodObject<any>;
callback: (args: any) => Promise<CallToolResult>;
}
/**
* Gemini API types
*/
export interface GeminiConfig {
apiKey: string;
}
export interface GeminiFile {
uri: string;
mimeType: string;
name?: string;
state?: string;
}
export interface ProcessedGeminiFile {
uri: string;
mimeType: string;
name: string;
state: string;
}
export interface CachedFile {
fileId: string;
checksum: string;
uri: string;
mimeType: string;
name: string;
state: string;
timestamp: number;
}
// File states from Gemini API
export enum FileState {
UNSPECIFIED = 'STATE_UNSPECIFIED',
PROCESSING = 'PROCESSING',
ACTIVE = 'ACTIVE',
FAILED = 'FAILED'
}
export interface GeminiResponse {
text: string;
isError?: boolean;
}
```
--------------------------------------------------------------------------------
/src/tools/audio-recognition.ts:
--------------------------------------------------------------------------------
```typescript
/**
* Audio recognition tool for MCP server
*/
import { z } from 'zod';
import { createLogger } from '../utils/logger.js';
import { GeminiService } from '../services/gemini.js';
import { AudioRecognitionParamsSchema } from '../types/index.js';
import type { CallToolResult } from '@modelcontextprotocol/sdk/types.js';
import type { AudioRecognitionParams } from '../types/index.js';
import * as fs from 'node:fs';
import * as path from 'node:path';
const log = createLogger('AudioRecognitionTool');
export const createAudioRecognitionTool = (geminiService: GeminiService) => {
return {
name: 'audio_recognition',
description: 'Analyze and transcribe audio using Google Gemini AI',
inputSchema: AudioRecognitionParamsSchema,
callback: async (args: AudioRecognitionParams): Promise<CallToolResult> => {
try {
log.info(`Processing audio recognition request for file: ${args.filepath}`);
log.verbose('Audio recognition request', JSON.stringify(args));
// Verify file exists
if (!fs.existsSync(args.filepath)) {
throw new Error(`Audio file not found: ${args.filepath}`);
}
// Verify file is an audio
const ext = path.extname(args.filepath).toLowerCase();
if (!['.mp3', '.wav', '.ogg'].includes(ext)) {
throw new Error(`Unsupported audio format: ${ext}. Supported formats are: .mp3, .wav, .ogg`);
}
// Default prompt if not provided
const prompt = args.prompt || 'Describe this audio';
const modelName = args.modelname || 'gemini-2.0-flash';
// Upload the file
log.info('Uploading audio file...');
const file = await geminiService.uploadFile(args.filepath);
// Process with Gemini
log.info('Generating content from audio...');
const result = await geminiService.processFile(file, prompt, modelName);
if (result.isError) {
log.error(`Error in audio recognition: ${result.text}`);
return {
content: [
{
type: 'text',
text: result.text
}
],
isError: true
};
}
log.info('Audio recognition completed successfully');
log.verbose('Audio recognition result', JSON.stringify(result));
return {
content: [
{
type: 'text',
text: result.text
}
]
};
} catch (error) {
log.error('Error in audio recognition tool', error);
const errorMessage = error instanceof Error ? error.message : String(error);
return {
content: [
{
type: 'text',
text: `Error processing audio: ${errorMessage}`
}
],
isError: true
};
}
}
};
};
```
--------------------------------------------------------------------------------
/src/tools/image-recognition.ts:
--------------------------------------------------------------------------------
```typescript
/**
* Image recognition tool for MCP server
*/
import { z } from 'zod';
import { createLogger } from '../utils/logger.js';
import { GeminiService } from '../services/gemini.js';
import { ImageRecognitionParamsSchema } from '../types/index.js';
import type { CallToolResult } from '@modelcontextprotocol/sdk/types.js';
import type { ImageRecognitionParams } from '../types/index.js';
import * as fs from 'node:fs';
import * as path from 'node:path';
const log = createLogger('ImageRecognitionTool');
export const createImageRecognitionTool = (geminiService: GeminiService) => {
return {
name: 'image_recognition',
description: 'Analyze and describe images using Google Gemini AI',
inputSchema: ImageRecognitionParamsSchema,
callback: async (args: ImageRecognitionParams): Promise<CallToolResult> => {
try {
log.info(`Processing image recognition request for file: ${args.filepath}`);
log.verbose('Image recognition request', JSON.stringify(args));
// Verify file exists
if (!fs.existsSync(args.filepath)) {
throw new Error(`Image file not found: ${args.filepath}`);
}
// Verify file is an image
const ext = path.extname(args.filepath).toLowerCase();
if (!['.jpg', '.jpeg', '.png', '.webp'].includes(ext)) {
throw new Error(`Unsupported image format: ${ext}. Supported formats are: .jpg, .jpeg, .png, .webp`);
}
// Default prompt if not provided
const prompt = args.prompt || 'Describe this image';
const modelName = args.modelname || 'gemini-2.0-flash';
// Upload the file
log.info('Uploading image file...');
const file = await geminiService.uploadFile(args.filepath);
// Process with Gemini
log.info('Generating content from image...');
const result = await geminiService.processFile(file, prompt, modelName);
if (result.isError) {
log.error(`Error in image recognition: ${result.text}`);
return {
content: [
{
type: 'text',
text: result.text
}
],
isError: true
};
}
log.info('Image recognition completed successfully');
log.verbose('Image recognition result', JSON.stringify(result));
return {
content: [
{
type: 'text',
text: result.text
}
]
};
} catch (error) {
log.error('Error in image recognition tool', error);
const errorMessage = error instanceof Error ? error.message : String(error);
return {
content: [
{
type: 'text',
text: `Error processing image: ${errorMessage}`
}
],
isError: true
};
}
}
};
};
```
--------------------------------------------------------------------------------
/src/tools/video-recognition.ts:
--------------------------------------------------------------------------------
```typescript
/**
* Video recognition tool for MCP server
*/
import { z } from 'zod';
import { createLogger } from '../utils/logger.js';
import { GeminiService } from '../services/gemini.js';
import { VideoRecognitionParamsSchema, FileState } from '../types/index.js';
import type { CallToolResult } from '@modelcontextprotocol/sdk/types.js';
import type { VideoRecognitionParams } from '../types/index.js';
import * as fs from 'node:fs';
import * as path from 'node:path';
const log = createLogger('VideoRecognitionTool');
export const createVideoRecognitionTool = (geminiService: GeminiService) => {
return {
name: 'video_recognition',
description: 'Analyze and describe videos using Google Gemini AI',
inputSchema: VideoRecognitionParamsSchema,
callback: async (args: VideoRecognitionParams): Promise<CallToolResult> => {
try {
log.info(`Processing video recognition request for file: ${args.filepath}`);
log.verbose('Video recognition request', JSON.stringify(args));
// Verify file exists
if (!fs.existsSync(args.filepath)) {
throw new Error(`Video file not found: ${args.filepath}`);
}
// Verify file is a video
const ext = path.extname(args.filepath).toLowerCase();
if (ext !== '.mp4' && ext !== '.mpeg' && ext !== '.mov' && ext !== '.avi' && ext !== '.webm') {
throw new Error(`Unsupported video format: ${ext}. Supported formats are: .mp4, .mpeg, .mov, .avi, .webm`);
}
// Default prompt if not provided
const prompt = args.prompt || 'Describe this video';
const modelName = args.modelname || 'gemini-2.0-flash';
// Upload the file - this will handle waiting for video processing
log.info('Uploading and processing video file...');
const file = await geminiService.uploadFile(args.filepath);
// Process with Gemini
log.info('Video processing complete, generating content...');
const result = await geminiService.processFile(file, prompt, modelName);
if (result.isError) {
log.error(`Error in video recognition: ${result.text}`);
return {
content: [
{
type: 'text',
text: result.text
}
],
isError: true
};
}
log.info('Video recognition completed successfully');
log.verbose('Video recognition result', JSON.stringify(result));
return {
content: [
{
type: 'text',
text: result.text
}
]
};
} catch (error) {
log.error('Error in video recognition tool', error);
const errorMessage = error instanceof Error ? error.message : String(error);
return {
content: [
{
type: 'text',
text: `Error processing video: ${errorMessage}`
}
],
isError: true
};
}
}
};
};
```
--------------------------------------------------------------------------------
/src/server.ts:
--------------------------------------------------------------------------------
```typescript
/**
* MCP server implementation
*/
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
import { StreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/streamableHttp.js';
import { randomUUID } from 'crypto';
import type { Request, Response } from 'express';
import { createLogger } from './utils/logger.js';
import { GeminiService } from './services/gemini.js';
import { createImageRecognitionTool } from './tools/image-recognition.js';
import { createAudioRecognitionTool } from './tools/audio-recognition.js';
import { createVideoRecognitionTool } from './tools/video-recognition.js';
import type { GeminiConfig } from './types/index.js';
const log = createLogger('Server');
export interface ServerConfig {
gemini: GeminiConfig;
transport: 'stdio' | 'sse';
port?: number;
}
export class Server {
private readonly mcpServer: McpServer;
private readonly geminiService: GeminiService;
private readonly config: ServerConfig;
constructor(config: ServerConfig) {
this.config = config;
// Initialize Gemini service
this.geminiService = new GeminiService(config.gemini);
// Create MCP server
this.mcpServer = new McpServer({
name: 'mcp-video-recognition',
version: '1.0.0'
});
// Register tools
this.registerTools();
log.info('MCP server initialized');
}
/**
* Register all tools with the MCP server
*/
private registerTools(): void {
// Create tools
const imageRecognitionTool = createImageRecognitionTool(this.geminiService);
const audioRecognitionTool = createAudioRecognitionTool(this.geminiService);
const videoRecognitionTool = createVideoRecognitionTool(this.geminiService);
// Register tools with MCP server
this.mcpServer.tool(
imageRecognitionTool.name,
imageRecognitionTool.description,
imageRecognitionTool.inputSchema.shape,
imageRecognitionTool.callback
);
this.mcpServer.tool(
audioRecognitionTool.name,
audioRecognitionTool.description,
audioRecognitionTool.inputSchema.shape,
audioRecognitionTool.callback
);
this.mcpServer.tool(
videoRecognitionTool.name,
videoRecognitionTool.description,
videoRecognitionTool.inputSchema.shape,
videoRecognitionTool.callback
);
log.info('All tools registered with MCP server');
}
/**
* Start the server with the configured transport
*/
async start(): Promise<void> {
try {
if (this.config.transport === 'stdio') {
await this.startWithStdio();
} else if (this.config.transport === 'sse') {
await this.startWithSSE();
} else {
throw new Error(`Unsupported transport: ${this.config.transport}`);
}
} catch (error) {
log.error('Failed to start server', error);
throw error;
}
}
/**
* Start the server with stdio transport
*/
private async startWithStdio(): Promise<void> {
log.info('Starting server with stdio transport');
const transport = new StdioServerTransport();
transport.onclose = () => {
log.info('Stdio transport closed');
};
transport.onerror = (error) => {
log.error('Stdio transport error', error);
};
await this.mcpServer.connect(transport);
log.info('Server started with stdio transport');
}
/**
* Start the server with SSE transport
*/
private async startWithSSE(): Promise<void> {
log.info('Starting server with SSE transport');
// Import express dynamically to avoid loading it when using stdio
const express = await import('express');
const app = express.default();
const port = this.config.port || 3000;
app.use(express.json());
// Map to store transports by session ID
const transports: { [sessionId: string]: StreamableHTTPServerTransport } = {};
// Handle POST requests for client-to-server communication
app.post('/mcp', async (req, res) => {
try {
// Check for existing session ID
const sessionId = req.headers['mcp-session-id'] as string | undefined;
let transport: StreamableHTTPServerTransport;
if (sessionId && transports[sessionId]) {
// Reuse existing transport
transport = transports[sessionId];
log.debug(`Using existing transport for session: ${sessionId}`);
} else {
log.error('No valid session ID provided');
res.status(400).json({
jsonrpc: '2.0',
error: {
code: -32000,
message: 'Bad Request: No valid session ID provided',
},
id: null,
});
return;
}
// Handle the request
await transport.handleRequest(req, res, req.body);
} catch (error) {
log.error('Error handling MCP request', error);
if (!res.headersSent) {
res.status(500).json({
jsonrpc: '2.0',
error: {
code: -32603,
message: 'Internal server error',
},
id: null,
});
}
}
});
// Reusable handler for GET and DELETE requests
const handleSessionRequest = async (req: Request, res: Response) => {
const sessionId = req.headers['mcp-session-id'] as string | undefined;
if (!sessionId || !transports[sessionId]) {
res.status(400).send('Invalid or missing session ID');
return;
}
const transport = transports[sessionId];
await transport.handleRequest(req, res);
};
// Handle GET requests for server-to-client notifications via SSE
app.get('/mcp', async (req, res) => {
try {
// Create a new transport for this connection
const transport = new StreamableHTTPServerTransport({
sessionIdGenerator: () => randomUUID(),
onsessioninitialized: (sessionId) => {
// Store the transport by session ID
transports[sessionId] = transport;
log.info(`New session initialized: ${sessionId}`);
}
});
// Clean up transport when closed
transport.onclose = () => {
if (transport.sessionId) {
delete transports[transport.sessionId];
log.info(`Session closed: ${transport.sessionId}`);
}
};
// Connect to the MCP server
await this.mcpServer.connect(transport);
// Handle the initial GET request
await transport.handleRequest(req, res);
} catch (error) {
log.error('Error handling SSE connection', error);
if (!res.headersSent) {
res.status(500).send('Internal server error');
}
}
});
// Handle DELETE requests for session termination
app.delete('/mcp', handleSessionRequest);
// Start the HTTP server
app.listen(port, () => {
log.info(`Server started with SSE transport on port ${port}`);
});
}
/**
* Stop the server
*/
async stop(): Promise<void> {
try {
await this.mcpServer.close();
log.info('Server stopped');
} catch (error) {
log.error('Error stopping server', error);
throw error;
}
}
}
```
--------------------------------------------------------------------------------
/src/services/gemini.ts:
--------------------------------------------------------------------------------
```typescript
/**
* Service for interacting with Google's Gemini API
*/
import {
GoogleGenAI,
createUserContent,
createPartFromUri
} from '@google/genai';
import { createLogger } from '../utils/logger.js';
import type { GeminiConfig, GeminiFile, GeminiResponse, CachedFile, ProcessedGeminiFile } from '../types/index.js';
import { FileState } from '../types/index.js';
import * as fs from 'node:fs';
import * as path from 'node:path';
import * as crypto from 'node:crypto';
const log = createLogger('GeminiService');
export class GeminiService {
private readonly client: GoogleGenAI;
private fileCache: Map<string, CachedFile> = new Map();
private readonly cacheExpiration = 24 * 60 * 60 * 1000; // 24 hours in milliseconds
constructor(config: GeminiConfig) {
this.client = new GoogleGenAI({ apiKey: config.apiKey });
log.info('Initialized Gemini service');
}
/**
* Calculate checksum for a file
*/
private async calculateChecksum(filePath: string): Promise<string> {
return new Promise((resolve, reject) => {
const hash = crypto.createHash('md5');
const stream = fs.createReadStream(filePath);
stream.on('error', err => reject(err));
stream.on('data', chunk => hash.update(chunk));
stream.on('end', () => resolve(hash.digest('hex')));
});
}
/**
* Check if a file exists in cache and is still valid
*/
private isCacheValid(checksum: string): boolean {
const cachedFile = this.fileCache.get(checksum);
if (!cachedFile) return false;
const now = Date.now();
const isExpired = now - cachedFile.timestamp > this.cacheExpiration;
return !isExpired;
}
/**
* Get file from Gemini API by name
*/
async getFile(name: string): Promise<GeminiFile> {
try {
const file = await this.client.files.get({ name });
log.debug(`Retrieved file details for ${name}`);
log.verbose('File details', JSON.stringify(file));
if (!file.uri || !file.mimeType) {
throw new Error(`Invalid file data returned for ${name}`);
}
return {
uri: file.uri,
mimeType: file.mimeType,
name: file.name,
state: file.state?.toString()
};
} catch (error) {
log.error(`Error retrieving file ${name}`, error);
throw error;
}
}
/**
* Wait for a video file to be processed
*/
async waitForVideoProcessing(file: GeminiFile, maxWaitTimeMs = 300000): Promise<ProcessedGeminiFile> {
if (!file.name) {
throw new Error('File name is required to check processing status');
}
log.info(`Waiting for video processing: ${file.name}`);
const startTime = Date.now();
let currentFile = file;
while (currentFile.state === FileState.PROCESSING) {
// Check if we've exceeded the maximum wait time
if (Date.now() - startTime > maxWaitTimeMs) {
throw new Error(`Timeout waiting for video processing: ${file.name}`);
}
// Wait 2 seconds before checking again
await new Promise(resolve => setTimeout(resolve, 2000));
// Get updated file status
currentFile = await this.getFile(file.name);
log.debug(`Video processing status: ${currentFile.state}`);
if (currentFile.state === FileState.FAILED) {
throw new Error(`Video processing failed: ${file.name}`);
}
}
log.info(`Video processing completed: ${file.name}`);
// Ensure all required fields are present
if (!currentFile.name || !currentFile.state) {
throw new Error('Missing required file information after processing');
}
return {
uri: currentFile.uri,
mimeType: currentFile.mimeType,
name: currentFile.name,
state: currentFile.state
};
}
/**
* Upload a file to Gemini API with caching
*/
async uploadFile(filePath: string): Promise<GeminiFile> {
try {
log.debug(`Processing file upload request: ${filePath}`);
// Calculate checksum for caching
const checksum = await this.calculateChecksum(filePath);
log.debug(`File checksum: ${checksum}`);
// Check if file is in cache and still valid
if (this.isCacheValid(checksum)) {
const cachedFile = this.fileCache.get(checksum)!;
log.info(`Using cached file: ${cachedFile.name}`);
// Return cached file info
return {
uri: cachedFile.uri,
mimeType: cachedFile.mimeType,
name: cachedFile.name,
state: cachedFile.state
};
}
// Determine MIME type based on file extension
const ext = path.extname(filePath).toLowerCase();
let mimeType: string;
let isVideo = false;
if (['.jpg', '.jpeg'].includes(ext)) {
mimeType = 'image/jpeg';
} else if (ext === '.png') {
mimeType = 'image/png';
} else if (ext === '.webp') {
mimeType = 'image/webp';
} else if (ext === '.mp4') {
mimeType = 'video/mp4';
isVideo = true;
} else if (ext === '.mp3') {
mimeType = 'audio/mp3';
} else if (ext === '.wav') {
mimeType = 'audio/wav';
} else if (ext === '.ogg') {
mimeType = 'audio/ogg';
} else {
throw new Error(`Unsupported file extension: ${ext}`);
}
// Upload file to Google's servers
const uploadedFile = await this.client.files.upload({
file: filePath,
config: { mimeType }
});
log.info(`File uploaded successfully: ${filePath}`);
log.verbose('Uploaded file details', JSON.stringify(uploadedFile));
if (!uploadedFile.uri || !uploadedFile.name) {
throw new Error('File upload failed: Missing URI or name');
}
// Create file object
const file: GeminiFile = {
uri: uploadedFile.uri,
mimeType,
name: uploadedFile.name,
state: uploadedFile.state?.toString()
};
// For videos, wait for processing to complete
if (isVideo && file.state === FileState.PROCESSING) {
const processedFile = await this.waitForVideoProcessing(file);
// Update cache with processed file
this.fileCache.set(checksum, {
fileId: processedFile.name!,
checksum,
uri: processedFile.uri,
mimeType: processedFile.mimeType,
name: processedFile.name!,
state: processedFile.state!,
timestamp: Date.now()
});
return processedFile;
}
// Add to cache
if (!file.name) {
throw new Error('File name is required for caching');
}
this.fileCache.set(checksum, {
fileId: file.name,
checksum,
uri: file.uri,
mimeType: file.mimeType,
name: file.name,
state: file.state || FileState.ACTIVE,
timestamp: Date.now()
});
return file;
} catch (error) {
log.error('Error uploading file', error);
throw error;
}
}
/**
* Process a file with Gemini API
*/
async processFile(file: GeminiFile, prompt: string, modelName: string): Promise<GeminiResponse> {
try {
log.debug(`Processing file with model ${modelName}`);
log.verbose('Processing with parameters', JSON.stringify({ file, prompt, modelName }));
const response = await this.client.models.generateContent({
model: modelName,
contents: createUserContent([
createPartFromUri(file.uri, file.mimeType),
prompt
])
});
log.debug('Received response from Gemini API');
log.verbose('Gemini API response', JSON.stringify(response));
const responseText = response.text || '';
return {
text: responseText
};
} catch (error) {
log.error('Error processing file with Gemini API', error);
return {
text: `Error processing file: ${error instanceof Error ? error.message : String(error)}`,
isError: true
};
}
}
}
```