# Directory Structure
```
├── .env.example
├── .gitignore
├── LICENSE
├── package.json
├── README.md
├── src
│ ├── index.ts
│ ├── server.ts
│ ├── services
│ │ └── gemini.ts
│ ├── tools
│ │ ├── audio-recognition.ts
│ │ ├── image-recognition.ts
│ │ └── video-recognition.ts
│ ├── types
│ │ └── index.ts
│ └── utils
│ └── logger.ts
└── tsconfig.json
```
# Files
--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
```
1 | # Google Gemini API key (required)
2 | GOOGLE_API_KEY=your_api_key_here
3 |
4 | # Transport type: 'stdio' or 'sse' (defaults to 'stdio')
5 | TRANSPORT_TYPE=stdio
6 |
7 | # Port for SSE transport (defaults to 3000)
8 | PORT=3000
9 |
10 | # Log level: 'verbose', 'debug', 'info', 'warn', 'error', 'fatal' (defaults to 'fatal')
11 | LOG_LEVEL=fatal
12 |
```
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
```
1 | # Dependencies
2 | node_modules/
3 | package-lock.json
4 | yarn.lock
5 | pnpm-lock.yaml
6 |
7 | # Build output
8 | dist/
9 | build/
10 | *.tsbuildinfo
11 |
12 | # Environment variables
13 | .env
14 | .env.local
15 | .env.*.local
16 |
17 | # Logs
18 | logs/
19 | *.log
20 | npm-debug.log*
21 | yarn-debug.log*
22 | yarn-error.log*
23 |
24 | # Editor directories and files
25 | .idea/
26 | .vscode/
27 | *.suo
28 | *.ntvs*
29 | *.njsproj
30 | *.sln
31 | *.sw?
32 |
33 | # OS files
34 | .DS_Store
35 | Thumbs.db
36 |
```
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
```markdown
1 | # MCP Video Recognition Server
2 |
3 | An MCP (Model Context Protocol) server that provides tools for image, audio, and video recognition using Google's Gemini AI.
4 |
5 | <a href="https://glama.ai/mcp/servers/@mario-andreschak/mcp_video_recognition">
6 | <img width="380" height="200" src="https://glama.ai/mcp/servers/@mario-andreschak/mcp_video_recognition/badge" alt="Video Recognition Server MCP server" />
7 | </a>
8 |
9 | ## Features
10 |
11 | - **Image Recognition**: Analyze and describe images using Google Gemini AI
12 | - **Audio Recognition**: Analyze and transcribe audio using Google Gemini AI
13 | - **Video Recognition**: Analyze and describe videos using Google Gemini AI
14 |
15 | ## Prerequisites
16 |
17 | - Node.js 18 or higher
18 | - Google Gemini API key
19 |
20 | ## Installation
21 |
22 | ### Manual Installation
23 |
24 | 1. Clone the repository:
25 | ```bash
26 | git clone https://github.com/yourusername/mcp-video-recognition.git
27 | cd mcp-video-recognition
28 | ```
29 |
30 | 2. Install dependencies:
31 | ```bash
32 | npm install
33 | ```
34 |
35 | 3. Build the project:
36 | ```bash
37 | npm run build
38 | ```
39 |
40 | ### Installing in [FLUJO](https://github.com/mario-andreschak/FLUJO/)
41 |
42 | 1. Click Add Server
43 | 2. Copy & Paste Github URL into FLUJO
44 | 3. Click Parse, Clone, Install, Build and Save.
45 |
46 | ### Installing via Configuration Files
47 |
48 | To integrate this MCP server with Cline or other MCP clients via configuration files:
49 |
50 | 1. Open your Cline settings:
51 | - In VS Code, go to File -> Preferences -> Settings
52 | - Search for "Cline MCP Settings"
53 | - Click "Edit in settings.json"
54 |
55 | 2. Add the server configuration to the `mcpServers` object:
56 | ```json
57 | {
58 | "mcpServers": {
59 | "video-recognition": {
60 | "command": "node",
61 | "args": [
62 | "/path/to/mcp-video-recognition/dist/index.js"
63 | ],
64 | "disabled": false,
65 | "autoApprove": []
66 | }
67 | }
68 | }
69 | ```
70 |
71 | 3. Replace `/path/to/mcp-video-recognition/dist/index.js` with the actual path to the `index.js` file in your project directory. Use forward slashes (/) or double backslashes (\\\\) for the path on Windows.
72 |
73 | 4. Save the settings file. Cline should automatically connect to the server.
74 |
75 | ## Configuration
76 |
77 | The server is configured using environment variables:
78 |
79 | - `GOOGLE_API_KEY` (required): Your Google Gemini API key
80 | - `TRANSPORT_TYPE`: Transport type to use (`stdio` or `sse`, defaults to `stdio`)
81 | - `PORT`: Port number for SSE transport (defaults to 3000)
82 | - `LOG_LEVEL`: Logging level (`verbose`, `debug`, `info`, `warn`, `error`, defaults to `info`)
83 |
84 | ## Usage
85 |
86 | ### Starting the Server
87 |
88 | #### With stdio Transport (Default)
89 |
90 | ```bash
91 | GOOGLE_API_KEY=your_api_key npm start
92 | ```
93 |
94 | #### With SSE Transport
95 |
96 | ```bash
97 | GOOGLE_API_KEY=your_api_key TRANSPORT_TYPE=sse PORT=3000 npm start
98 | ```
99 |
100 | ### Using the Tools
101 |
102 | The server provides three tools that can be called by MCP clients:
103 |
104 | #### Image Recognition
105 |
106 | ```json
107 | {
108 | "name": "image_recognition",
109 | "arguments": {
110 | "filepath": "/path/to/image.jpg",
111 | "prompt": "Describe this image in detail",
112 | "modelname": "gemini-2.0-flash"
113 | }
114 | }
115 | ```
116 |
117 | #### Audio Recognition
118 |
119 | ```json
120 | {
121 | "name": "audio_recognition",
122 | "arguments": {
123 | "filepath": "/path/to/audio.mp3",
124 | "prompt": "Transcribe this audio",
125 | "modelname": "gemini-2.0-flash"
126 | }
127 | }
128 | ```
129 |
130 | #### Video Recognition
131 |
132 | ```json
133 | {
134 | "name": "video_recognition",
135 | "arguments": {
136 | "filepath": "/path/to/video.mp4",
137 | "prompt": "Describe what happens in this video",
138 | "modelname": "gemini-2.0-flash"
139 | }
140 | }
141 | ```
142 |
143 | ### Tool Parameters
144 |
145 | All tools accept the following parameters:
146 |
147 | - `filepath` (required): Path to the media file to analyze
148 | - `prompt` (optional): Custom prompt for the recognition (defaults to "Describe this content")
149 | - `modelname` (optional): Gemini model to use for recognition (defaults to "gemini-2.0-flash")
150 |
151 | ## Development
152 |
153 | ### Running in Development Mode
154 |
155 | ```bash
156 | GOOGLE_API_KEY=your_api_key npm run dev
157 | ```
158 |
159 | ### Project Structure
160 |
161 | - `src/index.ts`: Entry point
162 | - `src/server.ts`: MCP server implementation
163 | - `src/tools/`: Tool implementations
164 | - `src/services/`: Service implementations (Gemini API)
165 | - `src/types/`: Type definitions
166 | - `src/utils/`: Utility functions
167 |
168 | ## License
169 |
170 | MIT
```
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
```json
1 | {
2 | "compilerOptions": {
3 | "target": "ES2022",
4 | "module": "NodeNext",
5 | "moduleResolution": "NodeNext",
6 | "esModuleInterop": true,
7 | "strict": true,
8 | "outDir": "dist",
9 | "sourceMap": true,
10 | "declaration": true,
11 | "skipLibCheck": true,
12 | "forceConsistentCasingInFileNames": true,
13 | "resolveJsonModule": true
14 | },
15 | "include": ["src/**/*"],
16 | "exclude": ["node_modules", "dist"]
17 | }
18 |
```
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
```json
1 | {
2 | "name": "mcp-video-recognition",
3 | "version": "1.0.0",
4 | "description": "MCP server for Google Gemini image, audio, and video recognition",
5 | "main": "dist/index.js",
6 | "type": "module",
7 | "scripts": {
8 | "build": "tsc",
9 | "start": "node dist/index.js",
10 | "dev": "tsc -w & node --watch dist/index.js",
11 | "debug": "tsc & npx @modelcontextprotocol/inspector node dist/index.js",
12 | "lint": "eslint src --ext .ts",
13 | "test": "echo \"Error: no test specified\" && exit 1"
14 | },
15 | "keywords": [
16 | "mcp",
17 | "gemini",
18 | "video",
19 | "audio",
20 | "image",
21 | "recognition"
22 | ],
23 | "author": "",
24 | "license": "MIT",
25 | "dependencies": {
26 | "@google/genai": "^0.9.0",
27 | "@modelcontextprotocol/sdk": "^1.10.1",
28 | "express": "^5.1.0",
29 | "zod": "^3.24.3"
30 | },
31 | "devDependencies": {
32 | "@types/express": "^5.0.1",
33 | "@types/node": "^22.14.1",
34 | "typescript": "^5.8.3"
35 | }
36 | }
37 |
```
--------------------------------------------------------------------------------
/src/utils/logger.ts:
--------------------------------------------------------------------------------
```typescript
1 | /**
2 | * Logger utility for the MCP server
3 | */
4 |
5 | export enum LogLevel {
6 | VERBOSE = 'verbose',
7 | DEBUG = 'debug',
8 | INFO = 'info',
9 | WARN = 'warn',
10 | ERROR = 'error',
11 | FATAL = 'fatal'
12 | }
13 |
14 | export class Logger {
15 | private readonly name: string;
16 | private static level: LogLevel = LogLevel.FATAL;
17 |
18 | constructor(name: string) {
19 | this.name = name;
20 | }
21 |
22 | static setLogLevel(level: LogLevel): void {
23 | Logger.level = level;
24 | }
25 |
26 | private shouldLog(level: LogLevel): boolean {
27 | const levels = Object.values(LogLevel);
28 | return levels.indexOf(level) >= levels.indexOf(Logger.level);
29 | }
30 |
31 | private formatMessage(level: LogLevel, message: string): string {
32 | const timestamp = new Date().toISOString();
33 | return `[${timestamp}] [${level.toUpperCase()}] [${this.name}] ${message}`;
34 | }
35 |
36 | verbose(message: string, data?: unknown): void {
37 | if (this.shouldLog(LogLevel.VERBOSE)) {
38 | const formattedData = data ? JSON.stringify(data) : '';
39 | console.log(this.formatMessage(LogLevel.VERBOSE, message), formattedData);
40 | }
41 | }
42 |
43 | debug(message: string, data?: unknown): void {
44 | if (this.shouldLog(LogLevel.DEBUG)) {
45 | console.log(this.formatMessage(LogLevel.DEBUG, message), data || '');
46 | }
47 | }
48 |
49 | info(message: string, data?: unknown): void {
50 | if (this.shouldLog(LogLevel.INFO)) {
51 | console.log(this.formatMessage(LogLevel.INFO, message), data || '');
52 | }
53 | }
54 |
55 | warn(message: string, data?: unknown): void {
56 | if (this.shouldLog(LogLevel.WARN)) {
57 | console.warn(this.formatMessage(LogLevel.WARN, message), data || '');
58 | }
59 | }
60 |
61 | error(message: string, error?: unknown): void {
62 | if (this.shouldLog(LogLevel.ERROR)) {
63 | console.error(this.formatMessage(LogLevel.ERROR, message), error || '');
64 | }
65 | }
66 |
67 | fatal(message: string, error?: unknown): void {
68 | if (this.shouldLog(LogLevel.FATAL)) {
69 | console.error(this.formatMessage(LogLevel.FATAL, message), error || '');
70 | }
71 | }
72 | }
73 |
74 | export const createLogger = (name: string): Logger => new Logger(name);
75 |
```
--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
```typescript
1 | /**
2 | * Entry point for the MCP video recognition server
3 | */
4 |
5 | import { Server } from './server.js';
6 | import { createLogger, LogLevel, Logger } from './utils/logger.js';
7 | import type { ServerConfig } from './server.js';
8 |
9 | const log = createLogger('Main');
10 |
11 | // Set log level from environment variable
12 | const logLevel = ( process.env.LOG_LEVEL || LogLevel.FATAL ) as LogLevel;
13 | Logger.setLogLevel(logLevel as LogLevel);
14 |
15 | /**
16 | * Load configuration from environment variables
17 | */
18 | function loadConfig(): ServerConfig {
19 | // Check for required environment variables
20 | const apiKey = process.env.GOOGLE_API_KEY;
21 | if (!apiKey) {
22 | throw new Error('GOOGLE_API_KEY environment variable is required');
23 | }
24 |
25 | // Determine transport type
26 | const transportType = process.env.TRANSPORT_TYPE === 'sse' ? 'sse' : 'stdio';
27 |
28 | // Parse port if provided
29 | const portStr = process.env.PORT;
30 | const port = portStr ? parseInt(portStr, 10) : undefined;
31 |
32 | return {
33 | gemini: {
34 | apiKey
35 | },
36 | transport: transportType,
37 | port
38 | };
39 | }
40 |
41 | /**
42 | * Main function to start the server
43 | */
44 | async function main(): Promise<void> {
45 | try {
46 | log.info('Starting MCP video recognition server');
47 |
48 | // Load configuration
49 | const config = loadConfig();
50 | log.info(`Using transport: ${config.transport}`);
51 |
52 | // Create and start server
53 | const server = new Server(config);
54 | await server.start();
55 |
56 | // Handle process termination
57 | process.on('SIGINT', async () => {
58 | log.info('Received SIGINT signal, shutting down...');
59 | await server.stop();
60 | process.exit(0);
61 | });
62 |
63 | process.on('SIGTERM', async () => {
64 | log.info('Received SIGTERM signal, shutting down...');
65 | await server.stop();
66 | process.exit(0);
67 | });
68 |
69 | log.info('Server started successfully');
70 | } catch (error) {
71 | log.error('Failed to start server', error);
72 | process.exit(1);
73 | }
74 | }
75 |
76 | // Start the server
77 | main().catch(error => {
78 | console.error('Unhandled error:', error);
79 | process.exit(1);
80 | });
81 |
```
--------------------------------------------------------------------------------
/src/types/index.ts:
--------------------------------------------------------------------------------
```typescript
1 | /**
2 | * Type definitions for the MCP server
3 | */
4 |
5 | import { z } from 'zod';
6 | import type { Tool, CallToolResult } from '@modelcontextprotocol/sdk/types.js';
7 |
8 | /**
9 | * Common parameters for all recognition tools
10 | */
11 | export const RecognitionParamsSchema = z.object({
12 | filepath: z.string().describe('Path to the media file to analyze'),
13 | prompt: z.string().default('Describe this content').describe('Custom prompt for the recognition'),
14 | modelname: z.string().default('gemini-2.0-flash').describe('Gemini model to use for recognition')
15 | });
16 |
17 | export type RecognitionParams = z.infer<typeof RecognitionParamsSchema>;
18 |
19 | /**
20 | * Video recognition specific types
21 | */
22 | export const VideoRecognitionParamsSchema = RecognitionParamsSchema.extend({});
23 | export type VideoRecognitionParams = z.infer<typeof VideoRecognitionParamsSchema>;
24 |
25 | /**
26 | * Image recognition specific types
27 | */
28 | export const ImageRecognitionParamsSchema = RecognitionParamsSchema.extend({});
29 | export type ImageRecognitionParams = z.infer<typeof ImageRecognitionParamsSchema>;
30 |
31 | /**
32 | * Audio recognition specific types
33 | */
34 | export const AudioRecognitionParamsSchema = RecognitionParamsSchema.extend({});
35 | export type AudioRecognitionParams = z.infer<typeof AudioRecognitionParamsSchema>;
36 |
37 | /**
38 | * Tool definitions
39 | */
40 | export interface ToolDefinition {
41 | name: string;
42 | description: string;
43 | inputSchema: z.ZodObject<any>;
44 | callback: (args: any) => Promise<CallToolResult>;
45 | }
46 |
47 | /**
48 | * Gemini API types
49 | */
50 | export interface GeminiConfig {
51 | apiKey: string;
52 | }
53 |
54 | export interface GeminiFile {
55 | uri: string;
56 | mimeType: string;
57 | name?: string;
58 | state?: string;
59 | }
60 |
61 | export interface ProcessedGeminiFile {
62 | uri: string;
63 | mimeType: string;
64 | name: string;
65 | state: string;
66 | }
67 |
68 | export interface CachedFile {
69 | fileId: string;
70 | checksum: string;
71 | uri: string;
72 | mimeType: string;
73 | name: string;
74 | state: string;
75 | timestamp: number;
76 | }
77 |
78 | // File states from Gemini API
79 | export enum FileState {
80 | UNSPECIFIED = 'STATE_UNSPECIFIED',
81 | PROCESSING = 'PROCESSING',
82 | ACTIVE = 'ACTIVE',
83 | FAILED = 'FAILED'
84 | }
85 |
86 | export interface GeminiResponse {
87 | text: string;
88 | isError?: boolean;
89 | }
90 |
```
--------------------------------------------------------------------------------
/src/tools/audio-recognition.ts:
--------------------------------------------------------------------------------
```typescript
1 | /**
2 | * Audio recognition tool for MCP server
3 | */
4 |
5 | import { z } from 'zod';
6 | import { createLogger } from '../utils/logger.js';
7 | import { GeminiService } from '../services/gemini.js';
8 | import { AudioRecognitionParamsSchema } from '../types/index.js';
9 | import type { CallToolResult } from '@modelcontextprotocol/sdk/types.js';
10 | import type { AudioRecognitionParams } from '../types/index.js';
11 | import * as fs from 'node:fs';
12 | import * as path from 'node:path';
13 |
14 | const log = createLogger('AudioRecognitionTool');
15 |
16 | export const createAudioRecognitionTool = (geminiService: GeminiService) => {
17 | return {
18 | name: 'audio_recognition',
19 | description: 'Analyze and transcribe audio using Google Gemini AI',
20 | inputSchema: AudioRecognitionParamsSchema,
21 | callback: async (args: AudioRecognitionParams): Promise<CallToolResult> => {
22 | try {
23 | log.info(`Processing audio recognition request for file: ${args.filepath}`);
24 | log.verbose('Audio recognition request', JSON.stringify(args));
25 |
26 | // Verify file exists
27 | if (!fs.existsSync(args.filepath)) {
28 | throw new Error(`Audio file not found: ${args.filepath}`);
29 | }
30 |
31 | // Verify file is an audio
32 | const ext = path.extname(args.filepath).toLowerCase();
33 | if (!['.mp3', '.wav', '.ogg'].includes(ext)) {
34 | throw new Error(`Unsupported audio format: ${ext}. Supported formats are: .mp3, .wav, .ogg`);
35 | }
36 |
37 | // Default prompt if not provided
38 | const prompt = args.prompt || 'Describe this audio';
39 | const modelName = args.modelname || 'gemini-2.0-flash';
40 |
41 | // Upload the file
42 | log.info('Uploading audio file...');
43 | const file = await geminiService.uploadFile(args.filepath);
44 |
45 | // Process with Gemini
46 | log.info('Generating content from audio...');
47 | const result = await geminiService.processFile(file, prompt, modelName);
48 |
49 | if (result.isError) {
50 | log.error(`Error in audio recognition: ${result.text}`);
51 | return {
52 | content: [
53 | {
54 | type: 'text',
55 | text: result.text
56 | }
57 | ],
58 | isError: true
59 | };
60 | }
61 |
62 | log.info('Audio recognition completed successfully');
63 | log.verbose('Audio recognition result', JSON.stringify(result));
64 |
65 | return {
66 | content: [
67 | {
68 | type: 'text',
69 | text: result.text
70 | }
71 | ]
72 | };
73 | } catch (error) {
74 | log.error('Error in audio recognition tool', error);
75 | const errorMessage = error instanceof Error ? error.message : String(error);
76 |
77 | return {
78 | content: [
79 | {
80 | type: 'text',
81 | text: `Error processing audio: ${errorMessage}`
82 | }
83 | ],
84 | isError: true
85 | };
86 | }
87 | }
88 | };
89 | };
90 |
```
--------------------------------------------------------------------------------
/src/tools/image-recognition.ts:
--------------------------------------------------------------------------------
```typescript
1 | /**
2 | * Image recognition tool for MCP server
3 | */
4 |
5 | import { z } from 'zod';
6 | import { createLogger } from '../utils/logger.js';
7 | import { GeminiService } from '../services/gemini.js';
8 | import { ImageRecognitionParamsSchema } from '../types/index.js';
9 | import type { CallToolResult } from '@modelcontextprotocol/sdk/types.js';
10 | import type { ImageRecognitionParams } from '../types/index.js';
11 | import * as fs from 'node:fs';
12 | import * as path from 'node:path';
13 |
14 | const log = createLogger('ImageRecognitionTool');
15 |
16 | export const createImageRecognitionTool = (geminiService: GeminiService) => {
17 | return {
18 | name: 'image_recognition',
19 | description: 'Analyze and describe images using Google Gemini AI',
20 | inputSchema: ImageRecognitionParamsSchema,
21 | callback: async (args: ImageRecognitionParams): Promise<CallToolResult> => {
22 | try {
23 | log.info(`Processing image recognition request for file: ${args.filepath}`);
24 | log.verbose('Image recognition request', JSON.stringify(args));
25 |
26 | // Verify file exists
27 | if (!fs.existsSync(args.filepath)) {
28 | throw new Error(`Image file not found: ${args.filepath}`);
29 | }
30 |
31 | // Verify file is an image
32 | const ext = path.extname(args.filepath).toLowerCase();
33 | if (!['.jpg', '.jpeg', '.png', '.webp'].includes(ext)) {
34 | throw new Error(`Unsupported image format: ${ext}. Supported formats are: .jpg, .jpeg, .png, .webp`);
35 | }
36 |
37 | // Default prompt if not provided
38 | const prompt = args.prompt || 'Describe this image';
39 | const modelName = args.modelname || 'gemini-2.0-flash';
40 |
41 | // Upload the file
42 | log.info('Uploading image file...');
43 | const file = await geminiService.uploadFile(args.filepath);
44 |
45 | // Process with Gemini
46 | log.info('Generating content from image...');
47 | const result = await geminiService.processFile(file, prompt, modelName);
48 |
49 | if (result.isError) {
50 | log.error(`Error in image recognition: ${result.text}`);
51 | return {
52 | content: [
53 | {
54 | type: 'text',
55 | text: result.text
56 | }
57 | ],
58 | isError: true
59 | };
60 | }
61 |
62 | log.info('Image recognition completed successfully');
63 | log.verbose('Image recognition result', JSON.stringify(result));
64 |
65 | return {
66 | content: [
67 | {
68 | type: 'text',
69 | text: result.text
70 | }
71 | ]
72 | };
73 | } catch (error) {
74 | log.error('Error in image recognition tool', error);
75 | const errorMessage = error instanceof Error ? error.message : String(error);
76 |
77 | return {
78 | content: [
79 | {
80 | type: 'text',
81 | text: `Error processing image: ${errorMessage}`
82 | }
83 | ],
84 | isError: true
85 | };
86 | }
87 | }
88 | };
89 | };
90 |
```
--------------------------------------------------------------------------------
/src/tools/video-recognition.ts:
--------------------------------------------------------------------------------
```typescript
1 | /**
2 | * Video recognition tool for MCP server
3 | */
4 |
5 | import { z } from 'zod';
6 | import { createLogger } from '../utils/logger.js';
7 | import { GeminiService } from '../services/gemini.js';
8 | import { VideoRecognitionParamsSchema, FileState } from '../types/index.js';
9 | import type { CallToolResult } from '@modelcontextprotocol/sdk/types.js';
10 | import type { VideoRecognitionParams } from '../types/index.js';
11 | import * as fs from 'node:fs';
12 | import * as path from 'node:path';
13 |
14 | const log = createLogger('VideoRecognitionTool');
15 |
16 | export const createVideoRecognitionTool = (geminiService: GeminiService) => {
17 | return {
18 | name: 'video_recognition',
19 | description: 'Analyze and describe videos using Google Gemini AI',
20 | inputSchema: VideoRecognitionParamsSchema,
21 | callback: async (args: VideoRecognitionParams): Promise<CallToolResult> => {
22 | try {
23 | log.info(`Processing video recognition request for file: ${args.filepath}`);
24 | log.verbose('Video recognition request', JSON.stringify(args));
25 |
26 | // Verify file exists
27 | if (!fs.existsSync(args.filepath)) {
28 | throw new Error(`Video file not found: ${args.filepath}`);
29 | }
30 |
31 | // Verify file is a video
32 | const ext = path.extname(args.filepath).toLowerCase();
33 | if (ext !== '.mp4' && ext !== '.mpeg' && ext !== '.mov' && ext !== '.avi' && ext !== '.webm') {
34 | throw new Error(`Unsupported video format: ${ext}. Supported formats are: .mp4, .mpeg, .mov, .avi, .webm`);
35 | }
36 |
37 | // Default prompt if not provided
38 | const prompt = args.prompt || 'Describe this video';
39 | const modelName = args.modelname || 'gemini-2.0-flash';
40 |
41 | // Upload the file - this will handle waiting for video processing
42 | log.info('Uploading and processing video file...');
43 | const file = await geminiService.uploadFile(args.filepath);
44 |
45 | // Process with Gemini
46 | log.info('Video processing complete, generating content...');
47 | const result = await geminiService.processFile(file, prompt, modelName);
48 |
49 | if (result.isError) {
50 | log.error(`Error in video recognition: ${result.text}`);
51 | return {
52 | content: [
53 | {
54 | type: 'text',
55 | text: result.text
56 | }
57 | ],
58 | isError: true
59 | };
60 | }
61 |
62 | log.info('Video recognition completed successfully');
63 | log.verbose('Video recognition result', JSON.stringify(result));
64 |
65 | return {
66 | content: [
67 | {
68 | type: 'text',
69 | text: result.text
70 | }
71 | ]
72 | };
73 | } catch (error) {
74 | log.error('Error in video recognition tool', error);
75 | const errorMessage = error instanceof Error ? error.message : String(error);
76 |
77 | return {
78 | content: [
79 | {
80 | type: 'text',
81 | text: `Error processing video: ${errorMessage}`
82 | }
83 | ],
84 | isError: true
85 | };
86 | }
87 | }
88 | };
89 | };
90 |
```
--------------------------------------------------------------------------------
/src/server.ts:
--------------------------------------------------------------------------------
```typescript
1 | /**
2 | * MCP server implementation
3 | */
4 |
5 | import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
6 | import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
7 | import { StreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/streamableHttp.js';
8 | import { randomUUID } from 'crypto';
9 | import type { Request, Response } from 'express';
10 | import { createLogger } from './utils/logger.js';
11 | import { GeminiService } from './services/gemini.js';
12 | import { createImageRecognitionTool } from './tools/image-recognition.js';
13 | import { createAudioRecognitionTool } from './tools/audio-recognition.js';
14 | import { createVideoRecognitionTool } from './tools/video-recognition.js';
15 | import type { GeminiConfig } from './types/index.js';
16 |
17 | const log = createLogger('Server');
18 |
19 | export interface ServerConfig {
20 | gemini: GeminiConfig;
21 | transport: 'stdio' | 'sse';
22 | port?: number;
23 | }
24 |
25 | export class Server {
26 | private readonly mcpServer: McpServer;
27 | private readonly geminiService: GeminiService;
28 | private readonly config: ServerConfig;
29 |
30 | constructor(config: ServerConfig) {
31 | this.config = config;
32 |
33 | // Initialize Gemini service
34 | this.geminiService = new GeminiService(config.gemini);
35 |
36 | // Create MCP server
37 | this.mcpServer = new McpServer({
38 | name: 'mcp-video-recognition',
39 | version: '1.0.0'
40 | });
41 |
42 | // Register tools
43 | this.registerTools();
44 |
45 | log.info('MCP server initialized');
46 | }
47 |
48 | /**
49 | * Register all tools with the MCP server
50 | */
51 | private registerTools(): void {
52 | // Create tools
53 | const imageRecognitionTool = createImageRecognitionTool(this.geminiService);
54 | const audioRecognitionTool = createAudioRecognitionTool(this.geminiService);
55 | const videoRecognitionTool = createVideoRecognitionTool(this.geminiService);
56 |
57 | // Register tools with MCP server
58 | this.mcpServer.tool(
59 | imageRecognitionTool.name,
60 | imageRecognitionTool.description,
61 | imageRecognitionTool.inputSchema.shape,
62 | imageRecognitionTool.callback
63 | );
64 |
65 | this.mcpServer.tool(
66 | audioRecognitionTool.name,
67 | audioRecognitionTool.description,
68 | audioRecognitionTool.inputSchema.shape,
69 | audioRecognitionTool.callback
70 | );
71 |
72 | this.mcpServer.tool(
73 | videoRecognitionTool.name,
74 | videoRecognitionTool.description,
75 | videoRecognitionTool.inputSchema.shape,
76 | videoRecognitionTool.callback
77 | );
78 |
79 | log.info('All tools registered with MCP server');
80 | }
81 |
82 | /**
83 | * Start the server with the configured transport
84 | */
85 | async start(): Promise<void> {
86 | try {
87 | if (this.config.transport === 'stdio') {
88 | await this.startWithStdio();
89 | } else if (this.config.transport === 'sse') {
90 | await this.startWithSSE();
91 | } else {
92 | throw new Error(`Unsupported transport: ${this.config.transport}`);
93 | }
94 | } catch (error) {
95 | log.error('Failed to start server', error);
96 | throw error;
97 | }
98 | }
99 |
100 | /**
101 | * Start the server with stdio transport
102 | */
103 | private async startWithStdio(): Promise<void> {
104 | log.info('Starting server with stdio transport');
105 |
106 | const transport = new StdioServerTransport();
107 |
108 | transport.onclose = () => {
109 | log.info('Stdio transport closed');
110 | };
111 |
112 | transport.onerror = (error) => {
113 | log.error('Stdio transport error', error);
114 | };
115 |
116 | await this.mcpServer.connect(transport);
117 | log.info('Server started with stdio transport');
118 | }
119 |
120 | /**
121 | * Start the server with SSE transport
122 | */
123 | private async startWithSSE(): Promise<void> {
124 | log.info('Starting server with SSE transport');
125 |
126 | // Import express dynamically to avoid loading it when using stdio
127 | const express = await import('express');
128 | const app = express.default();
129 | const port = this.config.port || 3000;
130 |
131 | app.use(express.json());
132 |
133 | // Map to store transports by session ID
134 | const transports: { [sessionId: string]: StreamableHTTPServerTransport } = {};
135 |
136 | // Handle POST requests for client-to-server communication
137 | app.post('/mcp', async (req, res) => {
138 | try {
139 | // Check for existing session ID
140 | const sessionId = req.headers['mcp-session-id'] as string | undefined;
141 | let transport: StreamableHTTPServerTransport;
142 |
143 | if (sessionId && transports[sessionId]) {
144 | // Reuse existing transport
145 | transport = transports[sessionId];
146 | log.debug(`Using existing transport for session: ${sessionId}`);
147 | } else {
148 | log.error('No valid session ID provided');
149 | res.status(400).json({
150 | jsonrpc: '2.0',
151 | error: {
152 | code: -32000,
153 | message: 'Bad Request: No valid session ID provided',
154 | },
155 | id: null,
156 | });
157 | return;
158 | }
159 |
160 | // Handle the request
161 | await transport.handleRequest(req, res, req.body);
162 | } catch (error) {
163 | log.error('Error handling MCP request', error);
164 | if (!res.headersSent) {
165 | res.status(500).json({
166 | jsonrpc: '2.0',
167 | error: {
168 | code: -32603,
169 | message: 'Internal server error',
170 | },
171 | id: null,
172 | });
173 | }
174 | }
175 | });
176 |
177 | // Reusable handler for GET and DELETE requests
178 | const handleSessionRequest = async (req: Request, res: Response) => {
179 | const sessionId = req.headers['mcp-session-id'] as string | undefined;
180 | if (!sessionId || !transports[sessionId]) {
181 | res.status(400).send('Invalid or missing session ID');
182 | return;
183 | }
184 |
185 | const transport = transports[sessionId];
186 | await transport.handleRequest(req, res);
187 | };
188 |
189 | // Handle GET requests for server-to-client notifications via SSE
190 | app.get('/mcp', async (req, res) => {
191 | try {
192 | // Create a new transport for this connection
193 | const transport = new StreamableHTTPServerTransport({
194 | sessionIdGenerator: () => randomUUID(),
195 | onsessioninitialized: (sessionId) => {
196 | // Store the transport by session ID
197 | transports[sessionId] = transport;
198 | log.info(`New session initialized: ${sessionId}`);
199 | }
200 | });
201 |
202 | // Clean up transport when closed
203 | transport.onclose = () => {
204 | if (transport.sessionId) {
205 | delete transports[transport.sessionId];
206 | log.info(`Session closed: ${transport.sessionId}`);
207 | }
208 | };
209 |
210 | // Connect to the MCP server
211 | await this.mcpServer.connect(transport);
212 |
213 | // Handle the initial GET request
214 | await transport.handleRequest(req, res);
215 | } catch (error) {
216 | log.error('Error handling SSE connection', error);
217 | if (!res.headersSent) {
218 | res.status(500).send('Internal server error');
219 | }
220 | }
221 | });
222 |
223 | // Handle DELETE requests for session termination
224 | app.delete('/mcp', handleSessionRequest);
225 |
226 | // Start the HTTP server
227 | app.listen(port, () => {
228 | log.info(`Server started with SSE transport on port ${port}`);
229 | });
230 | }
231 |
232 | /**
233 | * Stop the server
234 | */
235 | async stop(): Promise<void> {
236 | try {
237 | await this.mcpServer.close();
238 | log.info('Server stopped');
239 | } catch (error) {
240 | log.error('Error stopping server', error);
241 | throw error;
242 | }
243 | }
244 | }
245 |
```
--------------------------------------------------------------------------------
/src/services/gemini.ts:
--------------------------------------------------------------------------------
```typescript
1 | /**
2 | * Service for interacting with Google's Gemini API
3 | */
4 |
5 | import {
6 | GoogleGenAI,
7 | createUserContent,
8 | createPartFromUri
9 | } from '@google/genai';
10 | import { createLogger } from '../utils/logger.js';
11 | import type { GeminiConfig, GeminiFile, GeminiResponse, CachedFile, ProcessedGeminiFile } from '../types/index.js';
12 | import { FileState } from '../types/index.js';
13 | import * as fs from 'node:fs';
14 | import * as path from 'node:path';
15 | import * as crypto from 'node:crypto';
16 |
17 | const log = createLogger('GeminiService');
18 |
19 | export class GeminiService {
20 | private readonly client: GoogleGenAI;
21 | private fileCache: Map<string, CachedFile> = new Map();
22 | private readonly cacheExpiration = 24 * 60 * 60 * 1000; // 24 hours in milliseconds
23 |
24 | constructor(config: GeminiConfig) {
25 | this.client = new GoogleGenAI({ apiKey: config.apiKey });
26 | log.info('Initialized Gemini service');
27 | }
28 |
29 | /**
30 | * Calculate checksum for a file
31 | */
32 | private async calculateChecksum(filePath: string): Promise<string> {
33 | return new Promise((resolve, reject) => {
34 | const hash = crypto.createHash('md5');
35 | const stream = fs.createReadStream(filePath);
36 |
37 | stream.on('error', err => reject(err));
38 | stream.on('data', chunk => hash.update(chunk));
39 | stream.on('end', () => resolve(hash.digest('hex')));
40 | });
41 | }
42 |
43 | /**
44 | * Check if a file exists in cache and is still valid
45 | */
46 | private isCacheValid(checksum: string): boolean {
47 | const cachedFile = this.fileCache.get(checksum);
48 | if (!cachedFile) return false;
49 |
50 | const now = Date.now();
51 | const isExpired = now - cachedFile.timestamp > this.cacheExpiration;
52 |
53 | return !isExpired;
54 | }
55 |
56 | /**
57 | * Get file from Gemini API by name
58 | */
59 | async getFile(name: string): Promise<GeminiFile> {
60 | try {
61 | const file = await this.client.files.get({ name });
62 | log.debug(`Retrieved file details for ${name}`);
63 | log.verbose('File details', JSON.stringify(file));
64 |
65 | if (!file.uri || !file.mimeType) {
66 | throw new Error(`Invalid file data returned for ${name}`);
67 | }
68 |
69 | return {
70 | uri: file.uri,
71 | mimeType: file.mimeType,
72 | name: file.name,
73 | state: file.state?.toString()
74 | };
75 | } catch (error) {
76 | log.error(`Error retrieving file ${name}`, error);
77 | throw error;
78 | }
79 | }
80 |
81 | /**
82 | * Wait for a video file to be processed
83 | */
84 | async waitForVideoProcessing(file: GeminiFile, maxWaitTimeMs = 300000): Promise<ProcessedGeminiFile> {
85 | if (!file.name) {
86 | throw new Error('File name is required to check processing status');
87 | }
88 |
89 | log.info(`Waiting for video processing: ${file.name}`);
90 |
91 | const startTime = Date.now();
92 | let currentFile = file;
93 |
94 | while (currentFile.state === FileState.PROCESSING) {
95 | // Check if we've exceeded the maximum wait time
96 | if (Date.now() - startTime > maxWaitTimeMs) {
97 | throw new Error(`Timeout waiting for video processing: ${file.name}`);
98 | }
99 |
100 | // Wait 2 seconds before checking again
101 | await new Promise(resolve => setTimeout(resolve, 2000));
102 |
103 | // Get updated file status
104 | currentFile = await this.getFile(file.name);
105 | log.debug(`Video processing status: ${currentFile.state}`);
106 |
107 | if (currentFile.state === FileState.FAILED) {
108 | throw new Error(`Video processing failed: ${file.name}`);
109 | }
110 | }
111 |
112 | log.info(`Video processing completed: ${file.name}`);
113 |
114 | // Ensure all required fields are present
115 | if (!currentFile.name || !currentFile.state) {
116 | throw new Error('Missing required file information after processing');
117 | }
118 |
119 | return {
120 | uri: currentFile.uri,
121 | mimeType: currentFile.mimeType,
122 | name: currentFile.name,
123 | state: currentFile.state
124 | };
125 | }
126 |
127 | /**
128 | * Upload a file to Gemini API with caching
129 | */
130 | async uploadFile(filePath: string): Promise<GeminiFile> {
131 | try {
132 | log.debug(`Processing file upload request: ${filePath}`);
133 |
134 | // Calculate checksum for caching
135 | const checksum = await this.calculateChecksum(filePath);
136 | log.debug(`File checksum: ${checksum}`);
137 |
138 | // Check if file is in cache and still valid
139 | if (this.isCacheValid(checksum)) {
140 | const cachedFile = this.fileCache.get(checksum)!;
141 | log.info(`Using cached file: ${cachedFile.name}`);
142 |
143 | // Return cached file info
144 | return {
145 | uri: cachedFile.uri,
146 | mimeType: cachedFile.mimeType,
147 | name: cachedFile.name,
148 | state: cachedFile.state
149 | };
150 | }
151 |
152 | // Determine MIME type based on file extension
153 | const ext = path.extname(filePath).toLowerCase();
154 | let mimeType: string;
155 | let isVideo = false;
156 |
157 | if (['.jpg', '.jpeg'].includes(ext)) {
158 | mimeType = 'image/jpeg';
159 | } else if (ext === '.png') {
160 | mimeType = 'image/png';
161 | } else if (ext === '.webp') {
162 | mimeType = 'image/webp';
163 | } else if (ext === '.mp4') {
164 | mimeType = 'video/mp4';
165 | isVideo = true;
166 | } else if (ext === '.mp3') {
167 | mimeType = 'audio/mp3';
168 | } else if (ext === '.wav') {
169 | mimeType = 'audio/wav';
170 | } else if (ext === '.ogg') {
171 | mimeType = 'audio/ogg';
172 | } else {
173 | throw new Error(`Unsupported file extension: ${ext}`);
174 | }
175 |
176 | // Upload file to Google's servers
177 | const uploadedFile = await this.client.files.upload({
178 | file: filePath,
179 | config: { mimeType }
180 | });
181 |
182 | log.info(`File uploaded successfully: ${filePath}`);
183 | log.verbose('Uploaded file details', JSON.stringify(uploadedFile));
184 |
185 | if (!uploadedFile.uri || !uploadedFile.name) {
186 | throw new Error('File upload failed: Missing URI or name');
187 | }
188 |
189 | // Create file object
190 | const file: GeminiFile = {
191 | uri: uploadedFile.uri,
192 | mimeType,
193 | name: uploadedFile.name,
194 | state: uploadedFile.state?.toString()
195 | };
196 |
197 | // For videos, wait for processing to complete
198 | if (isVideo && file.state === FileState.PROCESSING) {
199 | const processedFile = await this.waitForVideoProcessing(file);
200 |
201 | // Update cache with processed file
202 | this.fileCache.set(checksum, {
203 | fileId: processedFile.name!,
204 | checksum,
205 | uri: processedFile.uri,
206 | mimeType: processedFile.mimeType,
207 | name: processedFile.name!,
208 | state: processedFile.state!,
209 | timestamp: Date.now()
210 | });
211 |
212 | return processedFile;
213 | }
214 |
215 | // Add to cache
216 | if (!file.name) {
217 | throw new Error('File name is required for caching');
218 | }
219 |
220 | this.fileCache.set(checksum, {
221 | fileId: file.name,
222 | checksum,
223 | uri: file.uri,
224 | mimeType: file.mimeType,
225 | name: file.name,
226 | state: file.state || FileState.ACTIVE,
227 | timestamp: Date.now()
228 | });
229 |
230 | return file;
231 | } catch (error) {
232 | log.error('Error uploading file', error);
233 | throw error;
234 | }
235 | }
236 |
237 | /**
238 | * Process a file with Gemini API
239 | */
240 | async processFile(file: GeminiFile, prompt: string, modelName: string): Promise<GeminiResponse> {
241 | try {
242 | log.debug(`Processing file with model ${modelName}`);
243 | log.verbose('Processing with parameters', JSON.stringify({ file, prompt, modelName }));
244 |
245 | const response = await this.client.models.generateContent({
246 | model: modelName,
247 | contents: createUserContent([
248 | createPartFromUri(file.uri, file.mimeType),
249 | prompt
250 | ])
251 | });
252 |
253 | log.debug('Received response from Gemini API');
254 | log.verbose('Gemini API response', JSON.stringify(response));
255 |
256 | const responseText = response.text || '';
257 |
258 | return {
259 | text: responseText
260 | };
261 | } catch (error) {
262 | log.error('Error processing file with Gemini API', error);
263 | return {
264 | text: `Error processing file: ${error instanceof Error ? error.message : String(error)}`,
265 | isError: true
266 | };
267 | }
268 | }
269 | }
270 |
```