# Directory Structure
```
├── .gitignore
├── Dockerfile
├── LICENSE
├── package-lock.json
├── package.json
├── README.md
├── smithery.yaml
├── src
│ ├── index.ts
│ ├── tools
│ │ ├── _index.ts
│ │ ├── documentReader.ts
│ │ ├── docxTools.ts
│ │ ├── excelTools.ts
│ │ ├── formatConverterPlus.ts
│ │ ├── htmlTools.ts
│ │ ├── pdfTools.ts
│ │ └── txtTools.ts
│ └── xhr-sync-worker.js
└── tsconfig.json
```
# Files
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
```
dist/*
node_modules/*
CodeReview.md
todo.md
develop-prompt-plan.md
```
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
```markdown
[](https://mseep.ai/app/cablate-mcp-doc-forge)
# Simple Document Processing MCP Server
[](https://smithery.ai/server/@cablate/mcp-doc-forge)
A powerful Model Context Protocol (MCP) server providing comprehensive document processing capabilities.
<a href="https://glama.ai/mcp/servers/pb9df6lnel"><img width="380" height="200" src="https://glama.ai/mcp/servers/pb9df6lnel/badge" alt="Simple Document Processing Server MCP server" /></a>
## Features
### Document Reader
- Read DOCX, PDF, TXT, HTML, CSV
### Document Conversion
- DOCX to HTML/PDF conversion
- HTML to TXT/Markdown conversion
- PDF manipulation (merge, split)
### Text Processing
- Multi-encoding transfer support (UTF-8, Big5, GBK)
- Text formatting and cleaning
- Text comparison and diff generation
- Text splitting by lines or delimiter
### HTML Processing
- HTML cleaning and formatting
- Resource extraction (images, links, videos)
- Structure-preserving conversion
## Installation
### Installing via Smithery
To install Document Processing Server for Claude Desktop automatically via [Smithery](https://smithery.ai/server/@cablate/mcp-doc-forge):
```bash
npx -y @smithery/cli install @cablate/mcp-doc-forge --client claude
```
### Manual Installation
```bash
npm install -g @cablate/mcp-doc-forge
```
## Usage
### Cli
```bash
mcp-doc-forge
```
### With [Dive Desktop](https://github.com/OpenAgentPlatform/Dive)
1. Click "+ Add MCP Server" in Dive Desktop
2. Copy and paste this configuration:
```json
{
"mcpServers": {
"searxng": {
"command": "npx",
"args": [
"-y",
"@cablate/mcp-doc-forge"
],
"enabled": true
}
}
}
```
3. Click "Save" to install the MCP server
## License
MIT
## Contributing
Welcome community participation and contributions! Here are ways to contribute:
- ⭐️ Star the project if you find it helpful
- 🐛 Submit Issues: Report problems or provide suggestions
- 🔧 Create Pull Requests: Submit code improvements
## Contact
If you have any questions or suggestions, feel free to reach out:
- 📧 Email: [[email protected]](mailto:[email protected])
- 📧 GitHub: [CabLate](https://github.com/cablate/)
- 🤝 Collaboration: Welcome to discuss project cooperation
- 📚 Technical Guidance: Sincere welcome for suggestions and guidance
```
--------------------------------------------------------------------------------
/smithery.yaml:
--------------------------------------------------------------------------------
```yaml
# Smithery configuration file: https://smithery.ai/docs/config#smitheryyaml
startCommand:
type: stdio
configSchema:
# JSON Schema defining the configuration options for the MCP.
type: object
properties: {}
commandFunction:
# A function that produces the CLI command to start the MCP on stdio.
|-
config => ({ command: 'node', args: ['dist/index.cjs'], env: {} })
```
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
```json
{
"compilerOptions": {
"target": "ES2022",
"strict": true,
"esModuleInterop": true,
"skipLibCheck": true,
"forceConsistentCasingInFileNames": true,
"resolveJsonModule": true,
"outDir": "./dist",
"rootDir": "./src",
"moduleResolution": "NodeNext",
"module": "NodeNext",
"noImplicitAny": false
},
"exclude": ["node_modules"],
"include": ["src/**/*"]
}
```
--------------------------------------------------------------------------------
/src/xhr-sync-worker.js:
--------------------------------------------------------------------------------
```javascript
// This is a minimal implementation of a synchronous XMLHttpRequest worker
// It's needed by jsdom for synchronous XHR operations
self.onmessage = function(e) {
const xhr = new XMLHttpRequest();
xhr.open(e.data.method, e.data.url, false); // false = synchronous
if (e.data.headers) {
Object.keys(e.data.headers).forEach(function(key) {
xhr.setRequestHeader(key, e.data.headers[key]);
});
}
try {
xhr.send(e.data.data || null);
self.postMessage({
status: xhr.status,
statusText: xhr.statusText,
headers: xhr.getAllResponseHeaders(),
response: xhr.response
});
} catch (error) {
self.postMessage({
error: error.message
});
}
};
```
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
```dockerfile
# Generated by https://smithery.ai. See: https://smithery.ai/docs/config#dockerfile
# Use an official Node.js runtime as a parent image for the build
FROM node:22.12-alpine AS builder
# Set the working directory in the container
WORKDIR /app
# Copy the package.json and package-lock.json into the container
COPY package.json package-lock.json ./
# Install dependencies, skipping the `prepare` step to avoid running build scripts
RUN npm install --ignore-scripts
# Copy the source code into the container
COPY src ./src
# Build the project using the specified build command in the package.json
RUN npm run build
# Use a smaller base image for the runtime
FROM node:22.12-alpine AS runner
# Set the working directory in the container
WORKDIR /app
# Copy the build artifacts from the builder stage
COPY --from=builder /app/dist ./dist
# Copy the necessary node modules
COPY --from=builder /app/node_modules ./node_modules
# Define environment variable
ENV NODE_ENV=production
# Define the command to run the application
ENTRYPOINT ["node", "dist/index.cjs"]
```
--------------------------------------------------------------------------------
/src/tools/_index.ts:
--------------------------------------------------------------------------------
```typescript
import { DOCUMENT_READER_TOOL } from "./documentReader.js";
import { DOCX_TO_HTML_TOOL, DOCX_TO_PDF_TOOL } from "./docxTools.js";
import { EXCEL_READ_TOOL } from "./excelTools.js";
import { FORMAT_CONVERTER_TOOL } from "./formatConverterPlus.js";
import { HTML_CLEAN_TOOL, HTML_EXTRACT_RESOURCES_TOOL, HTML_FORMAT_TOOL, HTML_TO_MARKDOWN_TOOL, HTML_TO_TEXT_TOOL } from "./htmlTools.js";
import { PDF_MERGE_TOOL, PDF_SPLIT_TOOL } from "./pdfTools.js";
import { TEXT_DIFF_TOOL, TEXT_ENCODING_CONVERT_TOOL, TEXT_FORMAT_TOOL, TEXT_SPLIT_TOOL } from "./txtTools.js";
export const tools = [DOCUMENT_READER_TOOL, PDF_MERGE_TOOL, PDF_SPLIT_TOOL, DOCX_TO_PDF_TOOL, DOCX_TO_HTML_TOOL, HTML_CLEAN_TOOL, HTML_TO_TEXT_TOOL, HTML_TO_MARKDOWN_TOOL, HTML_EXTRACT_RESOURCES_TOOL, HTML_FORMAT_TOOL, TEXT_DIFF_TOOL, TEXT_SPLIT_TOOL, TEXT_FORMAT_TOOL, TEXT_ENCODING_CONVERT_TOOL, EXCEL_READ_TOOL, FORMAT_CONVERTER_TOOL];
export * from "./documentReader.js";
export * from "./docxTools.js";
export * from "./excelTools.js";
export * from "./formatConverterPlus.js";
export * from "./htmlTools.js";
export * from "./pdfTools.js";
export * from "./txtTools.js";
```
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
```json
{
"name": "@cablate/mcp-doc-forge",
"version": "0.0.8",
"type": "module",
"description": "MCP server that provides doc forge capabilities",
"main": "dist/index.cjs",
"license": "MIT",
"scripts": {
"build": "esbuild src/index.ts --bundle --platform=node --outfile=dist/index.cjs --external:jsdom && shx chmod +x dist/index.cjs",
"start": "node dist/index.cjs",
"dev": "ts-node src/index.ts"
},
"dependencies": {
"@modelcontextprotocol/sdk": "^1.0.4",
"csv-parse": "^5.6.0",
"diff": "^5.1.0",
"docxtemplater": "^3.42.0",
"epub": "^1.2.1",
"exceljs": "^4.4.0",
"iconv-lite": "^0.6.3",
"image-size": "^1.1.1",
"jsdom": "^25.0.1",
"libreoffice-convert": "^1.6.0",
"mammoth": "^1.6.0",
"marked": "^15.0.7",
"pdf-lib": "^1.17.1",
"pdf2pic": "^3.1.3",
"pdfreader": "^3.0.6",
"pizzip": "^3.1.4",
"sharp": "^0.33.2",
"turndown": "^7.2.0",
"unzipper": "^0.12.3",
"util": "^0.12.5",
"xml2js": "^0.6.2"
},
"devDependencies": {
"@modelcontextprotocol/sdk": "^1.0.4",
"@types/jsdom": "^21.1.7",
"@types/node": "^20.17.10",
"esbuild": "^0.20.2",
"shx": "^0.3.4",
"ts-node": "^10.9.2",
"typescript": "^5.0.0"
},
"author": "CabLate",
"files": [
"dist",
"dist/**/*.map",
"README.md"
],
"bin": {
"mcp-doc-forge": "./dist/index.cjs"
},
"keywords": [
"mcp",
"mcp-server",
"doc-forge",
"document",
"pdf",
"docx",
"txt",
"html",
"csv",
"ai",
"dive"
],
"homepage": "https://github.com/cablate/mcp-doc-forge#readme",
"repository": {
"type": "git",
"url": "git+https://github.com/cablate/mcp-doc-forge.git"
},
"bugs": {
"url": "https://github.com/cablate/mcp-doc-forge/issues"
}
}
```
--------------------------------------------------------------------------------
/src/tools/documentReader.ts:
--------------------------------------------------------------------------------
```typescript
import { Tool } from "@modelcontextprotocol/sdk/types.js";
import { parse } from "csv-parse";
import * as fs from "fs/promises";
import { JSDOM } from "jsdom";
import mammoth from "mammoth";
import * as path from "path";
import { Item, ItemHandler, PdfReader } from "pdfreader";
export const DOCUMENT_READER_TOOL: Tool = {
name: "document_reader",
description:
"Read content from non-image document-files at specified paths, supporting various file formats: .pdf, .docx, .txt, .html, .csv",
inputSchema: {
type: "object",
properties: {
filePath: {
type: "string",
description: "Path to the file to be read",
},
},
required: ["filePath"],
},
};
export interface FileReaderArgs {
filePath: string;
}
export function isFileReaderArgs(args: unknown): args is FileReaderArgs {
return (
typeof args === "object" &&
args !== null &&
"filePath" in args &&
typeof (args as FileReaderArgs).filePath === "string"
);
}
async function readTextFile(filePath: string): Promise<string> {
return await fs.readFile(filePath, "utf-8");
}
async function readPDFFile(filePath: string): Promise<string> {
const buffer = await fs.readFile(filePath);
return new Promise((resolve, reject) => {
let content = "";
const reader = new PdfReader();
reader.parseBuffer(buffer, ((err: null | Error, item: Item | undefined) => {
if (err) {
reject(err);
} else if (!item) {
resolve(content);
} else if (item.text) {
content += item.text + " ";
}
}) as ItemHandler);
});
}
async function readDocxFile(filePath: string): Promise<string> {
const buffer = await fs.readFile(filePath);
const result = await mammoth.extractRawText({ buffer });
return result.value;
}
async function readCSVFile(filePath: string): Promise<string> {
const content = await fs.readFile(filePath, "utf-8");
return new Promise((resolve, reject) => {
parse(content, (err, records) => {
if (err) reject(err);
resolve(JSON.stringify(records));
});
});
}
async function readHTMLFile(filePath: string): Promise<string> {
const content = await fs.readFile(filePath, "utf-8");
const dom = new JSDOM(content);
return dom.window.document.body.textContent || "";
}
export async function readFile(filePath: string) {
try {
const ext = path.extname(filePath).toLowerCase();
let content: string;
switch (ext) {
case ".pdf":
content = await readPDFFile(filePath);
break;
case ".docx":
content = await readDocxFile(filePath);
break;
case ".txt":
content = await readTextFile(filePath);
break;
case ".html":
content = await readHTMLFile(filePath);
break;
case ".csv":
content = await readCSVFile(filePath);
break;
default:
throw new Error(`Unsupported file format: ${ext}`);
}
return {
success: true,
data: content,
};
} catch (error) {
return {
success: false,
error: error instanceof Error ? error.message : "Unknown error",
};
}
}
```
--------------------------------------------------------------------------------
/src/tools/docxTools.ts:
--------------------------------------------------------------------------------
```typescript
import { Tool } from "@modelcontextprotocol/sdk/types.js";
import { randomBytes } from "crypto";
import * as fs from "fs/promises";
import mammoth from "mammoth";
import * as path from "path";
// @ts-ignore
import { convert } from "libreoffice-convert";
import { promisify } from "util";
function generateUniqueId(): string {
return randomBytes(9).toString("hex");
}
// DOCX 轉 HTML 工具
export const DOCX_TO_HTML_TOOL: Tool = {
name: "docx_to_html",
description: "Convert DOCX to HTML while preserving formatting",
inputSchema: {
type: "object",
properties: {
inputPath: {
type: "string",
description: "Path to the input DOCX file",
},
outputDir: {
type: "string",
description: "Directory where HTML should be saved",
},
},
required: ["inputPath", "outputDir"],
},
};
// DOCX 轉 PDF 工具
export const DOCX_TO_PDF_TOOL: Tool = {
name: "docx_to_pdf",
description: "Convert DOCX files to PDF format",
inputSchema: {
type: "object",
properties: {
inputPath: {
type: "string",
description: "Path to the input DOCX file",
},
outputPath: {
type: "string",
description: "Path where the output PDF file should be saved",
},
},
required: ["inputPath", "outputPath"],
},
};
export interface DocxToPdfArgs {
inputPath: string;
outputPath: string;
}
// DOCX 轉 HTML 實作
export async function docxToHtml(inputPath: string, outputDir: string) {
try {
console.error(`Starting DOCX to HTML conversion...`);
console.error(`Input file: ${inputPath}`);
console.error(`Output directory: ${outputDir}`);
// 確保輸出目錄存在
try {
await fs.access(outputDir);
console.error(`Output directory exists: ${outputDir}`);
} catch {
console.error(`Creating output directory: ${outputDir}`);
await fs.mkdir(outputDir, { recursive: true });
console.error(`Created output directory: ${outputDir}`);
}
const uniqueId = generateUniqueId();
const buffer = await fs.readFile(inputPath);
const result = await mammoth.convertToHtml({ buffer });
console.error(
`Conversion completed with ${result.messages.length} messages`
);
const outputPath = path.join(outputDir, `converted_${uniqueId}.html`);
await fs.writeFile(outputPath, result.value);
console.error(`Written HTML to ${outputPath}`);
return {
success: true,
data: `Successfully converted DOCX to HTML: ${outputPath}`,
};
} catch (error) {
console.error(`Error in docxToHtml:`, error);
return {
success: false,
error: error instanceof Error ? error.message : "Unknown error",
};
}
}
// DOCX 轉 PDF 實作
export function isDocxToPdfArgs(args: unknown): args is DocxToPdfArgs {
return (
typeof args === "object" &&
args !== null &&
"inputPath" in args &&
"outputPath" in args &&
typeof (args as DocxToPdfArgs).inputPath === "string" &&
typeof (args as DocxToPdfArgs).outputPath === "string"
);
}
const convertAsyncPromise = promisify(convert);
export async function convertDocxToPdf(inputPath: string, outputPath: string) {
try {
const ext = path.extname(inputPath).toLowerCase();
if (ext !== ".docx") {
throw new Error("Input file must be a .docx file");
}
if (path.extname(outputPath).toLowerCase() !== ".pdf") {
throw new Error("Output file must have .pdf extension");
}
const docxBuffer = await fs.readFile(inputPath);
const pdfBuffer = await convertAsyncPromise(docxBuffer, ".pdf", undefined);
await fs.writeFile(outputPath, pdfBuffer);
return {
success: true,
data: `Successfully converted ${inputPath} to ${outputPath}`,
};
} catch (error) {
return {
success: false,
error: error instanceof Error ? error.message : "Unknown error",
};
}
}
```
--------------------------------------------------------------------------------
/src/tools/formatConverterPlus.ts:
--------------------------------------------------------------------------------
```typescript
import { Tool } from "@modelcontextprotocol/sdk/types.js";
import { marked } from "marked";
import * as xml2js from "xml2js";
/**
* Supported format types for conversion
*/
export enum FormatType {
MARKDOWN = "markdown",
HTML = "html",
XML = "xml",
JSON = "json",
}
/**
* Format converter tool
*/
export const FORMAT_CONVERTER_TOOL: Tool = {
name: "format_convert",
description: "Convert between different document formats (Markdown, HTML, XML, JSON)",
inputSchema: {
type: "object",
properties: {
input: {
type: "string",
description: "Input content to convert",
},
fromFormat: {
type: "string",
enum: Object.values(FormatType),
description: "Source format",
},
toFormat: {
type: "string",
enum: Object.values(FormatType),
description: "Target format",
},
},
required: ["input", "fromFormat", "toFormat"],
},
};
export interface FormatConverterArgs {
input: string;
fromFormat: FormatType;
toFormat: FormatType;
}
/**
* Type check function
*/
export function isFormatConverterArgs(args: unknown): args is FormatConverterArgs {
return typeof args === "object" && args !== null && "input" in args && "fromFormat" in args && "toFormat" in args && typeof (args as FormatConverterArgs).input === "string" && Object.values(FormatType).includes((args as FormatConverterArgs).fromFormat) && Object.values(FormatType).includes((args as FormatConverterArgs).toFormat);
}
// XML 解析器和建構器
const xmlParser = new xml2js.Parser();
const xmlBuilder = new xml2js.Builder();
/**
* Converts Markdown to HTML
*/
async function markdownToHtml(input: string): Promise<string> {
return marked(input);
}
/**
* Converts XML to JSON
*/
async function xmlToJson(input: string): Promise<string> {
try {
const result = await xmlParser.parseStringPromise(input);
return JSON.stringify(result, null, 2);
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : "Unknown error occurred";
throw new Error(`Failed to parse XML: ${errorMessage}`);
}
}
/**
* Converts JSON to XML
*/
function jsonToXml(input: string): string {
try {
const obj = JSON.parse(input);
return xmlBuilder.buildObject(obj);
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : "Unknown error occurred";
throw new Error(`Failed to parse JSON: ${errorMessage}`);
}
}
/**
* Converts content from one format to another
* @param input Input content to convert
* @param fromFormat Source format
* @param toFormat Target format
* @returns Promise resolving to the converted content
*/
export async function convertFormat(input: string, fromFormat: FormatType, toFormat: FormatType) {
try {
console.log(`Converting from ${fromFormat} to ${toFormat}`);
// Validate formats
if (!Object.values(FormatType).includes(fromFormat)) {
return {
success: false,
error: `Unsupported source format: ${fromFormat}`,
};
}
if (!Object.values(FormatType).includes(toFormat)) {
return {
success: false,
error: `Unsupported target format: ${toFormat}`,
};
}
// Handle different conversion paths
let result: string;
switch (`${fromFormat}-${toFormat}`) {
case `${FormatType.MARKDOWN}-${FormatType.HTML}`:
result = await markdownToHtml(input);
break;
case `${FormatType.HTML}-${FormatType.MARKDOWN}`:
return {
success: false,
error: "HTML to Markdown conversion is not supported yet",
};
case `${FormatType.XML}-${FormatType.JSON}`:
result = await xmlToJson(input);
break;
case `${FormatType.JSON}-${FormatType.XML}`:
result = jsonToXml(input);
break;
default:
return {
success: false,
error: `Unsupported conversion path: ${fromFormat} to ${toFormat}`,
};
}
return {
success: true,
data: result,
};
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : "Unknown error occurred";
console.error(`Error converting format: ${errorMessage}`);
return {
success: false,
error: errorMessage,
};
}
}
```
--------------------------------------------------------------------------------
/src/tools/excelTools.ts:
--------------------------------------------------------------------------------
```typescript
import { Tool } from "@modelcontextprotocol/sdk/types.js";
import * as ExcelJS from "exceljs";
import * as fs from "fs";
import * as path from "path";
/**
* Interface for Excel file processing options
*/
interface ExcelProcessOptions {
sheetName?: string;
includeHeaders?: boolean;
}
// Excel 讀取工具
export const EXCEL_READ_TOOL: Tool = {
name: "excel_read",
description: "Read Excel file and convert to JSON format while preserving structure",
inputSchema: {
type: "object",
properties: {
inputPath: {
type: "string",
description: "Path to the input Excel file",
},
includeHeaders: {
type: "boolean",
description: "Whether to include headers in the output",
default: true,
},
},
required: ["inputPath"],
},
};
export interface ExcelReadArgs {
inputPath: string;
includeHeaders?: boolean;
}
// 類型檢查函數
export function isExcelReadArgs(args: unknown): args is ExcelReadArgs {
return typeof args === "object" && args !== null && "inputPath" in args && typeof (args as ExcelReadArgs).inputPath === "string" && (typeof (args as ExcelReadArgs).includeHeaders === "undefined" || typeof (args as ExcelReadArgs).includeHeaders === "boolean");
}
/**
* Class for handling Excel file operations
*/
export class ExcelTools {
/**
* Reads an Excel file and returns its content as JSON
* @param filePath Path to the Excel file
* @param options Processing options
* @returns Promise resolving to the parsed Excel data
*/
public static async readExcelFile(filePath: string, options: ExcelProcessOptions = { includeHeaders: true }): Promise<any> {
try {
// Verify file exists
if (!fs.existsSync(filePath)) {
throw new Error(`File not found: ${filePath}`);
}
// Verify file extension
const ext = path.extname(filePath).toLowerCase();
if (ext !== ".xlsx" && ext !== ".xls") {
throw new Error(`Unsupported file format: ${ext}`);
}
console.log(`Reading Excel file: ${filePath}`);
const workbook = new ExcelJS.Workbook();
await workbook.xlsx.readFile(filePath);
const result: any = {};
workbook.worksheets.forEach((worksheet) => {
const sheetName = worksheet.name;
const rows: any[] = [];
worksheet.eachRow((row, rowNumber) => {
const rowData: any = {};
row.eachCell((cell, colNumber) => {
if (options.includeHeaders && rowNumber === 1) {
// Handle headers
rows.push(cell.value);
} else {
// Handle data rows
rowData[colNumber] = cell.value;
}
});
if (rowNumber > 1 || !options.includeHeaders) {
rows.push(rowData);
}
});
result[sheetName] = rows;
});
console.log(`Successfully parsed Excel file: ${filePath}`);
return result;
} catch (error: any) {
console.error(`Error processing Excel file: ${error.message}`);
throw error;
}
}
}
// Excel 讀取實作
export async function readExcelFile(inputPath: string, includeHeaders: boolean = true) {
try {
// 驗證檔案存在
if (!fs.existsSync(inputPath)) {
return {
success: false,
error: `File not found: ${inputPath}`,
};
}
// 驗證檔案副檔名
const ext = path.extname(inputPath).toLowerCase();
if (ext !== ".xlsx" && ext !== ".xls") {
return {
success: false,
error: `Unsupported file format: ${ext}`,
};
}
console.log(`Reading Excel file: ${inputPath}`);
const workbook = new ExcelJS.Workbook();
await workbook.xlsx.readFile(inputPath);
const result: Record<string, any[]> = {};
workbook.worksheets.forEach((worksheet) => {
const sheetName = worksheet.name;
const rows: any[] = [];
worksheet.eachRow((row, rowNumber) => {
const rowData: Record<number, any> = {};
row.eachCell((cell, colNumber) => {
if (includeHeaders && rowNumber === 1) {
rows.push(cell.value);
} else {
rowData[colNumber] = cell.value;
}
});
if (rowNumber > 1 || !includeHeaders) {
rows.push(rowData);
}
});
result[sheetName] = rows;
});
console.log(`Successfully parsed Excel file: ${inputPath}`);
return {
success: true,
data: result,
};
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : "Unknown error occurred";
console.error(`Error processing Excel file: ${errorMessage}`);
return {
success: false,
error: errorMessage,
};
}
}
```
--------------------------------------------------------------------------------
/src/tools/pdfTools.ts:
--------------------------------------------------------------------------------
```typescript
import { Tool } from "@modelcontextprotocol/sdk/types.js";
import { randomBytes } from "crypto";
import * as fs from "fs/promises";
import * as path from "path";
import { PDFDocument } from "pdf-lib";
import { fromPath } from "pdf2pic";
function generateUniqueId(): string {
return randomBytes(9).toString("hex");
}
// PDF 合併工具
export const PDF_MERGE_TOOL: Tool = {
name: "pdf_merger",
description: "Merge multiple PDF files into one",
inputSchema: {
type: "object",
properties: {
inputPaths: {
type: "array",
items: { type: "string" },
description: "Paths to the input PDF files",
},
outputDir: {
type: "string",
description: "Directory where merged PDFs should be saved",
},
},
required: ["inputPaths", "outputDir"],
},
};
// PDF 分割工具
export const PDF_SPLIT_TOOL: Tool = {
name: "pdf_splitter",
description: "Split a PDF file into multiple files",
inputSchema: {
type: "object",
properties: {
inputPath: {
type: "string",
description: "Path to the input PDF file",
},
outputDir: {
type: "string",
description: "Directory where split PDFs should be saved",
},
pageRanges: {
type: "array",
items: {
type: "object",
properties: {
start: { type: "number" },
end: { type: "number" },
},
},
description: "Array of page ranges to split",
},
},
required: ["inputPath", "outputDir", "pageRanges"],
},
};
// 實作函數
export async function mergePDFs(inputPaths: string[], outputDir: string) {
try {
console.error(`Starting PDF merge operation...`);
console.error(`Input files:`, inputPaths);
console.error(`Output directory: ${outputDir}`);
// 確保輸出目錄存在
try {
await fs.access(outputDir);
console.error(`Output directory exists: ${outputDir}`);
} catch {
console.error(`Creating output directory: ${outputDir}`);
await fs.mkdir(outputDir, { recursive: true });
console.error(`Created output directory: ${outputDir}`);
}
const uniqueId = generateUniqueId();
console.error(`Generated unique ID for this batch: ${uniqueId}`);
// 修改輸出檔案名稱,加入 uniqueId
const outputPath = path.join(outputDir, `merged_${uniqueId}.pdf`);
console.error(`New output path with unique ID: ${outputPath}`);
const mergedPdf = await PDFDocument.create();
for (const filePath of inputPaths) {
console.error(`Processing input file: ${filePath}`);
const pdfBytes = await fs.readFile(filePath);
console.error(`Read ${pdfBytes.length} bytes from ${filePath}`);
const pdf = await PDFDocument.load(pdfBytes);
const pageCount = pdf.getPageCount();
console.error(`Loaded PDF with ${pageCount} pages from ${filePath}`);
const copiedPages = await mergedPdf.copyPages(pdf, pdf.getPageIndices());
console.error(`Copied ${copiedPages.length} pages from ${filePath}`);
copiedPages.forEach((page, index) => {
mergedPdf.addPage(page);
console.error(`Added page ${index + 1} from ${filePath}`);
});
}
const mergedPdfBytes = await mergedPdf.save();
console.error(`Generated merged PDF: ${mergedPdfBytes.length} bytes`);
await fs.writeFile(outputPath, mergedPdfBytes);
console.error(`Successfully wrote merged PDF to ${outputPath}`);
return {
success: true,
data: `Successfully merged ${inputPaths.length} PDFs into ${outputPath}`,
};
} catch (error) {
console.error(`Error in mergePDFs:`);
console.error(error);
if (error instanceof Error) {
console.error(`Error name: ${error.name}`);
console.error(`Error message: ${error.message}`);
console.error(`Error stack: ${error.stack}`);
}
return {
success: false,
error: error instanceof Error ? error.message : "Unknown error",
};
}
}
export async function splitPDF(
inputPath: string,
outputDir: string,
pageRanges: Array<{ start: number; end: number }>
) {
try {
console.error(`Starting PDF split operation...`);
console.error(`Input file: ${inputPath}`);
console.error(`Output directory: ${outputDir}`);
console.error(`Page ranges:`, JSON.stringify(pageRanges, null, 2));
// 確保輸出目錄存在
try {
await fs.access(outputDir);
console.error(`Output directory exists: ${outputDir}`);
} catch {
console.error(`Creating output directory: ${outputDir}`);
await fs.mkdir(outputDir, { recursive: true });
console.error(`Created output directory: ${outputDir}`);
}
const pdfBytes = await fs.readFile(inputPath);
console.error(
`Successfully read input PDF, size: ${pdfBytes.length} bytes`
);
const pdf = await PDFDocument.load(pdfBytes);
const totalPages = pdf.getPageCount();
console.error(`PDF loaded successfully. Total pages: ${totalPages}`);
const uniqueId = generateUniqueId();
console.error(`Generated unique ID for this batch: ${uniqueId}`);
const results: string[] = [];
for (let i = 0; i < pageRanges.length; i++) {
const { start, end } = pageRanges[i];
console.error(`Processing range ${i + 1}: pages ${start} to ${end}`);
if (start > totalPages || end > totalPages) {
throw new Error(
`Invalid page range: ${start}-${end}. PDF only has ${totalPages} pages`
);
}
if (start > end) {
throw new Error(
`Invalid page range: start (${start}) is greater than end (${end})`
);
}
const newPdf = await PDFDocument.create();
const pageIndexes = Array.from(
{ length: end - start + 1 },
(_, i) => start - 1 + i
);
console.error(`Copying pages with indexes:`, pageIndexes);
const pages = await newPdf.copyPages(pdf, pageIndexes);
console.error(`Successfully copied ${pages.length} pages`);
pages.forEach((page, pageIndex) => {
newPdf.addPage(page);
console.error(`Added page ${pageIndex + 1} to new PDF`);
});
const outputPath = path.join(outputDir, `split_${uniqueId}_${i + 1}.pdf`);
console.error(`Saving split PDF to: ${outputPath}`);
const newPdfBytes = await newPdf.save();
console.error(`Generated PDF bytes: ${newPdfBytes.length}`);
await fs.writeFile(outputPath, newPdfBytes);
console.error(`Successfully wrote PDF to ${outputPath}`);
results.push(outputPath);
}
console.error(`Split operation completed successfully`);
return {
success: true,
data: `Successfully split PDF into ${
results.length
} files: ${results.join(", ")}`,
};
} catch (error) {
console.error(`Error in splitPDF:`);
console.error(error);
if (error instanceof Error) {
console.error(`Error name: ${error.name}`);
console.error(`Error message: ${error.message}`);
console.error(`Error stack: ${error.stack}`);
}
return {
success: false,
error: error instanceof Error ? error.message : "Unknown error",
};
}
}
export async function pdfToImages(
inputPath: string,
outputDir: string,
format: "png" | "jpeg" = "png",
dpi: number = 300
) {
try {
// 確保輸出目錄存在
try {
await fs.access(outputDir);
console.error(`Output directory exists: ${outputDir}`);
} catch {
console.error(`Creating output directory: ${outputDir}`);
await fs.mkdir(outputDir, { recursive: true });
console.error(`Created output directory: ${outputDir}`);
}
const uniqueId = generateUniqueId();
console.error(`Generated unique ID for this batch: ${uniqueId}`);
const convert = fromPath(inputPath, {
density: dpi,
format: format as string,
width: 2048,
height: 2048,
saveFilename: `page_${uniqueId}`,
savePath: outputDir,
});
const pdfBytes = await fs.readFile(inputPath);
const pdf = await PDFDocument.load(pdfBytes);
const pageCount = pdf.getPageCount();
const results: string[] = [];
for (let i = 1; i <= pageCount; i++) {
const result = await convert(i);
result.path && results.push(result.path);
}
return {
success: true,
data: `Successfully converted ${pageCount} pages to images in ${outputDir}`,
};
} catch (error) {
return {
success: false,
error: error instanceof Error ? error.message : "Unknown error",
};
}
}
```
--------------------------------------------------------------------------------
/src/tools/txtTools.ts:
--------------------------------------------------------------------------------
```typescript
import { Tool } from "@modelcontextprotocol/sdk/types.js";
import { randomBytes } from "crypto";
import { diffLines } from "diff";
import * as fs from "fs/promises";
import iconv from "iconv-lite";
import * as path from "path";
function generateUniqueId(): string {
return randomBytes(9).toString("hex");
}
// 文字編碼轉換工具
export const TEXT_ENCODING_CONVERT_TOOL: Tool = {
name: "text_encoding_converter",
description: "Convert text between different encodings",
inputSchema: {
type: "object",
properties: {
inputPath: {
type: "string",
description: "Path to the input text file",
},
outputDir: {
type: "string",
description: "Directory where converted file should be saved",
},
fromEncoding: {
type: "string",
description: "Source encoding (e.g., 'big5', 'gbk', 'utf8')",
},
toEncoding: {
type: "string",
description: "Target encoding (e.g., 'utf8', 'big5', 'gbk')",
},
},
required: ["inputPath", "outputDir", "fromEncoding", "toEncoding"],
},
};
// 文字格式化工具
export const TEXT_FORMAT_TOOL: Tool = {
name: "text_formatter",
description: "Format text with proper indentation and line spacing",
inputSchema: {
type: "object",
properties: {
inputPath: {
type: "string",
description: "Path to the input text file",
},
outputDir: {
type: "string",
description: "Directory where formatted file should be saved",
},
},
required: ["inputPath", "outputDir"],
},
};
// 文字比較工具
export const TEXT_DIFF_TOOL: Tool = {
name: "text_diff",
description: "Compare two text files and show differences",
inputSchema: {
type: "object",
properties: {
file1Path: {
type: "string",
description: "Path to the first text file",
},
file2Path: {
type: "string",
description: "Path to the second text file",
},
outputDir: {
type: "string",
description: "Directory where diff result should be saved",
},
},
required: ["file1Path", "file2Path", "outputDir"],
},
};
// 文字分割工具
export const TEXT_SPLIT_TOOL: Tool = {
name: "text_splitter",
description: "Split text file by specified delimiter or line count",
inputSchema: {
type: "object",
properties: {
inputPath: {
type: "string",
description: "Path to the input text file",
},
outputDir: {
type: "string",
description: "Directory where split files should be saved",
},
splitBy: {
type: "string",
enum: ["lines", "delimiter"],
description: "Split method: by line count or delimiter",
},
value: {
type: "string",
description: "Line count (number) or delimiter string",
},
},
required: ["inputPath", "outputDir", "splitBy", "value"],
},
};
// 文字編碼轉換實作
export async function convertTextEncoding(
inputPath: string,
outputDir: string,
fromEncoding: string,
toEncoding: string
) {
try {
console.error(`Starting text encoding conversion...`);
console.error(`Input file: ${inputPath}`);
console.error(`Output directory: ${outputDir}`);
console.error(`From encoding: ${fromEncoding}`);
console.error(`To encoding: ${toEncoding}`);
// 確保輸出目錄存在
try {
await fs.access(outputDir);
console.error(`Output directory exists: ${outputDir}`);
} catch {
console.error(`Creating output directory: ${outputDir}`);
await fs.mkdir(outputDir, { recursive: true });
console.error(`Created output directory: ${outputDir}`);
}
const uniqueId = generateUniqueId();
const content = await fs.readFile(inputPath);
const text = iconv.decode(content, fromEncoding);
const converted = iconv.encode(text, toEncoding);
const outputPath = path.join(outputDir, `converted_${uniqueId}.txt`);
await fs.writeFile(outputPath, converted);
console.error(`Written converted text to ${outputPath}`);
return {
success: true,
data: `Successfully converted text encoding: ${outputPath}`,
};
} catch (error) {
console.error(`Error in convertTextEncoding:`, error);
return {
success: false,
error: error instanceof Error ? error.message : "Unknown error",
};
}
}
// 文字格式化實作
export async function formatText(inputPath: string, outputDir: string) {
try {
console.error(`Starting text formatting...`);
console.error(`Input file: ${inputPath}`);
console.error(`Output directory: ${outputDir}`);
// 確保輸出目錄存在
try {
await fs.access(outputDir);
console.error(`Output directory exists: ${outputDir}`);
} catch {
console.error(`Creating output directory: ${outputDir}`);
await fs.mkdir(outputDir, { recursive: true });
console.error(`Created output directory: ${outputDir}`);
}
const uniqueId = generateUniqueId();
const content = await fs.readFile(inputPath, "utf-8");
// 基本格式化:移除多餘空白行,統一縮排
const formatted = content
.split("\n")
.map((line) => line.trim())
.filter((line, index, array) => !(line === "" && array[index - 1] === ""))
.join("\n");
const outputPath = path.join(outputDir, `formatted_${uniqueId}.txt`);
await fs.writeFile(outputPath, formatted);
console.error(`Written formatted text to ${outputPath}`);
return {
success: true,
data: `Successfully formatted text: ${outputPath}`,
};
} catch (error) {
console.error(`Error in formatText:`, error);
return {
success: false,
error: error instanceof Error ? error.message : "Unknown error",
};
}
}
// 文字比較實作
export async function compareTexts(
file1Path: string,
file2Path: string,
outputDir: string
) {
try {
console.error(`Starting text comparison...`);
console.error(`File 1: ${file1Path}`);
console.error(`File 2: ${file2Path}`);
console.error(`Output directory: ${outputDir}`);
// 確保輸出目錄存在
try {
await fs.access(outputDir);
console.error(`Output directory exists: ${outputDir}`);
} catch {
console.error(`Creating output directory: ${outputDir}`);
await fs.mkdir(outputDir, { recursive: true });
console.error(`Created output directory: ${outputDir}`);
}
const uniqueId = generateUniqueId();
const text1 = await fs.readFile(file1Path, "utf-8");
const text2 = await fs.readFile(file2Path, "utf-8");
const diff = diffLines(text1, text2);
const diffResult = diff
.map((part) => {
const prefix = part.added ? "+ " : part.removed ? "- " : " ";
return prefix + part.value;
})
.join("");
const outputPath = path.join(outputDir, `diff_${uniqueId}.txt`);
await fs.writeFile(outputPath, diffResult);
console.error(`Written diff result to ${outputPath}`);
return {
success: true,
data: `Successfully compared texts: ${outputPath}`,
};
} catch (error) {
console.error(`Error in compareTexts:`, error);
return {
success: false,
error: error instanceof Error ? error.message : "Unknown error",
};
}
}
// 文字分割實作
export async function splitText(
inputPath: string,
outputDir: string,
splitBy: "lines" | "delimiter",
value: string
) {
try {
console.error(`Starting text splitting...`);
console.error(`Input file: ${inputPath}`);
console.error(`Output directory: ${outputDir}`);
console.error(`Split by: ${splitBy}`);
console.error(`Value: ${value}`);
// 確保輸出目錄存在
try {
await fs.access(outputDir);
console.error(`Output directory exists: ${outputDir}`);
} catch {
console.error(`Creating output directory: ${outputDir}`);
await fs.mkdir(outputDir, { recursive: true });
console.error(`Created output directory: ${outputDir}`);
}
const uniqueId = generateUniqueId();
const content = await fs.readFile(inputPath, "utf-8");
const parts: string[] = [];
if (splitBy === "lines") {
const lineCount = parseInt(value, 10);
if (isNaN(lineCount) || lineCount <= 0) {
throw new Error("Invalid line count");
}
const lines = content.split("\n");
for (let i = 0; i < lines.length; i += lineCount) {
parts.push(lines.slice(i, i + lineCount).join("\n"));
}
} else {
parts.push(...content.split(value));
}
const results: string[] = [];
for (let i = 0; i < parts.length; i++) {
const outputPath = path.join(outputDir, `part_${uniqueId}_${i + 1}.txt`);
await fs.writeFile(outputPath, parts[i]);
results.push(outputPath);
console.error(`Written part ${i + 1} to ${outputPath}`);
}
return {
success: true,
data: `Successfully split text into ${parts.length} parts: ${results.join(
", "
)}`,
};
} catch (error) {
console.error(`Error in splitText:`, error);
return {
success: false,
error: error instanceof Error ? error.message : "Unknown error",
};
}
}
```
--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
```typescript
#!/usr/bin/env node
console.log = () => {};
console.error = () => {};
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
import {
CallToolRequestSchema,
ListToolsRequestSchema,
} from "@modelcontextprotocol/sdk/types.js";
import {
cleanHtml,
compareTexts,
convertDocxToPdf,
convertTextEncoding,
docxToHtml,
extractHtmlResources,
formatHtml,
formatText,
htmlToMarkdown,
htmlToText,
isDocxToPdfArgs,
isFileReaderArgs,
mergePDFs,
readFile,
splitPDF,
splitText,
tools,
} from "./tools/_index.js";
const server = new Server(
{
name: "mcp-server/common_doc_executor",
version: "0.0.1",
},
{
capabilities: {
description:
"A MCP server providing file reading capabilities for various file formats!",
tools: {},
},
}
);
server.setRequestHandler(ListToolsRequestSchema, async () => ({
tools,
}));
server.setRequestHandler(CallToolRequestSchema, async (request) => {
try {
const { name, arguments: args } = request.params;
if (!args) {
throw new Error("No arguments provided");
}
if (name === "document_reader") {
if (!isFileReaderArgs(args)) {
throw new Error("Invalid arguments for document_reader");
}
const result = await readFile(args.filePath);
if (!result.success) {
return {
content: [{ type: "text", text: `Error: ${result.error}` }],
isError: true,
};
}
return {
content: [{ type: "text", text: result.data }],
isError: false,
};
}
if (name === "docx_to_pdf") {
if (!isDocxToPdfArgs(args)) {
throw new Error("Invalid arguments for docx_to_pdf");
}
const result = await convertDocxToPdf(args.inputPath, args.outputPath);
if (!result.success) {
return {
content: [{ type: "text", text: `Error: ${result.error}` }],
isError: true,
};
}
return {
content: [{ type: "text", text: fileOperationResponse(result.data) }],
isError: false,
};
}
if (name === "pdf_merger") {
const { inputPaths, outputDir } = args as {
inputPaths: string[];
outputDir: string;
};
const result = await mergePDFs(inputPaths, outputDir);
if (!result.success) {
return {
content: [{ type: "text", text: `Error: ${result.error}` }],
isError: true,
};
}
return {
content: [{ type: "text", text: fileOperationResponse(result.data) }],
isError: false,
};
}
if (name === "pdf_splitter") {
const { inputPath, outputDir, pageRanges } = args as {
inputPath: string;
outputDir: string;
pageRanges: Array<{ start: number; end: number }>;
};
const result = await splitPDF(inputPath, outputDir, pageRanges);
if (!result.success) {
return {
content: [{ type: "text", text: `Error: ${result.error}` }],
isError: true,
};
}
return {
content: [{ type: "text", text: fileOperationResponse(result.data) }],
isError: false,
};
}
if (name === "docx_to_html") {
const { inputPath, outputDir } = args as {
inputPath: string;
outputDir: string;
};
const result = await docxToHtml(inputPath, outputDir);
if (!result.success) {
return {
content: [{ type: "text", text: `Error: ${result.error}` }],
isError: true,
};
}
return {
content: [{ type: "text", text: fileOperationResponse(result.data) }],
isError: false,
};
}
if (name === "html_cleaner") {
const { inputPath, outputDir } = args as {
inputPath: string;
outputDir: string;
};
const result = await cleanHtml(inputPath, outputDir);
if (!result.success) {
return {
content: [{ type: "text", text: `Error: ${result.error}` }],
isError: true,
};
}
return {
content: [{ type: "text", text: fileOperationResponse(result.data) }],
isError: false,
};
}
if (name === "html_to_text") {
const { inputPath, outputDir } = args as {
inputPath: string;
outputDir: string;
};
const result = await htmlToText(inputPath, outputDir);
if (!result.success) {
return {
content: [{ type: "text", text: `Error: ${result.error}` }],
isError: true,
};
}
return {
content: [{ type: "text", text: fileOperationResponse(result.data) }],
isError: false,
};
}
if (name === "html_to_markdown") {
const { inputPath, outputDir } = args as {
inputPath: string;
outputDir: string;
};
const result = await htmlToMarkdown(inputPath, outputDir);
if (!result.success) {
return {
content: [{ type: "text", text: `Error: ${result.error}` }],
isError: true,
};
}
return {
content: [{ type: "text", text: fileOperationResponse(result.data) }],
isError: false,
};
}
if (name === "html_extract_resources") {
const { inputPath, outputDir } = args as {
inputPath: string;
outputDir: string;
};
const result = await extractHtmlResources(inputPath, outputDir);
if (!result.success) {
return {
content: [{ type: "text", text: `Error: ${result.error}` }],
isError: true,
};
}
return {
content: [{ type: "text", text: fileOperationResponse(result.data) }],
isError: false,
};
}
if (name === "html_formatter") {
const { inputPath, outputDir } = args as {
inputPath: string;
outputDir: string;
};
const result = await formatHtml(inputPath, outputDir);
if (!result.success) {
return {
content: [{ type: "text", text: `Error: ${result.error}` }],
isError: true,
};
}
return {
content: [{ type: "text", text: fileOperationResponse(result.data) }],
isError: false,
};
}
if (name === "text_encoding_converter") {
const { inputPath, outputDir, fromEncoding, toEncoding } = args as {
inputPath: string;
outputDir: string;
fromEncoding: string;
toEncoding: string;
};
const result = await convertTextEncoding(
inputPath,
outputDir,
fromEncoding,
toEncoding
);
if (!result.success) {
return {
content: [{ type: "text", text: `Error: ${result.error}` }],
isError: true,
};
}
return {
content: [{ type: "text", text: fileOperationResponse(result.data) }],
isError: false,
};
}
if (name === "text_formatter") {
const { inputPath, outputDir } = args as {
inputPath: string;
outputDir: string;
};
const result = await formatText(inputPath, outputDir);
if (!result.success) {
return {
content: [{ type: "text", text: `Error: ${result.error}` }],
isError: true,
};
}
return {
content: [{ type: "text", text: fileOperationResponse(result.data) }],
isError: false,
};
}
if (name === "text_diff") {
const { file1Path, file2Path, outputDir } = args as {
file1Path: string;
file2Path: string;
outputDir: string;
};
const result = await compareTexts(file1Path, file2Path, outputDir);
if (!result.success) {
return {
content: [{ type: "text", text: `Error: ${result.error}` }],
isError: true,
};
}
return {
content: [{ type: "text", text: fileOperationResponse(result.data) }],
isError: false,
};
}
if (name === "text_splitter") {
const { inputPath, outputDir, splitBy, value } = args as {
inputPath: string;
outputDir: string;
splitBy: "lines" | "delimiter";
value: string;
};
const result = await splitText(inputPath, outputDir, splitBy, value);
if (!result.success) {
return {
content: [{ type: "text", text: `Error: ${result.error}` }],
isError: true,
};
}
return {
content: [{ type: "text", text: fileOperationResponse(result.data) }],
isError: false,
};
}
return {
content: [{ type: "text", text: `Unknown tool: ${name}` }],
isError: true,
};
} catch (error) {
return {
content: [
{
type: "text",
text: `Error: ${
error instanceof Error ? error.message : String(error)
}`,
},
],
isError: true,
};
}
});
async function runServer() {
const transport = new StdioServerTransport();
await server.connect(transport);
console.log("MCP Doc Forge Server is running");
}
runServer().catch((error) => {
console.error("Fatal error running server:", error);
process.exit(1);
});
function fileOperationResponse(data: any) {
return `
Note: This operation has generated a file.
The file path is in <result>
Please provide a blank_link download for the file.
ex: The download link: [file_name](/filepath)
<result>
${data}
</result>
`;
}
```
--------------------------------------------------------------------------------
/src/tools/htmlTools.ts:
--------------------------------------------------------------------------------
```typescript
import { Tool } from "@modelcontextprotocol/sdk/types.js";
import { randomBytes } from "crypto";
import * as fs from "fs/promises";
import { JSDOM } from "jsdom";
import * as path from "path";
import TurndownService from "turndown";
function generateUniqueId(): string {
return randomBytes(9).toString("hex");
}
// HTML 清理工具
export const HTML_CLEAN_TOOL: Tool = {
name: "html_cleaner",
description: "Clean HTML by removing unnecessary tags and attributes",
inputSchema: {
type: "object",
properties: {
inputPath: {
type: "string",
description: "Path to the input HTML file",
},
outputDir: {
type: "string",
description: "Directory where cleaned HTML should be saved",
},
},
required: ["inputPath", "outputDir"],
},
};
// HTML 轉純文字工具
export const HTML_TO_TEXT_TOOL: Tool = {
name: "html_to_text",
description: "Convert HTML to plain text while preserving structure",
inputSchema: {
type: "object",
properties: {
inputPath: {
type: "string",
description: "Path to the input HTML file",
},
outputDir: {
type: "string",
description: "Directory where text file should be saved",
},
},
required: ["inputPath", "outputDir"],
},
};
// HTML 轉 Markdown 工具
export const HTML_TO_MARKDOWN_TOOL: Tool = {
name: "html_to_markdown",
description: "Convert HTML to Markdown format",
inputSchema: {
type: "object",
properties: {
inputPath: {
type: "string",
description: "Path to the input HTML file",
},
outputDir: {
type: "string",
description: "Directory where Markdown file should be saved",
},
},
required: ["inputPath", "outputDir"],
},
};
// HTML 資源提取工具
export const HTML_EXTRACT_RESOURCES_TOOL: Tool = {
name: "html_extract_resources",
description: "Extract all resources (images, videos, links) from HTML",
inputSchema: {
type: "object",
properties: {
inputPath: {
type: "string",
description: "Path to the input HTML file",
},
outputDir: {
type: "string",
description: "Directory where resources should be saved",
},
},
required: ["inputPath", "outputDir"],
},
};
// HTML 格式化工具
export const HTML_FORMAT_TOOL: Tool = {
name: "html_formatter",
description: "Format and beautify HTML code",
inputSchema: {
type: "object",
properties: {
inputPath: {
type: "string",
description: "Path to the input HTML file",
},
outputDir: {
type: "string",
description: "Directory where formatted HTML should be saved",
},
},
required: ["inputPath", "outputDir"],
},
};
// HTML 清理實作
export async function cleanHtml(inputPath: string, outputDir: string) {
try {
console.error(`Starting HTML cleaning...`);
console.error(`Input file: ${inputPath}`);
console.error(`Output directory: ${outputDir}`);
// 確保輸出目錄存在
try {
await fs.access(outputDir);
console.error(`Output directory exists: ${outputDir}`);
} catch {
console.error(`Creating output directory: ${outputDir}`);
await fs.mkdir(outputDir, { recursive: true });
console.error(`Created output directory: ${outputDir}`);
}
const uniqueId = generateUniqueId();
const htmlContent = await fs.readFile(inputPath, "utf-8");
const dom = new JSDOM(htmlContent);
const { document } = dom.window;
// 移除不必要的標籤和屬性
const unwantedTags = ["script", "style", "iframe", "noscript"];
const unwantedAttrs = ["onclick", "onload", "onerror", "style"];
unwantedTags.forEach((tag) => {
document.querySelectorAll(tag).forEach((el) => el.remove());
});
document.querySelectorAll("*").forEach((el) => {
unwantedAttrs.forEach((attr) => el.removeAttribute(attr));
});
const cleanedHtml = dom.serialize();
const outputPath = path.join(outputDir, `cleaned_${uniqueId}.html`);
await fs.writeFile(outputPath, cleanedHtml);
console.error(`Written cleaned HTML to ${outputPath}`);
return {
success: true,
data: `Successfully cleaned HTML and saved to ${outputPath}`,
};
} catch (error) {
console.error(`Error in cleanHtml:`, error);
return {
success: false,
error: error instanceof Error ? error.message : "Unknown error",
};
}
}
// HTML 轉純文字實作
export async function htmlToText(inputPath: string, outputDir: string) {
try {
console.error(`Starting HTML to text conversion...`);
console.error(`Input file: ${inputPath}`);
console.error(`Output directory: ${outputDir}`);
// 確保輸出目錄存在
try {
await fs.access(outputDir);
console.error(`Output directory exists: ${outputDir}`);
} catch {
console.error(`Creating output directory: ${outputDir}`);
await fs.mkdir(outputDir, { recursive: true });
console.error(`Created output directory: ${outputDir}`);
}
const uniqueId = generateUniqueId();
const htmlContent = await fs.readFile(inputPath, "utf-8");
const dom = new JSDOM(htmlContent);
const { document } = dom.window;
// 保留結構的文字轉換
const text = document.body.textContent?.trim() || "";
const outputPath = path.join(outputDir, `text_${uniqueId}.txt`);
await fs.writeFile(outputPath, text);
console.error(`Written text to ${outputPath}`);
return {
success: true,
data: `Successfully converted HTML to text: ${outputPath}`,
};
} catch (error) {
console.error(`Error in htmlToText:`, error);
return {
success: false,
error: error instanceof Error ? error.message : "Unknown error",
};
}
}
// HTML 轉 Markdown 實作
export async function htmlToMarkdown(inputPath: string, outputDir: string) {
try {
console.error(`Starting HTML to Markdown conversion...`);
console.error(`Input file: ${inputPath}`);
console.error(`Output directory: ${outputDir}`);
// 確保輸出目錄存在
try {
await fs.access(outputDir);
console.error(`Output directory exists: ${outputDir}`);
} catch {
console.error(`Creating output directory: ${outputDir}`);
await fs.mkdir(outputDir, { recursive: true });
console.error(`Created output directory: ${outputDir}`);
}
const uniqueId = generateUniqueId();
const htmlContent = await fs.readFile(inputPath, "utf-8");
const turndownService = new TurndownService();
const markdown = turndownService.turndown(htmlContent);
const outputPath = path.join(outputDir, `markdown_${uniqueId}.md`);
await fs.writeFile(outputPath, markdown);
console.error(`Written Markdown to ${outputPath}`);
return {
success: true,
data: `Successfully converted HTML to Markdown: ${outputPath}`,
};
} catch (error) {
console.error(`Error in htmlToMarkdown:`, error);
return {
success: false,
error: error instanceof Error ? error.message : "Unknown error",
};
}
}
// HTML 資源提取實作
export async function extractHtmlResources(
inputPath: string,
outputDir: string
) {
try {
console.error(`Starting resource extraction...`);
console.error(`Input file: ${inputPath}`);
console.error(`Output directory: ${outputDir}`);
// 確保輸出目錄存在
try {
await fs.access(outputDir);
console.error(`Output directory exists: ${outputDir}`);
} catch {
console.error(`Creating output directory: ${outputDir}`);
await fs.mkdir(outputDir, { recursive: true });
console.error(`Created output directory: ${outputDir}`);
}
const uniqueId = generateUniqueId();
const htmlContent = await fs.readFile(inputPath, "utf-8");
const dom = new JSDOM(htmlContent);
const { document } = dom.window;
// 提取資源
const resources = {
images: Array.from(document.querySelectorAll("img")).map(
(img) => (img as HTMLImageElement).src
),
links: Array.from(document.querySelectorAll("a")).map(
(a) => (a as HTMLAnchorElement).href
),
videos: Array.from(document.querySelectorAll("video source")).map(
(video) => (video as HTMLSourceElement).src
),
};
const outputPath = path.join(outputDir, `resources_${uniqueId}.json`);
await fs.writeFile(outputPath, JSON.stringify(resources, null, 2));
console.error(`Written resources to ${outputPath}`);
return {
success: true,
data: `Successfully extracted resources: ${outputPath}`,
};
} catch (error) {
console.error(`Error in extractHtmlResources:`, error);
return {
success: false,
error: error instanceof Error ? error.message : "Unknown error",
};
}
}
// HTML 格式化實作
export async function formatHtml(inputPath: string, outputDir: string) {
try {
console.error(`Starting HTML formatting...`);
console.error(`Input file: ${inputPath}`);
console.error(`Output directory: ${outputDir}`);
// 確保輸出目錄存在
try {
await fs.access(outputDir);
console.error(`Output directory exists: ${outputDir}`);
} catch {
console.error(`Creating output directory: ${outputDir}`);
await fs.mkdir(outputDir, { recursive: true });
console.error(`Created output directory: ${outputDir}`);
}
const uniqueId = generateUniqueId();
const htmlContent = await fs.readFile(inputPath, "utf-8");
const dom = new JSDOM(htmlContent);
const { document } = dom.window;
// 格式化 HTML
const formattedHtml = dom.serialize();
const outputPath = path.join(outputDir, `formatted_${uniqueId}.html`);
await fs.writeFile(outputPath, formattedHtml);
console.error(`Written formatted HTML to ${outputPath}`);
return {
success: true,
data: `Successfully formatted HTML: ${outputPath}`,
};
} catch (error) {
console.error(`Error in formatHtml:`, error);
return {
success: false,
error: error instanceof Error ? error.message : "Unknown error",
};
}
}
```