microsoft/markitdown # codebase.md

This is page 1 of 5. Use http://codebase.md/microsoft/markitdown?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .devcontainer
│   └── devcontainer.json
├── .dockerignore
├── .gitattributes
├── .github
│   ├── dependabot.yml
│   └── workflows
│       ├── pre-commit.yml
│       └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CODE_OF_CONDUCT.md
├── Dockerfile
├── LICENSE
├── packages
│   ├── markitdown
│   │   ├── pyproject.toml
│   │   ├── README.md
│   │   ├── src
│   │   │   └── markitdown
│   │   │       ├── __about__.py
│   │   │       ├── __init__.py
│   │   │       ├── __main__.py
│   │   │       ├── _base_converter.py
│   │   │       ├── _exceptions.py
│   │   │       ├── _markitdown.py
│   │   │       ├── _stream_info.py
│   │   │       ├── _uri_utils.py
│   │   │       ├── converter_utils
│   │   │       │   ├── __init__.py
│   │   │       │   └── docx
│   │   │       │       ├── __init__.py
│   │   │       │       ├── math
│   │   │       │       │   ├── __init__.py
│   │   │       │       │   ├── latex_dict.py
│   │   │       │       │   └── omml.py
│   │   │       │       └── pre_process.py
│   │   │       ├── converters
│   │   │       │   ├── __init__.py
│   │   │       │   ├── _audio_converter.py
│   │   │       │   ├── _bing_serp_converter.py
│   │   │       │   ├── _csv_converter.py
│   │   │       │   ├── _doc_intel_converter.py
│   │   │       │   ├── _docx_converter.py
│   │   │       │   ├── _epub_converter.py
│   │   │       │   ├── _exiftool.py
│   │   │       │   ├── _html_converter.py
│   │   │       │   ├── _image_converter.py
│   │   │       │   ├── _ipynb_converter.py
│   │   │       │   ├── _llm_caption.py
│   │   │       │   ├── _markdownify.py
│   │   │       │   ├── _outlook_msg_converter.py
│   │   │       │   ├── _pdf_converter.py
│   │   │       │   ├── _plain_text_converter.py
│   │   │       │   ├── _pptx_converter.py
│   │   │       │   ├── _rss_converter.py
│   │   │       │   ├── _transcribe_audio.py
│   │   │       │   ├── _wikipedia_converter.py
│   │   │       │   ├── _xlsx_converter.py
│   │   │       │   ├── _youtube_converter.py
│   │   │       │   └── _zip_converter.py
│   │   │       └── py.typed
│   │   ├── tests
│   │   │   ├── __init__.py
│   │   │   ├── _test_vectors.py
│   │   │   ├── test_cli_misc.py
│   │   │   ├── test_cli_vectors.py
│   │   │   ├── test_docintel_html.py
│   │   │   ├── test_files
│   │   │   │   ├── equations.docx
│   │   │   │   ├── random.bin
│   │   │   │   ├── rlink.docx
│   │   │   │   ├── test_blog.html
│   │   │   │   ├── test_files.zip
│   │   │   │   ├── test_llm.jpg
│   │   │   │   ├── test_mskanji.csv
│   │   │   │   ├── test_notebook.ipynb
│   │   │   │   ├── test_outlook_msg.msg
│   │   │   │   ├── test_rss.xml
│   │   │   │   ├── test_serp.html
│   │   │   │   ├── test_wikipedia.html
│   │   │   │   ├── test_with_comment.docx
│   │   │   │   ├── test.docx
│   │   │   │   ├── test.epub
│   │   │   │   ├── test.jpg
│   │   │   │   ├── test.json
│   │   │   │   ├── test.m4a
│   │   │   │   ├── test.mp3
│   │   │   │   ├── test.pdf
│   │   │   │   ├── test.pptx
│   │   │   │   ├── test.wav
│   │   │   │   ├── test.xls
│   │   │   │   └── test.xlsx
│   │   │   ├── test_module_misc.py
│   │   │   └── test_module_vectors.py
│   │   └── ThirdPartyNotices.md
│   ├── markitdown-mcp
│   │   ├── Dockerfile
│   │   ├── pyproject.toml
│   │   ├── README.md
│   │   ├── src
│   │   │   └── markitdown_mcp
│   │   │       ├── __about__.py
│   │   │       ├── __init__.py
│   │   │       ├── __main__.py
│   │   │       └── py.typed
│   │   └── tests
│   │       └── __init__.py
│   └── markitdown-sample-plugin
│       ├── pyproject.toml
│       ├── README.md
│       ├── src
│       │   └── markitdown_sample_plugin
│       │       ├── __about__.py
│       │       ├── __init__.py
│       │       ├── _plugin.py
│       │       └── py.typed
│       └── tests
│           ├── __init__.py
│           ├── test_files
│           │   └── test.rtf
│           └── test_sample_plugin.py
├── README.md
├── SECURITY.md
└── SUPPORT.md
```

# Files

--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------

```
1 | *
2 | !packages/
3 | 
```

--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------

```yaml
1 | repos:
2 |   - repo: https://github.com/psf/black
3 |     rev: 23.7.0 # Use the latest version of Black
4 |     hooks:
5 |       - id: black
6 | 
```

--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------

```
1 | packages/markitdown/tests/test_files/** linguist-vendored
2 | packages/markitdown-sample-plugin/tests/test_files/** linguist-vendored
3 | 
```

--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------

```
  1 | .vscode
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | cover/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | .pybuilder/
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | #   For a library or package, you might want to ignore these files since the code is
 89 | #   intended to run in multiple environments; otherwise, check them in:
 90 | # .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # poetry
100 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
102 | #   commonly ignored for libraries.
103 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104 | #poetry.lock
105 | 
106 | # pdm
107 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108 | #pdm.lock
109 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110 | #   in version control.
111 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
112 | .pdm.toml
113 | .pdm-python
114 | .pdm-build/
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 | src/.DS_Store
166 | .DS_Store
167 | .cursorrules
168 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/README.md:
--------------------------------------------------------------------------------

```markdown
 1 | # MarkItDown
 2 | 
 3 | > [!IMPORTANT]
 4 | > MarkItDown is a Python package and command-line utility for converting various files to Markdown (e.g., for indexing, text analysis, etc). 
 5 | >
 6 | > For more information, and full documentation, see the project [README.md](https://github.com/microsoft/markitdown) on GitHub.
 7 | 
 8 | ## Installation
 9 | 
10 | From PyPI:
11 | 
12 | ```bash
13 | pip install markitdown[all]
14 | ```
15 | 
16 | From source:
17 | 
18 | ```bash
19 | git clone [email protected]:microsoft/markitdown.git
20 | cd markitdown
21 | pip install -e packages/markitdown[all]
22 | ```
23 | 
24 | ## Usage
25 | 
26 | ### Command-Line
27 | 
28 | ```bash
29 | markitdown path-to-file.pdf > document.md
30 | ```
31 | 
32 | ### Python API
33 | 
34 | ```python
35 | from markitdown import MarkItDown
36 | 
37 | md = MarkItDown()
38 | result = md.convert("test.xlsx")
39 | print(result.text_content)
40 | ```
41 | 
42 | ### More Information
43 | 
44 | For more information, and full documentation, see the project [README.md](https://github.com/microsoft/markitdown) on GitHub.
45 | 
46 | ## Trademarks
47 | 
48 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
49 | trademarks or logos is subject to and must follow
50 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
51 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
52 | Any use of third-party trademarks or logos are subject to those third-party's policies.
53 | 
```

--------------------------------------------------------------------------------
/packages/markitdown-sample-plugin/README.md:
--------------------------------------------------------------------------------

```markdown
  1 | # MarkItDown Sample Plugin
  2 | 
  3 | [![PyPI](https://img.shields.io/pypi/v/markitdown-sample-plugin.svg)](https://pypi.org/project/markitdown-sample-plugin/)
  4 | ![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown-sample-plugin)
  5 | [![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen)
  6 | 
  7 | 
  8 | This project shows how to create a sample plugin for MarkItDown. The most important parts are as follows:
  9 | 
 10 | Next, implement your custom DocumentConverter:
 11 | 
 12 | ```python
 13 | from typing import BinaryIO, Any
 14 | from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult, StreamInfo
 15 | 
 16 | class RtfConverter(DocumentConverter):
 17 | 
 18 |     def __init__(
 19 |         self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
 20 |     ):
 21 |         super().__init__(priority=priority)
 22 | 
 23 |     def accepts(
 24 |         self,
 25 |         file_stream: BinaryIO,
 26 |         stream_info: StreamInfo,
 27 |         **kwargs: Any,
 28 |     ) -> bool:
 29 | 	
 30 | 	# Implement logic to check if the file stream is an RTF file
 31 | 	# ...
 32 | 	raise NotImplementedError()
 33 | 
 34 | 
 35 |     def convert(
 36 |         self,
 37 |         file_stream: BinaryIO,
 38 |         stream_info: StreamInfo,
 39 |         **kwargs: Any,
 40 |     ) -> DocumentConverterResult:
 41 | 
 42 | 	# Implement logic to convert the file stream to Markdown
 43 | 	# ...
 44 | 	raise NotImplementedError()
 45 | ```
 46 | 
 47 | Next, make sure your package implements and exports the following:
 48 | 
 49 | ```python
 50 | # The version of the plugin interface that this plugin uses. 
 51 | # The only supported version is 1 for now.
 52 | __plugin_interface_version__ = 1 
 53 | 
 54 | # The main entrypoint for the plugin. This is called each time MarkItDown instances are created.
 55 | def register_converters(markitdown: MarkItDown, **kwargs):
 56 |     """
 57 |     Called during construction of MarkItDown instances to register converters provided by plugins.
 58 |     """
 59 | 
 60 |     # Simply create and attach an RtfConverter instance
 61 |     markitdown.register_converter(RtfConverter())
 62 | ```
 63 | 
 64 | 
 65 | Finally, create an entrypoint in the `pyproject.toml` file:
 66 | 
 67 | ```toml
 68 | [project.entry-points."markitdown.plugin"]
 69 | sample_plugin = "markitdown_sample_plugin"
 70 | ```
 71 | 
 72 | Here, the value of `sample_plugin` can be any key, but should ideally be the name of the plugin. The value is the fully qualified name of the package implementing the plugin.
 73 | 
 74 | 
 75 | ## Installation
 76 | 
 77 | To use the plugin with MarkItDown, it must be installed. To install the plugin from the current directory use:
 78 | 
 79 | ```bash
 80 | pip install -e .
 81 | ```
 82 | 
 83 | Once the plugin package is installed, verify that it is available to MarkItDown by running:
 84 | 
 85 | ```bash
 86 | markitdown --list-plugins
 87 | ```
 88 | 
 89 | To use the plugin for a conversion use the `--use-plugins` flag. For example, to convert an RTF file:
 90 | 
 91 | ```bash
 92 | markitdown --use-plugins path-to-file.rtf
 93 | ```
 94 | 
 95 | In Python, plugins can be enabled as follows:
 96 | 
 97 | ```python
 98 | from markitdown import MarkItDown
 99 | 
100 | md = MarkItDown(enable_plugins=True) 
101 | result = md.convert("path-to-file.rtf")
102 | print(result.text_content)
103 | ```
104 | 
105 | ## Trademarks
106 | 
107 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
108 | trademarks or logos is subject to and must follow
109 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
110 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
111 | Any use of third-party trademarks or logos are subject to those third-party's policies.
112 | 
```

--------------------------------------------------------------------------------
/packages/markitdown-mcp/README.md:
--------------------------------------------------------------------------------

```markdown
  1 | # MarkItDown-MCP
  2 | 
  3 | [![PyPI](https://img.shields.io/pypi/v/markitdown-mcp.svg)](https://pypi.org/project/markitdown-mcp/)
  4 | ![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown-mcp)
  5 | [![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen)
  6 | 
  7 | The `markitdown-mcp` package provides a lightweight STDIO, Streamable HTTP, and SSE MCP server for calling MarkItDown.
  8 | 
  9 | It exposes one tool: `convert_to_markdown(uri)`, where uri can be any `http:`, `https:`, `file:`, or `data:` URI.
 10 | 
 11 | ## Installation
 12 | 
 13 | To install the package, use pip:
 14 | 
 15 | ```bash
 16 | pip install markitdown-mcp
 17 | ```
 18 | 
 19 | ## Usage
 20 | 
 21 | To run the MCP server, using STDIO (default) use the following command:
 22 | 
 23 | 
 24 | ```bash	
 25 | markitdown-mcp
 26 | ```
 27 | 
 28 | To run the MCP server, using Streamable HTTP and SSE use the following command:
 29 | 
 30 | ```bash	
 31 | markitdown-mcp --http --host 127.0.0.1 --port 3001
 32 | ```
 33 | 
 34 | ## Running in Docker
 35 | 
 36 | To run `markitdown-mcp` in Docker, build the Docker image using the provided Dockerfile:
 37 | ```bash
 38 | docker build -t markitdown-mcp:latest .
 39 | ```
 40 | 
 41 | And run it using:
 42 | ```bash
 43 | docker run -it --rm markitdown-mcp:latest
 44 | ```
 45 | This will be sufficient for remote URIs. To access local files, you need to mount the local directory into the container. For example, if you want to access files in `/home/user/data`, you can run:
 46 | 
 47 | ```bash
 48 | docker run -it --rm -v /home/user/data:/workdir markitdown-mcp:latest
 49 | ```
 50 | 
 51 | Once mounted, all files under data will be accessible under `/workdir` in the container. For example, if you have a file `example.txt` in `/home/user/data`, it will be accessible in the container at `/workdir/example.txt`.
 52 | 
 53 | ## Accessing from Claude Desktop
 54 | 
 55 | It is recommended to use the Docker image when running the MCP server for Claude Desktop.
 56 | 
 57 | Follow [these instructions](https://modelcontextprotocol.io/quickstart/user#for-claude-desktop-users) to access Claude's `claude_desktop_config.json` file.
 58 | 
 59 | Edit it to include the following JSON entry:
 60 | 
 61 | ```json
 62 | {
 63 |   "mcpServers": {
 64 |     "markitdown": {
 65 |       "command": "docker",
 66 |       "args": [
 67 |         "run",
 68 |         "--rm",
 69 |         "-i",
 70 |         "markitdown-mcp:latest"
 71 |       ]
 72 |     }
 73 |   }
 74 | }
 75 | ```
 76 | 
 77 | If you want to mount a directory, adjust it accordingly:
 78 | 
 79 | ```json
 80 | {
 81 |   "mcpServers": {
 82 |     "markitdown": {
 83 |       "command": "docker",
 84 |       "args": [
 85 | 	"run",
 86 | 	"--rm",
 87 | 	"-i",
 88 | 	"-v",
 89 | 	"/home/user/data:/workdir",
 90 | 	"markitdown-mcp:latest"
 91 |       ]
 92 |     }
 93 |   }
 94 | }
 95 | ```
 96 | 
 97 | ## Debugging
 98 | 
 99 | To debug the MCP server you can use the `mcpinspector` tool.
100 | 
101 | ```bash
102 | npx @modelcontextprotocol/inspector
103 | ```
104 | 
105 | You can then connect to the inspector through the specified host and port (e.g., `http://localhost:5173/`).
106 | 
107 | If using STDIO:
108 | * select `STDIO` as the transport type,
109 | * input `markitdown-mcp` as the command, and
110 | * click `Connect`
111 | 
112 | If using Streamable HTTP:
113 | * select `Streamable HTTP` as the transport type,
114 | * input `http://127.0.0.1:3001/mcp` as the URL, and
115 | * click `Connect`
116 | 
117 | If using SSE:
118 | * select `SSE` as the transport type,
119 | * input `http://127.0.0.1:3001/sse` as the URL, and
120 | * click `Connect`
121 | 
122 | Finally:
123 | * click the `Tools` tab,
124 | * click `List Tools`,
125 | * click `convert_to_markdown`, and
126 | * run the tool on any valid URI.
127 | 
128 | ## Security Considerations
129 | 
130 | The server does not support authentication, and runs with the privileges of the user running it. For this reason, when running in SSE or Streamable HTTP mode, it is recommended to run the server bound to `localhost` (default).
131 | 
132 | ## Trademarks
133 | 
134 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
135 | trademarks or logos is subject to and must follow
136 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
137 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
138 | Any use of third-party trademarks or logos are subject to those third-party's policies.
139 | 
```

--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------

```markdown
  1 | # MarkItDown
  2 | 
  3 | [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
  4 | ![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown)
  5 | [![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen)
  6 | 
  7 | > [!TIP]
  8 | > MarkItDown now offers an MCP (Model Context Protocol) server for integration with LLM applications like Claude Desktop. See [markitdown-mcp](https://github.com/microsoft/markitdown/tree/main/packages/markitdown-mcp) for more information.
  9 | 
 10 | > [!IMPORTANT]
 11 | > Breaking changes between 0.0.1 to 0.1.0:
 12 | > * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]'` to have backward-compatible behavior. 
 13 | > * convert\_stream() now requires a binary file-like object (e.g., a file opened in binary mode, or an io.BytesIO object). This is a breaking change from the previous version, where it previously also accepted text file-like objects, like io.StringIO.
 14 | > * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything.
 15 | 
 16 | MarkItDown is a lightweight Python utility for converting various files to Markdown for use with LLMs and related text analysis pipelines. To this end, it is most comparable to [textract](https://github.com/deanmalmgren/textract), but with a focus on preserving important document structure and content as Markdown (including: headings, lists, tables, links, etc.) While the output is often reasonably presentable and human-friendly, it is meant to be consumed by text analysis tools -- and may not be the best option for high-fidelity document conversions for human consumption.
 17 | 
 18 | MarkItDown currently supports the conversion from:
 19 | 
 20 | - PDF
 21 | - PowerPoint
 22 | - Word
 23 | - Excel
 24 | - Images (EXIF metadata and OCR)
 25 | - Audio (EXIF metadata and speech transcription)
 26 | - HTML
 27 | - Text-based formats (CSV, JSON, XML)
 28 | - ZIP files (iterates over contents)
 29 | - Youtube URLs
 30 | - EPubs
 31 | - ... and more!
 32 | 
 33 | ## Why Markdown?
 34 | 
 35 | Markdown is extremely close to plain text, with minimal markup or formatting, but still
 36 | provides a way to represent important document structure. Mainstream LLMs, such as
 37 | OpenAI's GPT-4o, natively "_speak_" Markdown, and often incorporate Markdown into their
 38 | responses unprompted. This suggests that they have been trained on vast amounts of
 39 | Markdown-formatted text, and understand it well. As a side benefit, Markdown conventions
 40 | are also highly token-efficient.
 41 | 
 42 | ## Prerequisites
 43 | MarkItDown requires Python 3.10 or higher. It is recommended to use a virtual environment to avoid dependency conflicts.
 44 | 
 45 | With the standard Python installation, you can create and activate a virtual environment using the following commands:
 46 | 
 47 | ```bash
 48 | python -m venv .venv
 49 | source .venv/bin/activate
 50 | ```
 51 | 
 52 | If using `uv`, you can create a virtual environment with:
 53 | 
 54 | ```bash
 55 | uv venv --python=3.12 .venv
 56 | source .venv/bin/activate
 57 | # NOTE: Be sure to use 'uv pip install' rather than just 'pip install' to install packages in this virtual environment
 58 | ```
 59 | 
 60 | If you are using Anaconda, you can create a virtual environment with:
 61 | 
 62 | ```bash
 63 | conda create -n markitdown python=3.12
 64 | conda activate markitdown
 65 | ```
 66 | 
 67 | ## Installation
 68 | 
 69 | To install MarkItDown, use pip: `pip install 'markitdown[all]'`. Alternatively, you can install it from the source:
 70 | 
 71 | ```bash
 72 | git clone [email protected]:microsoft/markitdown.git
 73 | cd markitdown
 74 | pip install -e 'packages/markitdown[all]'
 75 | ```
 76 | 
 77 | ## Usage
 78 | 
 79 | ### Command-Line
 80 | 
 81 | ```bash
 82 | markitdown path-to-file.pdf > document.md
 83 | ```
 84 | 
 85 | Or use `-o` to specify the output file:
 86 | 
 87 | ```bash
 88 | markitdown path-to-file.pdf -o document.md
 89 | ```
 90 | 
 91 | You can also pipe content:
 92 | 
 93 | ```bash
 94 | cat path-to-file.pdf | markitdown
 95 | ```
 96 | 
 97 | ### Optional Dependencies
 98 | MarkItDown has optional dependencies for activating various file formats. Earlier in this document, we installed all optional dependencies with the `[all]` option. However, you can also install them individually for more control. For example:
 99 | 
100 | ```bash
101 | pip install 'markitdown[pdf, docx, pptx]'
102 | ```
103 | 
104 | will install only the dependencies for PDF, DOCX, and PPTX files.
105 | 
106 | At the moment, the following optional dependencies are available:
107 | 
108 | * `[all]` Installs all optional dependencies
109 | * `[pptx]` Installs dependencies for PowerPoint files
110 | * `[docx]` Installs dependencies for Word files
111 | * `[xlsx]` Installs dependencies for Excel files
112 | * `[xls]` Installs dependencies for older Excel files
113 | * `[pdf]` Installs dependencies for PDF files
114 | * `[outlook]` Installs dependencies for Outlook messages
115 | * `[az-doc-intel]` Installs dependencies for Azure Document Intelligence
116 | * `[audio-transcription]` Installs dependencies for audio transcription of wav and mp3 files
117 | * `[youtube-transcription]` Installs dependencies for fetching YouTube video transcription
118 | 
119 | ### Plugins
120 | 
121 | MarkItDown also supports 3rd-party plugins. Plugins are disabled by default. To list installed plugins:
122 | 
123 | ```bash
124 | markitdown --list-plugins
125 | ```
126 | 
127 | To enable plugins use:
128 | 
129 | ```bash
130 | markitdown --use-plugins path-to-file.pdf
131 | ```
132 | 
133 | To find available plugins, search GitHub for the hashtag `#markitdown-plugin`. To develop a plugin, see `packages/markitdown-sample-plugin`.
134 | 
135 | ### Azure Document Intelligence
136 | 
137 | To use Microsoft Document Intelligence for conversion:
138 | 
139 | ```bash
140 | markitdown path-to-file.pdf -o document.md -d -e "<document_intelligence_endpoint>"
141 | ```
142 | 
143 | More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0)
144 | 
145 | ### Python API
146 | 
147 | Basic usage in Python:
148 | 
149 | ```python
150 | from markitdown import MarkItDown
151 | 
152 | md = MarkItDown(enable_plugins=False) # Set to True to enable plugins
153 | result = md.convert("test.xlsx")
154 | print(result.text_content)
155 | ```
156 | 
157 | Document Intelligence conversion in Python:
158 | 
159 | ```python
160 | from markitdown import MarkItDown
161 | 
162 | md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
163 | result = md.convert("test.pdf")
164 | print(result.text_content)
165 | ```
166 | 
167 | To use Large Language Models for image descriptions (currently only for pptx and image files), provide `llm_client` and `llm_model`:
168 | 
169 | ```python
170 | from markitdown import MarkItDown
171 | from openai import OpenAI
172 | 
173 | client = OpenAI()
174 | md = MarkItDown(llm_client=client, llm_model="gpt-4o", llm_prompt="optional custom prompt")
175 | result = md.convert("example.jpg")
176 | print(result.text_content)
177 | ```
178 | 
179 | ### Docker
180 | 
181 | ```sh
182 | docker build -t markitdown:latest .
183 | docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
184 | ```
185 | 
186 | ## Contributing
187 | 
188 | This project welcomes contributions and suggestions. Most contributions require you to agree to a
189 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
190 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
191 | 
192 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
193 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
194 | provided by the bot. You will only need to do this once across all repos using our CLA.
195 | 
196 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
197 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
198 | contact [[email protected]](mailto:[email protected]) with any additional questions or comments.
199 | 
200 | ### How to Contribute
201 | 
202 | You can help by looking at issues or helping review PRs. Any issue or PR is welcome, but we have also marked some as 'open for contribution' and 'open for reviewing' to help facilitate community contributions. These are of course just suggestions and you are welcome to contribute in any way you like.
203 | 
204 | <div align="center">
205 | 
206 | |            | All                                                          | Especially Needs Help from Community                                                                                                      |
207 | | ---------- | ------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- |
208 | | **Issues** | [All Issues](https://github.com/microsoft/markitdown/issues) | [Issues open for contribution](https://github.com/microsoft/markitdown/issues?q=is%3Aissue+is%3Aopen+label%3A%22open+for+contribution%22) |
209 | | **PRs**    | [All PRs](https://github.com/microsoft/markitdown/pulls)     | [PRs open for reviewing](https://github.com/microsoft/markitdown/pulls?q=is%3Apr+is%3Aopen+label%3A%22open+for+reviewing%22)              |
210 | 
211 | </div>
212 | 
213 | ### Running Tests and Checks
214 | 
215 | - Navigate to the MarkItDown package:
216 | 
217 |   ```sh
218 |   cd packages/markitdown
219 |   ```
220 | 
221 | - Install `hatch` in your environment and run tests:
222 | 
223 |   ```sh
224 |   pip install hatch  # Other ways of installing hatch: https://hatch.pypa.io/dev/install/
225 |   hatch shell
226 |   hatch test
227 |   ```
228 | 
229 |   (Alternative) Use the Devcontainer which has all the dependencies installed:
230 | 
231 |   ```sh
232 |   # Reopen the project in Devcontainer and run:
233 |   hatch test
234 |   ```
235 | 
236 | - Run pre-commit checks before submitting a PR: `pre-commit run --all-files`
237 | 
238 | ### Contributing 3rd-party Plugins
239 | 
240 | You can also contribute by creating and sharing 3rd party plugins. See `packages/markitdown-sample-plugin` for more details.
241 | 
242 | ## Trademarks
243 | 
244 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
245 | trademarks or logos is subject to and must follow
246 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
247 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
248 | Any use of third-party trademarks or logos are subject to those third-party's policies.
249 | 
```

--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------

```markdown
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [[email protected]](mailto:[email protected]) with questions or concerns
10 | 
```

--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------

```markdown
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [[email protected]](mailto:[email protected]).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converter_utils/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converter_utils/docx/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converter_utils/docx/math/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/packages/markitdown-mcp/tests/__init__.py:
--------------------------------------------------------------------------------

```python
1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney <[email protected]>
2 | #
3 | # SPDX-License-Identifier: MIT
4 | 
```

--------------------------------------------------------------------------------
/packages/markitdown-sample-plugin/tests/__init__.py:
--------------------------------------------------------------------------------

```python
1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney <[email protected]>
2 | #
3 | # SPDX-License-Identifier: MIT
4 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/tests/__init__.py:
--------------------------------------------------------------------------------

```python
1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney <[email protected]>
2 | #
3 | # SPDX-License-Identifier: MIT
4 | 
```

--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------

```yaml
1 | version: 2
2 | updates:
3 |   - package-ecosystem: "github-actions"
4 |     directory: "/"
5 |     schedule:
6 |       interval: "weekly"
7 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/__about__.py:
--------------------------------------------------------------------------------

```python
1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney <[email protected]>
2 | #
3 | # SPDX-License-Identifier: MIT
4 | __version__ = "0.1.4"
5 | 
```

--------------------------------------------------------------------------------
/packages/markitdown-mcp/src/markitdown_mcp/__about__.py:
--------------------------------------------------------------------------------

```python
1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney <[email protected]>
2 | #
3 | # SPDX-License-Identifier: MIT
4 | __version__ = "0.0.1a4"
5 | 
```

--------------------------------------------------------------------------------
/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/__about__.py:
--------------------------------------------------------------------------------

```python
1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney <[email protected]>
2 | #
3 | # SPDX-License-Identifier: MIT
4 | __version__ = "0.1.0a1"
5 | 
```

--------------------------------------------------------------------------------
/packages/markitdown-mcp/src/markitdown_mcp/__init__.py:
--------------------------------------------------------------------------------

```python
 1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney <[email protected]>
 2 | #
 3 | # SPDX-License-Identifier: MIT
 4 | 
 5 | from .__about__ import __version__
 6 | 
 7 | __all__ = [
 8 |     "__version__",
 9 | ]
10 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/tests/test_files/test.json:
--------------------------------------------------------------------------------

```json
 1 | {
 2 |     "key1": "string_value",
 3 |     "key2": 1234,
 4 |     "key3": [
 5 |         "list_value1",
 6 |         "list_value2"
 7 |     ],
 8 |     "5b64c88c-b3c3-4510-bcb8-da0b200602d8": "uuid_key",
 9 |     "uuid_value": "9700dc99-6685-40b4-9a3a-5e406dcb37f3"
10 | }
11 | 
```

--------------------------------------------------------------------------------
/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/__init__.py:
--------------------------------------------------------------------------------

```python
 1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney <[email protected]>
 2 | #
 3 | # SPDX-License-Identifier: MIT
 4 | 
 5 | from ._plugin import __plugin_interface_version__, register_converters, RtfConverter
 6 | from .__about__ import __version__
 7 | 
 8 | __all__ = [
 9 |     "__version__",
10 |     "__plugin_interface_version__",
11 |     "register_converters",
12 |     "RtfConverter",
13 | ]
14 | 
```

--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------

```yaml
 1 | name: tests
 2 | on: [pull_request]
 3 | 
 4 | jobs:
 5 |   tests:
 6 |     runs-on: ubuntu-latest
 7 |     steps:
 8 |       - uses: actions/checkout@v5
 9 |       - uses: actions/setup-python@v5
10 |         with:
11 |           python-version: |
12 |             3.10
13 |             3.11
14 |             3.12
15 |       - name: Install Hatch
16 |         run: pipx install hatch
17 |       - name: Run tests
18 |         run: cd packages/markitdown; hatch test
19 | 
```

--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------

```yaml
 1 | name: pre-commit
 2 | on: [pull_request]
 3 | 
 4 | jobs:
 5 |   pre-commit:
 6 |     runs-on: ubuntu-latest
 7 |     steps:
 8 |       - uses: actions/checkout@v5
 9 |       - name: Set up Python
10 |         uses: actions/setup-python@v5
11 |         with:
12 |           python-version: "3.x"
13 | 
14 |       - name: Install pre-commit
15 |         run: |
16 |           pip install pre-commit
17 |           pre-commit install --install-hooks
18 | 
19 |       - name: Run pre-commit
20 |         run: pre-commit run --all-files
21 | 
```

--------------------------------------------------------------------------------
/packages/markitdown-mcp/Dockerfile:
--------------------------------------------------------------------------------

```dockerfile
 1 | FROM python:3.13-slim-bullseye
 2 | 
 3 | ENV DEBIAN_FRONTEND=noninteractive
 4 | ENV EXIFTOOL_PATH=/usr/bin/exiftool
 5 | ENV FFMPEG_PATH=/usr/bin/ffmpeg
 6 | ENV MARKITDOWN_ENABLE_PLUGINS=True
 7 | 
 8 | # Runtime dependency
 9 | # NOTE: Add any additional MarkItDown plugins here
10 | RUN apt-get update && apt-get install -y --no-install-recommends \
11 |     ffmpeg \
12 |     exiftool
13 | 
14 | # Cleanup
15 | RUN rm -rf /var/lib/apt/lists/*
16 | 
17 | COPY . /app
18 | RUN pip --no-cache-dir install /app
19 | 
20 | WORKDIR /workdir
21 | 
22 | # Default USERID and GROUPID
23 | ARG USERID=nobody
24 | ARG GROUPID=nogroup
25 | 
26 | USER $USERID:$GROUPID
27 | 
28 | ENTRYPOINT [ "markitdown-mcp" ]
29 | 
```

--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------

```dockerfile
 1 | FROM python:3.13-slim-bullseye
 2 | 
 3 | ENV DEBIAN_FRONTEND=noninteractive
 4 | ENV EXIFTOOL_PATH=/usr/bin/exiftool
 5 | ENV FFMPEG_PATH=/usr/bin/ffmpeg
 6 | 
 7 | # Runtime dependency
 8 | RUN apt-get update && apt-get install -y --no-install-recommends \
 9 |     ffmpeg \
10 |     exiftool
11 | 
12 | ARG INSTALL_GIT=false
13 | RUN if [ "$INSTALL_GIT" = "true" ]; then \
14 |     apt-get install -y --no-install-recommends \
15 |     git; \
16 |     fi
17 | 
18 | # Cleanup
19 | RUN rm -rf /var/lib/apt/lists/*
20 | 
21 | WORKDIR /app
22 | COPY . /app
23 | RUN pip --no-cache-dir install \
24 |     /app/packages/markitdown[all] \
25 |     /app/packages/markitdown-sample-plugin
26 | 
27 | # Default USERID and GROUPID
28 | ARG USERID=nobody
29 | ARG GROUPID=nogroup
30 | 
31 | USER $USERID:$GROUPID
32 | 
33 | ENTRYPOINT [ "markitdown" ]
34 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/__init__.py:
--------------------------------------------------------------------------------

```python
 1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney <[email protected]>
 2 | #
 3 | # SPDX-License-Identifier: MIT
 4 | 
 5 | from .__about__ import __version__
 6 | from ._markitdown import (
 7 |     MarkItDown,
 8 |     PRIORITY_SPECIFIC_FILE_FORMAT,
 9 |     PRIORITY_GENERIC_FILE_FORMAT,
10 | )
11 | from ._base_converter import DocumentConverterResult, DocumentConverter
12 | from ._stream_info import StreamInfo
13 | from ._exceptions import (
14 |     MarkItDownException,
15 |     MissingDependencyException,
16 |     FailedConversionAttempt,
17 |     FileConversionException,
18 |     UnsupportedFormatException,
19 | )
20 | 
21 | __all__ = [
22 |     "__version__",
23 |     "MarkItDown",
24 |     "DocumentConverter",
25 |     "DocumentConverterResult",
26 |     "MarkItDownException",
27 |     "MissingDependencyException",
28 |     "FailedConversionAttempt",
29 |     "FileConversionException",
30 |     "UnsupportedFormatException",
31 |     "StreamInfo",
32 |     "PRIORITY_SPECIFIC_FILE_FORMAT",
33 |     "PRIORITY_GENERIC_FILE_FORMAT",
34 | ]
35 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/tests/test_docintel_html.py:
--------------------------------------------------------------------------------

```python
 1 | import io
 2 | from markitdown.converters._doc_intel_converter import (
 3 |     DocumentIntelligenceConverter,
 4 |     DocumentIntelligenceFileType,
 5 | )
 6 | from markitdown._stream_info import StreamInfo
 7 | 
 8 | 
 9 | def _make_converter(file_types):
10 |     conv = DocumentIntelligenceConverter.__new__(DocumentIntelligenceConverter)
11 |     conv._file_types = file_types
12 |     return conv
13 | 
14 | 
15 | def test_docintel_accepts_html_extension():
16 |     conv = _make_converter([DocumentIntelligenceFileType.HTML])
17 |     stream_info = StreamInfo(mimetype=None, extension=".html")
18 |     assert conv.accepts(io.BytesIO(b""), stream_info)
19 | 
20 | 
21 | def test_docintel_accepts_html_mimetype():
22 |     conv = _make_converter([DocumentIntelligenceFileType.HTML])
23 |     stream_info = StreamInfo(mimetype="text/html", extension=None)
24 |     assert conv.accepts(io.BytesIO(b""), stream_info)
25 |     stream_info = StreamInfo(mimetype="application/xhtml+xml", extension=None)
26 |     assert conv.accepts(io.BytesIO(b""), stream_info)
27 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/_stream_info.py:
--------------------------------------------------------------------------------

```python
 1 | from dataclasses import dataclass, asdict
 2 | from typing import Optional
 3 | 
 4 | 
 5 | @dataclass(kw_only=True, frozen=True)
 6 | class StreamInfo:
 7 |     """The StreamInfo class is used to store information about a file stream.
 8 |     All fields can be None, and will depend on how the stream was opened.
 9 |     """
10 | 
11 |     mimetype: Optional[str] = None
12 |     extension: Optional[str] = None
13 |     charset: Optional[str] = None
14 |     filename: Optional[
15 |         str
16 |     ] = None  # From local path, url, or Content-Disposition header
17 |     local_path: Optional[str] = None  # If read from disk
18 |     url: Optional[str] = None  # If read from url
19 | 
20 |     def copy_and_update(self, *args, **kwargs):
21 |         """Copy the StreamInfo object and update it with the given StreamInfo
22 |         instance and/or other keyword arguments."""
23 |         new_info = asdict(self)
24 | 
25 |         for si in args:
26 |             assert isinstance(si, StreamInfo)
27 |             new_info.update({k: v for k, v in asdict(si).items() if v is not None})
28 | 
29 |         if len(kwargs) > 0:
30 |             new_info.update(kwargs)
31 | 
32 |         return StreamInfo(**new_info)
33 | 
```

--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------

```json
 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the
 2 | // README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
 3 | {
 4 | 	"name": "Existing Dockerfile",
 5 | 	"build": {
 6 | 		// Sets the run context to one level up instead of the .devcontainer folder.
 7 | 		"context": "..",
 8 | 		// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
 9 | 		"dockerfile": "../Dockerfile",
10 | 		"args": {
11 | 			"INSTALL_GIT": "true"
12 | 		}
13 | 	},
14 | 
15 | 	// Features to add to the dev container. More info: https://containers.dev/features.
16 | 	// "features": {},
17 | 	"features": {
18 | 		"ghcr.io/devcontainers-extra/features/hatch:2": {}
19 | 	},
20 | 
21 | 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
22 | 	// "forwardPorts": [],
23 | 
24 | 	// Uncomment the next line to run commands after the container is created.
25 | 	// "postCreateCommand": "cat /etc/os-release",
26 | 
27 | 	// Configure tool-specific properties.
28 | 	// "customizations": {},
29 | 
30 | 	// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
31 | 	"remoteUser": "root"
32 | }
33 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/tests/test_cli_misc.py:
--------------------------------------------------------------------------------

```python
 1 | #!/usr/bin/env python3 -m pytest
 2 | import subprocess
 3 | from markitdown import __version__
 4 | 
 5 | # This file contains CLI tests that are not directly tested by the FileTestVectors.
 6 | # This includes things like help messages, version numbers, and invalid flags.
 7 | 
 8 | 
 9 | def test_version() -> None:
10 |     result = subprocess.run(
11 |         ["python", "-m", "markitdown", "--version"], capture_output=True, text=True
12 |     )
13 | 
14 |     assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
15 |     assert __version__ in result.stdout, f"Version not found in output: {result.stdout}"
16 | 
17 | 
18 | def test_invalid_flag() -> None:
19 |     result = subprocess.run(
20 |         ["python", "-m", "markitdown", "--foobar"], capture_output=True, text=True
21 |     )
22 | 
23 |     assert result.returncode != 0, f"CLI exited with error: {result.stderr}"
24 |     assert (
25 |         "unrecognized arguments" in result.stderr
26 |     ), "Expected 'unrecognized arguments' to appear in STDERR"
27 |     assert "SYNTAX" in result.stderr, "Expected 'SYNTAX' to appear in STDERR"
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     """Runs this file's tests from the command line."""
32 |     test_version()
33 |     test_invalid_flag()
34 |     print("All tests passed!")
35 | 
```

--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------

```markdown
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 
```

--------------------------------------------------------------------------------
/packages/markitdown-sample-plugin/tests/test_sample_plugin.py:
--------------------------------------------------------------------------------

```python
 1 | #!/usr/bin/env python3 -m pytest
 2 | import os
 3 | 
 4 | from markitdown import MarkItDown, StreamInfo
 5 | from markitdown_sample_plugin import RtfConverter
 6 | 
 7 | TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
 8 | 
 9 | RTF_TEST_STRINGS = {
10 |     "This is a Sample RTF File",
11 |     "It is included to test if the MarkItDown sample plugin can correctly convert RTF files.",
12 | }
13 | 
14 | 
15 | def test_converter() -> None:
16 |     """Tests the RTF converter dirctly."""
17 |     with open(os.path.join(TEST_FILES_DIR, "test.rtf"), "rb") as file_stream:
18 |         converter = RtfConverter()
19 |         result = converter.convert(
20 |             file_stream=file_stream,
21 |             stream_info=StreamInfo(
22 |                 mimetype="text/rtf", extension=".rtf", filename="test.rtf"
23 |             ),
24 |         )
25 | 
26 |         for test_string in RTF_TEST_STRINGS:
27 |             assert test_string in result.text_content
28 | 
29 | 
30 | def test_markitdown() -> None:
31 |     """Tests that MarkItDown correctly loads the plugin."""
32 |     md = MarkItDown(enable_plugins=True)
33 |     result = md.convert(os.path.join(TEST_FILES_DIR, "test.rtf"))
34 | 
35 |     for test_string in RTF_TEST_STRINGS:
36 |         assert test_string in result.text_content
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     """Runs this file's tests from the command line."""
41 |     test_converter()
42 |     test_markitdown()
43 |     print("All tests passed.")
44 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_llm_caption.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import BinaryIO, Union
 2 | import base64
 3 | import mimetypes
 4 | from .._stream_info import StreamInfo
 5 | 
 6 | 
 7 | def llm_caption(
 8 |     file_stream: BinaryIO, stream_info: StreamInfo, *, client, model, prompt=None
 9 | ) -> Union[None, str]:
10 |     if prompt is None or prompt.strip() == "":
11 |         prompt = "Write a detailed caption for this image."
12 | 
13 |     # Get the content type
14 |     content_type = stream_info.mimetype
15 |     if not content_type:
16 |         content_type, _ = mimetypes.guess_type("_dummy" + (stream_info.extension or ""))
17 |     if not content_type:
18 |         content_type = "application/octet-stream"
19 | 
20 |     # Convert to base64
21 |     cur_pos = file_stream.tell()
22 |     try:
23 |         base64_image = base64.b64encode(file_stream.read()).decode("utf-8")
24 |     except Exception as e:
25 |         return None
26 |     finally:
27 |         file_stream.seek(cur_pos)
28 | 
29 |     # Prepare the data-uri
30 |     data_uri = f"data:{content_type};base64,{base64_image}"
31 | 
32 |     # Prepare the OpenAI API request
33 |     messages = [
34 |         {
35 |             "role": "user",
36 |             "content": [
37 |                 {"type": "text", "text": prompt},
38 |                 {
39 |                     "type": "image_url",
40 |                     "image_url": {
41 |                         "url": data_uri,
42 |                     },
43 |                 },
44 |             ],
45 |         }
46 |     ]
47 | 
48 |     # Call the OpenAI API
49 |     response = client.chat.completions.create(model=model, messages=messages)
50 |     return response.choices[0].message.content
51 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_exiftool.py:
--------------------------------------------------------------------------------

```python
 1 | import json
 2 | import locale
 3 | import subprocess
 4 | from typing import Any, BinaryIO, Union
 5 | 
 6 | 
 7 | def _parse_version(version: str) -> tuple:
 8 |     return tuple(map(int, (version.split("."))))
 9 | 
10 | 
11 | def exiftool_metadata(
12 |     file_stream: BinaryIO,
13 |     *,
14 |     exiftool_path: Union[str, None],
15 | ) -> Any:  # Need a better type for json data
16 |     # Nothing to do
17 |     if not exiftool_path:
18 |         return {}
19 | 
20 |     # Verify exiftool version
21 |     try:
22 |         version_output = subprocess.run(
23 |             [exiftool_path, "-ver"],
24 |             capture_output=True,
25 |             text=True,
26 |             check=True,
27 |         ).stdout.strip()
28 |         version = _parse_version(version_output)
29 |         min_version = (12, 24)
30 |         if version < min_version:
31 |             raise RuntimeError(
32 |                 f"ExifTool version {version_output} is vulnerable to CVE-2021-22204. "
33 |                 "Please upgrade to version 12.24 or later."
34 |             )
35 |     except (subprocess.CalledProcessError, ValueError) as e:
36 |         raise RuntimeError("Failed to verify ExifTool version.") from e
37 | 
38 |     # Run exiftool
39 |     cur_pos = file_stream.tell()
40 |     try:
41 |         output = subprocess.run(
42 |             [exiftool_path, "-json", "-"],
43 |             input=file_stream.read(),
44 |             capture_output=True,
45 |             text=False,
46 |         ).stdout
47 | 
48 |         return json.loads(
49 |             output.decode(locale.getpreferredencoding(False)),
50 |         )[0]
51 |     finally:
52 |         file_stream.seek(cur_pos)
53 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/__init__.py:
--------------------------------------------------------------------------------

```python
 1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney <[email protected]>
 2 | #
 3 | # SPDX-License-Identifier: MIT
 4 | 
 5 | from ._plain_text_converter import PlainTextConverter
 6 | from ._html_converter import HtmlConverter
 7 | from ._rss_converter import RssConverter
 8 | from ._wikipedia_converter import WikipediaConverter
 9 | from ._youtube_converter import YouTubeConverter
10 | from ._ipynb_converter import IpynbConverter
11 | from ._bing_serp_converter import BingSerpConverter
12 | from ._pdf_converter import PdfConverter
13 | from ._docx_converter import DocxConverter
14 | from ._xlsx_converter import XlsxConverter, XlsConverter
15 | from ._pptx_converter import PptxConverter
16 | from ._image_converter import ImageConverter
17 | from ._audio_converter import AudioConverter
18 | from ._outlook_msg_converter import OutlookMsgConverter
19 | from ._zip_converter import ZipConverter
20 | from ._doc_intel_converter import (
21 |     DocumentIntelligenceConverter,
22 |     DocumentIntelligenceFileType,
23 | )
24 | from ._epub_converter import EpubConverter
25 | from ._csv_converter import CsvConverter
26 | 
27 | __all__ = [
28 |     "PlainTextConverter",
29 |     "HtmlConverter",
30 |     "RssConverter",
31 |     "WikipediaConverter",
32 |     "YouTubeConverter",
33 |     "IpynbConverter",
34 |     "BingSerpConverter",
35 |     "PdfConverter",
36 |     "DocxConverter",
37 |     "XlsxConverter",
38 |     "XlsConverter",
39 |     "PptxConverter",
40 |     "ImageConverter",
41 |     "AudioConverter",
42 |     "OutlookMsgConverter",
43 |     "ZipConverter",
44 |     "DocumentIntelligenceConverter",
45 |     "DocumentIntelligenceFileType",
46 |     "EpubConverter",
47 |     "CsvConverter",
48 | ]
49 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/_uri_utils.py:
--------------------------------------------------------------------------------

```python
 1 | import base64
 2 | import os
 3 | from typing import Tuple, Dict
 4 | from urllib.request import url2pathname
 5 | from urllib.parse import urlparse, unquote_to_bytes
 6 | 
 7 | 
 8 | def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]:
 9 |     """Convert a file URI to a local file path"""
10 |     parsed = urlparse(file_uri)
11 |     if parsed.scheme != "file":
12 |         raise ValueError(f"Not a file URL: {file_uri}")
13 | 
14 |     netloc = parsed.netloc if parsed.netloc else None
15 |     path = os.path.abspath(url2pathname(parsed.path))
16 |     return netloc, path
17 | 
18 | 
19 | def parse_data_uri(uri: str) -> Tuple[str | None, Dict[str, str], bytes]:
20 |     if not uri.startswith("data:"):
21 |         raise ValueError("Not a data URI")
22 | 
23 |     header, _, data = uri.partition(",")
24 |     if not _:
25 |         raise ValueError("Malformed data URI, missing ',' separator")
26 | 
27 |     meta = header[5:]  # Strip 'data:'
28 |     parts = meta.split(";")
29 | 
30 |     is_base64 = False
31 |     # Ends with base64?
32 |     if parts[-1] == "base64":
33 |         parts.pop()
34 |         is_base64 = True
35 | 
36 |     mime_type = None  # Normally this would default to text/plain but we won't assume
37 |     if len(parts) and len(parts[0]) > 0:
38 |         # First part is the mime type
39 |         mime_type = parts.pop(0)
40 | 
41 |     attributes: Dict[str, str] = {}
42 |     for part in parts:
43 |         # Handle key=value pairs in the middle
44 |         if "=" in part:
45 |             key, value = part.split("=", 1)
46 |             attributes[key] = value
47 |         elif len(part) > 0:
48 |             attributes[part] = ""
49 | 
50 |     content = base64.b64decode(data) if is_base64 else unquote_to_bytes(data)
51 | 
52 |     return mime_type, attributes, content
53 | 
```

--------------------------------------------------------------------------------
/packages/markitdown-mcp/pyproject.toml:
--------------------------------------------------------------------------------

```toml
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "markitdown-mcp"
 7 | dynamic = ["version"]
 8 | description = 'An MCP server for the "markitdown" library.'
 9 | readme = "README.md"
10 | requires-python = ">=3.10"
11 | license = "MIT"
12 | keywords = []
13 | authors = [
14 |   { name = "Adam Fourney", email = "[email protected]" },
15 | ]
16 | classifiers = [
17 |   "Development Status :: 4 - Beta",
18 |   "Programming Language :: Python",
19 |   "Programming Language :: Python :: 3.10",
20 |   "Programming Language :: Python :: 3.11",
21 |   "Programming Language :: Python :: 3.12",
22 |   "Programming Language :: Python :: 3.13",
23 |   "Programming Language :: Python :: Implementation :: CPython",
24 |   "Programming Language :: Python :: Implementation :: PyPy",
25 | ]
26 | dependencies = [
27 |   "mcp~=1.8.0",
28 |   "markitdown[all]>=0.1.1,<0.2.0",
29 | ]
30 | 
31 | [project.urls]
32 | Documentation = "https://github.com/microsoft/markitdown#readme"
33 | Issues = "https://github.com/microsoft/markitdown/issues"
34 | Source = "https://github.com/microsoft/markitdown"
35 | 
36 | [tool.hatch.version]
37 | path = "src/markitdown_mcp/__about__.py"
38 | 
39 | [project.scripts]
40 | markitdown-mcp = "markitdown_mcp.__main__:main"
41 | 
42 | [tool.hatch.envs.types]
43 | extra-dependencies = [
44 |   "mypy>=1.0.0",
45 | ]
46 | [tool.hatch.envs.types.scripts]
47 | check = "mypy --install-types --non-interactive {args:src/markitdown_mcp tests}"
48 | 
49 | [tool.coverage.run]
50 | source_pkgs = ["markitdown-mcp", "tests"]
51 | branch = true
52 | parallel = true
53 | omit = [
54 |   "src/markitdown_mcp/__about__.py",
55 | ]
56 | 
57 | [tool.coverage.paths]
58 | markitdown-mcp = ["src/markitdown_mcp", "*/markitdown-mcp/src/markitdown_mcp"]
59 | tests = ["tests", "*/markitdown-mcp/tests"]
60 | 
61 | [tool.coverage.report]
62 | exclude_lines = [
63 |   "no cov",
64 |   "if __name__ == .__main__.:",
65 |   "if TYPE_CHECKING:",
66 | ]
67 | 
68 | [tool.hatch.build.targets.sdist]
69 | only-include = ["src/markitdown_mcp"]
70 | 
```

--------------------------------------------------------------------------------
/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/_plugin.py:
--------------------------------------------------------------------------------

```python
 1 | import locale
 2 | from typing import BinaryIO, Any
 3 | from striprtf.striprtf import rtf_to_text
 4 | 
 5 | from markitdown import (
 6 |     MarkItDown,
 7 |     DocumentConverter,
 8 |     DocumentConverterResult,
 9 |     StreamInfo,
10 | )
11 | 
12 | 
13 | __plugin_interface_version__ = (
14 |     1  # The version of the plugin interface that this plugin uses
15 | )
16 | 
17 | ACCEPTED_MIME_TYPE_PREFIXES = [
18 |     "text/rtf",
19 |     "application/rtf",
20 | ]
21 | 
22 | ACCEPTED_FILE_EXTENSIONS = [".rtf"]
23 | 
24 | 
25 | def register_converters(markitdown: MarkItDown, **kwargs):
26 |     """
27 |     Called during construction of MarkItDown instances to register converters provided by plugins.
28 |     """
29 | 
30 |     # Simply create and attach an RtfConverter instance
31 |     markitdown.register_converter(RtfConverter())
32 | 
33 | 
34 | class RtfConverter(DocumentConverter):
35 |     """
36 |     Converts an RTF file to in the simplest possible way.
37 |     """
38 | 
39 |     def accepts(
40 |         self,
41 |         file_stream: BinaryIO,
42 |         stream_info: StreamInfo,
43 |         **kwargs: Any,
44 |     ) -> bool:
45 |         mimetype = (stream_info.mimetype or "").lower()
46 |         extension = (stream_info.extension or "").lower()
47 | 
48 |         if extension in ACCEPTED_FILE_EXTENSIONS:
49 |             return True
50 | 
51 |         for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
52 |             if mimetype.startswith(prefix):
53 |                 return True
54 | 
55 |         return False
56 | 
57 |     def convert(
58 |         self,
59 |         file_stream: BinaryIO,
60 |         stream_info: StreamInfo,
61 |         **kwargs: Any,
62 |     ) -> DocumentConverterResult:
63 |         # Read the file stream into an str using hte provided charset encoding, or using the system default
64 |         encoding = stream_info.charset or locale.getpreferredencoding()
65 |         stream_data = file_stream.read().decode(encoding)
66 | 
67 |         # Return the result
68 |         return DocumentConverterResult(
69 |             title=None,
70 |             markdown=rtf_to_text(stream_data),
71 |         )
72 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_transcribe_audio.py:
--------------------------------------------------------------------------------

```python
 1 | import io
 2 | import sys
 3 | from typing import BinaryIO
 4 | from .._exceptions import MissingDependencyException
 5 | 
 6 | # Try loading optional (but in this case, required) dependencies
 7 | # Save reporting of any exceptions for later
 8 | _dependency_exc_info = None
 9 | try:
10 |     # Suppress some warnings on library import
11 |     import warnings
12 | 
13 |     with warnings.catch_warnings():
14 |         warnings.filterwarnings("ignore", category=DeprecationWarning)
15 |         warnings.filterwarnings("ignore", category=SyntaxWarning)
16 |         import speech_recognition as sr
17 |         import pydub
18 | except ImportError:
19 |     # Preserve the error and stack trace for later
20 |     _dependency_exc_info = sys.exc_info()
21 | 
22 | 
23 | def transcribe_audio(file_stream: BinaryIO, *, audio_format: str = "wav") -> str:
24 |     # Check for installed dependencies
25 |     if _dependency_exc_info is not None:
26 |         raise MissingDependencyException(
27 |             "Speech transcription requires installing MarkItdown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`"
28 |         ) from _dependency_exc_info[
29 |             1
30 |         ].with_traceback(  # type: ignore[union-attr]
31 |             _dependency_exc_info[2]
32 |         )
33 | 
34 |     if audio_format in ["wav", "aiff", "flac"]:
35 |         audio_source = file_stream
36 |     elif audio_format in ["mp3", "mp4"]:
37 |         audio_segment = pydub.AudioSegment.from_file(file_stream, format=audio_format)
38 | 
39 |         audio_source = io.BytesIO()
40 |         audio_segment.export(audio_source, format="wav")
41 |         audio_source.seek(0)
42 |     else:
43 |         raise ValueError(f"Unsupported audio format: {audio_format}")
44 | 
45 |     recognizer = sr.Recognizer()
46 |     with sr.AudioFile(audio_source) as source:
47 |         audio = recognizer.record(source)
48 |         transcript = recognizer.recognize_google(audio).strip()
49 |         return "[No speech detected]" if transcript == "" else transcript
50 | 
```

--------------------------------------------------------------------------------
/packages/markitdown-sample-plugin/pyproject.toml:
--------------------------------------------------------------------------------

```toml
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "markitdown-sample-plugin"
 7 | dynamic = ["version"]
 8 | description = 'A sample plugin for the "markitdown" library.'
 9 | readme = "README.md"
10 | requires-python = ">=3.10"
11 | license = "MIT"
12 | keywords = []
13 | authors = [
14 |   { name = "Adam Fourney", email = "[email protected]" },
15 | ]
16 | classifiers = [
17 |   "Development Status :: 4 - Beta",
18 |   "Programming Language :: Python",
19 |   "Programming Language :: Python :: 3.10",
20 |   "Programming Language :: Python :: 3.11",
21 |   "Programming Language :: Python :: 3.12",
22 |   "Programming Language :: Python :: 3.13",
23 |   "Programming Language :: Python :: Implementation :: CPython",
24 |   "Programming Language :: Python :: Implementation :: PyPy",
25 | ]
26 | dependencies = [
27 |   "markitdown>=0.1.0a1",
28 |   "striprtf",
29 | ]
30 | 
31 | [project.urls]
32 | Documentation = "https://github.com/microsoft/markitdown#readme"
33 | Issues = "https://github.com/microsoft/markitdown/issues"
34 | Source = "https://github.com/microsoft/markitdown"
35 | 
36 | [tool.hatch.version]
37 | path = "src/markitdown_sample_plugin/__about__.py"
38 | 
39 | # IMPORTANT: MarkItDown will look for this entry point to find the plugin.
40 | [project.entry-points."markitdown.plugin"]
41 | sample_plugin = "markitdown_sample_plugin"
42 | 
43 | [tool.hatch.envs.types]
44 | extra-dependencies = [
45 |   "mypy>=1.0.0",
46 | ]
47 | [tool.hatch.envs.types.scripts]
48 | check = "mypy --install-types --non-interactive {args:src/markitdown_sample_plugin tests}"
49 | 
50 | [tool.coverage.run]
51 | source_pkgs = ["markitdown-sample-plugin", "tests"]
52 | branch = true
53 | parallel = true
54 | omit = [
55 |   "src/markitdown_sample_plugin/__about__.py",
56 | ]
57 | 
58 | [tool.coverage.paths]
59 | markitdown-sample-plugin = ["src/markitdown_sample_plugin", "*/markitdown-sample-plugin/src/markitdown_sample_plugin"]
60 | tests = ["tests", "*/markitdown-sample-plugin/tests"]
61 | 
62 | [tool.coverage.report]
63 | exclude_lines = [
64 |   "no cov",
65 |   "if __name__ == .__main__.:",
66 |   "if TYPE_CHECKING:",
67 | ]
68 | 
69 | [tool.hatch.build.targets.sdist]
70 | only-include = ["src/markitdown_sample_plugin"]
71 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_plain_text_converter.py:
--------------------------------------------------------------------------------

```python
 1 | import sys
 2 | 
 3 | from typing import BinaryIO, Any
 4 | from charset_normalizer import from_bytes
 5 | from .._base_converter import DocumentConverter, DocumentConverterResult
 6 | from .._stream_info import StreamInfo
 7 | 
 8 | # Try loading optional (but in this case, required) dependencies
 9 | # Save reporting of any exceptions for later
10 | _dependency_exc_info = None
11 | try:
12 |     import mammoth  # noqa: F401
13 | except ImportError:
14 |     # Preserve the error and stack trace for later
15 |     _dependency_exc_info = sys.exc_info()
16 | 
17 | ACCEPTED_MIME_TYPE_PREFIXES = [
18 |     "text/",
19 |     "application/json",
20 |     "application/markdown",
21 | ]
22 | 
23 | ACCEPTED_FILE_EXTENSIONS = [
24 |     ".txt",
25 |     ".text",
26 |     ".md",
27 |     ".markdown",
28 |     ".json",
29 |     ".jsonl",
30 | ]
31 | 
32 | 
33 | class PlainTextConverter(DocumentConverter):
34 |     """Anything with content type text/plain"""
35 | 
36 |     def accepts(
37 |         self,
38 |         file_stream: BinaryIO,
39 |         stream_info: StreamInfo,
40 |         **kwargs: Any,  # Options to pass to the converter
41 |     ) -> bool:
42 |         mimetype = (stream_info.mimetype or "").lower()
43 |         extension = (stream_info.extension or "").lower()
44 | 
45 |         # If we have a charset, we can safely assume it's text
46 |         # With Magika in the earlier stages, this handles most cases
47 |         if stream_info.charset is not None:
48 |             return True
49 | 
50 |         # Otherwise, check the mimetype and extension
51 |         if extension in ACCEPTED_FILE_EXTENSIONS:
52 |             return True
53 | 
54 |         for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
55 |             if mimetype.startswith(prefix):
56 |                 return True
57 | 
58 |         return False
59 | 
60 |     def convert(
61 |         self,
62 |         file_stream: BinaryIO,
63 |         stream_info: StreamInfo,
64 |         **kwargs: Any,  # Options to pass to the converter
65 |     ) -> DocumentConverterResult:
66 |         if stream_info.charset:
67 |             text_content = file_stream.read().decode(stream_info.charset)
68 |         else:
69 |             text_content = str(from_bytes(file_stream.read()).best())
70 | 
71 |         return DocumentConverterResult(markdown=text_content)
72 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_pdf_converter.py:
--------------------------------------------------------------------------------

```python
 1 | import sys
 2 | import io
 3 | 
 4 | from typing import BinaryIO, Any
 5 | 
 6 | 
 7 | from .._base_converter import DocumentConverter, DocumentConverterResult
 8 | from .._stream_info import StreamInfo
 9 | from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
10 | 
11 | 
12 | # Try loading optional (but in this case, required) dependencies
13 | # Save reporting of any exceptions for later
14 | _dependency_exc_info = None
15 | try:
16 |     import pdfminer
17 |     import pdfminer.high_level
18 | except ImportError:
19 |     # Preserve the error and stack trace for later
20 |     _dependency_exc_info = sys.exc_info()
21 | 
22 | 
23 | ACCEPTED_MIME_TYPE_PREFIXES = [
24 |     "application/pdf",
25 |     "application/x-pdf",
26 | ]
27 | 
28 | ACCEPTED_FILE_EXTENSIONS = [".pdf"]
29 | 
30 | 
31 | class PdfConverter(DocumentConverter):
32 |     """
33 |     Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
34 |     """
35 | 
36 |     def accepts(
37 |         self,
38 |         file_stream: BinaryIO,
39 |         stream_info: StreamInfo,
40 |         **kwargs: Any,  # Options to pass to the converter
41 |     ) -> bool:
42 |         mimetype = (stream_info.mimetype or "").lower()
43 |         extension = (stream_info.extension or "").lower()
44 | 
45 |         if extension in ACCEPTED_FILE_EXTENSIONS:
46 |             return True
47 | 
48 |         for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
49 |             if mimetype.startswith(prefix):
50 |                 return True
51 | 
52 |         return False
53 | 
54 |     def convert(
55 |         self,
56 |         file_stream: BinaryIO,
57 |         stream_info: StreamInfo,
58 |         **kwargs: Any,  # Options to pass to the converter
59 |     ) -> DocumentConverterResult:
60 |         # Check the dependencies
61 |         if _dependency_exc_info is not None:
62 |             raise MissingDependencyException(
63 |                 MISSING_DEPENDENCY_MESSAGE.format(
64 |                     converter=type(self).__name__,
65 |                     extension=".pdf",
66 |                     feature="pdf",
67 |                 )
68 |             ) from _dependency_exc_info[
69 |                 1
70 |             ].with_traceback(  # type: ignore[union-attr]
71 |                 _dependency_exc_info[2]
72 |             )
73 | 
74 |         assert isinstance(file_stream, io.IOBase)  # for mypy
75 |         return DocumentConverterResult(
76 |             markdown=pdfminer.high_level.extract_text(file_stream),
77 |         )
78 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_csv_converter.py:
--------------------------------------------------------------------------------

```python
 1 | import csv
 2 | import io
 3 | from typing import BinaryIO, Any
 4 | from charset_normalizer import from_bytes
 5 | from .._base_converter import DocumentConverter, DocumentConverterResult
 6 | from .._stream_info import StreamInfo
 7 | 
 8 | ACCEPTED_MIME_TYPE_PREFIXES = [
 9 |     "text/csv",
10 |     "application/csv",
11 | ]
12 | ACCEPTED_FILE_EXTENSIONS = [".csv"]
13 | 
14 | 
15 | class CsvConverter(DocumentConverter):
16 |     """
17 |     Converts CSV files to Markdown tables.
18 |     """
19 | 
20 |     def __init__(self):
21 |         super().__init__()
22 | 
23 |     def accepts(
24 |         self,
25 |         file_stream: BinaryIO,
26 |         stream_info: StreamInfo,
27 |         **kwargs: Any,  # Options to pass to the converter
28 |     ) -> bool:
29 |         mimetype = (stream_info.mimetype or "").lower()
30 |         extension = (stream_info.extension or "").lower()
31 |         if extension in ACCEPTED_FILE_EXTENSIONS:
32 |             return True
33 |         for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
34 |             if mimetype.startswith(prefix):
35 |                 return True
36 |         return False
37 | 
38 |     def convert(
39 |         self,
40 |         file_stream: BinaryIO,
41 |         stream_info: StreamInfo,
42 |         **kwargs: Any,  # Options to pass to the converter
43 |     ) -> DocumentConverterResult:
44 |         # Read the file content
45 |         if stream_info.charset:
46 |             content = file_stream.read().decode(stream_info.charset)
47 |         else:
48 |             content = str(from_bytes(file_stream.read()).best())
49 | 
50 |         # Parse CSV content
51 |         reader = csv.reader(io.StringIO(content))
52 |         rows = list(reader)
53 | 
54 |         if not rows:
55 |             return DocumentConverterResult(markdown="")
56 | 
57 |         # Create markdown table
58 |         markdown_table = []
59 | 
60 |         # Add header row
61 |         markdown_table.append("| " + " | ".join(rows[0]) + " |")
62 | 
63 |         # Add separator row
64 |         markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |")
65 | 
66 |         # Add data rows
67 |         for row in rows[1:]:
68 |             # Make sure row has the same number of columns as header
69 |             while len(row) < len(rows[0]):
70 |                 row.append("")
71 |             # Truncate if row has more columns than header
72 |             row = row[: len(rows[0])]
73 |             markdown_table.append("| " + " | ".join(row) + " |")
74 | 
75 |         result = "\n".join(markdown_table)
76 | 
77 |         return DocumentConverterResult(markdown=result)
78 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/_exceptions.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import Optional, List, Any
 2 | 
 3 | MISSING_DEPENDENCY_MESSAGE = """{converter} recognized the input as a potential {extension} file, but the dependencies needed to read {extension} files have not been installed. To resolve this error, include the optional dependency [{feature}] or [all] when installing MarkItDown. For example:
 4 | 
 5 | * pip install markitdown[{feature}]
 6 | * pip install markitdown[all]
 7 | * pip install markitdown[{feature}, ...]
 8 | * etc."""
 9 | 
10 | 
11 | class MarkItDownException(Exception):
12 |     """
13 |     Base exception class for MarkItDown.
14 |     """
15 | 
16 |     pass
17 | 
18 | 
19 | class MissingDependencyException(MarkItDownException):
20 |     """
21 |     Converters shipped with MarkItDown may depend on optional
22 |     dependencies. This exception is thrown when a converter's
23 |     convert() method is called, but the required dependency is not
24 |     installed. This is not necessarily a fatal error, as the converter
25 |     will simply be skipped (an error will bubble up only if no other
26 |     suitable converter is found).
27 | 
28 |     Error messages should clearly indicate which dependency is missing.
29 |     """
30 | 
31 |     pass
32 | 
33 | 
34 | class UnsupportedFormatException(MarkItDownException):
35 |     """
36 |     Thrown when no suitable converter was found for the given file.
37 |     """
38 | 
39 |     pass
40 | 
41 | 
42 | class FailedConversionAttempt(object):
43 |     """
44 |     Represents an a single attempt to convert a file.
45 |     """
46 | 
47 |     def __init__(self, converter: Any, exc_info: Optional[tuple] = None):
48 |         self.converter = converter
49 |         self.exc_info = exc_info
50 | 
51 | 
52 | class FileConversionException(MarkItDownException):
53 |     """
54 |     Thrown when a suitable converter was found, but the conversion
55 |     process fails for any reason.
56 |     """
57 | 
58 |     def __init__(
59 |         self,
60 |         message: Optional[str] = None,
61 |         attempts: Optional[List[FailedConversionAttempt]] = None,
62 |     ):
63 |         self.attempts = attempts
64 | 
65 |         if message is None:
66 |             if attempts is None:
67 |                 message = "File conversion failed."
68 |             else:
69 |                 message = f"File conversion failed after {len(attempts)} attempts:\n"
70 |                 for attempt in attempts:
71 |                     if attempt.exc_info is None:
72 |                         message += f" -  {type(attempt.converter).__name__} provided no execution info."
73 |                     else:
74 |                         message += f" - {type(attempt.converter).__name__} threw {attempt.exc_info[0].__name__} with message: {attempt.exc_info[1]}\n"
75 | 
76 |         super().__init__(message)
77 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_docx_converter.py:
--------------------------------------------------------------------------------

```python
 1 | import sys
 2 | import io
 3 | from warnings import warn
 4 | 
 5 | from typing import BinaryIO, Any
 6 | 
 7 | from ._html_converter import HtmlConverter
 8 | from ..converter_utils.docx.pre_process import pre_process_docx
 9 | from .._base_converter import DocumentConverterResult
10 | from .._stream_info import StreamInfo
11 | from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
12 | 
13 | # Try loading optional (but in this case, required) dependencies
14 | # Save reporting of any exceptions for later
15 | _dependency_exc_info = None
16 | try:
17 |     import mammoth
18 | 
19 | except ImportError:
20 |     # Preserve the error and stack trace for later
21 |     _dependency_exc_info = sys.exc_info()
22 | 
23 | 
24 | ACCEPTED_MIME_TYPE_PREFIXES = [
25 |     "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
26 | ]
27 | 
28 | ACCEPTED_FILE_EXTENSIONS = [".docx"]
29 | 
30 | 
31 | class DocxConverter(HtmlConverter):
32 |     """
33 |     Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
34 |     """
35 | 
36 |     def __init__(self):
37 |         super().__init__()
38 |         self._html_converter = HtmlConverter()
39 | 
40 |     def accepts(
41 |         self,
42 |         file_stream: BinaryIO,
43 |         stream_info: StreamInfo,
44 |         **kwargs: Any,  # Options to pass to the converter
45 |     ) -> bool:
46 |         mimetype = (stream_info.mimetype or "").lower()
47 |         extension = (stream_info.extension or "").lower()
48 | 
49 |         if extension in ACCEPTED_FILE_EXTENSIONS:
50 |             return True
51 | 
52 |         for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
53 |             if mimetype.startswith(prefix):
54 |                 return True
55 | 
56 |         return False
57 | 
58 |     def convert(
59 |         self,
60 |         file_stream: BinaryIO,
61 |         stream_info: StreamInfo,
62 |         **kwargs: Any,  # Options to pass to the converter
63 |     ) -> DocumentConverterResult:
64 |         # Check: the dependencies
65 |         if _dependency_exc_info is not None:
66 |             raise MissingDependencyException(
67 |                 MISSING_DEPENDENCY_MESSAGE.format(
68 |                     converter=type(self).__name__,
69 |                     extension=".docx",
70 |                     feature="docx",
71 |                 )
72 |             ) from _dependency_exc_info[
73 |                 1
74 |             ].with_traceback(  # type: ignore[union-attr]
75 |                 _dependency_exc_info[2]
76 |             )
77 | 
78 |         style_map = kwargs.get("style_map", None)
79 |         pre_process_stream = pre_process_docx(file_stream)
80 |         return self._html_converter.convert_string(
81 |             mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
82 |             **kwargs,
83 |         )
84 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py:
--------------------------------------------------------------------------------

```python
 1 | import re
 2 | import bs4
 3 | from typing import Any, BinaryIO
 4 | 
 5 | from .._base_converter import DocumentConverter, DocumentConverterResult
 6 | from .._stream_info import StreamInfo
 7 | from ._markdownify import _CustomMarkdownify
 8 | 
 9 | ACCEPTED_MIME_TYPE_PREFIXES = [
10 |     "text/html",
11 |     "application/xhtml",
12 | ]
13 | 
14 | ACCEPTED_FILE_EXTENSIONS = [
15 |     ".html",
16 |     ".htm",
17 | ]
18 | 
19 | 
20 | class WikipediaConverter(DocumentConverter):
21 |     """Handle Wikipedia pages separately, focusing only on the main document content."""
22 | 
23 |     def accepts(
24 |         self,
25 |         file_stream: BinaryIO,
26 |         stream_info: StreamInfo,
27 |         **kwargs: Any,  # Options to pass to the converter
28 |     ) -> bool:
29 |         """
30 |         Make sure we're dealing with HTML content *from* Wikipedia.
31 |         """
32 | 
33 |         url = stream_info.url or ""
34 |         mimetype = (stream_info.mimetype or "").lower()
35 |         extension = (stream_info.extension or "").lower()
36 | 
37 |         if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
38 |             # Not a Wikipedia URL
39 |             return False
40 | 
41 |         if extension in ACCEPTED_FILE_EXTENSIONS:
42 |             return True
43 | 
44 |         for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
45 |             if mimetype.startswith(prefix):
46 |                 return True
47 | 
48 |         # Not HTML content
49 |         return False
50 | 
51 |     def convert(
52 |         self,
53 |         file_stream: BinaryIO,
54 |         stream_info: StreamInfo,
55 |         **kwargs: Any,  # Options to pass to the converter
56 |     ) -> DocumentConverterResult:
57 |         # Parse the stream
58 |         encoding = "utf-8" if stream_info.charset is None else stream_info.charset
59 |         soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
60 | 
61 |         # Remove javascript and style blocks
62 |         for script in soup(["script", "style"]):
63 |             script.extract()
64 | 
65 |         # Print only the main content
66 |         body_elm = soup.find("div", {"id": "mw-content-text"})
67 |         title_elm = soup.find("span", {"class": "mw-page-title-main"})
68 | 
69 |         webpage_text = ""
70 |         main_title = None if soup.title is None else soup.title.string
71 | 
72 |         if body_elm:
73 |             # What's the title
74 |             if title_elm and isinstance(title_elm, bs4.Tag):
75 |                 main_title = title_elm.string
76 | 
77 |             # Convert the page
78 |             webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(
79 |                 **kwargs
80 |             ).convert_soup(body_elm)
81 |         else:
82 |             webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
83 | 
84 |         return DocumentConverterResult(
85 |             markdown=webpage_text,
86 |             title=main_title,
87 |         )
88 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/pyproject.toml:
--------------------------------------------------------------------------------

```toml
  1 | [build-system]
  2 | requires = ["hatchling"]
  3 | build-backend = "hatchling.build"
  4 | 
  5 | [project]
  6 | name = "markitdown"
  7 | dynamic = ["version"]
  8 | description = 'Utility tool for converting various files to Markdown'
  9 | readme = "README.md"
 10 | requires-python = ">=3.10"
 11 | license = "MIT"
 12 | keywords = []
 13 | authors = [
 14 |   { name = "Adam Fourney", email = "[email protected]" },
 15 | ]
 16 | classifiers = [
 17 |   "Development Status :: 4 - Beta",
 18 |   "Programming Language :: Python",
 19 |   "Programming Language :: Python :: 3.10",
 20 |   "Programming Language :: Python :: 3.11",
 21 |   "Programming Language :: Python :: 3.12",
 22 |   "Programming Language :: Python :: 3.13",
 23 |   "Programming Language :: Python :: Implementation :: CPython",
 24 |   "Programming Language :: Python :: Implementation :: PyPy",
 25 | ]
 26 | dependencies = [
 27 |   "beautifulsoup4",
 28 |   "requests",
 29 |   "markdownify",
 30 |   "magika~=0.6.1",
 31 |   "charset-normalizer",
 32 |   "defusedxml",
 33 |   "onnxruntime<=1.20.1; sys_platform == 'win32'",
 34 | ]
 35 | 
 36 | [project.optional-dependencies]
 37 | all = [
 38 |   "python-pptx",
 39 |   "mammoth~=1.11.0",
 40 |   "pandas",
 41 |   "openpyxl",
 42 |   "xlrd",
 43 |   "lxml",
 44 |   "pdfminer.six>=20251107",
 45 |   "olefile",
 46 |   "pydub",
 47 |   "SpeechRecognition",
 48 |   "youtube-transcript-api~=1.0.0",
 49 |   "azure-ai-documentintelligence",
 50 |   "azure-identity"
 51 | ]
 52 | pptx = ["python-pptx"]
 53 | docx = ["mammoth~=1.11.0", "lxml"]
 54 | xlsx = ["pandas", "openpyxl"]
 55 | xls = ["pandas", "xlrd"]
 56 | pdf = ["pdfminer.six"]
 57 | outlook = ["olefile"]
 58 | audio-transcription = ["pydub", "SpeechRecognition"]
 59 | youtube-transcription = ["youtube-transcript-api"]
 60 | az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]
 61 | 
 62 | [project.urls]
 63 | Documentation = "https://github.com/microsoft/markitdown#readme"
 64 | Issues = "https://github.com/microsoft/markitdown/issues"
 65 | Source = "https://github.com/microsoft/markitdown"
 66 | 
 67 | [tool.hatch.version]
 68 | path = "src/markitdown/__about__.py"
 69 | 
 70 | [project.scripts]
 71 | markitdown = "markitdown.__main__:main"
 72 | 
 73 | [tool.hatch.envs.default]
 74 | features = ["all"]
 75 | 
 76 | [tool.hatch.envs.hatch-test]
 77 | features = ["all"]
 78 | extra-dependencies = [
 79 |   "openai",
 80 | ]
 81 | 
 82 | [tool.hatch.envs.types]
 83 | features = ["all"]
 84 | extra-dependencies = [
 85 |   "openai",
 86 |   "mypy>=1.0.0",
 87 | ]
 88 | 
 89 | [tool.hatch.envs.types.scripts]
 90 | check = "mypy --install-types --non-interactive --ignore-missing-imports {args:src/markitdown tests}"
 91 | 
 92 | [tool.coverage.run]
 93 | source_pkgs = ["markitdown", "tests"]
 94 | branch = true
 95 | parallel = true
 96 | omit = [
 97 |   "src/markitdown/__about__.py",
 98 | ]
 99 | 
100 | [tool.coverage.paths]
101 | markitdown = ["src/markitdown", "*/markitdown/src/markitdown"]
102 | tests = ["tests", "*/markitdown/tests"]
103 | 
104 | [tool.coverage.report]
105 | exclude_lines = [
106 |   "no cov",
107 |   "if __name__ == .__main__.:",
108 |   "if TYPE_CHECKING:",
109 | ]
110 | 
111 | [tool.hatch.build.targets.sdist]
112 | only-include = ["src/markitdown"]
113 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_html_converter.py:
--------------------------------------------------------------------------------

```python
 1 | import io
 2 | from typing import Any, BinaryIO, Optional
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from .._base_converter import DocumentConverter, DocumentConverterResult
 6 | from .._stream_info import StreamInfo
 7 | from ._markdownify import _CustomMarkdownify
 8 | 
 9 | ACCEPTED_MIME_TYPE_PREFIXES = [
10 |     "text/html",
11 |     "application/xhtml",
12 | ]
13 | 
14 | ACCEPTED_FILE_EXTENSIONS = [
15 |     ".html",
16 |     ".htm",
17 | ]
18 | 
19 | 
20 | class HtmlConverter(DocumentConverter):
21 |     """Anything with content type text/html"""
22 | 
23 |     def accepts(
24 |         self,
25 |         file_stream: BinaryIO,
26 |         stream_info: StreamInfo,
27 |         **kwargs: Any,  # Options to pass to the converter
28 |     ) -> bool:
29 |         mimetype = (stream_info.mimetype or "").lower()
30 |         extension = (stream_info.extension or "").lower()
31 | 
32 |         if extension in ACCEPTED_FILE_EXTENSIONS:
33 |             return True
34 | 
35 |         for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
36 |             if mimetype.startswith(prefix):
37 |                 return True
38 | 
39 |         return False
40 | 
41 |     def convert(
42 |         self,
43 |         file_stream: BinaryIO,
44 |         stream_info: StreamInfo,
45 |         **kwargs: Any,  # Options to pass to the converter
46 |     ) -> DocumentConverterResult:
47 |         # Parse the stream
48 |         encoding = "utf-8" if stream_info.charset is None else stream_info.charset
49 |         soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
50 | 
51 |         # Remove javascript and style blocks
52 |         for script in soup(["script", "style"]):
53 |             script.extract()
54 | 
55 |         # Print only the main content
56 |         body_elm = soup.find("body")
57 |         webpage_text = ""
58 |         if body_elm:
59 |             webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
60 |         else:
61 |             webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
62 | 
63 |         assert isinstance(webpage_text, str)
64 | 
65 |         # remove leading and trailing \n
66 |         webpage_text = webpage_text.strip()
67 | 
68 |         return DocumentConverterResult(
69 |             markdown=webpage_text,
70 |             title=None if soup.title is None else soup.title.string,
71 |         )
72 | 
73 |     def convert_string(
74 |         self, html_content: str, *, url: Optional[str] = None, **kwargs
75 |     ) -> DocumentConverterResult:
76 |         """
77 |         Non-standard convenience method to convert a string to markdown.
78 |         Given that many converters produce HTML as intermediate output, this
79 |         allows for easy conversion of HTML to markdown.
80 |         """
81 |         return self.convert(
82 |             file_stream=io.BytesIO(html_content.encode("utf-8")),
83 |             stream_info=StreamInfo(
84 |                 mimetype="text/html",
85 |                 extension=".html",
86 |                 charset="utf-8",
87 |                 url=url,
88 |             ),
89 |             **kwargs,
90 |         )
91 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_audio_converter.py:
--------------------------------------------------------------------------------

```python
  1 | from typing import Any, BinaryIO
  2 | 
  3 | from ._exiftool import exiftool_metadata
  4 | from ._transcribe_audio import transcribe_audio
  5 | from .._base_converter import DocumentConverter, DocumentConverterResult
  6 | from .._stream_info import StreamInfo
  7 | from .._exceptions import MissingDependencyException
  8 | 
  9 | ACCEPTED_MIME_TYPE_PREFIXES = [
 10 |     "audio/x-wav",
 11 |     "audio/mpeg",
 12 |     "video/mp4",
 13 | ]
 14 | 
 15 | ACCEPTED_FILE_EXTENSIONS = [
 16 |     ".wav",
 17 |     ".mp3",
 18 |     ".m4a",
 19 |     ".mp4",
 20 | ]
 21 | 
 22 | 
 23 | class AudioConverter(DocumentConverter):
 24 |     """
 25 |     Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
 26 |     """
 27 | 
 28 |     def accepts(
 29 |         self,
 30 |         file_stream: BinaryIO,
 31 |         stream_info: StreamInfo,
 32 |         **kwargs: Any,  # Options to pass to the converter
 33 |     ) -> bool:
 34 |         mimetype = (stream_info.mimetype or "").lower()
 35 |         extension = (stream_info.extension or "").lower()
 36 | 
 37 |         if extension in ACCEPTED_FILE_EXTENSIONS:
 38 |             return True
 39 | 
 40 |         for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
 41 |             if mimetype.startswith(prefix):
 42 |                 return True
 43 | 
 44 |         return False
 45 | 
 46 |     def convert(
 47 |         self,
 48 |         file_stream: BinaryIO,
 49 |         stream_info: StreamInfo,
 50 |         **kwargs: Any,  # Options to pass to the converter
 51 |     ) -> DocumentConverterResult:
 52 |         md_content = ""
 53 | 
 54 |         # Add metadata
 55 |         metadata = exiftool_metadata(
 56 |             file_stream, exiftool_path=kwargs.get("exiftool_path")
 57 |         )
 58 |         if metadata:
 59 |             for f in [
 60 |                 "Title",
 61 |                 "Artist",
 62 |                 "Author",
 63 |                 "Band",
 64 |                 "Album",
 65 |                 "Genre",
 66 |                 "Track",
 67 |                 "DateTimeOriginal",
 68 |                 "CreateDate",
 69 |                 # "Duration", -- Wrong values when read from memory
 70 |                 "NumChannels",
 71 |                 "SampleRate",
 72 |                 "AvgBytesPerSec",
 73 |                 "BitsPerSample",
 74 |             ]:
 75 |                 if f in metadata:
 76 |                     md_content += f"{f}: {metadata[f]}\n"
 77 | 
 78 |         # Figure out the audio format for transcription
 79 |         if stream_info.extension == ".wav" or stream_info.mimetype == "audio/x-wav":
 80 |             audio_format = "wav"
 81 |         elif stream_info.extension == ".mp3" or stream_info.mimetype == "audio/mpeg":
 82 |             audio_format = "mp3"
 83 |         elif (
 84 |             stream_info.extension in [".mp4", ".m4a"]
 85 |             or stream_info.mimetype == "video/mp4"
 86 |         ):
 87 |             audio_format = "mp4"
 88 |         else:
 89 |             audio_format = None
 90 | 
 91 |         # Transcribe
 92 |         if audio_format:
 93 |             try:
 94 |                 transcript = transcribe_audio(file_stream, audio_format=audio_format)
 95 |                 if transcript:
 96 |                     md_content += "\n\n### Audio Transcript:\n" + transcript
 97 |             except MissingDependencyException:
 98 |                 pass
 99 | 
100 |         # Return the result
101 |         return DocumentConverterResult(markdown=md_content.strip())
102 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_ipynb_converter.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import BinaryIO, Any
 2 | import json
 3 | 
 4 | from .._base_converter import DocumentConverter, DocumentConverterResult
 5 | from .._exceptions import FileConversionException
 6 | from .._stream_info import StreamInfo
 7 | 
 8 | CANDIDATE_MIME_TYPE_PREFIXES = [
 9 |     "application/json",
10 | ]
11 | 
12 | ACCEPTED_FILE_EXTENSIONS = [".ipynb"]
13 | 
14 | 
15 | class IpynbConverter(DocumentConverter):
16 |     """Converts Jupyter Notebook (.ipynb) files to Markdown."""
17 | 
18 |     def accepts(
19 |         self,
20 |         file_stream: BinaryIO,
21 |         stream_info: StreamInfo,
22 |         **kwargs: Any,  # Options to pass to the converter
23 |     ) -> bool:
24 |         mimetype = (stream_info.mimetype or "").lower()
25 |         extension = (stream_info.extension or "").lower()
26 | 
27 |         if extension in ACCEPTED_FILE_EXTENSIONS:
28 |             return True
29 | 
30 |         for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
31 |             if mimetype.startswith(prefix):
32 |                 # Read further to see if it's a notebook
33 |                 cur_pos = file_stream.tell()
34 |                 try:
35 |                     encoding = stream_info.charset or "utf-8"
36 |                     notebook_content = file_stream.read().decode(encoding)
37 |                     return (
38 |                         "nbformat" in notebook_content
39 |                         and "nbformat_minor" in notebook_content
40 |                     )
41 |                 finally:
42 |                     file_stream.seek(cur_pos)
43 | 
44 |         return False
45 | 
46 |     def convert(
47 |         self,
48 |         file_stream: BinaryIO,
49 |         stream_info: StreamInfo,
50 |         **kwargs: Any,  # Options to pass to the converter
51 |     ) -> DocumentConverterResult:
52 |         # Parse and convert the notebook
53 |         encoding = stream_info.charset or "utf-8"
54 |         notebook_content = file_stream.read().decode(encoding=encoding)
55 |         return self._convert(json.loads(notebook_content))
56 | 
57 |     def _convert(self, notebook_content: dict) -> DocumentConverterResult:
58 |         """Helper function that converts notebook JSON content to Markdown."""
59 |         try:
60 |             md_output = []
61 |             title = None
62 | 
63 |             for cell in notebook_content.get("cells", []):
64 |                 cell_type = cell.get("cell_type", "")
65 |                 source_lines = cell.get("source", [])
66 | 
67 |                 if cell_type == "markdown":
68 |                     md_output.append("".join(source_lines))
69 | 
70 |                     # Extract the first # heading as title if not already found
71 |                     if title is None:
72 |                         for line in source_lines:
73 |                             if line.startswith("# "):
74 |                                 title = line.lstrip("# ").strip()
75 |                                 break
76 | 
77 |                 elif cell_type == "code":
78 |                     # Code cells are wrapped in Markdown code blocks
79 |                     md_output.append(f"```python\n{''.join(source_lines)}\n```")
80 |                 elif cell_type == "raw":
81 |                     md_output.append(f"```\n{''.join(source_lines)}\n```")
82 | 
83 |             md_text = "\n\n".join(md_output)
84 | 
85 |             # Check for title in notebook metadata
86 |             title = notebook_content.get("metadata", {}).get("title", title)
87 | 
88 |             return DocumentConverterResult(
89 |                 markdown=md_text,
90 |                 title=title,
91 |             )
92 | 
93 |         except Exception as e:
94 |             raise FileConversionException(
95 |                 f"Error converting .ipynb file: {str(e)}"
96 |             ) from e
97 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_zip_converter.py:
--------------------------------------------------------------------------------

```python
  1 | import zipfile
  2 | import io
  3 | import os
  4 | 
  5 | from typing import BinaryIO, Any, TYPE_CHECKING
  6 | 
  7 | from .._base_converter import DocumentConverter, DocumentConverterResult
  8 | from .._stream_info import StreamInfo
  9 | from .._exceptions import UnsupportedFormatException, FileConversionException
 10 | 
 11 | # Break otherwise circular import for type hinting
 12 | if TYPE_CHECKING:
 13 |     from .._markitdown import MarkItDown
 14 | 
 15 | ACCEPTED_MIME_TYPE_PREFIXES = [
 16 |     "application/zip",
 17 | ]
 18 | 
 19 | ACCEPTED_FILE_EXTENSIONS = [".zip"]
 20 | 
 21 | 
 22 | class ZipConverter(DocumentConverter):
 23 |     """Converts ZIP files to markdown by extracting and converting all contained files.
 24 | 
 25 |     The converter extracts the ZIP contents to a temporary directory, processes each file
 26 |     using appropriate converters based on file extensions, and then combines the results
 27 |     into a single markdown document. The temporary directory is cleaned up after processing.
 28 | 
 29 |     Example output format:
 30 |     ```markdown
 31 |     Content from the zip file `example.zip`:
 32 | 
 33 |     ## File: docs/readme.txt
 34 | 
 35 |     This is the content of readme.txt
 36 |     Multiple lines are preserved
 37 | 
 38 |     ## File: images/example.jpg
 39 | 
 40 |     ImageSize: 1920x1080
 41 |     DateTimeOriginal: 2024-02-15 14:30:00
 42 |     Description: A beautiful landscape photo
 43 | 
 44 |     ## File: data/report.xlsx
 45 | 
 46 |     ## Sheet1
 47 |     | Column1 | Column2 | Column3 |
 48 |     |---------|---------|---------|
 49 |     | data1   | data2   | data3   |
 50 |     | data4   | data5   | data6   |
 51 |     ```
 52 | 
 53 |     Key features:
 54 |     - Maintains original file structure in headings
 55 |     - Processes nested files recursively
 56 |     - Uses appropriate converters for each file type
 57 |     - Preserves formatting of converted content
 58 |     - Cleans up temporary files after processing
 59 |     """
 60 | 
 61 |     def __init__(
 62 |         self,
 63 |         *,
 64 |         markitdown: "MarkItDown",
 65 |     ):
 66 |         super().__init__()
 67 |         self._markitdown = markitdown
 68 | 
 69 |     def accepts(
 70 |         self,
 71 |         file_stream: BinaryIO,
 72 |         stream_info: StreamInfo,
 73 |         **kwargs: Any,  # Options to pass to the converter
 74 |     ) -> bool:
 75 |         mimetype = (stream_info.mimetype or "").lower()
 76 |         extension = (stream_info.extension or "").lower()
 77 | 
 78 |         if extension in ACCEPTED_FILE_EXTENSIONS:
 79 |             return True
 80 | 
 81 |         for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
 82 |             if mimetype.startswith(prefix):
 83 |                 return True
 84 | 
 85 |         return False
 86 | 
 87 |     def convert(
 88 |         self,
 89 |         file_stream: BinaryIO,
 90 |         stream_info: StreamInfo,
 91 |         **kwargs: Any,  # Options to pass to the converter
 92 |     ) -> DocumentConverterResult:
 93 |         file_path = stream_info.url or stream_info.local_path or stream_info.filename
 94 |         md_content = f"Content from the zip file `{file_path}`:\n\n"
 95 | 
 96 |         with zipfile.ZipFile(file_stream, "r") as zipObj:
 97 |             for name in zipObj.namelist():
 98 |                 try:
 99 |                     z_file_stream = io.BytesIO(zipObj.read(name))
100 |                     z_file_stream_info = StreamInfo(
101 |                         extension=os.path.splitext(name)[1],
102 |                         filename=os.path.basename(name),
103 |                     )
104 |                     result = self._markitdown.convert_stream(
105 |                         stream=z_file_stream,
106 |                         stream_info=z_file_stream_info,
107 |                     )
108 |                     if result is not None:
109 |                         md_content += f"## File: {name}\n\n"
110 |                         md_content += result.markdown + "\n\n"
111 |                 except UnsupportedFormatException:
112 |                     pass
113 |                 except FileConversionException:
114 |                     pass
115 | 
116 |         return DocumentConverterResult(markdown=md_content.strip())
117 | 
```

--------------------------------------------------------------------------------
/packages/markitdown-mcp/src/markitdown_mcp/__main__.py:
--------------------------------------------------------------------------------

```python
  1 | import contextlib
  2 | import sys
  3 | import os
  4 | from collections.abc import AsyncIterator
  5 | from mcp.server.fastmcp import FastMCP
  6 | from starlette.applications import Starlette
  7 | from mcp.server.sse import SseServerTransport
  8 | from starlette.requests import Request
  9 | from starlette.routing import Mount, Route
 10 | from starlette.types import Receive, Scope, Send
 11 | from mcp.server import Server
 12 | from mcp.server.streamable_http_manager import StreamableHTTPSessionManager
 13 | from markitdown import MarkItDown
 14 | import uvicorn
 15 | 
 16 | # Initialize FastMCP server for MarkItDown (SSE)
 17 | mcp = FastMCP("markitdown")
 18 | 
 19 | 
 20 | @mcp.tool()
 21 | async def convert_to_markdown(uri: str) -> str:
 22 |     """Convert a resource described by an http:, https:, file: or data: URI to markdown"""
 23 |     return MarkItDown(enable_plugins=check_plugins_enabled()).convert_uri(uri).markdown
 24 | 
 25 | 
 26 | def check_plugins_enabled() -> bool:
 27 |     return os.getenv("MARKITDOWN_ENABLE_PLUGINS", "false").strip().lower() in (
 28 |         "true",
 29 |         "1",
 30 |         "yes",
 31 |     )
 32 | 
 33 | 
 34 | def create_starlette_app(mcp_server: Server, *, debug: bool = False) -> Starlette:
 35 |     sse = SseServerTransport("/messages/")
 36 |     session_manager = StreamableHTTPSessionManager(
 37 |         app=mcp_server,
 38 |         event_store=None,
 39 |         json_response=True,
 40 |         stateless=True,
 41 |     )
 42 | 
 43 |     async def handle_sse(request: Request) -> None:
 44 |         async with sse.connect_sse(
 45 |             request.scope,
 46 |             request.receive,
 47 |             request._send,
 48 |         ) as (read_stream, write_stream):
 49 |             await mcp_server.run(
 50 |                 read_stream,
 51 |                 write_stream,
 52 |                 mcp_server.create_initialization_options(),
 53 |             )
 54 | 
 55 |     async def handle_streamable_http(
 56 |         scope: Scope, receive: Receive, send: Send
 57 |     ) -> None:
 58 |         await session_manager.handle_request(scope, receive, send)
 59 | 
 60 |     @contextlib.asynccontextmanager
 61 |     async def lifespan(app: Starlette) -> AsyncIterator[None]:
 62 |         """Context manager for session manager."""
 63 |         async with session_manager.run():
 64 |             print("Application started with StreamableHTTP session manager!")
 65 |             try:
 66 |                 yield
 67 |             finally:
 68 |                 print("Application shutting down...")
 69 | 
 70 |     return Starlette(
 71 |         debug=debug,
 72 |         routes=[
 73 |             Route("/sse", endpoint=handle_sse),
 74 |             Mount("/mcp", app=handle_streamable_http),
 75 |             Mount("/messages/", app=sse.handle_post_message),
 76 |         ],
 77 |         lifespan=lifespan,
 78 |     )
 79 | 
 80 | 
 81 | # Main entry point
 82 | def main():
 83 |     import argparse
 84 | 
 85 |     mcp_server = mcp._mcp_server
 86 | 
 87 |     parser = argparse.ArgumentParser(description="Run a MarkItDown MCP server")
 88 | 
 89 |     parser.add_argument(
 90 |         "--http",
 91 |         action="store_true",
 92 |         help="Run the server with Streamable HTTP and SSE transport rather than STDIO (default: False)",
 93 |     )
 94 |     parser.add_argument(
 95 |         "--sse",
 96 |         action="store_true",
 97 |         help="(Deprecated) An alias for --http (default: False)",
 98 |     )
 99 |     parser.add_argument(
100 |         "--host", default=None, help="Host to bind to (default: 127.0.0.1)"
101 |     )
102 |     parser.add_argument(
103 |         "--port", type=int, default=None, help="Port to listen on (default: 3001)"
104 |     )
105 |     args = parser.parse_args()
106 | 
107 |     use_http = args.http or args.sse
108 | 
109 |     if not use_http and (args.host or args.port):
110 |         parser.error(
111 |             "Host and port arguments are only valid when using streamable HTTP or SSE transport (see: --http)."
112 |         )
113 |         sys.exit(1)
114 | 
115 |     if use_http:
116 |         starlette_app = create_starlette_app(mcp_server, debug=True)
117 |         uvicorn.run(
118 |             starlette_app,
119 |             host=args.host if args.host else "127.0.0.1",
120 |             port=args.port if args.port else 3001,
121 |         )
122 |     else:
123 |         mcp.run()
124 | 
125 | 
126 | if __name__ == "__main__":
127 |     main()
128 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py:
--------------------------------------------------------------------------------

```python
  1 | import re
  2 | import base64
  3 | import binascii
  4 | from urllib.parse import parse_qs, urlparse
  5 | from typing import Any, BinaryIO
  6 | from bs4 import BeautifulSoup
  7 | 
  8 | from .._base_converter import DocumentConverter, DocumentConverterResult
  9 | from .._stream_info import StreamInfo
 10 | from ._markdownify import _CustomMarkdownify
 11 | 
 12 | ACCEPTED_MIME_TYPE_PREFIXES = [
 13 |     "text/html",
 14 |     "application/xhtml",
 15 | ]
 16 | 
 17 | ACCEPTED_FILE_EXTENSIONS = [
 18 |     ".html",
 19 |     ".htm",
 20 | ]
 21 | 
 22 | 
 23 | class BingSerpConverter(DocumentConverter):
 24 |     """
 25 |     Handle Bing results pages (only the organic search results).
 26 |     NOTE: It is better to use the Bing API
 27 |     """
 28 | 
 29 |     def accepts(
 30 |         self,
 31 |         file_stream: BinaryIO,
 32 |         stream_info: StreamInfo,
 33 |         **kwargs: Any,  # Options to pass to the converter
 34 |     ) -> bool:
 35 |         """
 36 |         Make sure we're dealing with HTML content *from* Bing.
 37 |         """
 38 | 
 39 |         url = stream_info.url or ""
 40 |         mimetype = (stream_info.mimetype or "").lower()
 41 |         extension = (stream_info.extension or "").lower()
 42 | 
 43 |         if not re.search(r"^https://www\.bing\.com/search\?q=", url):
 44 |             # Not a Bing SERP URL
 45 |             return False
 46 | 
 47 |         if extension in ACCEPTED_FILE_EXTENSIONS:
 48 |             return True
 49 | 
 50 |         for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
 51 |             if mimetype.startswith(prefix):
 52 |                 return True
 53 | 
 54 |         # Not HTML content
 55 |         return False
 56 | 
 57 |     def convert(
 58 |         self,
 59 |         file_stream: BinaryIO,
 60 |         stream_info: StreamInfo,
 61 |         **kwargs: Any,  # Options to pass to the converter
 62 |     ) -> DocumentConverterResult:
 63 |         assert stream_info.url is not None
 64 | 
 65 |         # Parse the query parameters
 66 |         parsed_params = parse_qs(urlparse(stream_info.url).query)
 67 |         query = parsed_params.get("q", [""])[0]
 68 | 
 69 |         # Parse the stream
 70 |         encoding = "utf-8" if stream_info.charset is None else stream_info.charset
 71 |         soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
 72 | 
 73 |         # Clean up some formatting
 74 |         for tptt in soup.find_all(class_="tptt"):
 75 |             if hasattr(tptt, "string") and tptt.string:
 76 |                 tptt.string += " "
 77 |         for slug in soup.find_all(class_="algoSlug_icon"):
 78 |             slug.extract()
 79 | 
 80 |         # Parse the algorithmic results
 81 |         _markdownify = _CustomMarkdownify(**kwargs)
 82 |         results = list()
 83 |         for result in soup.find_all(class_="b_algo"):
 84 |             if not hasattr(result, "find_all"):
 85 |                 continue
 86 | 
 87 |             # Rewrite redirect urls
 88 |             for a in result.find_all("a", href=True):
 89 |                 parsed_href = urlparse(a["href"])
 90 |                 qs = parse_qs(parsed_href.query)
 91 | 
 92 |                 # The destination is contained in the u parameter,
 93 |                 # but appears to be base64 encoded, with some prefix
 94 |                 if "u" in qs:
 95 |                     u = (
 96 |                         qs["u"][0][2:].strip() + "=="
 97 |                     )  # Python 3 doesn't care about extra padding
 98 | 
 99 |                     try:
100 |                         # RFC 4648 / Base64URL" variant, which uses "-" and "_"
101 |                         a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8")
102 |                     except UnicodeDecodeError:
103 |                         pass
104 |                     except binascii.Error:
105 |                         pass
106 | 
107 |             # Convert to markdown
108 |             md_result = _markdownify.convert_soup(result).strip()
109 |             lines = [line.strip() for line in re.split(r"\n+", md_result)]
110 |             results.append("\n".join([line for line in lines if len(line) > 0]))
111 | 
112 |         webpage_text = (
113 |             f"## A Bing search for '{query}' found the following results:\n\n"
114 |             + "\n\n".join(results)
115 |         )
116 | 
117 |         return DocumentConverterResult(
118 |             markdown=webpage_text,
119 |             title=None if soup.title is None else soup.title.string,
120 |         )
121 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_image_converter.py:
--------------------------------------------------------------------------------

```python
  1 | from typing import BinaryIO, Any, Union
  2 | import base64
  3 | import mimetypes
  4 | from ._exiftool import exiftool_metadata
  5 | from .._base_converter import DocumentConverter, DocumentConverterResult
  6 | from .._stream_info import StreamInfo
  7 | 
  8 | ACCEPTED_MIME_TYPE_PREFIXES = [
  9 |     "image/jpeg",
 10 |     "image/png",
 11 | ]
 12 | 
 13 | ACCEPTED_FILE_EXTENSIONS = [".jpg", ".jpeg", ".png"]
 14 | 
 15 | 
 16 | class ImageConverter(DocumentConverter):
 17 |     """
 18 |     Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
 19 |     """
 20 | 
 21 |     def accepts(
 22 |         self,
 23 |         file_stream: BinaryIO,
 24 |         stream_info: StreamInfo,
 25 |         **kwargs: Any,
 26 |     ) -> bool:
 27 |         mimetype = (stream_info.mimetype or "").lower()
 28 |         extension = (stream_info.extension or "").lower()
 29 | 
 30 |         if extension in ACCEPTED_FILE_EXTENSIONS:
 31 |             return True
 32 | 
 33 |         for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
 34 |             if mimetype.startswith(prefix):
 35 |                 return True
 36 | 
 37 |         return False
 38 | 
 39 |     def convert(
 40 |         self,
 41 |         file_stream: BinaryIO,
 42 |         stream_info: StreamInfo,
 43 |         **kwargs: Any,  # Options to pass to the converter
 44 |     ) -> DocumentConverterResult:
 45 |         md_content = ""
 46 | 
 47 |         # Add metadata
 48 |         metadata = exiftool_metadata(
 49 |             file_stream, exiftool_path=kwargs.get("exiftool_path")
 50 |         )
 51 | 
 52 |         if metadata:
 53 |             for f in [
 54 |                 "ImageSize",
 55 |                 "Title",
 56 |                 "Caption",
 57 |                 "Description",
 58 |                 "Keywords",
 59 |                 "Artist",
 60 |                 "Author",
 61 |                 "DateTimeOriginal",
 62 |                 "CreateDate",
 63 |                 "GPSPosition",
 64 |             ]:
 65 |                 if f in metadata:
 66 |                     md_content += f"{f}: {metadata[f]}\n"
 67 | 
 68 |         # Try describing the image with GPT
 69 |         llm_client = kwargs.get("llm_client")
 70 |         llm_model = kwargs.get("llm_model")
 71 |         if llm_client is not None and llm_model is not None:
 72 |             llm_description = self._get_llm_description(
 73 |                 file_stream,
 74 |                 stream_info,
 75 |                 client=llm_client,
 76 |                 model=llm_model,
 77 |                 prompt=kwargs.get("llm_prompt"),
 78 |             )
 79 | 
 80 |             if llm_description is not None:
 81 |                 md_content += "\n# Description:\n" + llm_description.strip() + "\n"
 82 | 
 83 |         return DocumentConverterResult(
 84 |             markdown=md_content,
 85 |         )
 86 | 
 87 |     def _get_llm_description(
 88 |         self,
 89 |         file_stream: BinaryIO,
 90 |         stream_info: StreamInfo,
 91 |         *,
 92 |         client,
 93 |         model,
 94 |         prompt=None,
 95 |     ) -> Union[None, str]:
 96 |         if prompt is None or prompt.strip() == "":
 97 |             prompt = "Write a detailed caption for this image."
 98 | 
 99 |         # Get the content type
100 |         content_type = stream_info.mimetype
101 |         if not content_type:
102 |             content_type, _ = mimetypes.guess_type(
103 |                 "_dummy" + (stream_info.extension or "")
104 |             )
105 |         if not content_type:
106 |             content_type = "application/octet-stream"
107 | 
108 |         # Convert to base64
109 |         cur_pos = file_stream.tell()
110 |         try:
111 |             base64_image = base64.b64encode(file_stream.read()).decode("utf-8")
112 |         except Exception as e:
113 |             return None
114 |         finally:
115 |             file_stream.seek(cur_pos)
116 | 
117 |         # Prepare the data-uri
118 |         data_uri = f"data:{content_type};base64,{base64_image}"
119 | 
120 |         # Prepare the OpenAI API request
121 |         messages = [
122 |             {
123 |                 "role": "user",
124 |                 "content": [
125 |                     {"type": "text", "text": prompt},
126 |                     {
127 |                         "type": "image_url",
128 |                         "image_url": {
129 |                             "url": data_uri,
130 |                         },
131 |                     },
132 |                 ],
133 |             }
134 |         ]
135 | 
136 |         # Call the OpenAI API
137 |         response = client.chat.completions.create(model=model, messages=messages)
138 |         return response.choices[0].message.content
139 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_markdownify.py:
--------------------------------------------------------------------------------

```python
  1 | import re
  2 | import markdownify
  3 | 
  4 | from typing import Any, Optional
  5 | from urllib.parse import quote, unquote, urlparse, urlunparse
  6 | 
  7 | 
  8 | class _CustomMarkdownify(markdownify.MarkdownConverter):
  9 |     """
 10 |     A custom version of markdownify's MarkdownConverter. Changes include:
 11 | 
 12 |     - Altering the default heading style to use '#', '##', etc.
 13 |     - Removing javascript hyperlinks.
 14 |     - Truncating images with large data:uri sources.
 15 |     - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
 16 |     """
 17 | 
 18 |     def __init__(self, **options: Any):
 19 |         options["heading_style"] = options.get("heading_style", markdownify.ATX)
 20 |         options["keep_data_uris"] = options.get("keep_data_uris", False)
 21 |         # Explicitly cast options to the expected type if necessary
 22 |         super().__init__(**options)
 23 | 
 24 |     def convert_hn(
 25 |         self,
 26 |         n: int,
 27 |         el: Any,
 28 |         text: str,
 29 |         convert_as_inline: Optional[bool] = False,
 30 |         **kwargs,
 31 |     ) -> str:
 32 |         """Same as usual, but be sure to start with a new line"""
 33 |         if not convert_as_inline:
 34 |             if not re.search(r"^\n", text):
 35 |                 return "\n" + super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
 36 | 
 37 |         return super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
 38 | 
 39 |     def convert_a(
 40 |         self,
 41 |         el: Any,
 42 |         text: str,
 43 |         convert_as_inline: Optional[bool] = False,
 44 |         **kwargs,
 45 |     ):
 46 |         """Same as usual converter, but removes Javascript links and escapes URIs."""
 47 |         prefix, suffix, text = markdownify.chomp(text)  # type: ignore
 48 |         if not text:
 49 |             return ""
 50 | 
 51 |         if el.find_parent("pre") is not None:
 52 |             return text
 53 | 
 54 |         href = el.get("href")
 55 |         title = el.get("title")
 56 | 
 57 |         # Escape URIs and skip non-http or file schemes
 58 |         if href:
 59 |             try:
 60 |                 parsed_url = urlparse(href)  # type: ignore
 61 |                 if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]:  # type: ignore
 62 |                     return "%s%s%s" % (prefix, text, suffix)
 63 |                 href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path))))  # type: ignore
 64 |             except ValueError:  # It's not clear if this ever gets thrown
 65 |                 return "%s%s%s" % (prefix, text, suffix)
 66 | 
 67 |         # For the replacement see #29: text nodes underscores are escaped
 68 |         if (
 69 |             self.options["autolinks"]
 70 |             and text.replace(r"\_", "_") == href
 71 |             and not title
 72 |             and not self.options["default_title"]
 73 |         ):
 74 |             # Shortcut syntax
 75 |             return "<%s>" % href
 76 |         if self.options["default_title"] and not title:
 77 |             title = href
 78 |         title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
 79 |         return (
 80 |             "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
 81 |             if href
 82 |             else text
 83 |         )
 84 | 
 85 |     def convert_img(
 86 |         self,
 87 |         el: Any,
 88 |         text: str,
 89 |         convert_as_inline: Optional[bool] = False,
 90 |         **kwargs,
 91 |     ) -> str:
 92 |         """Same as usual converter, but removes data URIs"""
 93 | 
 94 |         alt = el.attrs.get("alt", None) or ""
 95 |         src = el.attrs.get("src", None) or el.attrs.get("data-src", None) or ""
 96 |         title = el.attrs.get("title", None) or ""
 97 |         title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
 98 |         # Remove all line breaks from alt
 99 |         alt = alt.replace("\n", " ")
100 |         if (
101 |             convert_as_inline
102 |             and el.parent.name not in self.options["keep_inline_images_in"]
103 |         ):
104 |             return alt
105 | 
106 |         # Remove dataURIs
107 |         if src.startswith("data:") and not self.options["keep_data_uris"]:
108 |             src = src.split(",")[0] + "..."
109 | 
110 |         return "![%s](%s%s)" % (alt, src, title_part)
111 | 
112 |     def convert_input(
113 |         self,
114 |         el: Any,
115 |         text: str,
116 |         convert_as_inline: Optional[bool] = False,
117 |         **kwargs,
118 |     ) -> str:
119 |         """Convert checkboxes to Markdown [x]/[ ] syntax."""
120 | 
121 |         if el.get("type") == "checkbox":
122 |             return "[x] " if el.has_attr("checked") else "[ ] "
123 |         return ""
124 | 
125 |     def convert_soup(self, soup: Any) -> str:
126 |         return super().convert_soup(soup)  # type: ignore
127 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/_base_converter.py:
--------------------------------------------------------------------------------

```python
  1 | from typing import Any, BinaryIO, Optional
  2 | from ._stream_info import StreamInfo
  3 | 
  4 | 
  5 | class DocumentConverterResult:
  6 |     """The result of converting a document to Markdown."""
  7 | 
  8 |     def __init__(
  9 |         self,
 10 |         markdown: str,
 11 |         *,
 12 |         title: Optional[str] = None,
 13 |     ):
 14 |         """
 15 |         Initialize the DocumentConverterResult.
 16 | 
 17 |         The only required parameter is the converted Markdown text.
 18 |         The title, and any other metadata that may be added in the future, are optional.
 19 | 
 20 |         Parameters:
 21 |         - markdown: The converted Markdown text.
 22 |         - title: Optional title of the document.
 23 |         """
 24 |         self.markdown = markdown
 25 |         self.title = title
 26 | 
 27 |     @property
 28 |     def text_content(self) -> str:
 29 |         """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
 30 |         return self.markdown
 31 | 
 32 |     @text_content.setter
 33 |     def text_content(self, markdown: str):
 34 |         """Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
 35 |         self.markdown = markdown
 36 | 
 37 |     def __str__(self) -> str:
 38 |         """Return the converted Markdown text."""
 39 |         return self.markdown
 40 | 
 41 | 
 42 | class DocumentConverter:
 43 |     """Abstract superclass of all DocumentConverters."""
 44 | 
 45 |     def accepts(
 46 |         self,
 47 |         file_stream: BinaryIO,
 48 |         stream_info: StreamInfo,
 49 |         **kwargs: Any,  # Options to pass to the converter
 50 |     ) -> bool:
 51 |         """
 52 |         Return a quick determination on if the converter should attempt converting the document.
 53 |         This is primarily based `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`).
 54 |         In cases where the data is retrieved via HTTP, the `steam_info.url` might also be referenced to
 55 |         make a determination (e.g., special converters for Wikipedia, YouTube etc).
 56 |         Finally, it is conceivable that the `stream_info.filename` might be used to in cases
 57 |         where the filename is well-known (e.g., `Dockerfile`, `Makefile`, etc)
 58 | 
 59 |         NOTE: The method signature is designed to match that of the convert() method. This provides some
 60 |         assurance that, if accepts() returns True, the convert() method will also be able to handle the document.
 61 | 
 62 |         IMPORTANT: In rare cases, (e.g., OutlookMsgConverter) we need to read more from the stream to make a final
 63 |         determination. Read operations inevitably advances the position in file_stream. In these case, the position
 64 |         MUST be reset it MUST be reset before returning. This is because the convert() method may be called immediately
 65 |         after accepts(), and will expect the file_stream to be at the original position.
 66 | 
 67 |         E.g.,
 68 |         cur_pos = file_stream.tell() # Save the current position
 69 |         data = file_stream.read(100) # ... peek at the first 100 bytes, etc.
 70 |         file_stream.seek(cur_pos)    # Reset the position to the original position
 71 | 
 72 |         Parameters:
 73 |         - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
 74 |         - stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
 75 |         - kwargs: Additional keyword arguments for the converter.
 76 | 
 77 |         Returns:
 78 |         - bool: True if the converter can handle the document, False otherwise.
 79 |         """
 80 |         raise NotImplementedError(
 81 |             f"The subclass, {type(self).__name__}, must implement the accepts() method to determine if they can handle the document."
 82 |         )
 83 | 
 84 |     def convert(
 85 |         self,
 86 |         file_stream: BinaryIO,
 87 |         stream_info: StreamInfo,
 88 |         **kwargs: Any,  # Options to pass to the converter
 89 |     ) -> DocumentConverterResult:
 90 |         """
 91 |         Convert a document to Markdown text.
 92 | 
 93 |         Parameters:
 94 |         - file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
 95 |         - stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
 96 |         - kwargs: Additional keyword arguments for the converter.
 97 | 
 98 |         Returns:
 99 |         - DocumentConverterResult: The result of the conversion, which includes the title and markdown content.
100 | 
101 |         Raises:
102 |         - FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
103 |         - MissingDependencyException: If the converter requires a dependency that is not installed.
104 |         """
105 |         raise NotImplementedError("Subclasses must implement this method")
106 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_xlsx_converter.py:
--------------------------------------------------------------------------------

```python
  1 | import sys
  2 | from typing import BinaryIO, Any
  3 | from ._html_converter import HtmlConverter
  4 | from .._base_converter import DocumentConverter, DocumentConverterResult
  5 | from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
  6 | from .._stream_info import StreamInfo
  7 | 
  8 | # Try loading optional (but in this case, required) dependencies
  9 | # Save reporting of any exceptions for later
 10 | _xlsx_dependency_exc_info = None
 11 | try:
 12 |     import pandas as pd
 13 |     import openpyxl  # noqa: F401
 14 | except ImportError:
 15 |     _xlsx_dependency_exc_info = sys.exc_info()
 16 | 
 17 | _xls_dependency_exc_info = None
 18 | try:
 19 |     import pandas as pd  # noqa: F811
 20 |     import xlrd  # noqa: F401
 21 | except ImportError:
 22 |     _xls_dependency_exc_info = sys.exc_info()
 23 | 
 24 | ACCEPTED_XLSX_MIME_TYPE_PREFIXES = [
 25 |     "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
 26 | ]
 27 | ACCEPTED_XLSX_FILE_EXTENSIONS = [".xlsx"]
 28 | 
 29 | ACCEPTED_XLS_MIME_TYPE_PREFIXES = [
 30 |     "application/vnd.ms-excel",
 31 |     "application/excel",
 32 | ]
 33 | ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]
 34 | 
 35 | 
 36 | class XlsxConverter(DocumentConverter):
 37 |     """
 38 |     Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
 39 |     """
 40 | 
 41 |     def __init__(self):
 42 |         super().__init__()
 43 |         self._html_converter = HtmlConverter()
 44 | 
 45 |     def accepts(
 46 |         self,
 47 |         file_stream: BinaryIO,
 48 |         stream_info: StreamInfo,
 49 |         **kwargs: Any,  # Options to pass to the converter
 50 |     ) -> bool:
 51 |         mimetype = (stream_info.mimetype or "").lower()
 52 |         extension = (stream_info.extension or "").lower()
 53 | 
 54 |         if extension in ACCEPTED_XLSX_FILE_EXTENSIONS:
 55 |             return True
 56 | 
 57 |         for prefix in ACCEPTED_XLSX_MIME_TYPE_PREFIXES:
 58 |             if mimetype.startswith(prefix):
 59 |                 return True
 60 | 
 61 |         return False
 62 | 
 63 |     def convert(
 64 |         self,
 65 |         file_stream: BinaryIO,
 66 |         stream_info: StreamInfo,
 67 |         **kwargs: Any,  # Options to pass to the converter
 68 |     ) -> DocumentConverterResult:
 69 |         # Check the dependencies
 70 |         if _xlsx_dependency_exc_info is not None:
 71 |             raise MissingDependencyException(
 72 |                 MISSING_DEPENDENCY_MESSAGE.format(
 73 |                     converter=type(self).__name__,
 74 |                     extension=".xlsx",
 75 |                     feature="xlsx",
 76 |                 )
 77 |             ) from _xlsx_dependency_exc_info[
 78 |                 1
 79 |             ].with_traceback(  # type: ignore[union-attr]
 80 |                 _xlsx_dependency_exc_info[2]
 81 |             )
 82 | 
 83 |         sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
 84 |         md_content = ""
 85 |         for s in sheets:
 86 |             md_content += f"## {s}\n"
 87 |             html_content = sheets[s].to_html(index=False)
 88 |             md_content += (
 89 |                 self._html_converter.convert_string(
 90 |                     html_content, **kwargs
 91 |                 ).markdown.strip()
 92 |                 + "\n\n"
 93 |             )
 94 | 
 95 |         return DocumentConverterResult(markdown=md_content.strip())
 96 | 
 97 | 
 98 | class XlsConverter(DocumentConverter):
 99 |     """
100 |     Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
101 |     """
102 | 
103 |     def __init__(self):
104 |         super().__init__()
105 |         self._html_converter = HtmlConverter()
106 | 
107 |     def accepts(
108 |         self,
109 |         file_stream: BinaryIO,
110 |         stream_info: StreamInfo,
111 |         **kwargs: Any,  # Options to pass to the converter
112 |     ) -> bool:
113 |         mimetype = (stream_info.mimetype or "").lower()
114 |         extension = (stream_info.extension or "").lower()
115 | 
116 |         if extension in ACCEPTED_XLS_FILE_EXTENSIONS:
117 |             return True
118 | 
119 |         for prefix in ACCEPTED_XLS_MIME_TYPE_PREFIXES:
120 |             if mimetype.startswith(prefix):
121 |                 return True
122 | 
123 |         return False
124 | 
125 |     def convert(
126 |         self,
127 |         file_stream: BinaryIO,
128 |         stream_info: StreamInfo,
129 |         **kwargs: Any,  # Options to pass to the converter
130 |     ) -> DocumentConverterResult:
131 |         # Load the dependencies
132 |         if _xls_dependency_exc_info is not None:
133 |             raise MissingDependencyException(
134 |                 MISSING_DEPENDENCY_MESSAGE.format(
135 |                     converter=type(self).__name__,
136 |                     extension=".xls",
137 |                     feature="xls",
138 |                 )
139 |             ) from _xls_dependency_exc_info[
140 |                 1
141 |             ].with_traceback(  # type: ignore[union-attr]
142 |                 _xls_dependency_exc_info[2]
143 |             )
144 | 
145 |         sheets = pd.read_excel(file_stream, sheet_name=None, engine="xlrd")
146 |         md_content = ""
147 |         for s in sheets:
148 |             md_content += f"## {s}\n"
149 |             html_content = sheets[s].to_html(index=False)
150 |             md_content += (
151 |                 self._html_converter.convert_string(
152 |                     html_content, **kwargs
153 |                 ).markdown.strip()
154 |                 + "\n\n"
155 |             )
156 | 
157 |         return DocumentConverterResult(markdown=md_content.strip())
158 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py:
--------------------------------------------------------------------------------

```python
  1 | import sys
  2 | from typing import Any, Union, BinaryIO
  3 | from .._stream_info import StreamInfo
  4 | from .._base_converter import DocumentConverter, DocumentConverterResult
  5 | from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
  6 | 
  7 | # Try loading optional (but in this case, required) dependencies
  8 | # Save reporting of any exceptions for later
  9 | _dependency_exc_info = None
 10 | olefile = None
 11 | try:
 12 |     import olefile  # type: ignore[no-redef]
 13 | except ImportError:
 14 |     # Preserve the error and stack trace for later
 15 |     _dependency_exc_info = sys.exc_info()
 16 | 
 17 | ACCEPTED_MIME_TYPE_PREFIXES = [
 18 |     "application/vnd.ms-outlook",
 19 | ]
 20 | 
 21 | ACCEPTED_FILE_EXTENSIONS = [".msg"]
 22 | 
 23 | 
 24 | class OutlookMsgConverter(DocumentConverter):
 25 |     """Converts Outlook .msg files to markdown by extracting email metadata and content.
 26 | 
 27 |     Uses the olefile package to parse the .msg file structure and extract:
 28 |     - Email headers (From, To, Subject)
 29 |     - Email body content
 30 |     """
 31 | 
 32 |     def accepts(
 33 |         self,
 34 |         file_stream: BinaryIO,
 35 |         stream_info: StreamInfo,
 36 |         **kwargs: Any,  # Options to pass to the converter
 37 |     ) -> bool:
 38 |         mimetype = (stream_info.mimetype or "").lower()
 39 |         extension = (stream_info.extension or "").lower()
 40 | 
 41 |         # Check the extension and mimetype
 42 |         if extension in ACCEPTED_FILE_EXTENSIONS:
 43 |             return True
 44 | 
 45 |         for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
 46 |             if mimetype.startswith(prefix):
 47 |                 return True
 48 | 
 49 |         # Brute force, check if we have an OLE file
 50 |         cur_pos = file_stream.tell()
 51 |         try:
 52 |             if olefile and not olefile.isOleFile(file_stream):
 53 |                 return False
 54 |         finally:
 55 |             file_stream.seek(cur_pos)
 56 | 
 57 |         # Brue force, check if it's an Outlook file
 58 |         try:
 59 |             if olefile is not None:
 60 |                 msg = olefile.OleFileIO(file_stream)
 61 |                 toc = "\n".join([str(stream) for stream in msg.listdir()])
 62 |                 return (
 63 |                     "__properties_version1.0" in toc
 64 |                     and "__recip_version1.0_#00000000" in toc
 65 |                 )
 66 |         except Exception as e:
 67 |             pass
 68 |         finally:
 69 |             file_stream.seek(cur_pos)
 70 | 
 71 |         return False
 72 | 
 73 |     def convert(
 74 |         self,
 75 |         file_stream: BinaryIO,
 76 |         stream_info: StreamInfo,
 77 |         **kwargs: Any,  # Options to pass to the converter
 78 |     ) -> DocumentConverterResult:
 79 |         # Check: the dependencies
 80 |         if _dependency_exc_info is not None:
 81 |             raise MissingDependencyException(
 82 |                 MISSING_DEPENDENCY_MESSAGE.format(
 83 |                     converter=type(self).__name__,
 84 |                     extension=".msg",
 85 |                     feature="outlook",
 86 |                 )
 87 |             ) from _dependency_exc_info[
 88 |                 1
 89 |             ].with_traceback(  # type: ignore[union-attr]
 90 |                 _dependency_exc_info[2]
 91 |             )
 92 | 
 93 |         assert (
 94 |             olefile is not None
 95 |         )  # If we made it this far, olefile should be available
 96 |         msg = olefile.OleFileIO(file_stream)
 97 | 
 98 |         # Extract email metadata
 99 |         md_content = "# Email Message\n\n"
100 | 
101 |         # Get headers
102 |         headers = {
103 |             "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"),
104 |             "To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
105 |             "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
106 |         }
107 | 
108 |         # Add headers to markdown
109 |         for key, value in headers.items():
110 |             if value:
111 |                 md_content += f"**{key}:** {value}\n"
112 | 
113 |         md_content += "\n## Content\n\n"
114 | 
115 |         # Get email body
116 |         body = self._get_stream_data(msg, "__substg1.0_1000001F")
117 |         if body:
118 |             md_content += body
119 | 
120 |         msg.close()
121 | 
122 |         return DocumentConverterResult(
123 |             markdown=md_content.strip(),
124 |             title=headers.get("Subject"),
125 |         )
126 | 
127 |     def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
128 |         """Helper to safely extract and decode stream data from the MSG file."""
129 |         assert olefile is not None
130 |         assert isinstance(
131 |             msg, olefile.OleFileIO
132 |         )  # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package)
133 | 
134 |         try:
135 |             if msg.exists(stream_path):
136 |                 data = msg.openstream(stream_path).read()
137 |                 # Try UTF-16 first (common for .msg files)
138 |                 try:
139 |                     return data.decode("utf-16-le").strip()
140 |                 except UnicodeDecodeError:
141 |                     # Fall back to UTF-8
142 |                     try:
143 |                         return data.decode("utf-8").strip()
144 |                     except UnicodeDecodeError:
145 |                         # Last resort - ignore errors
146 |                         return data.decode("utf-8", errors="ignore").strip()
147 |         except Exception:
148 |             pass
149 |         return None
150 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_epub_converter.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import zipfile
  3 | from defusedxml import minidom
  4 | from xml.dom.minidom import Document
  5 | 
  6 | from typing import BinaryIO, Any, Dict, List
  7 | 
  8 | from ._html_converter import HtmlConverter
  9 | from .._base_converter import DocumentConverterResult
 10 | from .._stream_info import StreamInfo
 11 | 
 12 | ACCEPTED_MIME_TYPE_PREFIXES = [
 13 |     "application/epub",
 14 |     "application/epub+zip",
 15 |     "application/x-epub+zip",
 16 | ]
 17 | 
 18 | ACCEPTED_FILE_EXTENSIONS = [".epub"]
 19 | 
 20 | MIME_TYPE_MAPPING = {
 21 |     ".html": "text/html",
 22 |     ".xhtml": "application/xhtml+xml",
 23 | }
 24 | 
 25 | 
 26 | class EpubConverter(HtmlConverter):
 27 |     """
 28 |     Converts EPUB files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
 29 |     """
 30 | 
 31 |     def __init__(self):
 32 |         super().__init__()
 33 |         self._html_converter = HtmlConverter()
 34 | 
 35 |     def accepts(
 36 |         self,
 37 |         file_stream: BinaryIO,
 38 |         stream_info: StreamInfo,
 39 |         **kwargs: Any,  # Options to pass to the converter
 40 |     ) -> bool:
 41 |         mimetype = (stream_info.mimetype or "").lower()
 42 |         extension = (stream_info.extension or "").lower()
 43 | 
 44 |         if extension in ACCEPTED_FILE_EXTENSIONS:
 45 |             return True
 46 | 
 47 |         for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
 48 |             if mimetype.startswith(prefix):
 49 |                 return True
 50 | 
 51 |         return False
 52 | 
 53 |     def convert(
 54 |         self,
 55 |         file_stream: BinaryIO,
 56 |         stream_info: StreamInfo,
 57 |         **kwargs: Any,  # Options to pass to the converter
 58 |     ) -> DocumentConverterResult:
 59 |         with zipfile.ZipFile(file_stream, "r") as z:
 60 |             # Extracts metadata (title, authors, language, publisher, date, description, cover) from an EPUB file."""
 61 | 
 62 |             # Locate content.opf
 63 |             container_dom = minidom.parse(z.open("META-INF/container.xml"))
 64 |             opf_path = container_dom.getElementsByTagName("rootfile")[0].getAttribute(
 65 |                 "full-path"
 66 |             )
 67 | 
 68 |             # Parse content.opf
 69 |             opf_dom = minidom.parse(z.open(opf_path))
 70 |             metadata: Dict[str, Any] = {
 71 |                 "title": self._get_text_from_node(opf_dom, "dc:title"),
 72 |                 "authors": self._get_all_texts_from_nodes(opf_dom, "dc:creator"),
 73 |                 "language": self._get_text_from_node(opf_dom, "dc:language"),
 74 |                 "publisher": self._get_text_from_node(opf_dom, "dc:publisher"),
 75 |                 "date": self._get_text_from_node(opf_dom, "dc:date"),
 76 |                 "description": self._get_text_from_node(opf_dom, "dc:description"),
 77 |                 "identifier": self._get_text_from_node(opf_dom, "dc:identifier"),
 78 |             }
 79 | 
 80 |             # Extract manifest items (ID → href mapping)
 81 |             manifest = {
 82 |                 item.getAttribute("id"): item.getAttribute("href")
 83 |                 for item in opf_dom.getElementsByTagName("item")
 84 |             }
 85 | 
 86 |             # Extract spine order (ID refs)
 87 |             spine_items = opf_dom.getElementsByTagName("itemref")
 88 |             spine_order = [item.getAttribute("idref") for item in spine_items]
 89 | 
 90 |             # Convert spine order to actual file paths
 91 |             base_path = "/".join(
 92 |                 opf_path.split("/")[:-1]
 93 |             )  # Get base directory of content.opf
 94 |             spine = [
 95 |                 f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
 96 |                 for item_id in spine_order
 97 |                 if item_id in manifest
 98 |             ]
 99 | 
100 |             # Extract and convert the content
101 |             markdown_content: List[str] = []
102 |             for file in spine:
103 |                 if file in z.namelist():
104 |                     with z.open(file) as f:
105 |                         filename = os.path.basename(file)
106 |                         extension = os.path.splitext(filename)[1].lower()
107 |                         mimetype = MIME_TYPE_MAPPING.get(extension)
108 |                         converted_content = self._html_converter.convert(
109 |                             f,
110 |                             StreamInfo(
111 |                                 mimetype=mimetype,
112 |                                 extension=extension,
113 |                                 filename=filename,
114 |                             ),
115 |                         )
116 |                         markdown_content.append(converted_content.markdown.strip())
117 | 
118 |             # Format and add the metadata
119 |             metadata_markdown = []
120 |             for key, value in metadata.items():
121 |                 if isinstance(value, list):
122 |                     value = ", ".join(value)
123 |                 if value:
124 |                     metadata_markdown.append(f"**{key.capitalize()}:** {value}")
125 | 
126 |             markdown_content.insert(0, "\n".join(metadata_markdown))
127 | 
128 |             return DocumentConverterResult(
129 |                 markdown="\n\n".join(markdown_content), title=metadata["title"]
130 |             )
131 | 
132 |     def _get_text_from_node(self, dom: Document, tag_name: str) -> str | None:
133 |         """Convenience function to extract a single occurrence of a tag (e.g., title)."""
134 |         texts = self._get_all_texts_from_nodes(dom, tag_name)
135 |         if len(texts) > 0:
136 |             return texts[0]
137 |         else:
138 |             return None
139 | 
140 |     def _get_all_texts_from_nodes(self, dom: Document, tag_name: str) -> List[str]:
141 |         """Helper function to extract all occurrences of a tag (e.g., multiple authors)."""
142 |         texts: List[str] = []
143 |         for node in dom.getElementsByTagName(tag_name):
144 |             if node.firstChild and hasattr(node.firstChild, "nodeValue"):
145 |                 texts.append(node.firstChild.nodeValue.strip())
146 |         return texts
147 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py:
--------------------------------------------------------------------------------

```python
  1 | import zipfile
  2 | from io import BytesIO
  3 | from typing import BinaryIO
  4 | from xml.etree import ElementTree as ET
  5 | 
  6 | from bs4 import BeautifulSoup, Tag
  7 | 
  8 | from .math.omml import OMML_NS, oMath2Latex
  9 | 
 10 | MATH_ROOT_TEMPLATE = "".join(
 11 |     (
 12 |         "<w:document ",
 13 |         'xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" ',
 14 |         'xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" ',
 15 |         'xmlns:o="urn:schemas-microsoft-com:office:office" ',
 16 |         'xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" ',
 17 |         'xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" ',
 18 |         'xmlns:v="urn:schemas-microsoft-com:vml" ',
 19 |         'xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" ',
 20 |         'xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" ',
 21 |         'xmlns:w10="urn:schemas-microsoft-com:office:word" ',
 22 |         'xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" ',
 23 |         'xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" ',
 24 |         'xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" ',
 25 |         'xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" ',
 26 |         'xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" ',
 27 |         'xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 wp14">',
 28 |         "{0}</w:document>",
 29 |     )
 30 | )
 31 | 
 32 | 
 33 | def _convert_omath_to_latex(tag: Tag) -> str:
 34 |     """
 35 |     Converts an OMML (Office Math Markup Language) tag to LaTeX format.
 36 | 
 37 |     Args:
 38 |         tag (Tag): A BeautifulSoup Tag object representing the OMML element.
 39 | 
 40 |     Returns:
 41 |         str: The LaTeX representation of the OMML element.
 42 |     """
 43 |     # Format the tag into a complete XML document string
 44 |     math_root = ET.fromstring(MATH_ROOT_TEMPLATE.format(str(tag)))
 45 |     # Find the 'oMath' element within the XML document
 46 |     math_element = math_root.find(OMML_NS + "oMath")
 47 |     # Convert the 'oMath' element to LaTeX using the oMath2Latex function
 48 |     latex = oMath2Latex(math_element).latex
 49 |     return latex
 50 | 
 51 | 
 52 | def _get_omath_tag_replacement(tag: Tag, block: bool = False) -> Tag:
 53 |     """
 54 |     Creates a replacement tag for an OMML (Office Math Markup Language) element.
 55 | 
 56 |     Args:
 57 |         tag (Tag): A BeautifulSoup Tag object representing the "oMath" element.
 58 |         block (bool, optional): If True, the LaTeX will be wrapped in double dollar signs for block mode. Defaults to False.
 59 | 
 60 |     Returns:
 61 |         Tag: A BeautifulSoup Tag object representing the replacement element.
 62 |     """
 63 |     t_tag = Tag(name="w:t")
 64 |     t_tag.string = (
 65 |         f"$${_convert_omath_to_latex(tag)}$$"
 66 |         if block
 67 |         else f"${_convert_omath_to_latex(tag)}$"
 68 |     )
 69 |     r_tag = Tag(name="w:r")
 70 |     r_tag.append(t_tag)
 71 |     return r_tag
 72 | 
 73 | 
 74 | def _replace_equations(tag: Tag):
 75 |     """
 76 |     Replaces OMML (Office Math Markup Language) elements with their LaTeX equivalents.
 77 | 
 78 |     Args:
 79 |         tag (Tag): A BeautifulSoup Tag object representing the OMML element. Could be either "oMathPara" or "oMath".
 80 | 
 81 |     Raises:
 82 |         ValueError: If the tag is not supported.
 83 |     """
 84 |     if tag.name == "oMathPara":
 85 |         # Create a new paragraph tag
 86 |         p_tag = Tag(name="w:p")
 87 |         # Replace each 'oMath' child tag with its LaTeX equivalent as block equations
 88 |         for child_tag in tag.find_all("oMath"):
 89 |             p_tag.append(_get_omath_tag_replacement(child_tag, block=True))
 90 |         # Replace the original 'oMathPara' tag with the new paragraph tag
 91 |         tag.replace_with(p_tag)
 92 |     elif tag.name == "oMath":
 93 |         # Replace the 'oMath' tag with its LaTeX equivalent as inline equation
 94 |         tag.replace_with(_get_omath_tag_replacement(tag, block=False))
 95 |     else:
 96 |         raise ValueError(f"Not supported tag: {tag.name}")
 97 | 
 98 | 
 99 | def _pre_process_math(content: bytes) -> bytes:
100 |     """
101 |     Pre-processes the math content in a DOCX -> XML file by converting OMML (Office Math Markup Language) elements to LaTeX.
102 |     This preprocessed content can be directly replaced in the DOCX file -> XMLs.
103 | 
104 |     Args:
105 |         content (bytes): The XML content of the DOCX file as bytes.
106 | 
107 |     Returns:
108 |         bytes: The processed content with OMML elements replaced by their LaTeX equivalents, encoded as bytes.
109 |     """
110 |     soup = BeautifulSoup(content.decode(), features="xml")
111 |     for tag in soup.find_all("oMathPara"):
112 |         _replace_equations(tag)
113 |     for tag in soup.find_all("oMath"):
114 |         _replace_equations(tag)
115 |     return str(soup).encode()
116 | 
117 | 
118 | def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
119 |     """
120 |     Pre-processes a DOCX file with provided steps.
121 | 
122 |     The process works by unzipping the DOCX file in memory, transforming specific XML files
123 |     (such as converting OMML elements to LaTeX), and then zipping everything back into a
124 |     DOCX file without writing to disk.
125 | 
126 |     Args:
127 |         input_docx (BinaryIO): A binary input stream representing the DOCX file.
128 | 
129 |     Returns:
130 |         BinaryIO: A binary output stream representing the processed DOCX file.
131 |     """
132 |     output_docx = BytesIO()
133 |     # The files that need to be pre-processed from .docx
134 |     pre_process_enable_files = [
135 |         "word/document.xml",
136 |         "word/footnotes.xml",
137 |         "word/endnotes.xml",
138 |     ]
139 |     with zipfile.ZipFile(input_docx, mode="r") as zip_input:
140 |         files = {name: zip_input.read(name) for name in zip_input.namelist()}
141 |         with zipfile.ZipFile(output_docx, mode="w") as zip_output:
142 |             zip_output.comment = zip_input.comment
143 |             for name, content in files.items():
144 |                 if name in pre_process_enable_files:
145 |                     try:
146 |                         # Pre-process the content
147 |                         updated_content = _pre_process_math(content)
148 |                         # In the future, if there are more pre-processing steps, they can be added here
149 |                         zip_output.writestr(name, updated_content)
150 |                     except Exception:
151 |                         # If there is an error in processing the content, write the original content
152 |                         zip_output.writestr(name, content)
153 |                 else:
154 |                     zip_output.writestr(name, content)
155 |     output_docx.seek(0)
156 |     return output_docx
157 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/__main__.py:
--------------------------------------------------------------------------------

```python
  1 | # SPDX-FileCopyrightText: 2024-present Adam Fourney <[email protected]>
  2 | #
  3 | # SPDX-License-Identifier: MIT
  4 | import argparse
  5 | import sys
  6 | import codecs
  7 | from textwrap import dedent
  8 | from importlib.metadata import entry_points
  9 | from .__about__ import __version__
 10 | from ._markitdown import MarkItDown, StreamInfo, DocumentConverterResult
 11 | 
 12 | 
 13 | def main():
 14 |     parser = argparse.ArgumentParser(
 15 |         description="Convert various file formats to markdown.",
 16 |         prog="markitdown",
 17 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 18 |         usage=dedent(
 19 |             """
 20 |             SYNTAX:
 21 | 
 22 |                 markitdown <OPTIONAL: FILENAME>
 23 |                 If FILENAME is empty, markitdown reads from stdin.
 24 | 
 25 |             EXAMPLE:
 26 | 
 27 |                 markitdown example.pdf
 28 | 
 29 |                 OR
 30 | 
 31 |                 cat example.pdf | markitdown
 32 | 
 33 |                 OR
 34 | 
 35 |                 markitdown < example.pdf
 36 | 
 37 |                 OR to save to a file use
 38 | 
 39 |                 markitdown example.pdf -o example.md
 40 | 
 41 |                 OR
 42 | 
 43 |                 markitdown example.pdf > example.md
 44 |             """
 45 |         ).strip(),
 46 |     )
 47 | 
 48 |     parser.add_argument(
 49 |         "-v",
 50 |         "--version",
 51 |         action="version",
 52 |         version=f"%(prog)s {__version__}",
 53 |         help="show the version number and exit",
 54 |     )
 55 | 
 56 |     parser.add_argument(
 57 |         "-o",
 58 |         "--output",
 59 |         help="Output file name. If not provided, output is written to stdout.",
 60 |     )
 61 | 
 62 |     parser.add_argument(
 63 |         "-x",
 64 |         "--extension",
 65 |         help="Provide a hint about the file extension (e.g., when reading from stdin).",
 66 |     )
 67 | 
 68 |     parser.add_argument(
 69 |         "-m",
 70 |         "--mime-type",
 71 |         help="Provide a hint about the file's MIME type.",
 72 |     )
 73 | 
 74 |     parser.add_argument(
 75 |         "-c",
 76 |         "--charset",
 77 |         help="Provide a hint about the file's charset (e.g, UTF-8).",
 78 |     )
 79 | 
 80 |     parser.add_argument(
 81 |         "-d",
 82 |         "--use-docintel",
 83 |         action="store_true",
 84 |         help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.",
 85 |     )
 86 | 
 87 |     parser.add_argument(
 88 |         "-e",
 89 |         "--endpoint",
 90 |         type=str,
 91 |         help="Document Intelligence Endpoint. Required if using Document Intelligence.",
 92 |     )
 93 | 
 94 |     parser.add_argument(
 95 |         "-p",
 96 |         "--use-plugins",
 97 |         action="store_true",
 98 |         help="Use 3rd-party plugins to convert files. Use --list-plugins to see installed plugins.",
 99 |     )
100 | 
101 |     parser.add_argument(
102 |         "--list-plugins",
103 |         action="store_true",
104 |         help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.",
105 |     )
106 | 
107 |     parser.add_argument(
108 |         "--keep-data-uris",
109 |         action="store_true",
110 |         help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
111 |     )
112 | 
113 |     parser.add_argument("filename", nargs="?")
114 |     args = parser.parse_args()
115 | 
116 |     # Parse the extension hint
117 |     extension_hint = args.extension
118 |     if extension_hint is not None:
119 |         extension_hint = extension_hint.strip().lower()
120 |         if len(extension_hint) > 0:
121 |             if not extension_hint.startswith("."):
122 |                 extension_hint = "." + extension_hint
123 |         else:
124 |             extension_hint = None
125 | 
126 |     # Parse the mime type
127 |     mime_type_hint = args.mime_type
128 |     if mime_type_hint is not None:
129 |         mime_type_hint = mime_type_hint.strip()
130 |         if len(mime_type_hint) > 0:
131 |             if mime_type_hint.count("/") != 1:
132 |                 _exit_with_error(f"Invalid MIME type: {mime_type_hint}")
133 |         else:
134 |             mime_type_hint = None
135 | 
136 |     # Parse the charset
137 |     charset_hint = args.charset
138 |     if charset_hint is not None:
139 |         charset_hint = charset_hint.strip()
140 |         if len(charset_hint) > 0:
141 |             try:
142 |                 charset_hint = codecs.lookup(charset_hint).name
143 |             except LookupError:
144 |                 _exit_with_error(f"Invalid charset: {charset_hint}")
145 |         else:
146 |             charset_hint = None
147 | 
148 |     stream_info = None
149 |     if (
150 |         extension_hint is not None
151 |         or mime_type_hint is not None
152 |         or charset_hint is not None
153 |     ):
154 |         stream_info = StreamInfo(
155 |             extension=extension_hint, mimetype=mime_type_hint, charset=charset_hint
156 |         )
157 | 
158 |     if args.list_plugins:
159 |         # List installed plugins, then exit
160 |         print("Installed MarkItDown 3rd-party Plugins:\n")
161 |         plugin_entry_points = list(entry_points(group="markitdown.plugin"))
162 |         if len(plugin_entry_points) == 0:
163 |             print("  * No 3rd-party plugins installed.")
164 |             print(
165 |                 "\nFind plugins by searching for the hashtag #markitdown-plugin on GitHub.\n"
166 |             )
167 |         else:
168 |             for entry_point in plugin_entry_points:
169 |                 print(f"  * {entry_point.name:<16}\t(package: {entry_point.value})")
170 |             print(
171 |                 "\nUse the -p (or --use-plugins) option to enable 3rd-party plugins.\n"
172 |             )
173 |         sys.exit(0)
174 | 
175 |     if args.use_docintel:
176 |         if args.endpoint is None:
177 |             _exit_with_error(
178 |                 "Document Intelligence Endpoint is required when using Document Intelligence."
179 |             )
180 |         elif args.filename is None:
181 |             _exit_with_error("Filename is required when using Document Intelligence.")
182 | 
183 |         markitdown = MarkItDown(
184 |             enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint
185 |         )
186 |     else:
187 |         markitdown = MarkItDown(enable_plugins=args.use_plugins)
188 | 
189 |     if args.filename is None:
190 |         result = markitdown.convert_stream(
191 |             sys.stdin.buffer,
192 |             stream_info=stream_info,
193 |             keep_data_uris=args.keep_data_uris,
194 |         )
195 |     else:
196 |         result = markitdown.convert(
197 |             args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
198 |         )
199 | 
200 |     _handle_output(args, result)
201 | 
202 | 
203 | def _handle_output(args, result: DocumentConverterResult):
204 |     """Handle output to stdout or file"""
205 |     if args.output:
206 |         with open(args.output, "w", encoding="utf-8") as f:
207 |             f.write(result.markdown)
208 |     else:
209 |         # Handle stdout encoding errors more gracefully
210 |         print(
211 |             result.markdown.encode(sys.stdout.encoding, errors="replace").decode(
212 |                 sys.stdout.encoding
213 |             )
214 |         )
215 | 
216 | 
217 | def _exit_with_error(message: str):
218 |     print(message)
219 |     sys.exit(1)
220 | 
221 | 
222 | if __name__ == "__main__":
223 |     main()
224 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_rss_converter.py:
--------------------------------------------------------------------------------

```python
  1 | from defusedxml import minidom
  2 | from xml.dom.minidom import Document, Element
  3 | from typing import BinaryIO, Any, Union
  4 | from bs4 import BeautifulSoup
  5 | 
  6 | from ._markdownify import _CustomMarkdownify
  7 | from .._stream_info import StreamInfo
  8 | from .._base_converter import DocumentConverter, DocumentConverterResult
  9 | 
 10 | PRECISE_MIME_TYPE_PREFIXES = [
 11 |     "application/rss",
 12 |     "application/rss+xml",
 13 |     "application/atom",
 14 |     "application/atom+xml",
 15 | ]
 16 | 
 17 | PRECISE_FILE_EXTENSIONS = [".rss", ".atom"]
 18 | 
 19 | CANDIDATE_MIME_TYPE_PREFIXES = [
 20 |     "text/xml",
 21 |     "application/xml",
 22 | ]
 23 | 
 24 | CANDIDATE_FILE_EXTENSIONS = [
 25 |     ".xml",
 26 | ]
 27 | 
 28 | 
 29 | class RssConverter(DocumentConverter):
 30 |     """Convert RSS / Atom type to markdown"""
 31 | 
 32 |     def __init__(self):
 33 |         super().__init__()
 34 |         self._kwargs = {}
 35 | 
 36 |     def accepts(
 37 |         self,
 38 |         file_stream: BinaryIO,
 39 |         stream_info: StreamInfo,
 40 |         **kwargs: Any,  # Options to pass to the converter
 41 |     ) -> bool:
 42 |         mimetype = (stream_info.mimetype or "").lower()
 43 |         extension = (stream_info.extension or "").lower()
 44 | 
 45 |         # Check for precise mimetypes and file extensions
 46 |         if extension in PRECISE_FILE_EXTENSIONS:
 47 |             return True
 48 | 
 49 |         for prefix in PRECISE_MIME_TYPE_PREFIXES:
 50 |             if mimetype.startswith(prefix):
 51 |                 return True
 52 | 
 53 |         # Check for precise mimetypes and file extensions
 54 |         if extension in CANDIDATE_FILE_EXTENSIONS:
 55 |             return self._check_xml(file_stream)
 56 | 
 57 |         for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
 58 |             if mimetype.startswith(prefix):
 59 |                 return self._check_xml(file_stream)
 60 | 
 61 |         return False
 62 | 
 63 |     def _check_xml(self, file_stream: BinaryIO) -> bool:
 64 |         cur_pos = file_stream.tell()
 65 |         try:
 66 |             doc = minidom.parse(file_stream)
 67 |             return self._feed_type(doc) is not None
 68 |         except BaseException as _:
 69 |             pass
 70 |         finally:
 71 |             file_stream.seek(cur_pos)
 72 |         return False
 73 | 
 74 |     def _feed_type(self, doc: Any) -> str | None:
 75 |         if doc.getElementsByTagName("rss"):
 76 |             return "rss"
 77 |         elif doc.getElementsByTagName("feed"):
 78 |             root = doc.getElementsByTagName("feed")[0]
 79 |             if root.getElementsByTagName("entry"):
 80 |                 # An Atom feed must have a root element of <feed> and at least one <entry>
 81 |                 return "atom"
 82 |         return None
 83 | 
 84 |     def convert(
 85 |         self,
 86 |         file_stream: BinaryIO,
 87 |         stream_info: StreamInfo,
 88 |         **kwargs: Any,  # Options to pass to the converter
 89 |     ) -> DocumentConverterResult:
 90 |         self._kwargs = kwargs
 91 |         doc = minidom.parse(file_stream)
 92 |         feed_type = self._feed_type(doc)
 93 | 
 94 |         if feed_type == "rss":
 95 |             return self._parse_rss_type(doc)
 96 |         elif feed_type == "atom":
 97 |             return self._parse_atom_type(doc)
 98 |         else:
 99 |             raise ValueError("Unknown feed type")
100 | 
101 |     def _parse_atom_type(self, doc: Document) -> DocumentConverterResult:
102 |         """Parse the type of an Atom feed.
103 | 
104 |         Returns None if the feed type is not recognized or something goes wrong.
105 |         """
106 |         root = doc.getElementsByTagName("feed")[0]
107 |         title = self._get_data_by_tag_name(root, "title")
108 |         subtitle = self._get_data_by_tag_name(root, "subtitle")
109 |         entries = root.getElementsByTagName("entry")
110 |         md_text = f"# {title}\n"
111 |         if subtitle:
112 |             md_text += f"{subtitle}\n"
113 |         for entry in entries:
114 |             entry_title = self._get_data_by_tag_name(entry, "title")
115 |             entry_summary = self._get_data_by_tag_name(entry, "summary")
116 |             entry_updated = self._get_data_by_tag_name(entry, "updated")
117 |             entry_content = self._get_data_by_tag_name(entry, "content")
118 | 
119 |             if entry_title:
120 |                 md_text += f"\n## {entry_title}\n"
121 |             if entry_updated:
122 |                 md_text += f"Updated on: {entry_updated}\n"
123 |             if entry_summary:
124 |                 md_text += self._parse_content(entry_summary)
125 |             if entry_content:
126 |                 md_text += self._parse_content(entry_content)
127 | 
128 |         return DocumentConverterResult(
129 |             markdown=md_text,
130 |             title=title,
131 |         )
132 | 
133 |     def _parse_rss_type(self, doc: Document) -> DocumentConverterResult:
134 |         """Parse the type of an RSS feed.
135 | 
136 |         Returns None if the feed type is not recognized or something goes wrong.
137 |         """
138 |         root = doc.getElementsByTagName("rss")[0]
139 |         channel_list = root.getElementsByTagName("channel")
140 |         if not channel_list:
141 |             raise ValueError("No channel found in RSS feed")
142 |         channel = channel_list[0]
143 |         channel_title = self._get_data_by_tag_name(channel, "title")
144 |         channel_description = self._get_data_by_tag_name(channel, "description")
145 |         items = channel.getElementsByTagName("item")
146 |         if channel_title:
147 |             md_text = f"# {channel_title}\n"
148 |         if channel_description:
149 |             md_text += f"{channel_description}\n"
150 |         for item in items:
151 |             title = self._get_data_by_tag_name(item, "title")
152 |             description = self._get_data_by_tag_name(item, "description")
153 |             pubDate = self._get_data_by_tag_name(item, "pubDate")
154 |             content = self._get_data_by_tag_name(item, "content:encoded")
155 | 
156 |             if title:
157 |                 md_text += f"\n## {title}\n"
158 |             if pubDate:
159 |                 md_text += f"Published on: {pubDate}\n"
160 |             if description:
161 |                 md_text += self._parse_content(description)
162 |             if content:
163 |                 md_text += self._parse_content(content)
164 | 
165 |         return DocumentConverterResult(
166 |             markdown=md_text,
167 |             title=channel_title,
168 |         )
169 | 
170 |     def _parse_content(self, content: str) -> str:
171 |         """Parse the content of an RSS feed item"""
172 |         try:
173 |             # using bs4 because many RSS feeds have HTML-styled content
174 |             soup = BeautifulSoup(content, "html.parser")
175 |             return _CustomMarkdownify(**self._kwargs).convert_soup(soup)
176 |         except BaseException as _:
177 |             return content
178 | 
179 |     def _get_data_by_tag_name(
180 |         self, element: Element, tag_name: str
181 |     ) -> Union[str, None]:
182 |         """Get data from first child element with the given tag name.
183 |         Returns None when no such element is found.
184 |         """
185 |         nodes = element.getElementsByTagName(tag_name)
186 |         if not nodes:
187 |             return None
188 |         fc = nodes[0].firstChild
189 |         if fc:
190 |             if hasattr(fc, "data"):
191 |                 return fc.data
192 |         return None
193 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py:
--------------------------------------------------------------------------------

```python
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
  5 | On 25/03/2025
  6 | """
  7 | 
  8 | from __future__ import unicode_literals
  9 | 
 10 | CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
 11 | 
 12 | BLANK = ""
 13 | BACKSLASH = "\\"
 14 | ALN = "&"
 15 | 
 16 | CHR = {
 17 |     # Unicode : Latex Math Symbols
 18 |     # Top accents
 19 |     "\u0300": "\\grave{{{0}}}",
 20 |     "\u0301": "\\acute{{{0}}}",
 21 |     "\u0302": "\\hat{{{0}}}",
 22 |     "\u0303": "\\tilde{{{0}}}",
 23 |     "\u0304": "\\bar{{{0}}}",
 24 |     "\u0305": "\\overbar{{{0}}}",
 25 |     "\u0306": "\\breve{{{0}}}",
 26 |     "\u0307": "\\dot{{{0}}}",
 27 |     "\u0308": "\\ddot{{{0}}}",
 28 |     "\u0309": "\\ovhook{{{0}}}",
 29 |     "\u030a": "\\ocirc{{{0}}}}",
 30 |     "\u030c": "\\check{{{0}}}}",
 31 |     "\u0310": "\\candra{{{0}}}",
 32 |     "\u0312": "\\oturnedcomma{{{0}}}",
 33 |     "\u0315": "\\ocommatopright{{{0}}}",
 34 |     "\u031a": "\\droang{{{0}}}",
 35 |     "\u0338": "\\not{{{0}}}",
 36 |     "\u20d0": "\\leftharpoonaccent{{{0}}}",
 37 |     "\u20d1": "\\rightharpoonaccent{{{0}}}",
 38 |     "\u20d2": "\\vertoverlay{{{0}}}",
 39 |     "\u20d6": "\\overleftarrow{{{0}}}",
 40 |     "\u20d7": "\\vec{{{0}}}",
 41 |     "\u20db": "\\dddot{{{0}}}",
 42 |     "\u20dc": "\\ddddot{{{0}}}",
 43 |     "\u20e1": "\\overleftrightarrow{{{0}}}",
 44 |     "\u20e7": "\\annuity{{{0}}}",
 45 |     "\u20e9": "\\widebridgeabove{{{0}}}",
 46 |     "\u20f0": "\\asteraccent{{{0}}}",
 47 |     # Bottom accents
 48 |     "\u0330": "\\wideutilde{{{0}}}",
 49 |     "\u0331": "\\underbar{{{0}}}",
 50 |     "\u20e8": "\\threeunderdot{{{0}}}",
 51 |     "\u20ec": "\\underrightharpoondown{{{0}}}",
 52 |     "\u20ed": "\\underleftharpoondown{{{0}}}",
 53 |     "\u20ee": "\\underledtarrow{{{0}}}",
 54 |     "\u20ef": "\\underrightarrow{{{0}}}",
 55 |     # Over | group
 56 |     "\u23b4": "\\overbracket{{{0}}}",
 57 |     "\u23dc": "\\overparen{{{0}}}",
 58 |     "\u23de": "\\overbrace{{{0}}}",
 59 |     # Under| group
 60 |     "\u23b5": "\\underbracket{{{0}}}",
 61 |     "\u23dd": "\\underparen{{{0}}}",
 62 |     "\u23df": "\\underbrace{{{0}}}",
 63 | }
 64 | 
 65 | CHR_BO = {
 66 |     # Big operators,
 67 |     "\u2140": "\\Bbbsum",
 68 |     "\u220f": "\\prod",
 69 |     "\u2210": "\\coprod",
 70 |     "\u2211": "\\sum",
 71 |     "\u222b": "\\int",
 72 |     "\u22c0": "\\bigwedge",
 73 |     "\u22c1": "\\bigvee",
 74 |     "\u22c2": "\\bigcap",
 75 |     "\u22c3": "\\bigcup",
 76 |     "\u2a00": "\\bigodot",
 77 |     "\u2a01": "\\bigoplus",
 78 |     "\u2a02": "\\bigotimes",
 79 | }
 80 | 
 81 | T = {
 82 |     "\u2192": "\\rightarrow ",
 83 |     # Greek letters
 84 |     "\U0001d6fc": "\\alpha ",
 85 |     "\U0001d6fd": "\\beta ",
 86 |     "\U0001d6fe": "\\gamma ",
 87 |     "\U0001d6ff": "\\theta ",
 88 |     "\U0001d700": "\\epsilon ",
 89 |     "\U0001d701": "\\zeta ",
 90 |     "\U0001d702": "\\eta ",
 91 |     "\U0001d703": "\\theta ",
 92 |     "\U0001d704": "\\iota ",
 93 |     "\U0001d705": "\\kappa ",
 94 |     "\U0001d706": "\\lambda ",
 95 |     "\U0001d707": "\\m ",
 96 |     "\U0001d708": "\\n ",
 97 |     "\U0001d709": "\\xi ",
 98 |     "\U0001d70a": "\\omicron ",
 99 |     "\U0001d70b": "\\pi ",
100 |     "\U0001d70c": "\\rho ",
101 |     "\U0001d70d": "\\varsigma ",
102 |     "\U0001d70e": "\\sigma ",
103 |     "\U0001d70f": "\\ta ",
104 |     "\U0001d710": "\\upsilon ",
105 |     "\U0001d711": "\\phi ",
106 |     "\U0001d712": "\\chi ",
107 |     "\U0001d713": "\\psi ",
108 |     "\U0001d714": "\\omega ",
109 |     "\U0001d715": "\\partial ",
110 |     "\U0001d716": "\\varepsilon ",
111 |     "\U0001d717": "\\vartheta ",
112 |     "\U0001d718": "\\varkappa ",
113 |     "\U0001d719": "\\varphi ",
114 |     "\U0001d71a": "\\varrho ",
115 |     "\U0001d71b": "\\varpi ",
116 |     # Relation symbols
117 |     "\u2190": "\\leftarrow ",
118 |     "\u2191": "\\uparrow ",
119 |     "\u2192": "\\rightarrow ",
120 |     "\u2193": "\\downright ",
121 |     "\u2194": "\\leftrightarrow ",
122 |     "\u2195": "\\updownarrow ",
123 |     "\u2196": "\\nwarrow ",
124 |     "\u2197": "\\nearrow ",
125 |     "\u2198": "\\searrow ",
126 |     "\u2199": "\\swarrow ",
127 |     "\u22ee": "\\vdots ",
128 |     "\u22ef": "\\cdots ",
129 |     "\u22f0": "\\adots ",
130 |     "\u22f1": "\\ddots ",
131 |     "\u2260": "\\ne ",
132 |     "\u2264": "\\leq ",
133 |     "\u2265": "\\geq ",
134 |     "\u2266": "\\leqq ",
135 |     "\u2267": "\\geqq ",
136 |     "\u2268": "\\lneqq ",
137 |     "\u2269": "\\gneqq ",
138 |     "\u226a": "\\ll ",
139 |     "\u226b": "\\gg ",
140 |     "\u2208": "\\in ",
141 |     "\u2209": "\\notin ",
142 |     "\u220b": "\\ni ",
143 |     "\u220c": "\\nni ",
144 |     # Ordinary symbols
145 |     "\u221e": "\\infty ",
146 |     # Binary relations
147 |     "\u00b1": "\\pm ",
148 |     "\u2213": "\\mp ",
149 |     # Italic, Latin, uppercase
150 |     "\U0001d434": "A",
151 |     "\U0001d435": "B",
152 |     "\U0001d436": "C",
153 |     "\U0001d437": "D",
154 |     "\U0001d438": "E",
155 |     "\U0001d439": "F",
156 |     "\U0001d43a": "G",
157 |     "\U0001d43b": "H",
158 |     "\U0001d43c": "I",
159 |     "\U0001d43d": "J",
160 |     "\U0001d43e": "K",
161 |     "\U0001d43f": "L",
162 |     "\U0001d440": "M",
163 |     "\U0001d441": "N",
164 |     "\U0001d442": "O",
165 |     "\U0001d443": "P",
166 |     "\U0001d444": "Q",
167 |     "\U0001d445": "R",
168 |     "\U0001d446": "S",
169 |     "\U0001d447": "T",
170 |     "\U0001d448": "U",
171 |     "\U0001d449": "V",
172 |     "\U0001d44a": "W",
173 |     "\U0001d44b": "X",
174 |     "\U0001d44c": "Y",
175 |     "\U0001d44d": "Z",
176 |     # Italic, Latin, lowercase
177 |     "\U0001d44e": "a",
178 |     "\U0001d44f": "b",
179 |     "\U0001d450": "c",
180 |     "\U0001d451": "d",
181 |     "\U0001d452": "e",
182 |     "\U0001d453": "f",
183 |     "\U0001d454": "g",
184 |     "\U0001d456": "i",
185 |     "\U0001d457": "j",
186 |     "\U0001d458": "k",
187 |     "\U0001d459": "l",
188 |     "\U0001d45a": "m",
189 |     "\U0001d45b": "n",
190 |     "\U0001d45c": "o",
191 |     "\U0001d45d": "p",
192 |     "\U0001d45e": "q",
193 |     "\U0001d45f": "r",
194 |     "\U0001d460": "s",
195 |     "\U0001d461": "t",
196 |     "\U0001d462": "u",
197 |     "\U0001d463": "v",
198 |     "\U0001d464": "w",
199 |     "\U0001d465": "x",
200 |     "\U0001d466": "y",
201 |     "\U0001d467": "z",
202 | }
203 | 
204 | FUNC = {
205 |     "sin": "\\sin({fe})",
206 |     "cos": "\\cos({fe})",
207 |     "tan": "\\tan({fe})",
208 |     "arcsin": "\\arcsin({fe})",
209 |     "arccos": "\\arccos({fe})",
210 |     "arctan": "\\arctan({fe})",
211 |     "arccot": "\\arccot({fe})",
212 |     "sinh": "\\sinh({fe})",
213 |     "cosh": "\\cosh({fe})",
214 |     "tanh": "\\tanh({fe})",
215 |     "coth": "\\coth({fe})",
216 |     "sec": "\\sec({fe})",
217 |     "csc": "\\csc({fe})",
218 | }
219 | 
220 | FUNC_PLACE = "{fe}"
221 | 
222 | BRK = "\\\\"
223 | 
224 | CHR_DEFAULT = {
225 |     "ACC_VAL": "\\hat{{{0}}}",
226 | }
227 | 
228 | POS = {
229 |     "top": "\\overline{{{0}}}",  # not sure
230 |     "bot": "\\underline{{{0}}}",
231 | }
232 | 
233 | POS_DEFAULT = {
234 |     "BAR_VAL": "\\overline{{{0}}}",
235 | }
236 | 
237 | SUB = "_{{{0}}}"
238 | 
239 | SUP = "^{{{0}}}"
240 | 
241 | F = {
242 |     "bar": "\\frac{{{num}}}{{{den}}}",
243 |     "skw": r"^{{{num}}}/_{{{den}}}",
244 |     "noBar": "\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}",
245 |     "lin": "{{{num}}}/{{{den}}}",
246 | }
247 | F_DEFAULT = "\\frac{{{num}}}{{{den}}}"
248 | 
249 | D = "\\left{left}{text}\\right{right}"
250 | 
251 | D_DEFAULT = {
252 |     "left": "(",
253 |     "right": ")",
254 |     "null": ".",
255 | }
256 | 
257 | RAD = "\\sqrt[{deg}]{{{text}}}"
258 | 
259 | RAD_DEFAULT = "\\sqrt{{{text}}}"
260 | 
261 | ARR = "\\begin{{array}}{{c}}{text}\\end{{array}}"
262 | 
263 | LIM_FUNC = {
264 |     "lim": "\\lim_{{{lim}}}",
265 |     "max": "\\max_{{{lim}}}",
266 |     "min": "\\min_{{{lim}}}",
267 | }
268 | 
269 | LIM_TO = ("\\rightarrow", "\\to")
270 | 
271 | LIM_UPP = "\\overset{{{lim}}}{{{text}}}"
272 | 
273 | M = "\\begin{{matrix}}{text}\\end{{matrix}}"
274 | 
```