#
tokens: 45162/50000 11/79 files (page 2/5)
lines: on (toggle) GitHub
raw markdown copy reset
This is page 2 of 5. Use http://codebase.md/microsoft/markitdown?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .devcontainer
│   └── devcontainer.json
├── .dockerignore
├── .gitattributes
├── .github
│   ├── dependabot.yml
│   └── workflows
│       ├── pre-commit.yml
│       └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CODE_OF_CONDUCT.md
├── Dockerfile
├── LICENSE
├── packages
│   ├── markitdown
│   │   ├── pyproject.toml
│   │   ├── README.md
│   │   ├── src
│   │   │   └── markitdown
│   │   │       ├── __about__.py
│   │   │       ├── __init__.py
│   │   │       ├── __main__.py
│   │   │       ├── _base_converter.py
│   │   │       ├── _exceptions.py
│   │   │       ├── _markitdown.py
│   │   │       ├── _stream_info.py
│   │   │       ├── _uri_utils.py
│   │   │       ├── converter_utils
│   │   │       │   ├── __init__.py
│   │   │       │   └── docx
│   │   │       │       ├── __init__.py
│   │   │       │       ├── math
│   │   │       │       │   ├── __init__.py
│   │   │       │       │   ├── latex_dict.py
│   │   │       │       │   └── omml.py
│   │   │       │       └── pre_process.py
│   │   │       ├── converters
│   │   │       │   ├── __init__.py
│   │   │       │   ├── _audio_converter.py
│   │   │       │   ├── _bing_serp_converter.py
│   │   │       │   ├── _csv_converter.py
│   │   │       │   ├── _doc_intel_converter.py
│   │   │       │   ├── _docx_converter.py
│   │   │       │   ├── _epub_converter.py
│   │   │       │   ├── _exiftool.py
│   │   │       │   ├── _html_converter.py
│   │   │       │   ├── _image_converter.py
│   │   │       │   ├── _ipynb_converter.py
│   │   │       │   ├── _llm_caption.py
│   │   │       │   ├── _markdownify.py
│   │   │       │   ├── _outlook_msg_converter.py
│   │   │       │   ├── _pdf_converter.py
│   │   │       │   ├── _plain_text_converter.py
│   │   │       │   ├── _pptx_converter.py
│   │   │       │   ├── _rss_converter.py
│   │   │       │   ├── _transcribe_audio.py
│   │   │       │   ├── _wikipedia_converter.py
│   │   │       │   ├── _xlsx_converter.py
│   │   │       │   ├── _youtube_converter.py
│   │   │       │   └── _zip_converter.py
│   │   │       └── py.typed
│   │   ├── tests
│   │   │   ├── __init__.py
│   │   │   ├── _test_vectors.py
│   │   │   ├── test_cli_misc.py
│   │   │   ├── test_cli_vectors.py
│   │   │   ├── test_docintel_html.py
│   │   │   ├── test_files
│   │   │   │   ├── equations.docx
│   │   │   │   ├── random.bin
│   │   │   │   ├── test_blog.html
│   │   │   │   ├── test_files.zip
│   │   │   │   ├── test_llm.jpg
│   │   │   │   ├── test_mskanji.csv
│   │   │   │   ├── test_notebook.ipynb
│   │   │   │   ├── test_outlook_msg.msg
│   │   │   │   ├── test_rss.xml
│   │   │   │   ├── test_serp.html
│   │   │   │   ├── test_wikipedia.html
│   │   │   │   ├── test_with_comment.docx
│   │   │   │   ├── test.docx
│   │   │   │   ├── test.epub
│   │   │   │   ├── test.jpg
│   │   │   │   ├── test.json
│   │   │   │   ├── test.m4a
│   │   │   │   ├── test.mp3
│   │   │   │   ├── test.pdf
│   │   │   │   ├── test.pptx
│   │   │   │   ├── test.wav
│   │   │   │   ├── test.xls
│   │   │   │   └── test.xlsx
│   │   │   ├── test_module_misc.py
│   │   │   └── test_module_vectors.py
│   │   └── ThirdPartyNotices.md
│   ├── markitdown-mcp
│   │   ├── Dockerfile
│   │   ├── pyproject.toml
│   │   ├── README.md
│   │   ├── src
│   │   │   └── markitdown_mcp
│   │   │       ├── __about__.py
│   │   │       ├── __init__.py
│   │   │       ├── __main__.py
│   │   │       └── py.typed
│   │   └── tests
│   │       └── __init__.py
│   └── markitdown-sample-plugin
│       ├── pyproject.toml
│       ├── README.md
│       ├── src
│       │   └── markitdown_sample_plugin
│       │       ├── __about__.py
│       │       ├── __init__.py
│       │       ├── _plugin.py
│       │       └── py.typed
│       └── tests
│           ├── __init__.py
│           ├── test_files
│           │   └── test.rtf
│           └── test_sample_plugin.py
├── README.md
├── SECURITY.md
└── SUPPORT.md
```

# Files

--------------------------------------------------------------------------------
/packages/markitdown/tests/test_cli_vectors.py:
--------------------------------------------------------------------------------

```python
  1 | #!/usr/bin/env python3 -m pytest
  2 | import os
  3 | import time
  4 | import pytest
  5 | import subprocess
  6 | import locale
  7 | from typing import List
  8 | 
  9 | if __name__ == "__main__":
 10 |     from _test_vectors import (
 11 |         GENERAL_TEST_VECTORS,
 12 |         DATA_URI_TEST_VECTORS,
 13 |         FileTestVector,
 14 |     )
 15 | else:
 16 |     from ._test_vectors import (
 17 |         GENERAL_TEST_VECTORS,
 18 |         DATA_URI_TEST_VECTORS,
 19 |         FileTestVector,
 20 |     )
 21 | 
 22 | skip_remote = (
 23 |     True if os.environ.get("GITHUB_ACTIONS") else False
 24 | )  # Don't run these tests in CI
 25 | 
 26 | TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
 27 | TEST_FILES_URL = "https://raw.githubusercontent.com/microsoft/markitdown/refs/heads/main/packages/markitdown/tests/test_files"
 28 | 
 29 | 
 30 | # Prepare CLI test vectors (remove vectors that require mockig the url)
 31 | CLI_TEST_VECTORS: List[FileTestVector] = []
 32 | for test_vector in GENERAL_TEST_VECTORS:
 33 |     if test_vector.url is not None:
 34 |         continue
 35 |     CLI_TEST_VECTORS.append(test_vector)
 36 | 
 37 | 
 38 | @pytest.fixture(scope="session")
 39 | def shared_tmp_dir(tmp_path_factory):
 40 |     return tmp_path_factory.mktemp("pytest_tmp")
 41 | 
 42 | 
 43 | @pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
 44 | def test_output_to_stdout(shared_tmp_dir, test_vector) -> None:
 45 |     """Test that the CLI outputs to stdout correctly."""
 46 | 
 47 |     result = subprocess.run(
 48 |         [
 49 |             "python",
 50 |             "-m",
 51 |             "markitdown",
 52 |             os.path.join(TEST_FILES_DIR, test_vector.filename),
 53 |         ],
 54 |         capture_output=True,
 55 |         text=True,
 56 |     )
 57 | 
 58 |     assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
 59 |     for test_string in test_vector.must_include:
 60 |         assert test_string in result.stdout
 61 |     for test_string in test_vector.must_not_include:
 62 |         assert test_string not in result.stdout
 63 | 
 64 | 
 65 | @pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
 66 | def test_output_to_file(shared_tmp_dir, test_vector) -> None:
 67 |     """Test that the CLI outputs to a file correctly."""
 68 | 
 69 |     output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output")
 70 |     result = subprocess.run(
 71 |         [
 72 |             "python",
 73 |             "-m",
 74 |             "markitdown",
 75 |             "-o",
 76 |             output_file,
 77 |             os.path.join(TEST_FILES_DIR, test_vector.filename),
 78 |         ],
 79 |         capture_output=True,
 80 |         text=True,
 81 |     )
 82 | 
 83 |     assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
 84 |     assert os.path.exists(output_file), f"Output file not created: {output_file}"
 85 | 
 86 |     with open(output_file, "r") as f:
 87 |         output_data = f.read()
 88 |         for test_string in test_vector.must_include:
 89 |             assert test_string in output_data
 90 |         for test_string in test_vector.must_not_include:
 91 |             assert test_string not in output_data
 92 | 
 93 |     os.remove(output_file)
 94 |     assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"
 95 | 
 96 | 
 97 | @pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
 98 | def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None:
 99 |     """Test that the CLI readds from stdin correctly."""
100 | 
101 |     test_input = b""
102 |     with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
103 |         test_input = stream.read()
104 | 
105 |     result = subprocess.run(
106 |         [
107 |             "python",
108 |             "-m",
109 |             "markitdown",
110 |             os.path.join(TEST_FILES_DIR, test_vector.filename),
111 |         ],
112 |         input=test_input,
113 |         capture_output=True,
114 |         text=False,
115 |     )
116 | 
117 |     stdout = result.stdout.decode(locale.getpreferredencoding())
118 |     assert (
119 |         result.returncode == 0
120 |     ), f"CLI exited with error: {result.stderr.decode('utf-8')}"
121 |     for test_string in test_vector.must_include:
122 |         assert test_string in stdout
123 |     for test_string in test_vector.must_not_include:
124 |         assert test_string not in stdout
125 | 
126 | 
127 | @pytest.mark.skipif(
128 |     skip_remote,
129 |     reason="do not run tests that query external urls",
130 | )
131 | @pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
132 | def test_convert_url(shared_tmp_dir, test_vector):
133 |     """Test the conversion of a stream with no stream info."""
134 |     # Note: tmp_dir is not used here, but is needed to match the signature
135 | 
136 |     time.sleep(1)  # Ensure we don't hit rate limits
137 |     result = subprocess.run(
138 |         ["python", "-m", "markitdown", TEST_FILES_URL + "/" + test_vector.filename],
139 |         capture_output=True,
140 |         text=False,
141 |     )
142 | 
143 |     stdout = result.stdout.decode(locale.getpreferredencoding())
144 |     assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
145 |     for test_string in test_vector.must_include:
146 |         assert test_string in stdout
147 |     for test_string in test_vector.must_not_include:
148 |         assert test_string not in stdout
149 | 
150 | 
151 | @pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
152 | def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None:
153 |     """Test CLI functionality when keep_data_uris is enabled"""
154 | 
155 |     output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output")
156 |     result = subprocess.run(
157 |         [
158 |             "python",
159 |             "-m",
160 |             "markitdown",
161 |             "--keep-data-uris",
162 |             "-o",
163 |             output_file,
164 |             os.path.join(TEST_FILES_DIR, test_vector.filename),
165 |         ],
166 |         capture_output=True,
167 |         text=True,
168 |     )
169 | 
170 |     assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
171 |     assert os.path.exists(output_file), f"Output file not created: {output_file}"
172 | 
173 |     with open(output_file, "r") as f:
174 |         output_data = f.read()
175 |         for test_string in test_vector.must_include:
176 |             assert test_string in output_data
177 |         for test_string in test_vector.must_not_include:
178 |             assert test_string not in output_data
179 | 
180 |     os.remove(output_file)
181 |     assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"
182 | 
183 | 
184 | if __name__ == "__main__":
185 |     import tempfile
186 | 
187 |     """Runs this file's tests from the command line."""
188 | 
189 |     with tempfile.TemporaryDirectory() as tmp_dir:
190 |         # General tests
191 |         for test_function in [
192 |             test_output_to_stdout,
193 |             test_output_to_file,
194 |             test_input_from_stdin_without_hints,
195 |             test_convert_url,
196 |         ]:
197 |             for test_vector in CLI_TEST_VECTORS:
198 |                 print(
199 |                     f"Running {test_function.__name__} on {test_vector.filename}...",
200 |                     end="",
201 |                 )
202 |                 test_function(tmp_dir, test_vector)
203 |                 print("OK")
204 | 
205 |         # Data URI tests
206 |         for test_function in [
207 |             test_output_to_file_with_data_uris,
208 |         ]:
209 |             for test_vector in DATA_URI_TEST_VECTORS:
210 |                 print(
211 |                     f"Running {test_function.__name__} on {test_vector.filename}...",
212 |                     end="",
213 |                 )
214 |                 test_function(tmp_dir, test_vector)
215 |                 print("OK")
216 | 
217 |     print("All tests passed!")
218 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/tests/test_module_vectors.py:
--------------------------------------------------------------------------------

```python
  1 | #!/usr/bin/env python3 -m pytest
  2 | import os
  3 | import time
  4 | import pytest
  5 | import base64
  6 | 
  7 | from pathlib import Path
  8 | 
  9 | if __name__ == "__main__":
 10 |     from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
 11 | else:
 12 |     from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
 13 | 
 14 | from markitdown import (
 15 |     MarkItDown,
 16 |     StreamInfo,
 17 | )
 18 | 
 19 | skip_remote = (
 20 |     True if os.environ.get("GITHUB_ACTIONS") else False
 21 | )  # Don't run these tests in CI
 22 | 
 23 | TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
 24 | TEST_FILES_URL = "https://raw.githubusercontent.com/microsoft/markitdown/refs/heads/main/packages/markitdown/tests/test_files"
 25 | 
 26 | 
 27 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
 28 | def test_guess_stream_info(test_vector):
 29 |     """Test the ability to guess stream info."""
 30 |     markitdown = MarkItDown()
 31 | 
 32 |     local_path = os.path.join(TEST_FILES_DIR, test_vector.filename)
 33 |     expected_extension = os.path.splitext(test_vector.filename)[1]
 34 | 
 35 |     with open(local_path, "rb") as stream:
 36 |         guesses = markitdown._get_stream_info_guesses(
 37 |             stream,
 38 |             base_guess=StreamInfo(
 39 |                 filename=os.path.basename(test_vector.filename),
 40 |                 local_path=local_path,
 41 |                 extension=expected_extension,
 42 |             ),
 43 |         )
 44 | 
 45 |         # For some limited exceptions, we can't guarantee the exact
 46 |         # mimetype or extension, so we'll special-case them here.
 47 |         if test_vector.filename in [
 48 |             "test_outlook_msg.msg",
 49 |         ]:
 50 |             return
 51 | 
 52 |         assert guesses[0].mimetype == test_vector.mimetype
 53 |         assert guesses[0].extension == expected_extension
 54 |         assert guesses[0].charset == test_vector.charset
 55 | 
 56 | 
 57 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
 58 | def test_convert_local(test_vector):
 59 |     """Test the conversion of a local file."""
 60 |     markitdown = MarkItDown()
 61 | 
 62 |     result = markitdown.convert(
 63 |         os.path.join(TEST_FILES_DIR, test_vector.filename), url=test_vector.url
 64 |     )
 65 |     for string in test_vector.must_include:
 66 |         assert string in result.markdown
 67 |     for string in test_vector.must_not_include:
 68 |         assert string not in result.markdown
 69 | 
 70 | 
 71 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
 72 | def test_convert_stream_with_hints(test_vector):
 73 |     """Test the conversion of a stream with full stream info."""
 74 |     markitdown = MarkItDown()
 75 | 
 76 |     stream_info = StreamInfo(
 77 |         extension=os.path.splitext(test_vector.filename)[1],
 78 |         mimetype=test_vector.mimetype,
 79 |         charset=test_vector.charset,
 80 |     )
 81 | 
 82 |     with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
 83 |         result = markitdown.convert(
 84 |             stream, stream_info=stream_info, url=test_vector.url
 85 |         )
 86 |         for string in test_vector.must_include:
 87 |             assert string in result.markdown
 88 |         for string in test_vector.must_not_include:
 89 |             assert string not in result.markdown
 90 | 
 91 | 
 92 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
 93 | def test_convert_stream_without_hints(test_vector):
 94 |     """Test the conversion of a stream with no stream info."""
 95 |     markitdown = MarkItDown()
 96 | 
 97 |     with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
 98 |         result = markitdown.convert(stream, url=test_vector.url)
 99 |         for string in test_vector.must_include:
100 |             assert string in result.markdown
101 |         for string in test_vector.must_not_include:
102 |             assert string not in result.markdown
103 | 
104 | 
105 | @pytest.mark.skipif(
106 |     skip_remote,
107 |     reason="do not run tests that query external urls",
108 | )
109 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
110 | def test_convert_http_uri(test_vector):
111 |     """Test the conversion of an HTTP:// or HTTPS:// URI."""
112 |     markitdown = MarkItDown()
113 | 
114 |     time.sleep(1)  # Ensure we don't hit rate limits
115 | 
116 |     result = markitdown.convert(
117 |         TEST_FILES_URL + "/" + test_vector.filename,
118 |         url=test_vector.url,  # Mock where this file would be found
119 |     )
120 |     for string in test_vector.must_include:
121 |         assert string in result.markdown
122 |     for string in test_vector.must_not_include:
123 |         assert string not in result.markdown
124 | 
125 | 
126 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
127 | def test_convert_file_uri(test_vector):
128 |     """Test the conversion of a file:// URI."""
129 |     markitdown = MarkItDown()
130 | 
131 |     result = markitdown.convert(
132 |         Path(os.path.join(TEST_FILES_DIR, test_vector.filename)).as_uri(),
133 |         url=test_vector.url,
134 |     )
135 |     for string in test_vector.must_include:
136 |         assert string in result.markdown
137 |     for string in test_vector.must_not_include:
138 |         assert string not in result.markdown
139 | 
140 | 
141 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
142 | def test_convert_data_uri(test_vector):
143 |     """Test the conversion of a data URI."""
144 |     markitdown = MarkItDown()
145 | 
146 |     data = ""
147 |     with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
148 |         data = base64.b64encode(stream.read()).decode("utf-8")
149 |     mimetype = test_vector.mimetype
150 |     data_uri = f"data:{mimetype};base64,{data}"
151 | 
152 |     result = markitdown.convert(
153 |         data_uri,
154 |         url=test_vector.url,
155 |     )
156 |     for string in test_vector.must_include:
157 |         assert string in result.markdown
158 |     for string in test_vector.must_not_include:
159 |         assert string not in result.markdown
160 | 
161 | 
162 | @pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
163 | def test_convert_keep_data_uris(test_vector):
164 |     """Test API functionality when keep_data_uris is enabled"""
165 |     markitdown = MarkItDown()
166 | 
167 |     # Test local file conversion
168 |     result = markitdown.convert(
169 |         os.path.join(TEST_FILES_DIR, test_vector.filename),
170 |         keep_data_uris=True,
171 |         url=test_vector.url,
172 |     )
173 | 
174 |     for string in test_vector.must_include:
175 |         assert string in result.markdown
176 |     for string in test_vector.must_not_include:
177 |         assert string not in result.markdown
178 | 
179 | 
180 | @pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
181 | def test_convert_stream_keep_data_uris(test_vector):
182 |     """Test the conversion of a stream with no stream info."""
183 |     markitdown = MarkItDown()
184 | 
185 |     stream_info = StreamInfo(
186 |         extension=os.path.splitext(test_vector.filename)[1],
187 |         mimetype=test_vector.mimetype,
188 |         charset=test_vector.charset,
189 |     )
190 | 
191 |     with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
192 |         result = markitdown.convert(
193 |             stream, stream_info=stream_info, keep_data_uris=True, url=test_vector.url
194 |         )
195 | 
196 |         for string in test_vector.must_include:
197 |             assert string in result.markdown
198 |         for string in test_vector.must_not_include:
199 |             assert string not in result.markdown
200 | 
201 | 
202 | if __name__ == "__main__":
203 |     """Runs this file's tests from the command line."""
204 | 
205 |     # General tests
206 |     for test_function in [
207 |         test_guess_stream_info,
208 |         test_convert_local,
209 |         test_convert_stream_with_hints,
210 |         test_convert_stream_without_hints,
211 |         test_convert_http_uri,
212 |         test_convert_file_uri,
213 |         test_convert_data_uri,
214 |     ]:
215 |         for test_vector in GENERAL_TEST_VECTORS:
216 |             print(
217 |                 f"Running {test_function.__name__} on {test_vector.filename}...", end=""
218 |             )
219 |             test_function(test_vector)
220 |             print("OK")
221 | 
222 |     # Data URI tests
223 |     for test_function in [
224 |         test_convert_keep_data_uris,
225 |         test_convert_stream_keep_data_uris,
226 |     ]:
227 |         for test_vector in DATA_URI_TEST_VECTORS:
228 |             print(
229 |                 f"Running {test_function.__name__} on {test_vector.filename}...", end=""
230 |             )
231 |             test_function(test_vector)
232 |             print("OK")
233 | 
234 |     print("All tests passed!")
235 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_youtube_converter.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import time
  3 | import re
  4 | import bs4
  5 | from typing import Any, BinaryIO, Dict, List, Union
  6 | from urllib.parse import parse_qs, urlparse, unquote
  7 | 
  8 | from .._base_converter import DocumentConverter, DocumentConverterResult
  9 | from .._stream_info import StreamInfo
 10 | 
 11 | # Optional YouTube transcription support
 12 | try:
 13 |     # Suppress some warnings on library import
 14 |     import warnings
 15 | 
 16 |     with warnings.catch_warnings():
 17 |         warnings.filterwarnings("ignore", category=SyntaxWarning)
 18 |         # Patch submitted upstream to fix the SyntaxWarning
 19 |         from youtube_transcript_api import YouTubeTranscriptApi
 20 | 
 21 |     IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
 22 | except ModuleNotFoundError:
 23 |     IS_YOUTUBE_TRANSCRIPT_CAPABLE = False
 24 | 
 25 | 
 26 | ACCEPTED_MIME_TYPE_PREFIXES = [
 27 |     "text/html",
 28 |     "application/xhtml",
 29 | ]
 30 | 
 31 | ACCEPTED_FILE_EXTENSIONS = [
 32 |     ".html",
 33 |     ".htm",
 34 | ]
 35 | 
 36 | 
 37 | class YouTubeConverter(DocumentConverter):
 38 |     """Handle YouTube specially, focusing on the video title, description, and transcript."""
 39 | 
 40 |     def accepts(
 41 |         self,
 42 |         file_stream: BinaryIO,
 43 |         stream_info: StreamInfo,
 44 |         **kwargs: Any,  # Options to pass to the converter
 45 |     ) -> bool:
 46 |         """
 47 |         Make sure we're dealing with HTML content *from* YouTube.
 48 |         """
 49 |         url = stream_info.url or ""
 50 |         mimetype = (stream_info.mimetype or "").lower()
 51 |         extension = (stream_info.extension or "").lower()
 52 | 
 53 |         url = unquote(url)
 54 |         url = url.replace(r"\?", "?").replace(r"\=", "=")
 55 | 
 56 |         if not url.startswith("https://www.youtube.com/watch?"):
 57 |             # Not a YouTube URL
 58 |             return False
 59 | 
 60 |         if extension in ACCEPTED_FILE_EXTENSIONS:
 61 |             return True
 62 | 
 63 |         for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
 64 |             if mimetype.startswith(prefix):
 65 |                 return True
 66 | 
 67 |         # Not HTML content
 68 |         return False
 69 | 
 70 |     def convert(
 71 |         self,
 72 |         file_stream: BinaryIO,
 73 |         stream_info: StreamInfo,
 74 |         **kwargs: Any,  # Options to pass to the converter
 75 |     ) -> DocumentConverterResult:
 76 |         # Parse the stream
 77 |         encoding = "utf-8" if stream_info.charset is None else stream_info.charset
 78 |         soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
 79 | 
 80 |         # Read the meta tags
 81 |         metadata: Dict[str, str] = {}
 82 | 
 83 |         if soup.title and soup.title.string:
 84 |             metadata["title"] = soup.title.string
 85 | 
 86 |         for meta in soup(["meta"]):
 87 |             if not isinstance(meta, bs4.Tag):
 88 |                 continue
 89 | 
 90 |             for a in meta.attrs:
 91 |                 if a in ["itemprop", "property", "name"]:
 92 |                     key = str(meta.get(a, ""))
 93 |                     content = str(meta.get("content", ""))
 94 |                     if key and content:  # Only add non-empty content
 95 |                         metadata[key] = content
 96 |                     break
 97 | 
 98 |         # Try reading the description
 99 |         try:
100 |             for script in soup(["script"]):
101 |                 if not isinstance(script, bs4.Tag):
102 |                     continue
103 |                 if not script.string:  # Skip empty scripts
104 |                     continue
105 |                 content = script.string
106 |                 if "ytInitialData" in content:
107 |                     match = re.search(r"var ytInitialData = ({.*?});", content)
108 |                     if match:
109 |                         data = json.loads(match.group(1))
110 |                         attrdesc = self._findKey(data, "attributedDescriptionBodyText")
111 |                         if attrdesc and isinstance(attrdesc, dict):
112 |                             metadata["description"] = str(attrdesc.get("content", ""))
113 |                     break
114 |         except Exception as e:
115 |             print(f"Error extracting description: {e}")
116 |             pass
117 | 
118 |         # Start preparing the page
119 |         webpage_text = "# YouTube\n"
120 | 
121 |         title = self._get(metadata, ["title", "og:title", "name"])  # type: ignore
122 |         assert isinstance(title, str)
123 | 
124 |         if title:
125 |             webpage_text += f"\n## {title}\n"
126 | 
127 |         stats = ""
128 |         views = self._get(metadata, ["interactionCount"])  # type: ignore
129 |         if views:
130 |             stats += f"- **Views:** {views}\n"
131 | 
132 |         keywords = self._get(metadata, ["keywords"])  # type: ignore
133 |         if keywords:
134 |             stats += f"- **Keywords:** {keywords}\n"
135 | 
136 |         runtime = self._get(metadata, ["duration"])  # type: ignore
137 |         if runtime:
138 |             stats += f"- **Runtime:** {runtime}\n"
139 | 
140 |         if len(stats) > 0:
141 |             webpage_text += f"\n### Video Metadata\n{stats}\n"
142 | 
143 |         description = self._get(metadata, ["description", "og:description"])  # type: ignore
144 |         if description:
145 |             webpage_text += f"\n### Description\n{description}\n"
146 | 
147 |         if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
148 |             ytt_api = YouTubeTranscriptApi()
149 |             transcript_text = ""
150 |             parsed_url = urlparse(stream_info.url)  # type: ignore
151 |             params = parse_qs(parsed_url.query)  # type: ignore
152 |             if "v" in params and params["v"][0]:
153 |                 video_id = str(params["v"][0])
154 |                 transcript_list = ytt_api.list(video_id)
155 |                 languages = ["en"]
156 |                 for transcript in transcript_list:
157 |                     languages.append(transcript.language_code)
158 |                     break
159 |                 try:
160 |                     youtube_transcript_languages = kwargs.get(
161 |                         "youtube_transcript_languages", languages
162 |                     )
163 |                     # Retry the transcript fetching operation
164 |                     transcript = self._retry_operation(
165 |                         lambda: ytt_api.fetch(
166 |                             video_id, languages=youtube_transcript_languages
167 |                         ),
168 |                         retries=3,  # Retry 3 times
169 |                         delay=2,  # 2 seconds delay between retries
170 |                     )
171 | 
172 |                     if transcript:
173 |                         transcript_text = " ".join(
174 |                             [part.text for part in transcript]
175 |                         )  # type: ignore
176 |                 except Exception as e:
177 |                     # No transcript available
178 |                     if len(languages) == 1:
179 |                         print(f"Error fetching transcript: {e}")
180 |                     else:
181 |                         # Translate transcript into first kwarg
182 |                         transcript = (
183 |                             transcript_list.find_transcript(languages)
184 |                             .translate(youtube_transcript_languages[0])
185 |                             .fetch()
186 |                         )
187 |                         transcript_text = " ".join([part.text for part in transcript])
188 |             if transcript_text:
189 |                 webpage_text += f"\n### Transcript\n{transcript_text}\n"
190 | 
191 |         title = title if title else (soup.title.string if soup.title else "")
192 |         assert isinstance(title, str)
193 | 
194 |         return DocumentConverterResult(
195 |             markdown=webpage_text,
196 |             title=title,
197 |         )
198 | 
199 |     def _get(
200 |         self,
201 |         metadata: Dict[str, str],
202 |         keys: List[str],
203 |         default: Union[str, None] = None,
204 |     ) -> Union[str, None]:
205 |         """Get first non-empty value from metadata matching given keys."""
206 |         for k in keys:
207 |             if k in metadata:
208 |                 return metadata[k]
209 |         return default
210 | 
211 |     def _findKey(self, json: Any, key: str) -> Union[str, None]:  # TODO: Fix json type
212 |         """Recursively search for a key in nested dictionary/list structures."""
213 |         if isinstance(json, list):
214 |             for elm in json:
215 |                 ret = self._findKey(elm, key)
216 |                 if ret is not None:
217 |                     return ret
218 |         elif isinstance(json, dict):
219 |             for k, v in json.items():
220 |                 if k == key:
221 |                     return json[k]
222 |                 if result := self._findKey(v, key):
223 |                     return result
224 |         return None
225 | 
226 |     def _retry_operation(self, operation, retries=3, delay=2):
227 |         """Retries the operation if it fails."""
228 |         attempt = 0
229 |         while attempt < retries:
230 |             try:
231 |                 return operation()  # Attempt the operation
232 |             except Exception as e:
233 |                 print(f"Attempt {attempt + 1} failed: {e}")
234 |                 if attempt < retries - 1:
235 |                     time.sleep(delay)  # Wait before retrying
236 |                 attempt += 1
237 |         # If all attempts fail, raise the last exception
238 |         raise Exception(f"Operation failed after {retries} attempts.")
239 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py:
--------------------------------------------------------------------------------

```python
  1 | import sys
  2 | import re
  3 | import os
  4 | from typing import BinaryIO, Any, List
  5 | from enum import Enum
  6 | 
  7 | from .._base_converter import DocumentConverter, DocumentConverterResult
  8 | from .._stream_info import StreamInfo
  9 | from .._exceptions import MissingDependencyException
 10 | 
 11 | # Try loading optional (but in this case, required) dependencies
 12 | # Save reporting of any exceptions for later
 13 | _dependency_exc_info = None
 14 | try:
 15 |     from azure.ai.documentintelligence import DocumentIntelligenceClient
 16 |     from azure.ai.documentintelligence.models import (
 17 |         AnalyzeDocumentRequest,
 18 |         AnalyzeResult,
 19 |         DocumentAnalysisFeature,
 20 |     )
 21 |     from azure.core.credentials import AzureKeyCredential, TokenCredential
 22 |     from azure.identity import DefaultAzureCredential
 23 | except ImportError:
 24 |     # Preserve the error and stack trace for later
 25 |     _dependency_exc_info = sys.exc_info()
 26 | 
 27 |     # Define these types for type hinting when the package is not available
 28 |     class AzureKeyCredential:
 29 |         pass
 30 | 
 31 |     class TokenCredential:
 32 |         pass
 33 | 
 34 |     class DocumentIntelligenceClient:
 35 |         pass
 36 | 
 37 |     class AnalyzeDocumentRequest:
 38 |         pass
 39 | 
 40 |     class AnalyzeResult:
 41 |         pass
 42 | 
 43 |     class DocumentAnalysisFeature:
 44 |         pass
 45 | 
 46 |     class DefaultAzureCredential:
 47 |         pass
 48 | 
 49 | 
 50 | # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
 51 | # This constant is a temporary fix until the bug is resolved.
 52 | CONTENT_FORMAT = "markdown"
 53 | 
 54 | 
 55 | class DocumentIntelligenceFileType(str, Enum):
 56 |     """Enum of file types supported by the Document Intelligence Converter."""
 57 | 
 58 |     # No OCR
 59 |     DOCX = "docx"
 60 |     PPTX = "pptx"
 61 |     XLSX = "xlsx"
 62 |     HTML = "html"
 63 |     # OCR
 64 |     PDF = "pdf"
 65 |     JPEG = "jpeg"
 66 |     PNG = "png"
 67 |     BMP = "bmp"
 68 |     TIFF = "tiff"
 69 | 
 70 | 
 71 | def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[str]:
 72 |     """Get the MIME type prefixes for the given file types."""
 73 |     prefixes: List[str] = []
 74 |     for type_ in types:
 75 |         if type_ == DocumentIntelligenceFileType.DOCX:
 76 |             prefixes.append(
 77 |                 "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
 78 |             )
 79 |         elif type_ == DocumentIntelligenceFileType.PPTX:
 80 |             prefixes.append(
 81 |                 "application/vnd.openxmlformats-officedocument.presentationml"
 82 |             )
 83 |         elif type_ == DocumentIntelligenceFileType.XLSX:
 84 |             prefixes.append(
 85 |                 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
 86 |             )
 87 |         elif type_ == DocumentIntelligenceFileType.HTML:
 88 |             prefixes.append("text/html")
 89 |             prefixes.append("application/xhtml+xml")
 90 |         elif type_ == DocumentIntelligenceFileType.PDF:
 91 |             prefixes.append("application/pdf")
 92 |             prefixes.append("application/x-pdf")
 93 |         elif type_ == DocumentIntelligenceFileType.JPEG:
 94 |             prefixes.append("image/jpeg")
 95 |         elif type_ == DocumentIntelligenceFileType.PNG:
 96 |             prefixes.append("image/png")
 97 |         elif type_ == DocumentIntelligenceFileType.BMP:
 98 |             prefixes.append("image/bmp")
 99 |         elif type_ == DocumentIntelligenceFileType.TIFF:
100 |             prefixes.append("image/tiff")
101 |     return prefixes
102 | 
103 | 
104 | def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]:
105 |     """Get the file extensions for the given file types."""
106 |     extensions: List[str] = []
107 |     for type_ in types:
108 |         if type_ == DocumentIntelligenceFileType.DOCX:
109 |             extensions.append(".docx")
110 |         elif type_ == DocumentIntelligenceFileType.PPTX:
111 |             extensions.append(".pptx")
112 |         elif type_ == DocumentIntelligenceFileType.XLSX:
113 |             extensions.append(".xlsx")
114 |         elif type_ == DocumentIntelligenceFileType.PDF:
115 |             extensions.append(".pdf")
116 |         elif type_ == DocumentIntelligenceFileType.JPEG:
117 |             extensions.append(".jpg")
118 |             extensions.append(".jpeg")
119 |         elif type_ == DocumentIntelligenceFileType.PNG:
120 |             extensions.append(".png")
121 |         elif type_ == DocumentIntelligenceFileType.BMP:
122 |             extensions.append(".bmp")
123 |         elif type_ == DocumentIntelligenceFileType.TIFF:
124 |             extensions.append(".tiff")
125 |         elif type_ == DocumentIntelligenceFileType.HTML:
126 |             extensions.append(".html")
127 |     return extensions
128 | 
129 | 
130 | class DocumentIntelligenceConverter(DocumentConverter):
131 |     """Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
132 | 
133 |     def __init__(
134 |         self,
135 |         *,
136 |         endpoint: str,
137 |         api_version: str = "2024-07-31-preview",
138 |         credential: AzureKeyCredential | TokenCredential | None = None,
139 |         file_types: List[DocumentIntelligenceFileType] = [
140 |             DocumentIntelligenceFileType.DOCX,
141 |             DocumentIntelligenceFileType.PPTX,
142 |             DocumentIntelligenceFileType.XLSX,
143 |             DocumentIntelligenceFileType.PDF,
144 |             DocumentIntelligenceFileType.JPEG,
145 |             DocumentIntelligenceFileType.PNG,
146 |             DocumentIntelligenceFileType.BMP,
147 |             DocumentIntelligenceFileType.TIFF,
148 |         ],
149 |     ):
150 |         """
151 |         Initialize the DocumentIntelligenceConverter.
152 | 
153 |         Args:
154 |             endpoint (str): The endpoint for the Document Intelligence service.
155 |             api_version (str): The API version to use. Defaults to "2024-07-31-preview".
156 |             credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication.
157 |             file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types.
158 |         """
159 | 
160 |         super().__init__()
161 |         self._file_types = file_types
162 | 
163 |         # Raise an error if the dependencies are not available.
164 |         # This is different than other converters since this one isn't even instantiated
165 |         # unless explicitly requested.
166 |         if _dependency_exc_info is not None:
167 |             raise MissingDependencyException(
168 |                 "DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`"
169 |             ) from _dependency_exc_info[
170 |                 1
171 |             ].with_traceback(  # type: ignore[union-attr]
172 |                 _dependency_exc_info[2]
173 |             )
174 | 
175 |         if credential is None:
176 |             if os.environ.get("AZURE_API_KEY") is None:
177 |                 credential = DefaultAzureCredential()
178 |             else:
179 |                 credential = AzureKeyCredential(os.environ["AZURE_API_KEY"])
180 | 
181 |         self.endpoint = endpoint
182 |         self.api_version = api_version
183 |         self.doc_intel_client = DocumentIntelligenceClient(
184 |             endpoint=self.endpoint,
185 |             api_version=self.api_version,
186 |             credential=credential,
187 |         )
188 | 
189 |     def accepts(
190 |         self,
191 |         file_stream: BinaryIO,
192 |         stream_info: StreamInfo,
193 |         **kwargs: Any,  # Options to pass to the converter
194 |     ) -> bool:
195 |         mimetype = (stream_info.mimetype or "").lower()
196 |         extension = (stream_info.extension or "").lower()
197 | 
198 |         if extension in _get_file_extensions(self._file_types):
199 |             return True
200 | 
201 |         for prefix in _get_mime_type_prefixes(self._file_types):
202 |             if mimetype.startswith(prefix):
203 |                 return True
204 | 
205 |         return False
206 | 
207 |     def _analysis_features(self, stream_info: StreamInfo) -> List[str]:
208 |         """
209 |         Helper needed to determine which analysis features to use.
210 |         Certain document analysis features are not availiable for
211 |         office filetypes (.xlsx, .pptx, .html, .docx)
212 |         """
213 |         mimetype = (stream_info.mimetype or "").lower()
214 |         extension = (stream_info.extension or "").lower()
215 | 
216 |         # Types that don't support ocr
217 |         no_ocr_types = [
218 |             DocumentIntelligenceFileType.DOCX,
219 |             DocumentIntelligenceFileType.PPTX,
220 |             DocumentIntelligenceFileType.XLSX,
221 |             DocumentIntelligenceFileType.HTML,
222 |         ]
223 | 
224 |         if extension in _get_file_extensions(no_ocr_types):
225 |             return []
226 | 
227 |         for prefix in _get_mime_type_prefixes(no_ocr_types):
228 |             if mimetype.startswith(prefix):
229 |                 return []
230 | 
231 |         return [
232 |             DocumentAnalysisFeature.FORMULAS,  # enable formula extraction
233 |             DocumentAnalysisFeature.OCR_HIGH_RESOLUTION,  # enable high resolution OCR
234 |             DocumentAnalysisFeature.STYLE_FONT,  # enable font style extraction
235 |         ]
236 | 
237 |     def convert(
238 |         self,
239 |         file_stream: BinaryIO,
240 |         stream_info: StreamInfo,
241 |         **kwargs: Any,  # Options to pass to the converter
242 |     ) -> DocumentConverterResult:
243 |         # Extract the text using Azure Document Intelligence
244 |         poller = self.doc_intel_client.begin_analyze_document(
245 |             model_id="prebuilt-layout",
246 |             body=AnalyzeDocumentRequest(bytes_source=file_stream.read()),
247 |             features=self._analysis_features(stream_info),
248 |             output_content_format=CONTENT_FORMAT,  # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
249 |         )
250 |         result: AnalyzeResult = poller.result()
251 | 
252 |         # remove comments from the markdown content generated by Doc Intelligence and append to markdown string
253 |         markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
254 |         return DocumentConverterResult(markdown=markdown_text)
255 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/tests/_test_vectors.py:
--------------------------------------------------------------------------------

```python
  1 | import dataclasses
  2 | from typing import List
  3 | 
  4 | 
  5 | @dataclasses.dataclass(frozen=True, kw_only=True)
  6 | class FileTestVector(object):
  7 |     filename: str
  8 |     mimetype: str | None
  9 |     charset: str | None
 10 |     url: str | None
 11 |     must_include: List[str]
 12 |     must_not_include: List[str]
 13 | 
 14 | 
 15 | GENERAL_TEST_VECTORS = [
 16 |     FileTestVector(
 17 |         filename="test.docx",
 18 |         mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
 19 |         charset=None,
 20 |         url=None,
 21 |         must_include=[
 22 |             "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
 23 |             "49e168b7-d2ae-407f-a055-2167576f39a1",
 24 |             "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
 25 |             "# Abstract",
 26 |             "# Introduction",
 27 |             "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
 28 |             "data:image/png;base64...",
 29 |         ],
 30 |         must_not_include=[
 31 |             "",
 32 |         ],
 33 |     ),
 34 |     FileTestVector(
 35 |         filename="test.xlsx",
 36 |         mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
 37 |         charset=None,
 38 |         url=None,
 39 |         must_include=[
 40 |             "## 09060124-b5e7-4717-9d07-3c046eb",
 41 |             "6ff4173b-42a5-4784-9b19-f49caff4d93d",
 42 |             "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
 43 |         ],
 44 |         must_not_include=[],
 45 |     ),
 46 |     FileTestVector(
 47 |         filename="test.xls",
 48 |         mimetype="application/vnd.ms-excel",
 49 |         charset=None,
 50 |         url=None,
 51 |         must_include=[
 52 |             "## 09060124-b5e7-4717-9d07-3c046eb",
 53 |             "6ff4173b-42a5-4784-9b19-f49caff4d93d",
 54 |             "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
 55 |         ],
 56 |         must_not_include=[],
 57 |     ),
 58 |     FileTestVector(
 59 |         filename="test.pptx",
 60 |         mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
 61 |         charset=None,
 62 |         url=None,
 63 |         must_include=[
 64 |             "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
 65 |             "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
 66 |             "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
 67 |             "1b92870d-e3b5-4e65-8153-919f4ff45592",
 68 |             "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
 69 |             "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
 70 |             "2003",  # chart value
 71 |             "![This phrase of the caption is Human-written.](Picture4.jpg)",
 72 |         ],
 73 |         must_not_include=[""],
 74 |     ),
 75 |     FileTestVector(
 76 |         filename="test_outlook_msg.msg",
 77 |         mimetype="application/vnd.ms-outlook",
 78 |         charset=None,
 79 |         url=None,
 80 |         must_include=[
 81 |             "# Email Message",
 82 |             "**From:** [email protected]",
 83 |             "**To:** [email protected]",
 84 |             "**Subject:** Test Email Message",
 85 |             "## Content",
 86 |             "This is the body of the test email message",
 87 |         ],
 88 |         must_not_include=[],
 89 |     ),
 90 |     FileTestVector(
 91 |         filename="test.pdf",
 92 |         mimetype="application/pdf",
 93 |         charset=None,
 94 |         url=None,
 95 |         must_include=[
 96 |             "While there is contemporaneous exploration of multi-agent approaches"
 97 |         ],
 98 |         must_not_include=[],
 99 |     ),
100 |     FileTestVector(
101 |         filename="test_blog.html",
102 |         mimetype="text/html",
103 |         charset="utf-8",
104 |         url="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math",
105 |         must_include=[
106 |             "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?",
107 |             "an example where high cost can easily prevent a generic complex",
108 |         ],
109 |         must_not_include=[],
110 |     ),
111 |     FileTestVector(
112 |         filename="test_wikipedia.html",
113 |         mimetype="text/html",
114 |         charset="utf-8",
115 |         url="https://en.wikipedia.org/wiki/Microsoft",
116 |         must_include=[
117 |             "Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]",
118 |             'Microsoft was founded by [Bill Gates](/wiki/Bill_Gates "Bill Gates")',
119 |         ],
120 |         must_not_include=[
121 |             "You are encouraged to create an account and log in",
122 |             "154 languages",
123 |             "move to sidebar",
124 |         ],
125 |     ),
126 |     FileTestVector(
127 |         filename="test_serp.html",
128 |         mimetype="text/html",
129 |         charset="utf-8",
130 |         url="https://www.bing.com/search?q=microsoft+wikipedia",
131 |         must_include=[
132 |             "](https://en.wikipedia.org/wiki/Microsoft",
133 |             "Microsoft Corporation is **an American multinational corporation and technology company headquartered** in Redmond",
134 |             "1995–2007: Foray into the Web, Windows 95, Windows XP, and Xbox",
135 |         ],
136 |         must_not_include=[
137 |             "https://www.bing.com/ck/a?!&&p=",
138 |             "",
254 |         ],
255 |         must_not_include=[
256 |             "data:image/png;base64...",
257 |         ],
258 |     ),
259 |     FileTestVector(
260 |         filename="test.pptx",
261 |         mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
262 |         charset=None,
263 |         url=None,
264 |         must_include=[
265 |             "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
266 |             "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
267 |             "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
268 |             "1b92870d-e3b5-4e65-8153-919f4ff45592",
269 |             "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
270 |             "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
271 |             "2003",  # chart value
272 |             "![This phrase of the caption is Human-written.]",  # image caption
273 |             "",
274 |         ],
275 |         must_not_include=[
276 |             "![This phrase of the caption is Human-written.](Picture4.jpg)",
277 |         ],
278 |     ),
279 | ]
280 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converters/_pptx_converter.py:
--------------------------------------------------------------------------------

```python
  1 | import sys
  2 | import base64
  3 | import os
  4 | import io
  5 | import re
  6 | import html
  7 | 
  8 | from typing import BinaryIO, Any
  9 | from operator import attrgetter
 10 | 
 11 | from ._html_converter import HtmlConverter
 12 | from ._llm_caption import llm_caption
 13 | from .._base_converter import DocumentConverter, DocumentConverterResult
 14 | from .._stream_info import StreamInfo
 15 | from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 16 | 
 17 | # Try loading optional (but in this case, required) dependencies
 18 | # Save reporting of any exceptions for later
 19 | _dependency_exc_info = None
 20 | try:
 21 |     import pptx
 22 | except ImportError:
 23 |     # Preserve the error and stack trace for later
 24 |     _dependency_exc_info = sys.exc_info()
 25 | 
 26 | 
 27 | ACCEPTED_MIME_TYPE_PREFIXES = [
 28 |     "application/vnd.openxmlformats-officedocument.presentationml",
 29 | ]
 30 | 
 31 | ACCEPTED_FILE_EXTENSIONS = [".pptx"]
 32 | 
 33 | 
 34 | class PptxConverter(DocumentConverter):
 35 |     """
 36 |     Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
 37 |     """
 38 | 
 39 |     def __init__(self):
 40 |         super().__init__()
 41 |         self._html_converter = HtmlConverter()
 42 | 
 43 |     def accepts(
 44 |         self,
 45 |         file_stream: BinaryIO,
 46 |         stream_info: StreamInfo,
 47 |         **kwargs: Any,  # Options to pass to the converter
 48 |     ) -> bool:
 49 |         mimetype = (stream_info.mimetype or "").lower()
 50 |         extension = (stream_info.extension or "").lower()
 51 | 
 52 |         if extension in ACCEPTED_FILE_EXTENSIONS:
 53 |             return True
 54 | 
 55 |         for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
 56 |             if mimetype.startswith(prefix):
 57 |                 return True
 58 | 
 59 |         return False
 60 | 
 61 |     def convert(
 62 |         self,
 63 |         file_stream: BinaryIO,
 64 |         stream_info: StreamInfo,
 65 |         **kwargs: Any,  # Options to pass to the converter
 66 |     ) -> DocumentConverterResult:
 67 |         # Check the dependencies
 68 |         if _dependency_exc_info is not None:
 69 |             raise MissingDependencyException(
 70 |                 MISSING_DEPENDENCY_MESSAGE.format(
 71 |                     converter=type(self).__name__,
 72 |                     extension=".pptx",
 73 |                     feature="pptx",
 74 |                 )
 75 |             ) from _dependency_exc_info[
 76 |                 1
 77 |             ].with_traceback(  # type: ignore[union-attr]
 78 |                 _dependency_exc_info[2]
 79 |             )
 80 | 
 81 |         # Perform the conversion
 82 |         presentation = pptx.Presentation(file_stream)
 83 |         md_content = ""
 84 |         slide_num = 0
 85 |         for slide in presentation.slides:
 86 |             slide_num += 1
 87 | 
 88 |             md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
 89 | 
 90 |             title = slide.shapes.title
 91 | 
 92 |             def get_shape_content(shape, **kwargs):
 93 |                 nonlocal md_content
 94 |                 # Pictures
 95 |                 if self._is_picture(shape):
 96 |                     # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
 97 | 
 98 |                     llm_description = ""
 99 |                     alt_text = ""
100 | 
101 |                     # Potentially generate a description using an LLM
102 |                     llm_client = kwargs.get("llm_client")
103 |                     llm_model = kwargs.get("llm_model")
104 |                     if llm_client is not None and llm_model is not None:
105 |                         # Prepare a file_stream and stream_info for the image data
106 |                         image_filename = shape.image.filename
107 |                         image_extension = None
108 |                         if image_filename:
109 |                             image_extension = os.path.splitext(image_filename)[1]
110 |                         image_stream_info = StreamInfo(
111 |                             mimetype=shape.image.content_type,
112 |                             extension=image_extension,
113 |                             filename=image_filename,
114 |                         )
115 | 
116 |                         image_stream = io.BytesIO(shape.image.blob)
117 | 
118 |                         # Caption the image
119 |                         try:
120 |                             llm_description = llm_caption(
121 |                                 image_stream,
122 |                                 image_stream_info,
123 |                                 client=llm_client,
124 |                                 model=llm_model,
125 |                                 prompt=kwargs.get("llm_prompt"),
126 |                             )
127 |                         except Exception:
128 |                             # Unable to generate a description
129 |                             pass
130 | 
131 |                     # Also grab any description embedded in the deck
132 |                     try:
133 |                         alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
134 |                     except Exception:
135 |                         # Unable to get alt text
136 |                         pass
137 | 
138 |                     # Prepare the alt, escaping any special characters
139 |                     alt_text = "\n".join([llm_description, alt_text]) or shape.name
140 |                     alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
141 |                     alt_text = re.sub(r"\s+", " ", alt_text).strip()
142 | 
143 |                     # If keep_data_uris is True, use base64 encoding for images
144 |                     if kwargs.get("keep_data_uris", False):
145 |                         blob = shape.image.blob
146 |                         content_type = shape.image.content_type or "image/png"
147 |                         b64_string = base64.b64encode(blob).decode("utf-8")
148 |                         md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
149 |                     else:
150 |                         # A placeholder name
151 |                         filename = re.sub(r"\W", "", shape.name) + ".jpg"
152 |                         md_content += "\n![" + alt_text + "](" + filename + ")\n"
153 | 
154 |                 # Tables
155 |                 if self._is_table(shape):
156 |                     md_content += self._convert_table_to_markdown(shape.table, **kwargs)
157 | 
158 |                 # Charts
159 |                 if shape.has_chart:
160 |                     md_content += self._convert_chart_to_markdown(shape.chart)
161 | 
162 |                 # Text areas
163 |                 elif shape.has_text_frame:
164 |                     if shape == title:
165 |                         md_content += "# " + shape.text.lstrip() + "\n"
166 |                     else:
167 |                         md_content += shape.text + "\n"
168 | 
169 |                 # Group Shapes
170 |                 if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
171 |                     sorted_shapes = sorted(
172 |                         shape.shapes,
173 |                         key=lambda x: (
174 |                             float("-inf") if not x.top else x.top,
175 |                             float("-inf") if not x.left else x.left,
176 |                         ),
177 |                     )
178 |                     for subshape in sorted_shapes:
179 |                         get_shape_content(subshape, **kwargs)
180 | 
181 |             sorted_shapes = sorted(
182 |                 slide.shapes,
183 |                 key=lambda x: (
184 |                     float("-inf") if not x.top else x.top,
185 |                     float("-inf") if not x.left else x.left,
186 |                 ),
187 |             )
188 |             for shape in sorted_shapes:
189 |                 get_shape_content(shape, **kwargs)
190 | 
191 |             md_content = md_content.strip()
192 | 
193 |             if slide.has_notes_slide:
194 |                 md_content += "\n\n### Notes:\n"
195 |                 notes_frame = slide.notes_slide.notes_text_frame
196 |                 if notes_frame is not None:
197 |                     md_content += notes_frame.text
198 |                 md_content = md_content.strip()
199 | 
200 |         return DocumentConverterResult(markdown=md_content.strip())
201 | 
202 |     def _is_picture(self, shape):
203 |         if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
204 |             return True
205 |         if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
206 |             if hasattr(shape, "image"):
207 |                 return True
208 |         return False
209 | 
210 |     def _is_table(self, shape):
211 |         if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
212 |             return True
213 |         return False
214 | 
215 |     def _convert_table_to_markdown(self, table, **kwargs):
216 |         # Write the table as HTML, then convert it to Markdown
217 |         html_table = "<html><body><table>"
218 |         first_row = True
219 |         for row in table.rows:
220 |             html_table += "<tr>"
221 |             for cell in row.cells:
222 |                 if first_row:
223 |                     html_table += "<th>" + html.escape(cell.text) + "</th>"
224 |                 else:
225 |                     html_table += "<td>" + html.escape(cell.text) + "</td>"
226 |             html_table += "</tr>"
227 |             first_row = False
228 |         html_table += "</table></body></html>"
229 | 
230 |         return (
231 |             self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
232 |             + "\n"
233 |         )
234 | 
235 |     def _convert_chart_to_markdown(self, chart):
236 |         try:
237 |             md = "\n\n### Chart"
238 |             if chart.has_title:
239 |                 md += f": {chart.chart_title.text_frame.text}"
240 |             md += "\n\n"
241 |             data = []
242 |             category_names = [c.label for c in chart.plots[0].categories]
243 |             series_names = [s.name for s in chart.series]
244 |             data.append(["Category"] + series_names)
245 | 
246 |             for idx, category in enumerate(category_names):
247 |                 row = [category]
248 |                 for series in chart.series:
249 |                     row.append(series.values[idx])
250 |                 data.append(row)
251 | 
252 |             markdown_table = []
253 |             for row in data:
254 |                 markdown_table.append("| " + " | ".join(map(str, row)) + " |")
255 |             header = markdown_table[0]
256 |             separator = "|" + "|".join(["---"] * len(data[0])) + "|"
257 |             return md + "\n".join([header, separator] + markdown_table[1:])
258 |         except ValueError as e:
259 |             # Handle the specific error for unsupported chart types
260 |             if "unsupported plot type" in str(e):
261 |                 return "\n\n[unsupported chart]\n\n"
262 |         except Exception:
263 |             # Catch any other exceptions that might occur
264 |             return "\n\n[unsupported chart]\n\n"
265 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py:
--------------------------------------------------------------------------------

```python
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Office Math Markup Language (OMML)
  5 | Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py
  6 | On 25/03/2025
  7 | """
  8 | 
  9 | from defusedxml import ElementTree as ET
 10 | 
 11 | from .latex_dict import (
 12 |     CHARS,
 13 |     CHR,
 14 |     CHR_BO,
 15 |     CHR_DEFAULT,
 16 |     POS,
 17 |     POS_DEFAULT,
 18 |     SUB,
 19 |     SUP,
 20 |     F,
 21 |     F_DEFAULT,
 22 |     T,
 23 |     FUNC,
 24 |     D,
 25 |     D_DEFAULT,
 26 |     RAD,
 27 |     RAD_DEFAULT,
 28 |     ARR,
 29 |     LIM_FUNC,
 30 |     LIM_TO,
 31 |     LIM_UPP,
 32 |     M,
 33 |     BRK,
 34 |     BLANK,
 35 |     BACKSLASH,
 36 |     ALN,
 37 |     FUNC_PLACE,
 38 | )
 39 | 
 40 | OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
 41 | 
 42 | 
 43 | def load(stream):
 44 |     tree = ET.parse(stream)
 45 |     for omath in tree.findall(OMML_NS + "oMath"):
 46 |         yield oMath2Latex(omath)
 47 | 
 48 | 
 49 | def load_string(string):
 50 |     root = ET.fromstring(string)
 51 |     for omath in root.findall(OMML_NS + "oMath"):
 52 |         yield oMath2Latex(omath)
 53 | 
 54 | 
 55 | def escape_latex(strs):
 56 |     last = None
 57 |     new_chr = []
 58 |     strs = strs.replace(r"\\", "\\")
 59 |     for c in strs:
 60 |         if (c in CHARS) and (last != BACKSLASH):
 61 |             new_chr.append(BACKSLASH + c)
 62 |         else:
 63 |             new_chr.append(c)
 64 |         last = c
 65 |     return BLANK.join(new_chr)
 66 | 
 67 | 
 68 | def get_val(key, default=None, store=CHR):
 69 |     if key is not None:
 70 |         return key if not store else store.get(key, key)
 71 |     else:
 72 |         return default
 73 | 
 74 | 
 75 | class Tag2Method(object):
 76 |     def call_method(self, elm, stag=None):
 77 |         getmethod = self.tag2meth.get
 78 |         if stag is None:
 79 |             stag = elm.tag.replace(OMML_NS, "")
 80 |         method = getmethod(stag)
 81 |         if method:
 82 |             return method(self, elm)
 83 |         else:
 84 |             return None
 85 | 
 86 |     def process_children_list(self, elm, include=None):
 87 |         """
 88 |         process children of the elm,return iterable
 89 |         """
 90 |         for _e in list(elm):
 91 |             if OMML_NS not in _e.tag:
 92 |                 continue
 93 |             stag = _e.tag.replace(OMML_NS, "")
 94 |             if include and (stag not in include):
 95 |                 continue
 96 |             t = self.call_method(_e, stag=stag)
 97 |             if t is None:
 98 |                 t = self.process_unknow(_e, stag)
 99 |                 if t is None:
100 |                     continue
101 |             yield (stag, t, _e)
102 | 
103 |     def process_children_dict(self, elm, include=None):
104 |         """
105 |         process children of the elm,return dict
106 |         """
107 |         latex_chars = dict()
108 |         for stag, t, e in self.process_children_list(elm, include):
109 |             latex_chars[stag] = t
110 |         return latex_chars
111 | 
112 |     def process_children(self, elm, include=None):
113 |         """
114 |         process children of the elm,return string
115 |         """
116 |         return BLANK.join(
117 |             (
118 |                 t if not isinstance(t, Tag2Method) else str(t)
119 |                 for stag, t, e in self.process_children_list(elm, include)
120 |             )
121 |         )
122 | 
123 |     def process_unknow(self, elm, stag):
124 |         return None
125 | 
126 | 
127 | class Pr(Tag2Method):
128 |     text = ""
129 | 
130 |     __val_tags = ("chr", "pos", "begChr", "endChr", "type")
131 | 
132 |     __innerdict = None  # can't use the __dict__
133 | 
134 |     """ common properties of element"""
135 | 
136 |     def __init__(self, elm):
137 |         self.__innerdict = {}
138 |         self.text = self.process_children(elm)
139 | 
140 |     def __str__(self):
141 |         return self.text
142 | 
143 |     def __unicode__(self):
144 |         return self.__str__(self)
145 | 
146 |     def __getattr__(self, name):
147 |         return self.__innerdict.get(name, None)
148 | 
149 |     def do_brk(self, elm):
150 |         self.__innerdict["brk"] = BRK
151 |         return BRK
152 | 
153 |     def do_common(self, elm):
154 |         stag = elm.tag.replace(OMML_NS, "")
155 |         if stag in self.__val_tags:
156 |             t = elm.get("{0}val".format(OMML_NS))
157 |             self.__innerdict[stag] = t
158 |         return None
159 | 
160 |     tag2meth = {
161 |         "brk": do_brk,
162 |         "chr": do_common,
163 |         "pos": do_common,
164 |         "begChr": do_common,
165 |         "endChr": do_common,
166 |         "type": do_common,
167 |     }
168 | 
169 | 
170 | class oMath2Latex(Tag2Method):
171 |     """
172 |     Convert oMath element of omml to latex
173 |     """
174 | 
175 |     _t_dict = T
176 | 
177 |     __direct_tags = ("box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e")
178 | 
179 |     def __init__(self, element):
180 |         self._latex = self.process_children(element)
181 | 
182 |     def __str__(self):
183 |         return self.latex
184 | 
185 |     def __unicode__(self):
186 |         return self.__str__(self)
187 | 
188 |     def process_unknow(self, elm, stag):
189 |         if stag in self.__direct_tags:
190 |             return self.process_children(elm)
191 |         elif stag[-2:] == "Pr":
192 |             return Pr(elm)
193 |         else:
194 |             return None
195 | 
196 |     @property
197 |     def latex(self):
198 |         return self._latex
199 | 
200 |     def do_acc(self, elm):
201 |         """
202 |         the accent function
203 |         """
204 |         c_dict = self.process_children_dict(elm)
205 |         latex_s = get_val(
206 |             c_dict["accPr"].chr, default=CHR_DEFAULT.get("ACC_VAL"), store=CHR
207 |         )
208 |         return latex_s.format(c_dict["e"])
209 | 
210 |     def do_bar(self, elm):
211 |         """
212 |         the bar function
213 |         """
214 |         c_dict = self.process_children_dict(elm)
215 |         pr = c_dict["barPr"]
216 |         latex_s = get_val(pr.pos, default=POS_DEFAULT.get("BAR_VAL"), store=POS)
217 |         return pr.text + latex_s.format(c_dict["e"])
218 | 
219 |     def do_d(self, elm):
220 |         """
221 |         the delimiter object
222 |         """
223 |         c_dict = self.process_children_dict(elm)
224 |         pr = c_dict["dPr"]
225 |         null = D_DEFAULT.get("null")
226 |         s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T)
227 |         e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T)
228 |         return pr.text + D.format(
229 |             left=null if not s_val else escape_latex(s_val),
230 |             text=c_dict["e"],
231 |             right=null if not e_val else escape_latex(e_val),
232 |         )
233 | 
234 |     def do_spre(self, elm):
235 |         """
236 |         the Pre-Sub-Superscript object -- Not support yet
237 |         """
238 |         pass
239 | 
240 |     def do_sub(self, elm):
241 |         text = self.process_children(elm)
242 |         return SUB.format(text)
243 | 
244 |     def do_sup(self, elm):
245 |         text = self.process_children(elm)
246 |         return SUP.format(text)
247 | 
248 |     def do_f(self, elm):
249 |         """
250 |         the fraction object
251 |         """
252 |         c_dict = self.process_children_dict(elm)
253 |         pr = c_dict["fPr"]
254 |         latex_s = get_val(pr.type, default=F_DEFAULT, store=F)
255 |         return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den"))
256 | 
257 |     def do_func(self, elm):
258 |         """
259 |         the Function-Apply object (Examples:sin cos)
260 |         """
261 |         c_dict = self.process_children_dict(elm)
262 |         func_name = c_dict.get("fName")
263 |         return func_name.replace(FUNC_PLACE, c_dict.get("e"))
264 | 
265 |     def do_fname(self, elm):
266 |         """
267 |         the func name
268 |         """
269 |         latex_chars = []
270 |         for stag, t, e in self.process_children_list(elm):
271 |             if stag == "r":
272 |                 if FUNC.get(t):
273 |                     latex_chars.append(FUNC[t])
274 |                 else:
275 |                     raise NotImplementedError("Not support func %s" % t)
276 |             else:
277 |                 latex_chars.append(t)
278 |         t = BLANK.join(latex_chars)
279 |         return t if FUNC_PLACE in t else t + FUNC_PLACE  # do_func will replace this
280 | 
281 |     def do_groupchr(self, elm):
282 |         """
283 |         the Group-Character object
284 |         """
285 |         c_dict = self.process_children_dict(elm)
286 |         pr = c_dict["groupChrPr"]
287 |         latex_s = get_val(pr.chr)
288 |         return pr.text + latex_s.format(c_dict["e"])
289 | 
290 |     def do_rad(self, elm):
291 |         """
292 |         the radical object
293 |         """
294 |         c_dict = self.process_children_dict(elm)
295 |         text = c_dict.get("e")
296 |         deg_text = c_dict.get("deg")
297 |         if deg_text:
298 |             return RAD.format(deg=deg_text, text=text)
299 |         else:
300 |             return RAD_DEFAULT.format(text=text)
301 | 
302 |     def do_eqarr(self, elm):
303 |         """
304 |         the Array object
305 |         """
306 |         return ARR.format(
307 |             text=BRK.join(
308 |                 [t for stag, t, e in self.process_children_list(elm, include=("e",))]
309 |             )
310 |         )
311 | 
312 |     def do_limlow(self, elm):
313 |         """
314 |         the Lower-Limit object
315 |         """
316 |         t_dict = self.process_children_dict(elm, include=("e", "lim"))
317 |         latex_s = LIM_FUNC.get(t_dict["e"])
318 |         if not latex_s:
319 |             raise NotImplementedError("Not support lim %s" % t_dict["e"])
320 |         else:
321 |             return latex_s.format(lim=t_dict.get("lim"))
322 | 
323 |     def do_limupp(self, elm):
324 |         """
325 |         the Upper-Limit object
326 |         """
327 |         t_dict = self.process_children_dict(elm, include=("e", "lim"))
328 |         return LIM_UPP.format(lim=t_dict.get("lim"), text=t_dict.get("e"))
329 | 
330 |     def do_lim(self, elm):
331 |         """
332 |         the lower limit of the limLow object and the upper limit of the limUpp function
333 |         """
334 |         return self.process_children(elm).replace(LIM_TO[0], LIM_TO[1])
335 | 
336 |     def do_m(self, elm):
337 |         """
338 |         the Matrix object
339 |         """
340 |         rows = []
341 |         for stag, t, e in self.process_children_list(elm):
342 |             if stag == "mPr":
343 |                 pass
344 |             elif stag == "mr":
345 |                 rows.append(t)
346 |         return M.format(text=BRK.join(rows))
347 | 
348 |     def do_mr(self, elm):
349 |         """
350 |         a single row of the matrix m
351 |         """
352 |         return ALN.join(
353 |             [t for stag, t, e in self.process_children_list(elm, include=("e",))]
354 |         )
355 | 
356 |     def do_nary(self, elm):
357 |         """
358 |         the n-ary object
359 |         """
360 |         res = []
361 |         bo = ""
362 |         for stag, t, e in self.process_children_list(elm):
363 |             if stag == "naryPr":
364 |                 bo = get_val(t.chr, store=CHR_BO)
365 |             else:
366 |                 res.append(t)
367 |         return bo + BLANK.join(res)
368 | 
369 |     def do_r(self, elm):
370 |         """
371 |         Get text from 'r' element,And try convert them to latex symbols
372 |         @todo text style support , (sty)
373 |         @todo \text (latex pure text support)
374 |         """
375 |         _str = []
376 |         for s in elm.findtext("./{0}t".format(OMML_NS)):
377 |             # s = s if isinstance(s,unicode) else unicode(s,'utf-8')
378 |             _str.append(self._t_dict.get(s, s))
379 |         return escape_latex(BLANK.join(_str))
380 | 
381 |     tag2meth = {
382 |         "acc": do_acc,
383 |         "r": do_r,
384 |         "bar": do_bar,
385 |         "sub": do_sub,
386 |         "sup": do_sup,
387 |         "f": do_f,
388 |         "func": do_func,
389 |         "fName": do_fname,
390 |         "groupChr": do_groupchr,
391 |         "d": do_d,
392 |         "rad": do_rad,
393 |         "eqArr": do_eqarr,
394 |         "limLow": do_limlow,
395 |         "limUpp": do_limupp,
396 |         "lim": do_lim,
397 |         "m": do_m,
398 |         "mr": do_mr,
399 |         "nary": do_nary,
400 |     }
401 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/ThirdPartyNotices.md:
--------------------------------------------------------------------------------

```markdown
  1 | # THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
  2 | 
  3 | **Do Not Translate or Localize**
  4 | 
  5 | This project incorporates components from the projects listed below. The original copyright notices and the licenses 
  6 | under which MarkItDown received such components are set forth below. MarkItDown reserves all rights not expressly 
  7 | granted herein, whether by implication, estoppel or otherwise.
  8 | 
  9 | 1.dwml (https://github.com/xiilei/dwml)
 10 | 
 11 | dwml NOTICES AND INFORMATION BEGIN HERE
 12 | 
 13 | -----------------------------------------
 14 | 
 15 | NOTE 1: What follows is a verbatim copy of dwml's LICENSE file, as it appeared on March 28th, 2025 - including 
 16 | placeholders for the copyright owner and year.
 17 | 
 18 | NOTE 2: The Apache License, Version 2.0, requires that modifications to the dwml source code be documented.  
 19 | The following section summarizes these changes. The full details are available in the MarkItDown source code 
 20 | repository under PR #1160 (https://github.com/microsoft/markitdown/pull/1160)
 21 | 
 22 | This project incorporates `dwml/latex_dict.py` and `dwml/omml.py` files without any additional logic modifications (which 
 23 | lives in `packages/markitdown/src/markitdown/converter_utils/docx/math` location). However, we have reformatted the code
 24 | according to `black` code formatter.  From `tests/docx.py` file, we have used `DOCXML_ROOT` XML namespaces and the rest of 
 25 | the file is not used.
 26 | 
 27 | -----------------------------------------
 28 | 
 29 | Apache License
 30 |                            Version 2.0, January 2004
 31 |                         http://www.apache.org/licenses/
 32 | 
 33 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 34 | 
 35 |    1. Definitions.
 36 | 
 37 |       "License" shall mean the terms and conditions for use, reproduction,
 38 |       and distribution as defined by Sections 1 through 9 of this document.
 39 | 
 40 |       "Licensor" shall mean the copyright owner or entity authorized by
 41 |       the copyright owner that is granting the License.
 42 | 
 43 |       "Legal Entity" shall mean the union of the acting entity and all
 44 |       other entities that control, are controlled by, or are under common
 45 |       control with that entity. For the purposes of this definition,
 46 |       "control" means (i) the power, direct or indirect, to cause the
 47 |       direction or management of such entity, whether by contract or
 48 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 49 |       outstanding shares, or (iii) beneficial ownership of such entity.
 50 | 
 51 |       "You" (or "Your") shall mean an individual or Legal Entity
 52 |       exercising permissions granted by this License.
 53 | 
 54 |       "Source" form shall mean the preferred form for making modifications,
 55 |       including but not limited to software source code, documentation
 56 |       source, and configuration files.
 57 | 
 58 |       "Object" form shall mean any form resulting from mechanical
 59 |       transformation or translation of a Source form, including but
 60 |       not limited to compiled object code, generated documentation,
 61 |       and conversions to other media types.
 62 | 
 63 |       "Work" shall mean the work of authorship, whether in Source or
 64 |       Object form, made available under the License, as indicated by a
 65 |       copyright notice that is included in or attached to the work
 66 |       (an example is provided in the Appendix below).
 67 | 
 68 |       "Derivative Works" shall mean any work, whether in Source or Object
 69 |       form, that is based on (or derived from) the Work and for which the
 70 |       editorial revisions, annotations, elaborations, or other modifications
 71 |       represent, as a whole, an original work of authorship. For the purposes
 72 |       of this License, Derivative Works shall not include works that remain
 73 |       separable from, or merely link (or bind by name) to the interfaces of,
 74 |       the Work and Derivative Works thereof.
 75 | 
 76 |       "Contribution" shall mean any work of authorship, including
 77 |       the original version of the Work and any modifications or additions
 78 |       to that Work or Derivative Works thereof, that is intentionally
 79 |       submitted to Licensor for inclusion in the Work by the copyright owner
 80 |       or by an individual or Legal Entity authorized to submit on behalf of
 81 |       the copyright owner. For the purposes of this definition, "submitted"
 82 |       means any form of electronic, verbal, or written communication sent
 83 |       to the Licensor or its representatives, including but not limited to
 84 |       communication on electronic mailing lists, source code control systems,
 85 |       and issue tracking systems that are managed by, or on behalf of, the
 86 |       Licensor for the purpose of discussing and improving the Work, but
 87 |       excluding communication that is conspicuously marked or otherwise
 88 |       designated in writing by the copyright owner as "Not a Contribution."
 89 | 
 90 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 91 |       on behalf of whom a Contribution has been received by Licensor and
 92 |       subsequently incorporated within the Work.
 93 | 
 94 |    2. Grant of Copyright License. Subject to the terms and conditions of
 95 |       this License, each Contributor hereby grants to You a perpetual,
 96 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 97 |       copyright license to reproduce, prepare Derivative Works of,
 98 |       publicly display, publicly perform, sublicense, and distribute the
 99 |       Work and such Derivative Works in Source or Object form.
100 | 
101 |    3. Grant of Patent License. Subject to the terms and conditions of
102 |       this License, each Contributor hereby grants to You a perpetual,
103 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
104 |       (except as stated in this section) patent license to make, have made,
105 |       use, offer to sell, sell, import, and otherwise transfer the Work,
106 |       where such license applies only to those patent claims licensable
107 |       by such Contributor that are necessarily infringed by their
108 |       Contribution(s) alone or by combination of their Contribution(s)
109 |       with the Work to which such Contribution(s) was submitted. If You
110 |       institute patent litigation against any entity (including a
111 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
112 |       or a Contribution incorporated within the Work constitutes direct
113 |       or contributory patent infringement, then any patent licenses
114 |       granted to You under this License for that Work shall terminate
115 |       as of the date such litigation is filed.
116 | 
117 |    4. Redistribution. You may reproduce and distribute copies of the
118 |       Work or Derivative Works thereof in any medium, with or without
119 |       modifications, and in Source or Object form, provided that You
120 |       meet the following conditions:
121 | 
122 |       (a) You must give any other recipients of the Work or
123 |           Derivative Works a copy of this License; and
124 | 
125 |       (b) You must cause any modified files to carry prominent notices
126 |           stating that You changed the files; and
127 | 
128 |       (c) You must retain, in the Source form of any Derivative Works
129 |           that You distribute, all copyright, patent, trademark, and
130 |           attribution notices from the Source form of the Work,
131 |           excluding those notices that do not pertain to any part of
132 |           the Derivative Works; and
133 | 
134 |       (d) If the Work includes a "NOTICE" text file as part of its
135 |           distribution, then any Derivative Works that You distribute must
136 |           include a readable copy of the attribution notices contained
137 |           within such NOTICE file, excluding those notices that do not
138 |           pertain to any part of the Derivative Works, in at least one
139 |           of the following places: within a NOTICE text file distributed
140 |           as part of the Derivative Works; within the Source form or
141 |           documentation, if provided along with the Derivative Works; or,
142 |           within a display generated by the Derivative Works, if and
143 |           wherever such third-party notices normally appear. The contents
144 |           of the NOTICE file are for informational purposes only and
145 |           do not modify the License. You may add Your own attribution
146 |           notices within Derivative Works that You distribute, alongside
147 |           or as an addendum to the NOTICE text from the Work, provided
148 |           that such additional attribution notices cannot be construed
149 |           as modifying the License.
150 | 
151 |       You may add Your own copyright statement to Your modifications and
152 |       may provide additional or different license terms and conditions
153 |       for use, reproduction, or distribution of Your modifications, or
154 |       for any such Derivative Works as a whole, provided Your use,
155 |       reproduction, and distribution of the Work otherwise complies with
156 |       the conditions stated in this License.
157 | 
158 |    5. Submission of Contributions. Unless You explicitly state otherwise,
159 |       any Contribution intentionally submitted for inclusion in the Work
160 |       by You to the Licensor shall be under the terms and conditions of
161 |       this License, without any additional terms or conditions.
162 |       Notwithstanding the above, nothing herein shall supersede or modify
163 |       the terms of any separate license agreement you may have executed
164 |       with Licensor regarding such Contributions.
165 | 
166 |    6. Trademarks. This License does not grant permission to use the trade
167 |       names, trademarks, service marks, or product names of the Licensor,
168 |       except as required for reasonable and customary use in describing the
169 |       origin of the Work and reproducing the content of the NOTICE file.
170 | 
171 |    7. Disclaimer of Warranty. Unless required by applicable law or
172 |       agreed to in writing, Licensor provides the Work (and each
173 |       Contributor provides its Contributions) on an "AS IS" BASIS,
174 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
175 |       implied, including, without limitation, any warranties or conditions
176 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
177 |       PARTICULAR PURPOSE. You are solely responsible for determining the
178 |       appropriateness of using or redistributing the Work and assume any
179 |       risks associated with Your exercise of permissions under this License.
180 | 
181 |    8. Limitation of Liability. In no event and under no legal theory,
182 |       whether in tort (including negligence), contract, or otherwise,
183 |       unless required by applicable law (such as deliberate and grossly
184 |       negligent acts) or agreed to in writing, shall any Contributor be
185 |       liable to You for damages, including any direct, indirect, special,
186 |       incidental, or consequential damages of any character arising as a
187 |       result of this License or out of the use or inability to use the
188 |       Work (including but not limited to damages for loss of goodwill,
189 |       work stoppage, computer failure or malfunction, or any and all
190 |       other commercial damages or losses), even if such Contributor
191 |       has been advised of the possibility of such damages.
192 | 
193 |    9. Accepting Warranty or Additional Liability. While redistributing
194 |       the Work or Derivative Works thereof, You may choose to offer,
195 |       and charge a fee for, acceptance of support, warranty, indemnity,
196 |       or other liability obligations and/or rights consistent with this
197 |       License. However, in accepting such obligations, You may act only
198 |       on Your own behalf and on Your sole responsibility, not on behalf
199 |       of any other Contributor, and only if You agree to indemnify,
200 |       defend, and hold each Contributor harmless for any liability
201 |       incurred by, or claims asserted against, such Contributor by reason
202 |       of your accepting any such warranty or additional liability.
203 | 
204 |    END OF TERMS AND CONDITIONS
205 | 
206 |    APPENDIX: How to apply the Apache License to your work.
207 | 
208 |       To apply the Apache License to your work, attach the following
209 |       boilerplate notice, with the fields enclosed by brackets "{}"
210 |       replaced with your own identifying information. (Don't include
211 |       the brackets!)  The text should be enclosed in the appropriate
212 |       comment syntax for the file format. We also recommend that a
213 |       file or class name and description of purpose be included on the
214 |       same "printed page" as the copyright notice for easier
215 |       identification within third-party archives.
216 | 
217 |    Copyright {yyyy} {name of copyright owner}
218 | 
219 |    Licensed under the Apache License, Version 2.0 (the "License");
220 |    you may not use this file except in compliance with the License.
221 |    You may obtain a copy of the License at
222 | 
223 |        http://www.apache.org/licenses/LICENSE-2.0
224 | 
225 |    Unless required by applicable law or agreed to in writing, software
226 |    distributed under the License is distributed on an "AS IS" BASIS,
227 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
228 |    See the License for the specific language governing permissions and
229 |    limitations under the License.
230 | 
231 | -----------------------------------------
232 | END OF dwml NOTICES AND INFORMATION
```

--------------------------------------------------------------------------------
/packages/markitdown/tests/test_module_misc.py:
--------------------------------------------------------------------------------

```python
  1 | #!/usr/bin/env python3 -m pytest
  2 | import io
  3 | import os
  4 | import re
  5 | import shutil
  6 | import pytest
  7 | from unittest.mock import MagicMock
  8 | 
  9 | from markitdown._uri_utils import parse_data_uri, file_uri_to_path
 10 | 
 11 | from markitdown import (
 12 |     MarkItDown,
 13 |     UnsupportedFormatException,
 14 |     FileConversionException,
 15 |     StreamInfo,
 16 | )
 17 | 
 18 | # This file contains module tests that are not directly tested by the FileTestVectors.
 19 | # This includes things like helper functions and runtime conversion options
 20 | # (e.g., LLM clients, exiftool path, transcription services, etc.)
 21 | 
 22 | skip_remote = (
 23 |     True if os.environ.get("GITHUB_ACTIONS") else False
 24 | )  # Don't run these tests in CI
 25 | 
 26 | 
 27 | # Don't run the llm tests without a key and the client library
 28 | skip_llm = False if os.environ.get("OPENAI_API_KEY") else True
 29 | try:
 30 |     import openai
 31 | except ModuleNotFoundError:
 32 |     skip_llm = True
 33 | 
 34 | # Skip exiftool tests if not installed
 35 | skip_exiftool = shutil.which("exiftool") is None
 36 | 
 37 | TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
 38 | 
 39 | JPG_TEST_EXIFTOOL = {
 40 |     "Author": "AutoGen Authors",
 41 |     "Title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
 42 |     "Description": "AutoGen enables diverse LLM-based applications",
 43 |     "ImageSize": "1615x1967",
 44 |     "DateTimeOriginal": "2024:03:14 22:10:00",
 45 | }
 46 | 
 47 | MP3_TEST_EXIFTOOL = {
 48 |     "Title": "f67a499e-a7d0-4ca3-a49b-358bd934ae3e",
 49 |     "Artist": "Artist Name Test String",
 50 |     "Album": "Album Name Test String",
 51 |     "SampleRate": "48000",
 52 | }
 53 | 
 54 | PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf"
 55 | PDF_TEST_STRINGS = [
 56 |     "While there is contemporaneous exploration of multi-agent approaches"
 57 | ]
 58 | 
 59 | YOUTUBE_TEST_URL = "https://www.youtube.com/watch?v=V2qZ_lgxTzg"
 60 | YOUTUBE_TEST_STRINGS = [
 61 |     "## AutoGen FULL Tutorial with Python (Step-By-Step)",
 62 |     "This is an intermediate tutorial for installing and using AutoGen locally",
 63 |     "PT15M4S",
 64 |     "the model we're going to be using today is GPT 3.5 turbo",  # From the transcript
 65 | ]
 66 | 
 67 | DOCX_COMMENT_TEST_STRINGS = [
 68 |     "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
 69 |     "49e168b7-d2ae-407f-a055-2167576f39a1",
 70 |     "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
 71 |     "# Abstract",
 72 |     "# Introduction",
 73 |     "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
 74 |     "This is a test comment. 12df-321a",
 75 |     "Yet another comment in the doc. 55yiyi-asd09",
 76 | ]
 77 | 
 78 | BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
 79 | BLOG_TEST_STRINGS = [
 80 |     "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?",
 81 |     "an example where high cost can easily prevent a generic complex",
 82 | ]
 83 | 
 84 | LLM_TEST_STRINGS = [
 85 |     "5bda1dd6",
 86 | ]
 87 | 
 88 | PPTX_TEST_STRINGS = [
 89 |     "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
 90 |     "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
 91 |     "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
 92 |     "1b92870d-e3b5-4e65-8153-919f4ff45592",
 93 |     "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
 94 |     "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
 95 |     "2003",  # chart value
 96 | ]
 97 | 
 98 | 
 99 | # --- Helper Functions ---
100 | def validate_strings(result, expected_strings, exclude_strings=None):
101 |     """Validate presence or absence of specific strings."""
102 |     text_content = result.text_content.replace("\\", "")
103 |     for string in expected_strings:
104 |         assert string in text_content
105 |     if exclude_strings:
106 |         for string in exclude_strings:
107 |             assert string not in text_content
108 | 
109 | 
110 | def test_stream_info_operations() -> None:
111 |     """Test operations performed on StreamInfo objects."""
112 | 
113 |     stream_info_original = StreamInfo(
114 |         mimetype="mimetype.1",
115 |         extension="extension.1",
116 |         charset="charset.1",
117 |         filename="filename.1",
118 |         local_path="local_path.1",
119 |         url="url.1",
120 |     )
121 | 
122 |     # Check updating all attributes by keyword
123 |     keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"]
124 |     for keyword in keywords:
125 |         updated_stream_info = stream_info_original.copy_and_update(
126 |             **{keyword: f"{keyword}.2"}
127 |         )
128 | 
129 |         # Make sure the targted attribute is updated
130 |         assert getattr(updated_stream_info, keyword) == f"{keyword}.2"
131 | 
132 |         # Make sure the other attributes are unchanged
133 |         for k in keywords:
134 |             if k != keyword:
135 |                 assert getattr(stream_info_original, k) == getattr(
136 |                     updated_stream_info, k
137 |                 )
138 | 
139 |     # Check updating all attributes by passing a new StreamInfo object
140 |     keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"]
141 |     for keyword in keywords:
142 |         updated_stream_info = stream_info_original.copy_and_update(
143 |             StreamInfo(**{keyword: f"{keyword}.2"})
144 |         )
145 | 
146 |         # Make sure the targted attribute is updated
147 |         assert getattr(updated_stream_info, keyword) == f"{keyword}.2"
148 | 
149 |         # Make sure the other attributes are unchanged
150 |         for k in keywords:
151 |             if k != keyword:
152 |                 assert getattr(stream_info_original, k) == getattr(
153 |                     updated_stream_info, k
154 |                 )
155 | 
156 |     # Check mixing and matching
157 |     updated_stream_info = stream_info_original.copy_and_update(
158 |         StreamInfo(extension="extension.2", filename="filename.2"),
159 |         mimetype="mimetype.3",
160 |         charset="charset.3",
161 |     )
162 |     assert updated_stream_info.extension == "extension.2"
163 |     assert updated_stream_info.filename == "filename.2"
164 |     assert updated_stream_info.mimetype == "mimetype.3"
165 |     assert updated_stream_info.charset == "charset.3"
166 |     assert updated_stream_info.local_path == "local_path.1"
167 |     assert updated_stream_info.url == "url.1"
168 | 
169 |     # Check multiple StreamInfo objects
170 |     updated_stream_info = stream_info_original.copy_and_update(
171 |         StreamInfo(extension="extension.4", filename="filename.5"),
172 |         StreamInfo(mimetype="mimetype.6", charset="charset.7"),
173 |     )
174 |     assert updated_stream_info.extension == "extension.4"
175 |     assert updated_stream_info.filename == "filename.5"
176 |     assert updated_stream_info.mimetype == "mimetype.6"
177 |     assert updated_stream_info.charset == "charset.7"
178 |     assert updated_stream_info.local_path == "local_path.1"
179 |     assert updated_stream_info.url == "url.1"
180 | 
181 | 
182 | def test_data_uris() -> None:
183 |     # Test basic parsing of data URIs
184 |     data_uri = "data:text/plain;base64,SGVsbG8sIFdvcmxkIQ=="
185 |     mime_type, attributes, data = parse_data_uri(data_uri)
186 |     assert mime_type == "text/plain"
187 |     assert len(attributes) == 0
188 |     assert data == b"Hello, World!"
189 | 
190 |     data_uri = "data:base64,SGVsbG8sIFdvcmxkIQ=="
191 |     mime_type, attributes, data = parse_data_uri(data_uri)
192 |     assert mime_type is None
193 |     assert len(attributes) == 0
194 |     assert data == b"Hello, World!"
195 | 
196 |     data_uri = "data:text/plain;charset=utf-8;base64,SGVsbG8sIFdvcmxkIQ=="
197 |     mime_type, attributes, data = parse_data_uri(data_uri)
198 |     assert mime_type == "text/plain"
199 |     assert len(attributes) == 1
200 |     assert attributes["charset"] == "utf-8"
201 |     assert data == b"Hello, World!"
202 | 
203 |     data_uri = "data:,Hello%2C%20World%21"
204 |     mime_type, attributes, data = parse_data_uri(data_uri)
205 |     assert mime_type is None
206 |     assert len(attributes) == 0
207 |     assert data == b"Hello, World!"
208 | 
209 |     data_uri = "data:text/plain,Hello%2C%20World%21"
210 |     mime_type, attributes, data = parse_data_uri(data_uri)
211 |     assert mime_type == "text/plain"
212 |     assert len(attributes) == 0
213 |     assert data == b"Hello, World!"
214 | 
215 |     data_uri = "data:text/plain;charset=utf-8,Hello%2C%20World%21"
216 |     mime_type, attributes, data = parse_data_uri(data_uri)
217 |     assert mime_type == "text/plain"
218 |     assert len(attributes) == 1
219 |     assert attributes["charset"] == "utf-8"
220 |     assert data == b"Hello, World!"
221 | 
222 | 
223 | def test_file_uris() -> None:
224 |     # Test file URI with an empty host
225 |     file_uri = "file:///path/to/file.txt"
226 |     netloc, path = file_uri_to_path(file_uri)
227 |     assert netloc is None
228 |     assert path == "/path/to/file.txt"
229 | 
230 |     # Test file URI with no host
231 |     file_uri = "file:/path/to/file.txt"
232 |     netloc, path = file_uri_to_path(file_uri)
233 |     assert netloc is None
234 |     assert path == "/path/to/file.txt"
235 | 
236 |     # Test file URI with localhost
237 |     file_uri = "file://localhost/path/to/file.txt"
238 |     netloc, path = file_uri_to_path(file_uri)
239 |     assert netloc == "localhost"
240 |     assert path == "/path/to/file.txt"
241 | 
242 |     # Test file URI with query parameters
243 |     file_uri = "file:///path/to/file.txt?param=value"
244 |     netloc, path = file_uri_to_path(file_uri)
245 |     assert netloc is None
246 |     assert path == "/path/to/file.txt"
247 | 
248 |     # Test file URI with fragment
249 |     file_uri = "file:///path/to/file.txt#fragment"
250 |     netloc, path = file_uri_to_path(file_uri)
251 |     assert netloc is None
252 |     assert path == "/path/to/file.txt"
253 | 
254 | 
255 | def test_docx_comments() -> None:
256 |     # Test DOCX processing, with comments and setting style_map on init
257 |     markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
258 |     result = markitdown_with_style_map.convert(
259 |         os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
260 |     )
261 |     validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
262 | 
263 | 
264 | def test_docx_equations() -> None:
265 |     markitdown = MarkItDown()
266 |     docx_file = os.path.join(TEST_FILES_DIR, "equations.docx")
267 |     result = markitdown.convert(docx_file)
268 | 
269 |     # Check for inline equation m=1 (wrapped with single $) is present
270 |     assert "$m=1$" in result.text_content, "Inline equation $m=1$ not found"
271 | 
272 |     # Find block equations wrapped with double $$ and check if they are present
273 |     block_equations = re.findall(r"\$\$(.+?)\$\$", result.text_content)
274 |     assert block_equations, "No block equations found in the document."
275 | 
276 | 
277 | def test_input_as_strings() -> None:
278 |     markitdown = MarkItDown()
279 | 
280 |     # Test input from a stream
281 |     input_data = b"<html><body><h1>Test</h1></body></html>"
282 |     result = markitdown.convert_stream(io.BytesIO(input_data))
283 |     assert "# Test" in result.text_content
284 | 
285 |     # Test input with leading blank characters
286 |     input_data = b"   \n\n\n<html><body><h1>Test</h1></body></html>"
287 |     result = markitdown.convert_stream(io.BytesIO(input_data))
288 |     assert "# Test" in result.text_content
289 | 
290 | 
291 | @pytest.mark.skipif(
292 |     skip_remote,
293 |     reason="do not run tests that query external urls",
294 | )
295 | def test_markitdown_remote() -> None:
296 |     markitdown = MarkItDown()
297 | 
298 |     # By URL
299 |     result = markitdown.convert(PDF_TEST_URL)
300 |     for test_string in PDF_TEST_STRINGS:
301 |         assert test_string in result.text_content
302 | 
303 |     # Youtube
304 |     result = markitdown.convert(YOUTUBE_TEST_URL)
305 |     for test_string in YOUTUBE_TEST_STRINGS:
306 |         assert test_string in result.text_content
307 | 
308 | 
309 | @pytest.mark.skipif(
310 |     skip_remote,
311 |     reason="do not run remotely run speech transcription tests",
312 | )
313 | def test_speech_transcription() -> None:
314 |     markitdown = MarkItDown()
315 | 
316 |     # Test WAV files, MP3 and M4A files
317 |     for file_name in ["test.wav", "test.mp3", "test.m4a"]:
318 |         result = markitdown.convert(os.path.join(TEST_FILES_DIR, file_name))
319 |         result_lower = result.text_content.lower()
320 |         assert (
321 |             ("1" in result_lower or "one" in result_lower)
322 |             and ("2" in result_lower or "two" in result_lower)
323 |             and ("3" in result_lower or "three" in result_lower)
324 |             and ("4" in result_lower or "four" in result_lower)
325 |             and ("5" in result_lower or "five" in result_lower)
326 |         )
327 | 
328 | 
329 | def test_exceptions() -> None:
330 |     # Check that an exception is raised when trying to convert an unsupported format
331 |     markitdown = MarkItDown()
332 |     with pytest.raises(UnsupportedFormatException):
333 |         markitdown.convert(os.path.join(TEST_FILES_DIR, "random.bin"))
334 | 
335 |     # Check that an exception is raised when trying to convert a file that is corrupted
336 |     with pytest.raises(FileConversionException) as exc_info:
337 |         markitdown.convert(
338 |             os.path.join(TEST_FILES_DIR, "random.bin"), file_extension=".pptx"
339 |         )
340 |     assert len(exc_info.value.attempts) == 1
341 |     assert type(exc_info.value.attempts[0].converter).__name__ == "PptxConverter"
342 | 
343 | 
344 | @pytest.mark.skipif(
345 |     skip_exiftool,
346 |     reason="do not run if exiftool is not installed",
347 | )
348 | def test_markitdown_exiftool() -> None:
349 |     which_exiftool = shutil.which("exiftool")
350 |     assert which_exiftool is not None
351 | 
352 |     # Test explicitly setting the location of exiftool
353 |     markitdown = MarkItDown(exiftool_path=which_exiftool)
354 |     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
355 |     for key in JPG_TEST_EXIFTOOL:
356 |         target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
357 |         assert target in result.text_content
358 | 
359 |     # Test setting the exiftool path through an environment variable
360 |     os.environ["EXIFTOOL_PATH"] = which_exiftool
361 |     markitdown = MarkItDown()
362 |     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
363 |     for key in JPG_TEST_EXIFTOOL:
364 |         target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
365 |         assert target in result.text_content
366 | 
367 |     # Test some other media types
368 |     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.mp3"))
369 |     for key in MP3_TEST_EXIFTOOL:
370 |         target = f"{key}: {MP3_TEST_EXIFTOOL[key]}"
371 |         assert target in result.text_content
372 | 
373 | 
374 | def test_markitdown_llm_parameters() -> None:
375 |     """Test that LLM parameters are correctly passed to the client."""
376 |     mock_client = MagicMock()
377 |     mock_response = MagicMock()
378 |     mock_response.choices = [
379 |         MagicMock(
380 |             message=MagicMock(
381 |                 content="Test caption with red circle and blue square 5bda1dd6"
382 |             )
383 |         )
384 |     ]
385 |     mock_client.chat.completions.create.return_value = mock_response
386 | 
387 |     test_prompt = "You are a professional test prompt."
388 |     markitdown = MarkItDown(
389 |         llm_client=mock_client, llm_model="gpt-4o", llm_prompt=test_prompt
390 |     )
391 | 
392 |     # Test image file
393 |     markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
394 | 
395 |     # Verify the prompt was passed to the OpenAI API
396 |     assert mock_client.chat.completions.create.called
397 |     call_args = mock_client.chat.completions.create.call_args
398 |     messages = call_args[1]["messages"]
399 |     assert len(messages) == 1
400 |     assert messages[0]["content"][0]["text"] == test_prompt
401 | 
402 |     # Reset the mock for the next test
403 |     mock_client.chat.completions.create.reset_mock()
404 | 
405 |     # TODO: may only use one test after the llm caption method duplicate has been removed:
406 |     # https://github.com/microsoft/markitdown/pull/1254
407 |     # Test PPTX file
408 |     markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
409 | 
410 |     # Verify the prompt was passed to the OpenAI API for PPTX images too
411 |     assert mock_client.chat.completions.create.called
412 |     call_args = mock_client.chat.completions.create.call_args
413 |     messages = call_args[1]["messages"]
414 |     assert len(messages) == 1
415 |     assert messages[0]["content"][0]["text"] == test_prompt
416 | 
417 | 
418 | @pytest.mark.skipif(
419 |     skip_llm,
420 |     reason="do not run llm tests without a key",
421 | )
422 | def test_markitdown_llm() -> None:
423 |     client = openai.OpenAI()
424 |     markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")
425 | 
426 |     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
427 |     for test_string in LLM_TEST_STRINGS:
428 |         assert test_string in result.text_content
429 | 
430 |     # This is not super precise. It would also accept "red square", "blue circle",
431 |     # "the square is not blue", etc. But it's sufficient for this test.
432 |     for test_string in ["red", "circle", "blue", "square"]:
433 |         assert test_string in result.text_content.lower()
434 | 
435 |     # Images embedded in PPTX files
436 |     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
437 |     # LLM Captions are included
438 |     for test_string in LLM_TEST_STRINGS:
439 |         assert test_string in result.text_content
440 |     # Standard alt text is included
441 |     validate_strings(result, PPTX_TEST_STRINGS)
442 | 
443 | 
444 | if __name__ == "__main__":
445 |     """Runs this file's tests from the command line."""
446 |     for test in [
447 |         test_stream_info_operations,
448 |         test_data_uris,
449 |         test_file_uris,
450 |         test_docx_comments,
451 |         test_input_as_strings,
452 |         test_markitdown_remote,
453 |         test_speech_transcription,
454 |         test_exceptions,
455 |         test_markitdown_exiftool,
456 |         test_markitdown_llm_parameters,
457 |         test_markitdown_llm,
458 |     ]:
459 |         print(f"Running {test.__name__}...", end="")
460 |         test()
461 |         print("OK")
462 |     print("All tests passed!")
463 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/tests/test_files/test_blog.html:
--------------------------------------------------------------------------------

```html
 1 | <!doctype html>
 2 | <html lang="en" dir="ltr" class="blog-wrapper blog-post-page plugin-blog plugin-id-default" data-has-hydrated="false">
 3 | <head>
 4 | <meta charset="UTF-8">
 5 | <meta name="generator" content="Docusaurus v3.1.1">
 6 | <title data-rh="true">Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"><meta data-rh="true" property="og:locale" content="en"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docusaurus_tag" content="default"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docsearch:docusaurus_tag" content="default"><meta data-rh="true" property="og:title" content="Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen"><meta data-rh="true" name="description" content="level 2 algebra"><meta data-rh="true" property="og:description" content="level 2 algebra"><meta data-rh="true" property="og:type" content="article"><meta data-rh="true" property="article:published_time" content="2023-04-21T00:00:00.000Z"><meta data-rh="true" property="article:author" content="https://www.linkedin.com/in/chi-wang-49b15b16/"><meta data-rh="true" property="article:tag" content="LLM,GPT,research"><link data-rh="true" rel="icon" href="/autogen/img/ag.ico"><link data-rh="true" rel="canonical" href="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"><link data-rh="true" rel="alternate" href="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" hreflang="en"><link data-rh="true" rel="alternate" href="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" hreflang="x-default"><link rel="alternate" type="application/rss+xml" href="/autogen/blog/rss.xml" title="AutoGen RSS Feed">
 7 | <link rel="alternate" type="application/atom+xml" href="/autogen/blog/atom.xml" title="AutoGen Atom Feed">
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.css" integrity="sha384-Um5gpz1odJg5Z4HAmzPtgZKdTBHZdw8S29IecapCSB31ligYPhHQZMIlWLYQGVoc" crossorigin="anonymous">
15 | <script src="/autogen/js/custom.js" async defer="defer"></script><link rel="stylesheet" href="/autogen/assets/css/styles.ca10f300.css">
16 | <script src="/autogen/assets/js/runtime~main.83ab9fec.js" defer="defer"></script>
17 | <script src="/autogen/assets/js/main.5d28c826.js" defer="defer"></script>
18 | </head>
19 | <body class="navigation-with-keyboard">
20 | <script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){try{return new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}}()||function(){try{return localStorage.getItem("theme")}catch(t){}}();t(null!==e?e:"light")}(),function(){try{const a=new URLSearchParams(window.location.search).entries();for(var[t,e]of a)if(t.startsWith("docusaurus-data-")){var n=t.replace("docusaurus-data-","data-");document.documentElement.setAttribute(n,e)}}catch(t){}}(),document.documentElement.setAttribute("data-announcement-bar-initially-dismissed",function(){try{return"true"===localStorage.getItem("docusaurus.announcement.dismiss")}catch(t){}return!1}())</script><div id="__docusaurus"><div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><div class="announcementBar_mb4j" style="background-color:#fafbfc;color:#091E42" role="banner"><div class="announcementBarPlaceholder_vyr4"></div><div class="content_knG7 announcementBarContent_xLdY">What's new in AutoGen? Read <a href="/autogen/blog/2024/03/03/AutoGen-Update">this blog</a> for an overview of updates</div><button type="button" aria-label="Close" class="clean-btn close closeButton_CVFx announcementBarClose_gvF7"><svg viewBox="0 0 15 15" width="14" height="14"><g stroke="currentColor" stroke-width="3.1"><path d="M.75.75l13.5 13.5M14.25.75L.75 14.25"></path></g></svg></button></div><nav aria-label="Main" class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/autogen/"><div class="navbar__logo"><img src="/autogen/img/ag.svg" alt="AutoGen" class="themedComponent_mlkZ themedComponent--light_NVdE"><img src="/autogen/img/ag.svg" alt="AutoGen" class="themedComponent_mlkZ themedComponent--dark_xIcU"></div><b class="navbar__title text--truncate">AutoGen</b></a><a class="navbar__item navbar__link" href="/autogen/docs/Getting-Started">Docs</a><a class="navbar__item navbar__link" href="/autogen/docs/reference/agentchat/conversable_agent">API</a><a aria-current="page" class="navbar__item navbar__link navbar__link--active" href="/autogen/blog">Blog</a><a class="navbar__item navbar__link" href="/autogen/docs/FAQ">FAQ</a><a class="navbar__item navbar__link" href="/autogen/docs/Examples">Examples</a><a class="navbar__item navbar__link" href="/autogen/docs/notebooks">Notebooks</a><a class="navbar__item navbar__link" href="/autogen/docs/Gallery">Gallery</a></div><div class="navbar__items navbar__items--right"><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">Other Languages</a><ul class="dropdown__menu"><li><a href="https://microsoft.github.io/autogen-for-net/" target="_blank" rel="noopener noreferrer" class="dropdown__link">Dotnet<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div><a href="https://github.com/microsoft/autogen" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a><div class="toggle_vylO colorModeToggle_DEke"><button class="clean-btn toggleButton_gllP toggleButtonDisabled_aARS" type="button" disabled="" title="Switch between dark and light mode (currently light mode)" aria-label="Switch between dark and light mode (currently light mode)" aria-live="polite"><svg viewBox="0 0 24 24" width="24" height="24" class="lightToggleIcon_pyhR"><path fill="currentColor" d="M12,9c1.65,0,3,1.35,3,3s-1.35,3-3,3s-3-1.35-3-3S10.35,9,12,9 M12,7c-2.76,0-5,2.24-5,5s2.24,5,5,5s5-2.24,5-5 S14.76,7,12,7L12,7z M2,13l2,0c0.55,0,1-0.45,1-1s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S1.45,13,2,13z M20,13l2,0c0.55,0,1-0.45,1-1 s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S19.45,13,20,13z M11,2v2c0,0.55,0.45,1,1,1s1-0.45,1-1V2c0-0.55-0.45-1-1-1S11,1.45,11,2z M11,20v2c0,0.55,0.45,1,1,1s1-0.45,1-1v-2c0-0.55-0.45-1-1-1C11.45,19,11,19.45,11,20z M5.99,4.58c-0.39-0.39-1.03-0.39-1.41,0 c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0s0.39-1.03,0-1.41L5.99,4.58z M18.36,16.95 c-0.39-0.39-1.03-0.39-1.41,0c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0c0.39-0.39,0.39-1.03,0-1.41 L18.36,16.95z M19.42,5.99c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06c-0.39,0.39-0.39,1.03,0,1.41 s1.03,0.39,1.41,0L19.42,5.99z M7.05,18.36c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06 c-0.39,0.39-0.39,1.03,0,1.41s1.03,0.39,1.41,0L7.05,18.36z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" class="darkToggleIcon_wfgR"><path fill="currentColor" d="M9.37,5.51C9.19,6.15,9.1,6.82,9.1,7.5c0,4.08,3.32,7.4,7.4,7.4c0.68,0,1.35-0.09,1.99-0.27C17.45,17.19,14.93,19,12,19 c-3.86,0-7-3.14-7-7C5,9.07,6.81,6.55,9.37,5.51z M12,3c-4.97,0-9,4.03-9,9s4.03,9,9,9s9-4.03,9-9c0-0.46-0.04-0.92-0.1-1.36 c-0.98,1.37-2.58,2.26-4.4,2.26c-2.98,0-5.4-2.42-5.4-5.4c0-1.81,0.89-3.42,2.26-4.4C12.92,3.04,12.46,3,12,3L12,3z"></path></svg></button></div><div class="navbarSearchContainer_Bca1"><div class="navbar__search searchBarContainer_NW3z"><input placeholder="Search" aria-label="Search" class="navbar__search-input"><div class="loadingRing_RJI3 searchBarLoadingRing_YnHq"><div></div><div></div><div></div><div></div></div><div class="searchHintContainer_Pkmr"><kbd class="searchHint_iIMx">ctrl</kbd><kbd class="searchHint_iIMx">K</kbd></div></div></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="main-wrapper mainWrapper_z2l0"><div class="container margin-vert--lg"><div class="row"><aside class="col col--3"><nav class="sidebar_re4s thin-scrollbar" aria-label="Blog recent posts navigation"><div class="sidebarItemTitle_pO2u margin-bottom--md">Recent posts</div><ul class="sidebarItemList_Yudw clean-list"><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/03/03/AutoGen-Update">What&#x27;s New in AutoGen?</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/02/29/StateFlow">StateFlow - Build LLM Workflows with Customized State-Oriented Transition Function in GroupChat</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/02/11/FSM-GroupChat">FSM Group Chat -- User-specified agent transitions</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/02/02/AutoAnny">Anny: Assisting AutoGen Devs Via AutoGen</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/01/26/Custom-Models">AutoGen with Custom Models: Empowering Users to Use Their Own Inference Mechanism</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/01/25/AutoGenBench">AutoGenBench -- A Tool for Measuring and Evaluating AutoGen Agents</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/01/23/Code-execution-in-docker">Code execution is now by default inside docker container</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/12/29/AgentDescriptions">All About Agent Descriptions</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/12/23/AgentOptimizer">AgentOptimizer - An Agentic Way to Train Your LLM Agent</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/12/01/AutoGenStudio">AutoGen Studio: Interactively Explore Multi-Agent Workflows</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/26/Agent-AutoBuild">Agent AutoBuild - Automatically Building Multi-agent Systems</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/20/AgentEval">How to Assess Utility of LLM-powered Applications?</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/13/OAI-assistants">AutoGen Meets GPTs</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/09/EcoAssistant">EcoAssistant - Using LLM Assistants More Accurately and Affordably</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/06/LMM-Agent">Multimodal with GPT-4V and LLaVA</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/10/26/TeachableAgent">AutoGen&#x27;s Teachable Agents</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/10/18/RetrieveChat">Retrieval-Augmented Generation (RAG) Applications with AutoGen</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/07/14/Local-LLMs">Use AutoGen for Local LLMs</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/06/28/MathChat">MathChat - An Conversational Framework to Solve Math Problems</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/05/18/GPT-adaptive-humaneval">Achieve More, Pay Less - Use GPT-4 Smartly</a></li><li class="sidebarItem__DBe"><a aria-current="page" class="sidebarItemLink_mo7H sidebarItemLinkActive_I1ZP" href="/autogen/blog/2023/04/21/LLM-tuning-math">Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH</a></li></ul></nav></aside><main class="col col--7" itemscope="" itemtype="https://schema.org/Blog"><article itemprop="blogPost" itemscope="" itemtype="https://schema.org/BlogPosting"><meta itemprop="description" content="level 2 algebra"><header><h1 class="title_f1Hy" itemprop="headline">Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH</h1><div class="container_mt6G margin-vert--md"><time datetime="2023-04-21T00:00:00.000Z" itemprop="datePublished">April 21, 2023</time> · <!-- -->6 min read</div><div class="margin-top--md margin-bottom--sm row"><div class="col col--6 authorCol_Hf19"><div class="avatar margin-bottom--sm"><a href="https://www.linkedin.com/in/chi-wang-49b15b16/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link"><img class="avatar__photo" src="https://github.com/sonichi.png" alt="Chi Wang" itemprop="image"></a><div class="avatar__intro" itemprop="author" itemscope="" itemtype="https://schema.org/Person"><div class="avatar__name"><a href="https://www.linkedin.com/in/chi-wang-49b15b16/" target="_blank" rel="noopener noreferrer" itemprop="url"><span itemprop="name">Chi Wang</span></a></div><small class="avatar__subtitle" itemprop="description">Principal Researcher at Microsoft Research</small></div></div></div></div></header><div id="__blog-post-container" class="markdown" itemprop="articleBody"><p><img decoding="async" loading="lazy" alt="level 2 algebra" src="/autogen/assets/images/level2algebra-659ba95286432d9945fc89e84d606797.png" width="575" height="469" class="img_ev3q"></p>
21 | <p><strong>TL;DR:</strong></p>
22 | <ul>
23 | <li><strong>Just by tuning the inference parameters like model, number of responses, temperature etc. without changing any model weights or prompt, the baseline accuracy of untuned gpt-4 can be improved by 20% in high school math competition problems.</strong></li>
24 | <li><strong>For easy problems, the tuned gpt-3.5-turbo model vastly outperformed untuned gpt-4 in accuracy (e.g., 90% vs. 70%) and cost efficiency. For hard problems, the tuned gpt-4 is much more accurate (e.g., 35% vs. 20%) and less expensive than untuned gpt-4.</strong></li>
25 | <li><strong>AutoGen can help with model selection, parameter tuning, and cost-saving in LLM applications.</strong></li>
26 | </ul>
27 | <p>Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?</p>
28 | <p>In this blog post, we will explore how model and inference parameter matter in LLM applications, using a case study for <a href="https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html" target="_blank" rel="noopener noreferrer">MATH</a>, a benchmark for evaluating LLMs on advanced mathematical problem solving. MATH consists of 12K math competition problems from AMC-10, AMC-12 and AIME. Each problem is accompanied by a step-by-step solution.</p>
29 | <p>We will use AutoGen to automatically find the best model and inference parameter for LLMs on a given task and dataset given an inference budget, using a novel low-cost search &amp; pruning strategy. AutoGen currently supports all the LLMs from OpenAI, such as GPT-3.5 and GPT-4.</p>
30 | <p>We will use AutoGen to perform model selection and inference parameter tuning. Then we compare the performance and inference cost on solving algebra problems with the untuned gpt-4. We will also analyze how different difficulty levels affect the results.</p>
31 | <h2 class="anchor anchorWithStickyNavbar_LWe7" id="experiment-setup">Experiment Setup<a href="#experiment-setup" class="hash-link" aria-label="Direct link to Experiment Setup" title="Direct link to Experiment Setup">​</a></h2>
32 | <p>We use AutoGen to select between the following models with a target inference budget $0.02 per instance:</p>
33 | <ul>
34 | <li>gpt-3.5-turbo, a relatively cheap model that powers the popular ChatGPT app</li>
35 | <li>gpt-4, the state of the art LLM that costs more than 10 times of gpt-3.5-turbo</li>
36 | </ul>
37 | <p>We adapt the models using 20 examples in the train set, using the problem statement as the input and generating the solution as the output. We use the following inference parameters:</p>
38 | <ul>
39 | <li>temperature: The parameter that controls the randomness of the output text. A higher temperature means more diversity but less coherence. We search for the optimal temperature in the range of [0, 1].</li>
40 | <li>top_p: The parameter that controls the probability mass of the output tokens. Only tokens with a cumulative probability less than or equal to top-p are considered. A lower top-p means more diversity but less coherence. We search for the optimal top-p in the range of [0, 1].</li>
41 | <li>max_tokens: The maximum number of tokens that can be generated for each output. We search for the optimal max length in the range of [50, 1000].</li>
42 | <li>n: The number of responses to generate. We search for the optimal n in the range of [1, 100].</li>
43 | <li>prompt: We use the template: &quot;{problem} Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \boxed{{}}.&quot; where {problem} will be replaced by the math problem instance.</li>
44 | </ul>
45 | <p>In this experiment, when n &gt; 1, we find the answer with highest votes among all the responses and then select it as the final answer to compare with the ground truth. For example, if n = 5 and 3 of the responses contain a final answer 301 while 2 of the responses contain a final answer 159, we choose 301 as the final answer. This can help with resolving potential errors due to randomness. We use the average accuracy and average inference cost as the metric to evaluate the performance over a dataset. The inference cost of a particular instance is measured by the price per 1K tokens and the number of tokens consumed.</p>
46 | <h2 class="anchor anchorWithStickyNavbar_LWe7" id="experiment-results">Experiment Results<a href="#experiment-results" class="hash-link" aria-label="Direct link to Experiment Results" title="Direct link to Experiment Results">​</a></h2>
47 | <p>The first figure in this blog post shows the average accuracy and average inference cost of each configuration on the level 2 Algebra test set.</p>
48 | <p>Surprisingly, the tuned gpt-3.5-turbo model is selected as a better model and it vastly outperforms untuned gpt-4 in accuracy (92% vs. 70%) with equal or 2.5 times higher inference budget.
49 | The same observation can be obtained on the level 3 Algebra test set.</p>
50 | <p><img decoding="async" loading="lazy" alt="level 3 algebra" src="/autogen/assets/images/level3algebra-94e87a683ac8832ac7ae6f41f30131a4.png" width="575" height="469" class="img_ev3q"></p>
51 | <p>However, the selected model changes on level 4 Algebra.</p>
52 | <p><img decoding="async" loading="lazy" alt="level 4 algebra" src="/autogen/assets/images/level4algebra-492beb22490df30d6cc258f061912dcd.png" width="580" height="469" class="img_ev3q"></p>
53 | <p>This time gpt-4 is selected as the best model. The tuned gpt-4 achieves much higher accuracy (56% vs. 44%) and lower cost than the untuned gpt-4.
54 | On level 5 the result is similar.</p>
55 | <p><img decoding="async" loading="lazy" alt="level 5 algebra" src="/autogen/assets/images/level5algebra-8fba701551334296d08580b4b489fe56.png" width="575" height="469" class="img_ev3q"></p>
56 | <p>We can see that AutoGen has found different optimal model and inference parameters for each subset of a particular level, which shows that these parameters matter in cost-sensitive LLM applications and need to be carefully tuned or adapted.</p>
57 | <p>An example notebook to run these experiments can be found at: <a href="https://github.com/microsoft/FLAML/blob/v1.2.1/notebook/autogen_chatgpt.ipynb" target="_blank" rel="noopener noreferrer">https://github.com/microsoft/FLAML/blob/v1.2.1/notebook/autogen_chatgpt.ipynb</a>. The experiments were run when AutoGen was a subpackage in FLAML.</p>
58 | <h2 class="anchor anchorWithStickyNavbar_LWe7" id="analysis-and-discussion">Analysis and Discussion<a href="#analysis-and-discussion" class="hash-link" aria-label="Direct link to Analysis and Discussion" title="Direct link to Analysis and Discussion">​</a></h2>
59 | <p>While gpt-3.5-turbo demonstrates competitive accuracy with voted answers in relatively easy algebra problems under the same inference budget, gpt-4 is a better choice for the most difficult problems. In general, through parameter tuning and model selection, we can identify the opportunity to save the expensive model for more challenging tasks, and improve the overall effectiveness of a budget-constrained system.</p>
60 | <p>There are many other alternative ways of solving math problems, which we have not covered in this blog post. When there are choices beyond the inference parameters, they can be generally tuned via <a href="https://microsoft.github.io/FLAML/docs/Use-Cases/Tune-User-Defined-Function" target="_blank" rel="noopener noreferrer"><code>flaml.tune</code></a>.</p>
61 | <p>The need for model selection, parameter tuning and cost saving is not specific to the math problems. The <a href="https://github.com/Significant-Gravitas/Auto-GPT" target="_blank" rel="noopener noreferrer">Auto-GPT</a> project is an example where high cost can easily prevent a generic complex task to be accomplished as it needs many LLM inference calls.</p>
62 | <h2 class="anchor anchorWithStickyNavbar_LWe7" id="for-further-reading">For Further Reading<a href="#for-further-reading" class="hash-link" aria-label="Direct link to For Further Reading" title="Direct link to For Further Reading">​</a></h2>
63 | <ul>
64 | <li><a href="https://arxiv.org/abs/2303.04673" target="_blank" rel="noopener noreferrer">Research paper about the tuning technique</a></li>
65 | <li><a href="/autogen/docs/Use-Cases/enhanced_inference">Documentation about inference tuning</a></li>
66 | </ul>
67 | <p><em>Do you have any experience to share about LLM applications? Do you like to see more support or research of LLM optimization or automation? Please join our <a href="https://discord.gg/pAbnFJrkgZ" target="_blank" rel="noopener noreferrer">Discord</a> server for discussion.</em></p></div><footer class="row docusaurus-mt-lg blogPostFooterDetailsFull_mRVl"><div class="col"><b>Tags:</b><ul class="tags_jXut padding--none margin-left--sm"><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/autogen/blog/tags/llm">LLM</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/autogen/blog/tags/gpt">GPT</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/autogen/blog/tags/research">research</a></li></ul></div></footer></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Blog post page navigation"><a class="pagination-nav__link pagination-nav__link--prev" href="/autogen/blog/2023/05/18/GPT-adaptive-humaneval"><div class="pagination-nav__sublabel">Newer Post</div><div class="pagination-nav__label">Achieve More, Pay Less - Use GPT-4 Smartly</div></a></nav></main><div class="col col--2"><div class="tableOfContents_bqdL thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#experiment-setup" class="table-of-contents__link toc-highlight">Experiment Setup</a></li><li><a href="#experiment-results" class="table-of-contents__link toc-highlight">Experiment Results</a></li><li><a href="#analysis-and-discussion" class="table-of-contents__link toc-highlight">Analysis and Discussion</a></li><li><a href="#for-further-reading" class="table-of-contents__link toc-highlight">For Further Reading</a></li></ul></div></div></div></div></div><footer class="footer footer--dark"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://discord.gg/pAbnFJrkgZ" target="_blank" rel="noopener noreferrer" class="footer__link-item">Discord<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://twitter.com/pyautogen" target="_blank" rel="noopener noreferrer" class="footer__link-item">Twitter<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 AutoGen Authors |  <a target="_blank" style="color:#10adff" href="https://go.microsoft.com/fwlink/?LinkId=521839">Privacy and Cookies</a></div></div></div></footer></div>
68 | </body>
69 | </html>
70 | 
```

--------------------------------------------------------------------------------
/packages/markitdown/src/markitdown/_markitdown.py:
--------------------------------------------------------------------------------

```python
  1 | import mimetypes
  2 | import os
  3 | import re
  4 | import sys
  5 | import shutil
  6 | import traceback
  7 | import io
  8 | from dataclasses import dataclass
  9 | from importlib.metadata import entry_points
 10 | from typing import Any, List, Dict, Optional, Union, BinaryIO
 11 | from pathlib import Path
 12 | from urllib.parse import urlparse
 13 | from warnings import warn
 14 | import requests
 15 | import magika
 16 | import charset_normalizer
 17 | import codecs
 18 | 
 19 | from ._stream_info import StreamInfo
 20 | from ._uri_utils import parse_data_uri, file_uri_to_path
 21 | 
 22 | from .converters import (
 23 |     PlainTextConverter,
 24 |     HtmlConverter,
 25 |     RssConverter,
 26 |     WikipediaConverter,
 27 |     YouTubeConverter,
 28 |     IpynbConverter,
 29 |     BingSerpConverter,
 30 |     PdfConverter,
 31 |     DocxConverter,
 32 |     XlsxConverter,
 33 |     XlsConverter,
 34 |     PptxConverter,
 35 |     ImageConverter,
 36 |     AudioConverter,
 37 |     OutlookMsgConverter,
 38 |     ZipConverter,
 39 |     EpubConverter,
 40 |     DocumentIntelligenceConverter,
 41 |     CsvConverter,
 42 | )
 43 | 
 44 | from ._base_converter import DocumentConverter, DocumentConverterResult
 45 | 
 46 | from ._exceptions import (
 47 |     FileConversionException,
 48 |     UnsupportedFormatException,
 49 |     FailedConversionAttempt,
 50 | )
 51 | 
 52 | 
 53 | # Lower priority values are tried first.
 54 | PRIORITY_SPECIFIC_FILE_FORMAT = (
 55 |     0.0  # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
 56 | )
 57 | PRIORITY_GENERIC_FILE_FORMAT = (
 58 |     10.0  # Near catch-all converters for mimetypes like text/*, etc.
 59 | )
 60 | 
 61 | 
 62 | _plugins: Union[None, List[Any]] = None  # If None, plugins have not been loaded yet.
 63 | 
 64 | 
 65 | def _load_plugins() -> Union[None, List[Any]]:
 66 |     """Lazy load plugins, exiting early if already loaded."""
 67 |     global _plugins
 68 | 
 69 |     # Skip if we've already loaded plugins
 70 |     if _plugins is not None:
 71 |         return _plugins
 72 | 
 73 |     # Load plugins
 74 |     _plugins = []
 75 |     for entry_point in entry_points(group="markitdown.plugin"):
 76 |         try:
 77 |             _plugins.append(entry_point.load())
 78 |         except Exception:
 79 |             tb = traceback.format_exc()
 80 |             warn(f"Plugin '{entry_point.name}' failed to load ... skipping:\n{tb}")
 81 | 
 82 |     return _plugins
 83 | 
 84 | 
 85 | @dataclass(kw_only=True, frozen=True)
 86 | class ConverterRegistration:
 87 |     """A registration of a converter with its priority and other metadata."""
 88 | 
 89 |     converter: DocumentConverter
 90 |     priority: float
 91 | 
 92 | 
 93 | class MarkItDown:
 94 |     """(In preview) An extremely simple text-based document reader, suitable for LLM use.
 95 |     This reader will convert common file-types or webpages to Markdown."""
 96 | 
 97 |     def __init__(
 98 |         self,
 99 |         *,
100 |         enable_builtins: Union[None, bool] = None,
101 |         enable_plugins: Union[None, bool] = None,
102 |         **kwargs,
103 |     ):
104 |         self._builtins_enabled = False
105 |         self._plugins_enabled = False
106 | 
107 |         requests_session = kwargs.get("requests_session")
108 |         if requests_session is None:
109 |             self._requests_session = requests.Session()
110 |         else:
111 |             self._requests_session = requests_session
112 | 
113 |         self._magika = magika.Magika()
114 | 
115 |         # TODO - remove these (see enable_builtins)
116 |         self._llm_client: Any = None
117 |         self._llm_model: Union[str | None] = None
118 |         self._llm_prompt: Union[str | None] = None
119 |         self._exiftool_path: Union[str | None] = None
120 |         self._style_map: Union[str | None] = None
121 | 
122 |         # Register the converters
123 |         self._converters: List[ConverterRegistration] = []
124 | 
125 |         if (
126 |             enable_builtins is None or enable_builtins
127 |         ):  # Default to True when not specified
128 |             self.enable_builtins(**kwargs)
129 | 
130 |         if enable_plugins:
131 |             self.enable_plugins(**kwargs)
132 | 
133 |     def enable_builtins(self, **kwargs) -> None:
134 |         """
135 |         Enable and register built-in converters.
136 |         Built-in converters are enabled by default.
137 |         This method should only be called once, if built-ins were initially disabled.
138 |         """
139 |         if not self._builtins_enabled:
140 |             # TODO: Move these into converter constructors
141 |             self._llm_client = kwargs.get("llm_client")
142 |             self._llm_model = kwargs.get("llm_model")
143 |             self._llm_prompt = kwargs.get("llm_prompt")
144 |             self._exiftool_path = kwargs.get("exiftool_path")
145 |             self._style_map = kwargs.get("style_map")
146 | 
147 |             if self._exiftool_path is None:
148 |                 self._exiftool_path = os.getenv("EXIFTOOL_PATH")
149 | 
150 |             # Still none? Check well-known paths
151 |             if self._exiftool_path is None:
152 |                 candidate = shutil.which("exiftool")
153 |                 if candidate:
154 |                     candidate = os.path.abspath(candidate)
155 |                     if any(
156 |                         d == os.path.dirname(candidate)
157 |                         for d in [
158 |                             "/usr/bin",
159 |                             "/usr/local/bin",
160 |                             "/opt",
161 |                             "/opt/bin",
162 |                             "/opt/local/bin",
163 |                             "/opt/homebrew/bin",
164 |                             "C:\\Windows\\System32",
165 |                             "C:\\Program Files",
166 |                             "C:\\Program Files (x86)",
167 |                         ]
168 |                     ):
169 |                         self._exiftool_path = candidate
170 | 
171 |             # Register converters for successful browsing operations
172 |             # Later registrations are tried first / take higher priority than earlier registrations
173 |             # To this end, the most specific converters should appear below the most generic converters
174 |             self.register_converter(
175 |                 PlainTextConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
176 |             )
177 |             self.register_converter(
178 |                 ZipConverter(markitdown=self), priority=PRIORITY_GENERIC_FILE_FORMAT
179 |             )
180 |             self.register_converter(
181 |                 HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
182 |             )
183 |             self.register_converter(RssConverter())
184 |             self.register_converter(WikipediaConverter())
185 |             self.register_converter(YouTubeConverter())
186 |             self.register_converter(BingSerpConverter())
187 |             self.register_converter(DocxConverter())
188 |             self.register_converter(XlsxConverter())
189 |             self.register_converter(XlsConverter())
190 |             self.register_converter(PptxConverter())
191 |             self.register_converter(AudioConverter())
192 |             self.register_converter(ImageConverter())
193 |             self.register_converter(IpynbConverter())
194 |             self.register_converter(PdfConverter())
195 |             self.register_converter(OutlookMsgConverter())
196 |             self.register_converter(EpubConverter())
197 |             self.register_converter(CsvConverter())
198 | 
199 |             # Register Document Intelligence converter at the top of the stack if endpoint is provided
200 |             docintel_endpoint = kwargs.get("docintel_endpoint")
201 |             if docintel_endpoint is not None:
202 |                 docintel_args: Dict[str, Any] = {}
203 |                 docintel_args["endpoint"] = docintel_endpoint
204 | 
205 |                 docintel_credential = kwargs.get("docintel_credential")
206 |                 if docintel_credential is not None:
207 |                     docintel_args["credential"] = docintel_credential
208 | 
209 |                 docintel_types = kwargs.get("docintel_file_types")
210 |                 if docintel_types is not None:
211 |                     docintel_args["file_types"] = docintel_types
212 | 
213 |                 docintel_version = kwargs.get("docintel_api_version")
214 |                 if docintel_version is not None:
215 |                     docintel_args["api_version"] = docintel_version
216 | 
217 |                 self.register_converter(
218 |                     DocumentIntelligenceConverter(**docintel_args),
219 |                 )
220 | 
221 |             self._builtins_enabled = True
222 |         else:
223 |             warn("Built-in converters are already enabled.", RuntimeWarning)
224 | 
225 |     def enable_plugins(self, **kwargs) -> None:
226 |         """
227 |         Enable and register converters provided by plugins.
228 |         Plugins are disabled by default.
229 |         This method should only be called once, if plugins were initially disabled.
230 |         """
231 |         if not self._plugins_enabled:
232 |             # Load plugins
233 |             plugins = _load_plugins()
234 |             assert plugins is not None
235 |             for plugin in plugins:
236 |                 try:
237 |                     plugin.register_converters(self, **kwargs)
238 |                 except Exception:
239 |                     tb = traceback.format_exc()
240 |                     warn(f"Plugin '{plugin}' failed to register converters:\n{tb}")
241 |             self._plugins_enabled = True
242 |         else:
243 |             warn("Plugins converters are already enabled.", RuntimeWarning)
244 | 
245 |     def convert(
246 |         self,
247 |         source: Union[str, requests.Response, Path, BinaryIO],
248 |         *,
249 |         stream_info: Optional[StreamInfo] = None,
250 |         **kwargs: Any,
251 |     ) -> DocumentConverterResult:  # TODO: deal with kwargs
252 |         """
253 |         Args:
254 |             - source: can be a path (str or Path), url, or a requests.response object
255 |             - stream_info: optional stream info to use for the conversion. If None, infer from source
256 |             - kwargs: additional arguments to pass to the converter
257 |         """
258 | 
259 |         # Local path or url
260 |         if isinstance(source, str):
261 |             if (
262 |                 source.startswith("http:")
263 |                 or source.startswith("https:")
264 |                 or source.startswith("file:")
265 |                 or source.startswith("data:")
266 |             ):
267 |                 # Rename the url argument to mock_url
268 |                 # (Deprecated -- use stream_info)
269 |                 _kwargs = {k: v for k, v in kwargs.items()}
270 |                 if "url" in _kwargs:
271 |                     _kwargs["mock_url"] = _kwargs["url"]
272 |                     del _kwargs["url"]
273 | 
274 |                 return self.convert_uri(source, stream_info=stream_info, **_kwargs)
275 |             else:
276 |                 return self.convert_local(source, stream_info=stream_info, **kwargs)
277 |         # Path object
278 |         elif isinstance(source, Path):
279 |             return self.convert_local(source, stream_info=stream_info, **kwargs)
280 |         # Request response
281 |         elif isinstance(source, requests.Response):
282 |             return self.convert_response(source, stream_info=stream_info, **kwargs)
283 |         # Binary stream
284 |         elif (
285 |             hasattr(source, "read")
286 |             and callable(source.read)
287 |             and not isinstance(source, io.TextIOBase)
288 |         ):
289 |             return self.convert_stream(source, stream_info=stream_info, **kwargs)
290 |         else:
291 |             raise TypeError(
292 |                 f"Invalid source type: {type(source)}. Expected str, requests.Response, BinaryIO."
293 |             )
294 | 
295 |     def convert_local(
296 |         self,
297 |         path: Union[str, Path],
298 |         *,
299 |         stream_info: Optional[StreamInfo] = None,
300 |         file_extension: Optional[str] = None,  # Deprecated -- use stream_info
301 |         url: Optional[str] = None,  # Deprecated -- use stream_info
302 |         **kwargs: Any,
303 |     ) -> DocumentConverterResult:
304 |         if isinstance(path, Path):
305 |             path = str(path)
306 | 
307 |         # Build a base StreamInfo object from which to start guesses
308 |         base_guess = StreamInfo(
309 |             local_path=path,
310 |             extension=os.path.splitext(path)[1],
311 |             filename=os.path.basename(path),
312 |         )
313 | 
314 |         # Extend the base_guess with any additional info from the arguments
315 |         if stream_info is not None:
316 |             base_guess = base_guess.copy_and_update(stream_info)
317 | 
318 |         if file_extension is not None:
319 |             # Deprecated -- use stream_info
320 |             base_guess = base_guess.copy_and_update(extension=file_extension)
321 | 
322 |         if url is not None:
323 |             # Deprecated -- use stream_info
324 |             base_guess = base_guess.copy_and_update(url=url)
325 | 
326 |         with open(path, "rb") as fh:
327 |             guesses = self._get_stream_info_guesses(
328 |                 file_stream=fh, base_guess=base_guess
329 |             )
330 |             return self._convert(file_stream=fh, stream_info_guesses=guesses, **kwargs)
331 | 
332 |     def convert_stream(
333 |         self,
334 |         stream: BinaryIO,
335 |         *,
336 |         stream_info: Optional[StreamInfo] = None,
337 |         file_extension: Optional[str] = None,  # Deprecated -- use stream_info
338 |         url: Optional[str] = None,  # Deprecated -- use stream_info
339 |         **kwargs: Any,
340 |     ) -> DocumentConverterResult:
341 |         guesses: List[StreamInfo] = []
342 | 
343 |         # Do we have anything on which to base a guess?
344 |         base_guess = None
345 |         if stream_info is not None or file_extension is not None or url is not None:
346 |             # Start with a non-Null base guess
347 |             if stream_info is None:
348 |                 base_guess = StreamInfo()
349 |             else:
350 |                 base_guess = stream_info
351 | 
352 |             if file_extension is not None:
353 |                 # Deprecated -- use stream_info
354 |                 assert base_guess is not None  # for mypy
355 |                 base_guess = base_guess.copy_and_update(extension=file_extension)
356 | 
357 |             if url is not None:
358 |                 # Deprecated -- use stream_info
359 |                 assert base_guess is not None  # for mypy
360 |                 base_guess = base_guess.copy_and_update(url=url)
361 | 
362 |         # Check if we have a seekable stream. If not, load the entire stream into memory.
363 |         if not stream.seekable():
364 |             buffer = io.BytesIO()
365 |             while True:
366 |                 chunk = stream.read(4096)
367 |                 if not chunk:
368 |                     break
369 |                 buffer.write(chunk)
370 |             buffer.seek(0)
371 |             stream = buffer
372 | 
373 |         # Add guesses based on stream content
374 |         guesses = self._get_stream_info_guesses(
375 |             file_stream=stream, base_guess=base_guess or StreamInfo()
376 |         )
377 |         return self._convert(file_stream=stream, stream_info_guesses=guesses, **kwargs)
378 | 
379 |     def convert_url(
380 |         self,
381 |         url: str,
382 |         *,
383 |         stream_info: Optional[StreamInfo] = None,
384 |         file_extension: Optional[str] = None,
385 |         mock_url: Optional[str] = None,
386 |         **kwargs: Any,
387 |     ) -> DocumentConverterResult:
388 |         """Alias for convert_uri()"""
389 |         # convert_url will likely be deprecated in the future in favor of convert_uri
390 |         return self.convert_uri(
391 |             url,
392 |             stream_info=stream_info,
393 |             file_extension=file_extension,
394 |             mock_url=mock_url,
395 |             **kwargs,
396 |         )
397 | 
398 |     def convert_uri(
399 |         self,
400 |         uri: str,
401 |         *,
402 |         stream_info: Optional[StreamInfo] = None,
403 |         file_extension: Optional[str] = None,  # Deprecated -- use stream_info
404 |         mock_url: Optional[
405 |             str
406 |         ] = None,  # Mock the request as if it came from a different URL
407 |         **kwargs: Any,
408 |     ) -> DocumentConverterResult:
409 |         uri = uri.strip()
410 | 
411 |         # File URIs
412 |         if uri.startswith("file:"):
413 |             netloc, path = file_uri_to_path(uri)
414 |             if netloc and netloc != "localhost":
415 |                 raise ValueError(
416 |                     f"Unsupported file URI: {uri}. Netloc must be empty or localhost."
417 |                 )
418 |             return self.convert_local(
419 |                 path,
420 |                 stream_info=stream_info,
421 |                 file_extension=file_extension,
422 |                 url=mock_url,
423 |                 **kwargs,
424 |             )
425 |         # Data URIs
426 |         elif uri.startswith("data:"):
427 |             mimetype, attributes, data = parse_data_uri(uri)
428 | 
429 |             base_guess = StreamInfo(
430 |                 mimetype=mimetype,
431 |                 charset=attributes.get("charset"),
432 |             )
433 |             if stream_info is not None:
434 |                 base_guess = base_guess.copy_and_update(stream_info)
435 | 
436 |             return self.convert_stream(
437 |                 io.BytesIO(data),
438 |                 stream_info=base_guess,
439 |                 file_extension=file_extension,
440 |                 url=mock_url,
441 |                 **kwargs,
442 |             )
443 |         # HTTP/HTTPS URIs
444 |         elif uri.startswith("http:") or uri.startswith("https:"):
445 |             response = self._requests_session.get(uri, stream=True)
446 |             response.raise_for_status()
447 |             return self.convert_response(
448 |                 response,
449 |                 stream_info=stream_info,
450 |                 file_extension=file_extension,
451 |                 url=mock_url,
452 |                 **kwargs,
453 |             )
454 |         else:
455 |             raise ValueError(
456 |                 f"Unsupported URI scheme: {uri.split(':')[0]}. Supported schemes are: file:, data:, http:, https:"
457 |             )
458 | 
459 |     def convert_response(
460 |         self,
461 |         response: requests.Response,
462 |         *,
463 |         stream_info: Optional[StreamInfo] = None,
464 |         file_extension: Optional[str] = None,  # Deprecated -- use stream_info
465 |         url: Optional[str] = None,  # Deprecated -- use stream_info
466 |         **kwargs: Any,
467 |     ) -> DocumentConverterResult:
468 |         # If there is a content-type header, get the mimetype and charset (if present)
469 |         mimetype: Optional[str] = None
470 |         charset: Optional[str] = None
471 | 
472 |         if "content-type" in response.headers:
473 |             parts = response.headers["content-type"].split(";")
474 |             mimetype = parts.pop(0).strip()
475 |             for part in parts:
476 |                 if part.strip().startswith("charset="):
477 |                     _charset = part.split("=")[1].strip()
478 |                     if len(_charset) > 0:
479 |                         charset = _charset
480 | 
481 |         # If there is a content-disposition header, get the filename and possibly the extension
482 |         filename: Optional[str] = None
483 |         extension: Optional[str] = None
484 |         if "content-disposition" in response.headers:
485 |             m = re.search(r"filename=([^;]+)", response.headers["content-disposition"])
486 |             if m:
487 |                 filename = m.group(1).strip("\"'")
488 |                 _, _extension = os.path.splitext(filename)
489 |                 if len(_extension) > 0:
490 |                     extension = _extension
491 | 
492 |         # If there is still no filename, try to read it from the url
493 |         if filename is None:
494 |             parsed_url = urlparse(response.url)
495 |             _, _extension = os.path.splitext(parsed_url.path)
496 |             if len(_extension) > 0:  # Looks like this might be a file!
497 |                 filename = os.path.basename(parsed_url.path)
498 |                 extension = _extension
499 | 
500 |         # Create an initial guess from all this information
501 |         base_guess = StreamInfo(
502 |             mimetype=mimetype,
503 |             charset=charset,
504 |             filename=filename,
505 |             extension=extension,
506 |             url=response.url,
507 |         )
508 | 
509 |         # Update with any additional info from the arguments
510 |         if stream_info is not None:
511 |             base_guess = base_guess.copy_and_update(stream_info)
512 |         if file_extension is not None:
513 |             # Deprecated -- use stream_info
514 |             base_guess = base_guess.copy_and_update(extension=file_extension)
515 |         if url is not None:
516 |             # Deprecated -- use stream_info
517 |             base_guess = base_guess.copy_and_update(url=url)
518 | 
519 |         # Read into BytesIO
520 |         buffer = io.BytesIO()
521 |         for chunk in response.iter_content(chunk_size=512):
522 |             buffer.write(chunk)
523 |         buffer.seek(0)
524 | 
525 |         # Convert
526 |         guesses = self._get_stream_info_guesses(
527 |             file_stream=buffer, base_guess=base_guess
528 |         )
529 |         return self._convert(file_stream=buffer, stream_info_guesses=guesses, **kwargs)
530 | 
531 |     def _convert(
532 |         self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs
533 |     ) -> DocumentConverterResult:
534 |         res: Union[None, DocumentConverterResult] = None
535 | 
536 |         # Keep track of which converters throw exceptions
537 |         failed_attempts: List[FailedConversionAttempt] = []
538 | 
539 |         # Create a copy of the page_converters list, sorted by priority.
540 |         # We do this with each call to _convert because the priority of converters may change between calls.
541 |         # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
542 |         sorted_registrations = sorted(self._converters, key=lambda x: x.priority)
543 | 
544 |         # Remember the initial stream position so that we can return to it
545 |         cur_pos = file_stream.tell()
546 | 
547 |         for stream_info in stream_info_guesses + [StreamInfo()]:
548 |             for converter_registration in sorted_registrations:
549 |                 converter = converter_registration.converter
550 |                 # Sanity check -- make sure the cur_pos is still the same
551 |                 assert (
552 |                     cur_pos == file_stream.tell()
553 |                 ), "File stream position should NOT change between guess iterations"
554 | 
555 |                 _kwargs = {k: v for k, v in kwargs.items()}
556 | 
557 |                 # Copy any additional global options
558 |                 if "llm_client" not in _kwargs and self._llm_client is not None:
559 |                     _kwargs["llm_client"] = self._llm_client
560 | 
561 |                 if "llm_model" not in _kwargs and self._llm_model is not None:
562 |                     _kwargs["llm_model"] = self._llm_model
563 | 
564 |                 if "llm_prompt" not in _kwargs and self._llm_prompt is not None:
565 |                     _kwargs["llm_prompt"] = self._llm_prompt
566 | 
567 |                 if "style_map" not in _kwargs and self._style_map is not None:
568 |                     _kwargs["style_map"] = self._style_map
569 | 
570 |                 if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
571 |                     _kwargs["exiftool_path"] = self._exiftool_path
572 | 
573 |                 # Add the list of converters for nested processing
574 |                 _kwargs["_parent_converters"] = self._converters
575 | 
576 |                 # Add legaxy kwargs
577 |                 if stream_info is not None:
578 |                     if stream_info.extension is not None:
579 |                         _kwargs["file_extension"] = stream_info.extension
580 | 
581 |                     if stream_info.url is not None:
582 |                         _kwargs["url"] = stream_info.url
583 | 
584 |                 # Check if the converter will accept the file, and if so, try to convert it
585 |                 _accepts = False
586 |                 try:
587 |                     _accepts = converter.accepts(file_stream, stream_info, **_kwargs)
588 |                 except NotImplementedError:
589 |                     pass
590 | 
591 |                 # accept() should not have changed the file stream position
592 |                 assert (
593 |                     cur_pos == file_stream.tell()
594 |                 ), f"{type(converter).__name__}.accept() should NOT change the file_stream position"
595 | 
596 |                 # Attempt the conversion
597 |                 if _accepts:
598 |                     try:
599 |                         res = converter.convert(file_stream, stream_info, **_kwargs)
600 |                     except Exception:
601 |                         failed_attempts.append(
602 |                             FailedConversionAttempt(
603 |                                 converter=converter, exc_info=sys.exc_info()
604 |                             )
605 |                         )
606 |                     finally:
607 |                         file_stream.seek(cur_pos)
608 | 
609 |                 if res is not None:
610 |                     # Normalize the content
611 |                     res.text_content = "\n".join(
612 |                         [line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
613 |                     )
614 |                     res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
615 |                     return res
616 | 
617 |         # If we got this far without success, report any exceptions
618 |         if len(failed_attempts) > 0:
619 |             raise FileConversionException(attempts=failed_attempts)
620 | 
621 |         # Nothing can handle it!
622 |         raise UnsupportedFormatException(
623 |             "Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
624 |         )
625 | 
626 |     def register_page_converter(self, converter: DocumentConverter) -> None:
627 |         """DEPRECATED: User register_converter instead."""
628 |         warn(
629 |             "register_page_converter is deprecated. Use register_converter instead.",
630 |             DeprecationWarning,
631 |         )
632 |         self.register_converter(converter)
633 | 
634 |     def register_converter(
635 |         self,
636 |         converter: DocumentConverter,
637 |         *,
638 |         priority: float = PRIORITY_SPECIFIC_FILE_FORMAT,
639 |     ) -> None:
640 |         """
641 |         Register a DocumentConverter with a given priority.
642 | 
643 |         Priorities work as follows: By default, most converters get priority
644 |         DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
645 |         is the PlainTextConverter, HtmlConverter, and ZipConverter, which get
646 |         priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), with lower values
647 |         being tried first (i.e., higher priority).
648 | 
649 |         Just prior to conversion, the converters are sorted by priority, using
650 |         a stable sort. This means that converters with the same priority will
651 |         remain in the same order, with the most recently registered converters
652 |         appearing first.
653 | 
654 |         We have tight control over the order of built-in converters, but
655 |         plugins can register converters in any order. The registration's priority
656 |         field reasserts some control over the order of converters.
657 | 
658 |         Plugins can register converters with any priority, to appear before or
659 |         after the built-ins. For example, a plugin with priority 9 will run
660 |         before the PlainTextConverter, but after the built-in converters.
661 |         """
662 |         self._converters.insert(
663 |             0, ConverterRegistration(converter=converter, priority=priority)
664 |         )
665 | 
666 |     def _get_stream_info_guesses(
667 |         self, file_stream: BinaryIO, base_guess: StreamInfo
668 |     ) -> List[StreamInfo]:
669 |         """
670 |         Given a base guess, attempt to guess or expand on the stream info using the stream content (via magika).
671 |         """
672 |         guesses: List[StreamInfo] = []
673 | 
674 |         # Enhance the base guess with information based on the extension or mimetype
675 |         enhanced_guess = base_guess.copy_and_update()
676 | 
677 |         # If there's an extension and no mimetype, try to guess the mimetype
678 |         if base_guess.mimetype is None and base_guess.extension is not None:
679 |             _m, _ = mimetypes.guess_type(
680 |                 "placeholder" + base_guess.extension, strict=False
681 |             )
682 |             if _m is not None:
683 |                 enhanced_guess = enhanced_guess.copy_and_update(mimetype=_m)
684 | 
685 |         # If there's a mimetype and no extension, try to guess the extension
686 |         if base_guess.mimetype is not None and base_guess.extension is None:
687 |             _e = mimetypes.guess_all_extensions(base_guess.mimetype, strict=False)
688 |             if len(_e) > 0:
689 |                 enhanced_guess = enhanced_guess.copy_and_update(extension=_e[0])
690 | 
691 |         # Call magika to guess from the stream
692 |         cur_pos = file_stream.tell()
693 |         try:
694 |             result = self._magika.identify_stream(file_stream)
695 |             if result.status == "ok" and result.prediction.output.label != "unknown":
696 |                 # If it's text, also guess the charset
697 |                 charset = None
698 |                 if result.prediction.output.is_text:
699 |                     # Read the first 4k to guess the charset
700 |                     file_stream.seek(cur_pos)
701 |                     stream_page = file_stream.read(4096)
702 |                     charset_result = charset_normalizer.from_bytes(stream_page).best()
703 | 
704 |                     if charset_result is not None:
705 |                         charset = self._normalize_charset(charset_result.encoding)
706 | 
707 |                 # Normalize the first extension listed
708 |                 guessed_extension = None
709 |                 if len(result.prediction.output.extensions) > 0:
710 |                     guessed_extension = "." + result.prediction.output.extensions[0]
711 | 
712 |                 # Determine if the guess is compatible with the base guess
713 |                 compatible = True
714 |                 if (
715 |                     base_guess.mimetype is not None
716 |                     and base_guess.mimetype != result.prediction.output.mime_type
717 |                 ):
718 |                     compatible = False
719 | 
720 |                 if (
721 |                     base_guess.extension is not None
722 |                     and base_guess.extension.lstrip(".")
723 |                     not in result.prediction.output.extensions
724 |                 ):
725 |                     compatible = False
726 | 
727 |                 if (
728 |                     base_guess.charset is not None
729 |                     and self._normalize_charset(base_guess.charset) != charset
730 |                 ):
731 |                     compatible = False
732 | 
733 |                 if compatible:
734 |                     # Add the compatible base guess
735 |                     guesses.append(
736 |                         StreamInfo(
737 |                             mimetype=base_guess.mimetype
738 |                             or result.prediction.output.mime_type,
739 |                             extension=base_guess.extension or guessed_extension,
740 |                             charset=base_guess.charset or charset,
741 |                             filename=base_guess.filename,
742 |                             local_path=base_guess.local_path,
743 |                             url=base_guess.url,
744 |                         )
745 |                     )
746 |                 else:
747 |                     # The magika guess was incompatible with the base guess, so add both guesses
748 |                     guesses.append(enhanced_guess)
749 |                     guesses.append(
750 |                         StreamInfo(
751 |                             mimetype=result.prediction.output.mime_type,
752 |                             extension=guessed_extension,
753 |                             charset=charset,
754 |                             filename=base_guess.filename,
755 |                             local_path=base_guess.local_path,
756 |                             url=base_guess.url,
757 |                         )
758 |                     )
759 |             else:
760 |                 # There were no other guesses, so just add the base guess
761 |                 guesses.append(enhanced_guess)
762 |         finally:
763 |             file_stream.seek(cur_pos)
764 | 
765 |         return guesses
766 | 
767 |     def _normalize_charset(self, charset: str | None) -> str | None:
768 |         """
769 |         Normalize a charset string to a canonical form.
770 |         """
771 |         if charset is None:
772 |             return None
773 |         try:
774 |             return codecs.lookup(charset).name
775 |         except LookupError:
776 |             return charset
777 | 
```
Page 2/5FirstPrevNextLast