This is page 2 of 5. Use http://codebase.md/microsoft/markitdown?page={x} to view the full context. # Directory Structure ``` ├── .devcontainer │ └── devcontainer.json ├── .dockerignore ├── .gitattributes ├── .github │ ├── dependabot.yml │ └── workflows │ ├── pre-commit.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CODE_OF_CONDUCT.md ├── Dockerfile ├── LICENSE ├── packages │ ├── markitdown │ │ ├── pyproject.toml │ │ ├── README.md │ │ ├── src │ │ │ └── markitdown │ │ │ ├── __about__.py │ │ │ ├── __init__.py │ │ │ ├── __main__.py │ │ │ ├── _base_converter.py │ │ │ ├── _exceptions.py │ │ │ ├── _markitdown.py │ │ │ ├── _stream_info.py │ │ │ ├── _uri_utils.py │ │ │ ├── converter_utils │ │ │ │ ├── __init__.py │ │ │ │ └── docx │ │ │ │ ├── __init__.py │ │ │ │ ├── math │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── latex_dict.py │ │ │ │ │ └── omml.py │ │ │ │ └── pre_process.py │ │ │ ├── converters │ │ │ │ ├── __init__.py │ │ │ │ ├── _audio_converter.py │ │ │ │ ├── _bing_serp_converter.py │ │ │ │ ├── _csv_converter.py │ │ │ │ ├── _doc_intel_converter.py │ │ │ │ ├── _docx_converter.py │ │ │ │ ├── _epub_converter.py │ │ │ │ ├── _exiftool.py │ │ │ │ ├── _html_converter.py │ │ │ │ ├── _image_converter.py │ │ │ │ ├── _ipynb_converter.py │ │ │ │ ├── _llm_caption.py │ │ │ │ ├── _markdownify.py │ │ │ │ ├── _outlook_msg_converter.py │ │ │ │ ├── _pdf_converter.py │ │ │ │ ├── _plain_text_converter.py │ │ │ │ ├── _pptx_converter.py │ │ │ │ ├── _rss_converter.py │ │ │ │ ├── _transcribe_audio.py │ │ │ │ ├── _wikipedia_converter.py │ │ │ │ ├── _xlsx_converter.py │ │ │ │ ├── _youtube_converter.py │ │ │ │ └── _zip_converter.py │ │ │ └── py.typed │ │ ├── tests │ │ │ ├── __init__.py │ │ │ ├── _test_vectors.py │ │ │ ├── test_cli_misc.py │ │ │ ├── test_cli_vectors.py │ │ │ ├── test_docintel_html.py │ │ │ ├── test_files │ │ │ │ ├── equations.docx │ │ │ │ ├── random.bin │ │ │ │ ├── test_blog.html │ │ │ │ ├── test_files.zip │ │ │ │ ├── test_llm.jpg │ │ │ │ ├── test_mskanji.csv │ │ │ │ ├── test_notebook.ipynb │ │ │ │ ├── test_outlook_msg.msg │ │ │ │ ├── test_rss.xml │ │ │ │ ├── test_serp.html │ │ │ │ ├── test_wikipedia.html │ │ │ │ ├── test_with_comment.docx │ │ │ │ ├── test.docx │ │ │ │ ├── test.epub │ │ │ │ ├── test.jpg │ │ │ │ ├── test.json │ │ │ │ ├── test.m4a │ │ │ │ ├── test.mp3 │ │ │ │ ├── test.pdf │ │ │ │ ├── test.pptx │ │ │ │ ├── test.wav │ │ │ │ ├── test.xls │ │ │ │ └── test.xlsx │ │ │ ├── test_module_misc.py │ │ │ └── test_module_vectors.py │ │ └── ThirdPartyNotices.md │ ├── markitdown-mcp │ │ ├── Dockerfile │ │ ├── pyproject.toml │ │ ├── README.md │ │ ├── src │ │ │ └── markitdown_mcp │ │ │ ├── __about__.py │ │ │ ├── __init__.py │ │ │ ├── __main__.py │ │ │ └── py.typed │ │ └── tests │ │ └── __init__.py │ └── markitdown-sample-plugin │ ├── pyproject.toml │ ├── README.md │ ├── src │ │ └── markitdown_sample_plugin │ │ ├── __about__.py │ │ ├── __init__.py │ │ ├── _plugin.py │ │ └── py.typed │ └── tests │ ├── __init__.py │ ├── test_files │ │ └── test.rtf │ └── test_sample_plugin.py ├── README.md ├── SECURITY.md └── SUPPORT.md ``` # Files -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py: -------------------------------------------------------------------------------- ```python # -*- coding: utf-8 -*- """ Office Math Markup Language (OMML) Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py On 25/03/2025 """ from defusedxml import ElementTree as ET from .latex_dict import ( CHARS, CHR, CHR_BO, CHR_DEFAULT, POS, POS_DEFAULT, SUB, SUP, F, F_DEFAULT, T, FUNC, D, D_DEFAULT, RAD, RAD_DEFAULT, ARR, LIM_FUNC, LIM_TO, LIM_UPP, M, BRK, BLANK, BACKSLASH, ALN, FUNC_PLACE, ) OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}" def load(stream): tree = ET.parse(stream) for omath in tree.findall(OMML_NS + "oMath"): yield oMath2Latex(omath) def load_string(string): root = ET.fromstring(string) for omath in root.findall(OMML_NS + "oMath"): yield oMath2Latex(omath) def escape_latex(strs): last = None new_chr = [] strs = strs.replace(r"\\", "\\") for c in strs: if (c in CHARS) and (last != BACKSLASH): new_chr.append(BACKSLASH + c) else: new_chr.append(c) last = c return BLANK.join(new_chr) def get_val(key, default=None, store=CHR): if key is not None: return key if not store else store.get(key, key) else: return default class Tag2Method(object): def call_method(self, elm, stag=None): getmethod = self.tag2meth.get if stag is None: stag = elm.tag.replace(OMML_NS, "") method = getmethod(stag) if method: return method(self, elm) else: return None def process_children_list(self, elm, include=None): """ process children of the elm,return iterable """ for _e in list(elm): if OMML_NS not in _e.tag: continue stag = _e.tag.replace(OMML_NS, "") if include and (stag not in include): continue t = self.call_method(_e, stag=stag) if t is None: t = self.process_unknow(_e, stag) if t is None: continue yield (stag, t, _e) def process_children_dict(self, elm, include=None): """ process children of the elm,return dict """ latex_chars = dict() for stag, t, e in self.process_children_list(elm, include): latex_chars[stag] = t return latex_chars def process_children(self, elm, include=None): """ process children of the elm,return string """ return BLANK.join( ( t if not isinstance(t, Tag2Method) else str(t) for stag, t, e in self.process_children_list(elm, include) ) ) def process_unknow(self, elm, stag): return None class Pr(Tag2Method): text = "" __val_tags = ("chr", "pos", "begChr", "endChr", "type") __innerdict = None # can't use the __dict__ """ common properties of element""" def __init__(self, elm): self.__innerdict = {} self.text = self.process_children(elm) def __str__(self): return self.text def __unicode__(self): return self.__str__(self) def __getattr__(self, name): return self.__innerdict.get(name, None) def do_brk(self, elm): self.__innerdict["brk"] = BRK return BRK def do_common(self, elm): stag = elm.tag.replace(OMML_NS, "") if stag in self.__val_tags: t = elm.get("{0}val".format(OMML_NS)) self.__innerdict[stag] = t return None tag2meth = { "brk": do_brk, "chr": do_common, "pos": do_common, "begChr": do_common, "endChr": do_common, "type": do_common, } class oMath2Latex(Tag2Method): """ Convert oMath element of omml to latex """ _t_dict = T __direct_tags = ("box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e") def __init__(self, element): self._latex = self.process_children(element) def __str__(self): return self.latex def __unicode__(self): return self.__str__(self) def process_unknow(self, elm, stag): if stag in self.__direct_tags: return self.process_children(elm) elif stag[-2:] == "Pr": return Pr(elm) else: return None @property def latex(self): return self._latex def do_acc(self, elm): """ the accent function """ c_dict = self.process_children_dict(elm) latex_s = get_val( c_dict["accPr"].chr, default=CHR_DEFAULT.get("ACC_VAL"), store=CHR ) return latex_s.format(c_dict["e"]) def do_bar(self, elm): """ the bar function """ c_dict = self.process_children_dict(elm) pr = c_dict["barPr"] latex_s = get_val(pr.pos, default=POS_DEFAULT.get("BAR_VAL"), store=POS) return pr.text + latex_s.format(c_dict["e"]) def do_d(self, elm): """ the delimiter object """ c_dict = self.process_children_dict(elm) pr = c_dict["dPr"] null = D_DEFAULT.get("null") s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T) e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T) return pr.text + D.format( left=null if not s_val else escape_latex(s_val), text=c_dict["e"], right=null if not e_val else escape_latex(e_val), ) def do_spre(self, elm): """ the Pre-Sub-Superscript object -- Not support yet """ pass def do_sub(self, elm): text = self.process_children(elm) return SUB.format(text) def do_sup(self, elm): text = self.process_children(elm) return SUP.format(text) def do_f(self, elm): """ the fraction object """ c_dict = self.process_children_dict(elm) pr = c_dict["fPr"] latex_s = get_val(pr.type, default=F_DEFAULT, store=F) return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den")) def do_func(self, elm): """ the Function-Apply object (Examples:sin cos) """ c_dict = self.process_children_dict(elm) func_name = c_dict.get("fName") return func_name.replace(FUNC_PLACE, c_dict.get("e")) def do_fname(self, elm): """ the func name """ latex_chars = [] for stag, t, e in self.process_children_list(elm): if stag == "r": if FUNC.get(t): latex_chars.append(FUNC[t]) else: raise NotImplementedError("Not support func %s" % t) else: latex_chars.append(t) t = BLANK.join(latex_chars) return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this def do_groupchr(self, elm): """ the Group-Character object """ c_dict = self.process_children_dict(elm) pr = c_dict["groupChrPr"] latex_s = get_val(pr.chr) return pr.text + latex_s.format(c_dict["e"]) def do_rad(self, elm): """ the radical object """ c_dict = self.process_children_dict(elm) text = c_dict.get("e") deg_text = c_dict.get("deg") if deg_text: return RAD.format(deg=deg_text, text=text) else: return RAD_DEFAULT.format(text=text) def do_eqarr(self, elm): """ the Array object """ return ARR.format( text=BRK.join( [t for stag, t, e in self.process_children_list(elm, include=("e",))] ) ) def do_limlow(self, elm): """ the Lower-Limit object """ t_dict = self.process_children_dict(elm, include=("e", "lim")) latex_s = LIM_FUNC.get(t_dict["e"]) if not latex_s: raise NotImplementedError("Not support lim %s" % t_dict["e"]) else: return latex_s.format(lim=t_dict.get("lim")) def do_limupp(self, elm): """ the Upper-Limit object """ t_dict = self.process_children_dict(elm, include=("e", "lim")) return LIM_UPP.format(lim=t_dict.get("lim"), text=t_dict.get("e")) def do_lim(self, elm): """ the lower limit of the limLow object and the upper limit of the limUpp function """ return self.process_children(elm).replace(LIM_TO[0], LIM_TO[1]) def do_m(self, elm): """ the Matrix object """ rows = [] for stag, t, e in self.process_children_list(elm): if stag == "mPr": pass elif stag == "mr": rows.append(t) return M.format(text=BRK.join(rows)) def do_mr(self, elm): """ a single row of the matrix m """ return ALN.join( [t for stag, t, e in self.process_children_list(elm, include=("e",))] ) def do_nary(self, elm): """ the n-ary object """ res = [] bo = "" for stag, t, e in self.process_children_list(elm): if stag == "naryPr": bo = get_val(t.chr, store=CHR_BO) else: res.append(t) return bo + BLANK.join(res) def do_r(self, elm): """ Get text from 'r' element,And try convert them to latex symbols @todo text style support , (sty) @todo \text (latex pure text support) """ _str = [] for s in elm.findtext("./{0}t".format(OMML_NS)): # s = s if isinstance(s,unicode) else unicode(s,'utf-8') _str.append(self._t_dict.get(s, s)) return escape_latex(BLANK.join(_str)) tag2meth = { "acc": do_acc, "r": do_r, "bar": do_bar, "sub": do_sub, "sup": do_sup, "f": do_f, "func": do_func, "fName": do_fname, "groupChr": do_groupchr, "d": do_d, "rad": do_rad, "eqArr": do_eqarr, "limLow": do_limlow, "limUpp": do_limupp, "lim": do_lim, "m": do_m, "mr": do_mr, "nary": do_nary, } ``` -------------------------------------------------------------------------------- /packages/markitdown/ThirdPartyNotices.md: -------------------------------------------------------------------------------- ```markdown # THIRD-PARTY SOFTWARE NOTICES AND INFORMATION **Do Not Translate or Localize** This project incorporates components from the projects listed below. The original copyright notices and the licenses under which MarkItDown received such components are set forth below. MarkItDown reserves all rights not expressly granted herein, whether by implication, estoppel or otherwise. 1.dwml (https://github.com/xiilei/dwml) dwml NOTICES AND INFORMATION BEGIN HERE ----------------------------------------- NOTE 1: What follows is a verbatim copy of dwml's LICENSE file, as it appeared on March 28th, 2025 - including placeholders for the copyright owner and year. NOTE 2: The Apache License, Version 2.0, requires that modifications to the dwml source code be documented. The following section summarizes these changes. The full details are available in the MarkItDown source code repository under PR #1160 (https://github.com/microsoft/markitdown/pull/1160) This project incorporates `dwml/latex_dict.py` and `dwml/omml.py` files without any additional logic modifications (which lives in `packages/markitdown/src/markitdown/converter_utils/docx/math` location). However, we have reformatted the code according to `black` code formatter. From `tests/docx.py` file, we have used `DOCXML_ROOT` XML namespaces and the rest of the file is not used. ----------------------------------------- Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright {yyyy} {name of copyright owner} Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ----------------------------------------- END OF dwml NOTICES AND INFORMATION ``` -------------------------------------------------------------------------------- /packages/markitdown/tests/test_module_misc.py: -------------------------------------------------------------------------------- ```python #!/usr/bin/env python3 -m pytest import io import os import re import shutil import pytest from unittest.mock import MagicMock from markitdown._uri_utils import parse_data_uri, file_uri_to_path from markitdown import ( MarkItDown, UnsupportedFormatException, FileConversionException, StreamInfo, ) # This file contains module tests that are not directly tested by the FileTestVectors. # This includes things like helper functions and runtime conversion options # (e.g., LLM clients, exiftool path, transcription services, etc.) skip_remote = ( True if os.environ.get("GITHUB_ACTIONS") else False ) # Don't run these tests in CI # Don't run the llm tests without a key and the client library skip_llm = False if os.environ.get("OPENAI_API_KEY") else True try: import openai except ModuleNotFoundError: skip_llm = True # Skip exiftool tests if not installed skip_exiftool = shutil.which("exiftool") is None TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") JPG_TEST_EXIFTOOL = { "Author": "AutoGen Authors", "Title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "Description": "AutoGen enables diverse LLM-based applications", "ImageSize": "1615x1967", "DateTimeOriginal": "2024:03:14 22:10:00", } MP3_TEST_EXIFTOOL = { "Title": "f67a499e-a7d0-4ca3-a49b-358bd934ae3e", "Artist": "Artist Name Test String", "Album": "Album Name Test String", "SampleRate": "48000", } PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf" PDF_TEST_STRINGS = [ "While there is contemporaneous exploration of multi-agent approaches" ] YOUTUBE_TEST_URL = "https://www.youtube.com/watch?v=V2qZ_lgxTzg" YOUTUBE_TEST_STRINGS = [ "## AutoGen FULL Tutorial with Python (Step-By-Step)", "This is an intermediate tutorial for installing and using AutoGen locally", "PT15M4S", "the model we're going to be using today is GPT 3.5 turbo", # From the transcript ] DOCX_COMMENT_TEST_STRINGS = [ "314b0a30-5b04-470b-b9f7-eed2c2bec74a", "49e168b7-d2ae-407f-a055-2167576f39a1", "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", "# Abstract", "# Introduction", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "This is a test comment. 12df-321a", "Yet another comment in the doc. 55yiyi-asd09", ] BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" BLOG_TEST_STRINGS = [ "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?", "an example where high cost can easily prevent a generic complex", ] LLM_TEST_STRINGS = [ "5bda1dd6", ] PPTX_TEST_STRINGS = [ "2cdda5c8-e50e-4db4-b5f0-9722a649f455", "04191ea8-5c73-4215-a1d3-1cfb43aaaf12", "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", "1b92870d-e3b5-4e65-8153-919f4ff45592", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title "2003", # chart value ] # --- Helper Functions --- def validate_strings(result, expected_strings, exclude_strings=None): """Validate presence or absence of specific strings.""" text_content = result.text_content.replace("\\", "") for string in expected_strings: assert string in text_content if exclude_strings: for string in exclude_strings: assert string not in text_content def test_stream_info_operations() -> None: """Test operations performed on StreamInfo objects.""" stream_info_original = StreamInfo( mimetype="mimetype.1", extension="extension.1", charset="charset.1", filename="filename.1", local_path="local_path.1", url="url.1", ) # Check updating all attributes by keyword keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"] for keyword in keywords: updated_stream_info = stream_info_original.copy_and_update( **{keyword: f"{keyword}.2"} ) # Make sure the targted attribute is updated assert getattr(updated_stream_info, keyword) == f"{keyword}.2" # Make sure the other attributes are unchanged for k in keywords: if k != keyword: assert getattr(stream_info_original, k) == getattr( updated_stream_info, k ) # Check updating all attributes by passing a new StreamInfo object keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"] for keyword in keywords: updated_stream_info = stream_info_original.copy_and_update( StreamInfo(**{keyword: f"{keyword}.2"}) ) # Make sure the targted attribute is updated assert getattr(updated_stream_info, keyword) == f"{keyword}.2" # Make sure the other attributes are unchanged for k in keywords: if k != keyword: assert getattr(stream_info_original, k) == getattr( updated_stream_info, k ) # Check mixing and matching updated_stream_info = stream_info_original.copy_and_update( StreamInfo(extension="extension.2", filename="filename.2"), mimetype="mimetype.3", charset="charset.3", ) assert updated_stream_info.extension == "extension.2" assert updated_stream_info.filename == "filename.2" assert updated_stream_info.mimetype == "mimetype.3" assert updated_stream_info.charset == "charset.3" assert updated_stream_info.local_path == "local_path.1" assert updated_stream_info.url == "url.1" # Check multiple StreamInfo objects updated_stream_info = stream_info_original.copy_and_update( StreamInfo(extension="extension.4", filename="filename.5"), StreamInfo(mimetype="mimetype.6", charset="charset.7"), ) assert updated_stream_info.extension == "extension.4" assert updated_stream_info.filename == "filename.5" assert updated_stream_info.mimetype == "mimetype.6" assert updated_stream_info.charset == "charset.7" assert updated_stream_info.local_path == "local_path.1" assert updated_stream_info.url == "url.1" def test_data_uris() -> None: # Test basic parsing of data URIs data_uri = "data:text/plain;base64,SGVsbG8sIFdvcmxkIQ==" mime_type, attributes, data = parse_data_uri(data_uri) assert mime_type == "text/plain" assert len(attributes) == 0 assert data == b"Hello, World!" data_uri = "data:base64,SGVsbG8sIFdvcmxkIQ==" mime_type, attributes, data = parse_data_uri(data_uri) assert mime_type is None assert len(attributes) == 0 assert data == b"Hello, World!" data_uri = "data:text/plain;charset=utf-8;base64,SGVsbG8sIFdvcmxkIQ==" mime_type, attributes, data = parse_data_uri(data_uri) assert mime_type == "text/plain" assert len(attributes) == 1 assert attributes["charset"] == "utf-8" assert data == b"Hello, World!" data_uri = "data:,Hello%2C%20World%21" mime_type, attributes, data = parse_data_uri(data_uri) assert mime_type is None assert len(attributes) == 0 assert data == b"Hello, World!" data_uri = "data:text/plain,Hello%2C%20World%21" mime_type, attributes, data = parse_data_uri(data_uri) assert mime_type == "text/plain" assert len(attributes) == 0 assert data == b"Hello, World!" data_uri = "data:text/plain;charset=utf-8,Hello%2C%20World%21" mime_type, attributes, data = parse_data_uri(data_uri) assert mime_type == "text/plain" assert len(attributes) == 1 assert attributes["charset"] == "utf-8" assert data == b"Hello, World!" def test_file_uris() -> None: # Test file URI with an empty host file_uri = "file:///path/to/file.txt" netloc, path = file_uri_to_path(file_uri) assert netloc is None assert path == "/path/to/file.txt" # Test file URI with no host file_uri = "file:/path/to/file.txt" netloc, path = file_uri_to_path(file_uri) assert netloc is None assert path == "/path/to/file.txt" # Test file URI with localhost file_uri = "file://localhost/path/to/file.txt" netloc, path = file_uri_to_path(file_uri) assert netloc == "localhost" assert path == "/path/to/file.txt" # Test file URI with query parameters file_uri = "file:///path/to/file.txt?param=value" netloc, path = file_uri_to_path(file_uri) assert netloc is None assert path == "/path/to/file.txt" # Test file URI with fragment file_uri = "file:///path/to/file.txt#fragment" netloc, path = file_uri_to_path(file_uri) assert netloc is None assert path == "/path/to/file.txt" def test_docx_comments() -> None: # Test DOCX processing, with comments and setting style_map on init markitdown_with_style_map = MarkItDown(style_map="comment-reference => ") result = markitdown_with_style_map.convert( os.path.join(TEST_FILES_DIR, "test_with_comment.docx") ) validate_strings(result, DOCX_COMMENT_TEST_STRINGS) def test_docx_equations() -> None: markitdown = MarkItDown() docx_file = os.path.join(TEST_FILES_DIR, "equations.docx") result = markitdown.convert(docx_file) # Check for inline equation m=1 (wrapped with single $) is present assert "$m=1$" in result.text_content, "Inline equation $m=1$ not found" # Find block equations wrapped with double $$ and check if they are present block_equations = re.findall(r"\$\$(.+?)\$\$", result.text_content) assert block_equations, "No block equations found in the document." def test_input_as_strings() -> None: markitdown = MarkItDown() # Test input from a stream input_data = b"<html><body><h1>Test</h1></body></html>" result = markitdown.convert_stream(io.BytesIO(input_data)) assert "# Test" in result.text_content # Test input with leading blank characters input_data = b" \n\n\n<html><body><h1>Test</h1></body></html>" result = markitdown.convert_stream(io.BytesIO(input_data)) assert "# Test" in result.text_content @pytest.mark.skipif( skip_remote, reason="do not run tests that query external urls", ) def test_markitdown_remote() -> None: markitdown = MarkItDown() # By URL result = markitdown.convert(PDF_TEST_URL) for test_string in PDF_TEST_STRINGS: assert test_string in result.text_content # Youtube result = markitdown.convert(YOUTUBE_TEST_URL) for test_string in YOUTUBE_TEST_STRINGS: assert test_string in result.text_content @pytest.mark.skipif( skip_remote, reason="do not run remotely run speech transcription tests", ) def test_speech_transcription() -> None: markitdown = MarkItDown() # Test WAV files, MP3 and M4A files for file_name in ["test.wav", "test.mp3", "test.m4a"]: result = markitdown.convert(os.path.join(TEST_FILES_DIR, file_name)) result_lower = result.text_content.lower() assert ( ("1" in result_lower or "one" in result_lower) and ("2" in result_lower or "two" in result_lower) and ("3" in result_lower or "three" in result_lower) and ("4" in result_lower or "four" in result_lower) and ("5" in result_lower or "five" in result_lower) ) def test_exceptions() -> None: # Check that an exception is raised when trying to convert an unsupported format markitdown = MarkItDown() with pytest.raises(UnsupportedFormatException): markitdown.convert(os.path.join(TEST_FILES_DIR, "random.bin")) # Check that an exception is raised when trying to convert a file that is corrupted with pytest.raises(FileConversionException) as exc_info: markitdown.convert( os.path.join(TEST_FILES_DIR, "random.bin"), file_extension=".pptx" ) assert len(exc_info.value.attempts) == 1 assert type(exc_info.value.attempts[0].converter).__name__ == "PptxConverter" @pytest.mark.skipif( skip_exiftool, reason="do not run if exiftool is not installed", ) def test_markitdown_exiftool() -> None: which_exiftool = shutil.which("exiftool") assert which_exiftool is not None # Test explicitly setting the location of exiftool markitdown = MarkItDown(exiftool_path=which_exiftool) result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) for key in JPG_TEST_EXIFTOOL: target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" assert target in result.text_content # Test setting the exiftool path through an environment variable os.environ["EXIFTOOL_PATH"] = which_exiftool markitdown = MarkItDown() result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) for key in JPG_TEST_EXIFTOOL: target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" assert target in result.text_content # Test some other media types result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.mp3")) for key in MP3_TEST_EXIFTOOL: target = f"{key}: {MP3_TEST_EXIFTOOL[key]}" assert target in result.text_content def test_markitdown_llm_parameters() -> None: """Test that LLM parameters are correctly passed to the client.""" mock_client = MagicMock() mock_response = MagicMock() mock_response.choices = [ MagicMock( message=MagicMock( content="Test caption with red circle and blue square 5bda1dd6" ) ) ] mock_client.chat.completions.create.return_value = mock_response test_prompt = "You are a professional test prompt." markitdown = MarkItDown( llm_client=mock_client, llm_model="gpt-4o", llm_prompt=test_prompt ) # Test image file markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg")) # Verify the prompt was passed to the OpenAI API assert mock_client.chat.completions.create.called call_args = mock_client.chat.completions.create.call_args messages = call_args[1]["messages"] assert len(messages) == 1 assert messages[0]["content"][0]["text"] == test_prompt # Reset the mock for the next test mock_client.chat.completions.create.reset_mock() # TODO: may only use one test after the llm caption method duplicate has been removed: # https://github.com/microsoft/markitdown/pull/1254 # Test PPTX file markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) # Verify the prompt was passed to the OpenAI API for PPTX images too assert mock_client.chat.completions.create.called call_args = mock_client.chat.completions.create.call_args messages = call_args[1]["messages"] assert len(messages) == 1 assert messages[0]["content"][0]["text"] == test_prompt @pytest.mark.skipif( skip_llm, reason="do not run llm tests without a key", ) def test_markitdown_llm() -> None: client = openai.OpenAI() markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o") result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg")) for test_string in LLM_TEST_STRINGS: assert test_string in result.text_content # This is not super precise. It would also accept "red square", "blue circle", # "the square is not blue", etc. But it's sufficient for this test. for test_string in ["red", "circle", "blue", "square"]: assert test_string in result.text_content.lower() # Images embedded in PPTX files result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) # LLM Captions are included for test_string in LLM_TEST_STRINGS: assert test_string in result.text_content # Standard alt text is included validate_strings(result, PPTX_TEST_STRINGS) if __name__ == "__main__": """Runs this file's tests from the command line.""" for test in [ test_stream_info_operations, test_data_uris, test_file_uris, test_docx_comments, test_input_as_strings, test_markitdown_remote, test_speech_transcription, test_exceptions, test_markitdown_exiftool, test_markitdown_llm_parameters, test_markitdown_llm, ]: print(f"Running {test.__name__}...", end="") test() print("OK") print("All tests passed!") ``` -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/test_blog.html: -------------------------------------------------------------------------------- ```html <!doctype html> <html lang="en" dir="ltr" class="blog-wrapper blog-post-page plugin-blog plugin-id-default" data-has-hydrated="false"> <head> <meta charset="UTF-8"> <meta name="generator" content="Docusaurus v3.1.1"> <title data-rh="true">Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"><meta data-rh="true" property="og:locale" content="en"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docusaurus_tag" content="default"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docsearch:docusaurus_tag" content="default"><meta data-rh="true" property="og:title" content="Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen"><meta data-rh="true" name="description" content="level 2 algebra"><meta data-rh="true" property="og:description" content="level 2 algebra"><meta data-rh="true" property="og:type" content="article"><meta data-rh="true" property="article:published_time" content="2023-04-21T00:00:00.000Z"><meta data-rh="true" property="article:author" content="https://www.linkedin.com/in/chi-wang-49b15b16/"><meta data-rh="true" property="article:tag" content="LLM,GPT,research"><link data-rh="true" rel="icon" href="/autogen/img/ag.ico"><link data-rh="true" rel="canonical" href="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"><link data-rh="true" rel="alternate" href="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" hreflang="en"><link data-rh="true" rel="alternate" href="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" hreflang="x-default"><link rel="alternate" type="application/rss+xml" href="/autogen/blog/rss.xml" title="AutoGen RSS Feed"> <link rel="alternate" type="application/atom+xml" href="/autogen/blog/atom.xml" title="AutoGen Atom Feed"> <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.css" integrity="sha384-Um5gpz1odJg5Z4HAmzPtgZKdTBHZdw8S29IecapCSB31ligYPhHQZMIlWLYQGVoc" crossorigin="anonymous"> <script src="/autogen/js/custom.js" async defer="defer"></script><link rel="stylesheet" href="/autogen/assets/css/styles.ca10f300.css"> <script src="/autogen/assets/js/runtime~main.83ab9fec.js" defer="defer"></script> <script src="/autogen/assets/js/main.5d28c826.js" defer="defer"></script> </head> <body class="navigation-with-keyboard"> <script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){try{return new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}}()||function(){try{return localStorage.getItem("theme")}catch(t){}}();t(null!==e?e:"light")}(),function(){try{const a=new URLSearchParams(window.location.search).entries();for(var[t,e]of a)if(t.startsWith("docusaurus-data-")){var n=t.replace("docusaurus-data-","data-");document.documentElement.setAttribute(n,e)}}catch(t){}}(),document.documentElement.setAttribute("data-announcement-bar-initially-dismissed",function(){try{return"true"===localStorage.getItem("docusaurus.announcement.dismiss")}catch(t){}return!1}())</script><div id="__docusaurus"><div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><div class="announcementBar_mb4j" style="background-color:#fafbfc;color:#091E42" role="banner"><div class="announcementBarPlaceholder_vyr4"></div><div class="content_knG7 announcementBarContent_xLdY">What's new in AutoGen? Read <a href="/autogen/blog/2024/03/03/AutoGen-Update">this blog</a> for an overview of updates</div><button type="button" aria-label="Close" class="clean-btn close closeButton_CVFx announcementBarClose_gvF7"><svg viewBox="0 0 15 15" width="14" height="14"><g stroke="currentColor" stroke-width="3.1"><path d="M.75.75l13.5 13.5M14.25.75L.75 14.25"></path></g></svg></button></div><nav aria-label="Main" class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/autogen/"><div class="navbar__logo"><img src="/autogen/img/ag.svg" alt="AutoGen" class="themedComponent_mlkZ themedComponent--light_NVdE"><img src="/autogen/img/ag.svg" alt="AutoGen" class="themedComponent_mlkZ themedComponent--dark_xIcU"></div><b class="navbar__title text--truncate">AutoGen</b></a><a class="navbar__item navbar__link" href="/autogen/docs/Getting-Started">Docs</a><a class="navbar__item navbar__link" href="/autogen/docs/reference/agentchat/conversable_agent">API</a><a aria-current="page" class="navbar__item navbar__link navbar__link--active" href="/autogen/blog">Blog</a><a class="navbar__item navbar__link" href="/autogen/docs/FAQ">FAQ</a><a class="navbar__item navbar__link" href="/autogen/docs/Examples">Examples</a><a class="navbar__item navbar__link" href="/autogen/docs/notebooks">Notebooks</a><a class="navbar__item navbar__link" href="/autogen/docs/Gallery">Gallery</a></div><div class="navbar__items navbar__items--right"><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">Other Languages</a><ul class="dropdown__menu"><li><a href="https://microsoft.github.io/autogen-for-net/" target="_blank" rel="noopener noreferrer" class="dropdown__link">Dotnet<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div><a href="https://github.com/microsoft/autogen" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a><div class="toggle_vylO colorModeToggle_DEke"><button class="clean-btn toggleButton_gllP toggleButtonDisabled_aARS" type="button" disabled="" title="Switch between dark and light mode (currently light mode)" aria-label="Switch between dark and light mode (currently light mode)" aria-live="polite"><svg viewBox="0 0 24 24" width="24" height="24" class="lightToggleIcon_pyhR"><path fill="currentColor" d="M12,9c1.65,0,3,1.35,3,3s-1.35,3-3,3s-3-1.35-3-3S10.35,9,12,9 M12,7c-2.76,0-5,2.24-5,5s2.24,5,5,5s5-2.24,5-5 S14.76,7,12,7L12,7z M2,13l2,0c0.55,0,1-0.45,1-1s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S1.45,13,2,13z M20,13l2,0c0.55,0,1-0.45,1-1 s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S19.45,13,20,13z M11,2v2c0,0.55,0.45,1,1,1s1-0.45,1-1V2c0-0.55-0.45-1-1-1S11,1.45,11,2z M11,20v2c0,0.55,0.45,1,1,1s1-0.45,1-1v-2c0-0.55-0.45-1-1-1C11.45,19,11,19.45,11,20z M5.99,4.58c-0.39-0.39-1.03-0.39-1.41,0 c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0s0.39-1.03,0-1.41L5.99,4.58z M18.36,16.95 c-0.39-0.39-1.03-0.39-1.41,0c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0c0.39-0.39,0.39-1.03,0-1.41 L18.36,16.95z M19.42,5.99c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06c-0.39,0.39-0.39,1.03,0,1.41 s1.03,0.39,1.41,0L19.42,5.99z M7.05,18.36c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06 c-0.39,0.39-0.39,1.03,0,1.41s1.03,0.39,1.41,0L7.05,18.36z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" class="darkToggleIcon_wfgR"><path fill="currentColor" d="M9.37,5.51C9.19,6.15,9.1,6.82,9.1,7.5c0,4.08,3.32,7.4,7.4,7.4c0.68,0,1.35-0.09,1.99-0.27C17.45,17.19,14.93,19,12,19 c-3.86,0-7-3.14-7-7C5,9.07,6.81,6.55,9.37,5.51z M12,3c-4.97,0-9,4.03-9,9s4.03,9,9,9s9-4.03,9-9c0-0.46-0.04-0.92-0.1-1.36 c-0.98,1.37-2.58,2.26-4.4,2.26c-2.98,0-5.4-2.42-5.4-5.4c0-1.81,0.89-3.42,2.26-4.4C12.92,3.04,12.46,3,12,3L12,3z"></path></svg></button></div><div class="navbarSearchContainer_Bca1"><div class="navbar__search searchBarContainer_NW3z"><input placeholder="Search" aria-label="Search" class="navbar__search-input"><div class="loadingRing_RJI3 searchBarLoadingRing_YnHq"><div></div><div></div><div></div><div></div></div><div class="searchHintContainer_Pkmr"><kbd class="searchHint_iIMx">ctrl</kbd><kbd class="searchHint_iIMx">K</kbd></div></div></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="main-wrapper mainWrapper_z2l0"><div class="container margin-vert--lg"><div class="row"><aside class="col col--3"><nav class="sidebar_re4s thin-scrollbar" aria-label="Blog recent posts navigation"><div class="sidebarItemTitle_pO2u margin-bottom--md">Recent posts</div><ul class="sidebarItemList_Yudw clean-list"><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/03/03/AutoGen-Update">What's New in AutoGen?</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/02/29/StateFlow">StateFlow - Build LLM Workflows with Customized State-Oriented Transition Function in GroupChat</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/02/11/FSM-GroupChat">FSM Group Chat -- User-specified agent transitions</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/02/02/AutoAnny">Anny: Assisting AutoGen Devs Via AutoGen</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/01/26/Custom-Models">AutoGen with Custom Models: Empowering Users to Use Their Own Inference Mechanism</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/01/25/AutoGenBench">AutoGenBench -- A Tool for Measuring and Evaluating AutoGen Agents</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/01/23/Code-execution-in-docker">Code execution is now by default inside docker container</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/12/29/AgentDescriptions">All About Agent Descriptions</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/12/23/AgentOptimizer">AgentOptimizer - An Agentic Way to Train Your LLM Agent</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/12/01/AutoGenStudio">AutoGen Studio: Interactively Explore Multi-Agent Workflows</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/26/Agent-AutoBuild">Agent AutoBuild - Automatically Building Multi-agent Systems</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/20/AgentEval">How to Assess Utility of LLM-powered Applications?</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/13/OAI-assistants">AutoGen Meets GPTs</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/09/EcoAssistant">EcoAssistant - Using LLM Assistants More Accurately and Affordably</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/06/LMM-Agent">Multimodal with GPT-4V and LLaVA</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/10/26/TeachableAgent">AutoGen's Teachable Agents</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/10/18/RetrieveChat">Retrieval-Augmented Generation (RAG) Applications with AutoGen</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/07/14/Local-LLMs">Use AutoGen for Local LLMs</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/06/28/MathChat">MathChat - An Conversational Framework to Solve Math Problems</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/05/18/GPT-adaptive-humaneval">Achieve More, Pay Less - Use GPT-4 Smartly</a></li><li class="sidebarItem__DBe"><a aria-current="page" class="sidebarItemLink_mo7H sidebarItemLinkActive_I1ZP" href="/autogen/blog/2023/04/21/LLM-tuning-math">Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH</a></li></ul></nav></aside><main class="col col--7" itemscope="" itemtype="https://schema.org/Blog"><article itemprop="blogPost" itemscope="" itemtype="https://schema.org/BlogPosting"><meta itemprop="description" content="level 2 algebra"><header><h1 class="title_f1Hy" itemprop="headline">Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH</h1><div class="container_mt6G margin-vert--md"><time datetime="2023-04-21T00:00:00.000Z" itemprop="datePublished">April 21, 2023</time> · <!-- -->6 min read</div><div class="margin-top--md margin-bottom--sm row"><div class="col col--6 authorCol_Hf19"><div class="avatar margin-bottom--sm"><a href="https://www.linkedin.com/in/chi-wang-49b15b16/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link"><img class="avatar__photo" src="https://github.com/sonichi.png" alt="Chi Wang" itemprop="image"></a><div class="avatar__intro" itemprop="author" itemscope="" itemtype="https://schema.org/Person"><div class="avatar__name"><a href="https://www.linkedin.com/in/chi-wang-49b15b16/" target="_blank" rel="noopener noreferrer" itemprop="url"><span itemprop="name">Chi Wang</span></a></div><small class="avatar__subtitle" itemprop="description">Principal Researcher at Microsoft Research</small></div></div></div></div></header><div id="__blog-post-container" class="markdown" itemprop="articleBody"><p><img decoding="async" loading="lazy" alt="level 2 algebra" src="/autogen/assets/images/level2algebra-659ba95286432d9945fc89e84d606797.png" width="575" height="469" class="img_ev3q"></p> <p><strong>TL;DR:</strong></p> <ul> <li><strong>Just by tuning the inference parameters like model, number of responses, temperature etc. without changing any model weights or prompt, the baseline accuracy of untuned gpt-4 can be improved by 20% in high school math competition problems.</strong></li> <li><strong>For easy problems, the tuned gpt-3.5-turbo model vastly outperformed untuned gpt-4 in accuracy (e.g., 90% vs. 70%) and cost efficiency. For hard problems, the tuned gpt-4 is much more accurate (e.g., 35% vs. 20%) and less expensive than untuned gpt-4.</strong></li> <li><strong>AutoGen can help with model selection, parameter tuning, and cost-saving in LLM applications.</strong></li> </ul> <p>Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?</p> <p>In this blog post, we will explore how model and inference parameter matter in LLM applications, using a case study for <a href="https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html" target="_blank" rel="noopener noreferrer">MATH</a>, a benchmark for evaluating LLMs on advanced mathematical problem solving. MATH consists of 12K math competition problems from AMC-10, AMC-12 and AIME. Each problem is accompanied by a step-by-step solution.</p> <p>We will use AutoGen to automatically find the best model and inference parameter for LLMs on a given task and dataset given an inference budget, using a novel low-cost search & pruning strategy. AutoGen currently supports all the LLMs from OpenAI, such as GPT-3.5 and GPT-4.</p> <p>We will use AutoGen to perform model selection and inference parameter tuning. Then we compare the performance and inference cost on solving algebra problems with the untuned gpt-4. We will also analyze how different difficulty levels affect the results.</p> <h2 class="anchor anchorWithStickyNavbar_LWe7" id="experiment-setup">Experiment Setup<a href="#experiment-setup" class="hash-link" aria-label="Direct link to Experiment Setup" title="Direct link to Experiment Setup"></a></h2> <p>We use AutoGen to select between the following models with a target inference budget $0.02 per instance:</p> <ul> <li>gpt-3.5-turbo, a relatively cheap model that powers the popular ChatGPT app</li> <li>gpt-4, the state of the art LLM that costs more than 10 times of gpt-3.5-turbo</li> </ul> <p>We adapt the models using 20 examples in the train set, using the problem statement as the input and generating the solution as the output. We use the following inference parameters:</p> <ul> <li>temperature: The parameter that controls the randomness of the output text. A higher temperature means more diversity but less coherence. We search for the optimal temperature in the range of [0, 1].</li> <li>top_p: The parameter that controls the probability mass of the output tokens. Only tokens with a cumulative probability less than or equal to top-p are considered. A lower top-p means more diversity but less coherence. We search for the optimal top-p in the range of [0, 1].</li> <li>max_tokens: The maximum number of tokens that can be generated for each output. We search for the optimal max length in the range of [50, 1000].</li> <li>n: The number of responses to generate. We search for the optimal n in the range of [1, 100].</li> <li>prompt: We use the template: "{problem} Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \boxed{{}}." where {problem} will be replaced by the math problem instance.</li> </ul> <p>In this experiment, when n > 1, we find the answer with highest votes among all the responses and then select it as the final answer to compare with the ground truth. For example, if n = 5 and 3 of the responses contain a final answer 301 while 2 of the responses contain a final answer 159, we choose 301 as the final answer. This can help with resolving potential errors due to randomness. We use the average accuracy and average inference cost as the metric to evaluate the performance over a dataset. The inference cost of a particular instance is measured by the price per 1K tokens and the number of tokens consumed.</p> <h2 class="anchor anchorWithStickyNavbar_LWe7" id="experiment-results">Experiment Results<a href="#experiment-results" class="hash-link" aria-label="Direct link to Experiment Results" title="Direct link to Experiment Results"></a></h2> <p>The first figure in this blog post shows the average accuracy and average inference cost of each configuration on the level 2 Algebra test set.</p> <p>Surprisingly, the tuned gpt-3.5-turbo model is selected as a better model and it vastly outperforms untuned gpt-4 in accuracy (92% vs. 70%) with equal or 2.5 times higher inference budget. The same observation can be obtained on the level 3 Algebra test set.</p> <p><img decoding="async" loading="lazy" alt="level 3 algebra" src="/autogen/assets/images/level3algebra-94e87a683ac8832ac7ae6f41f30131a4.png" width="575" height="469" class="img_ev3q"></p> <p>However, the selected model changes on level 4 Algebra.</p> <p><img decoding="async" loading="lazy" alt="level 4 algebra" src="/autogen/assets/images/level4algebra-492beb22490df30d6cc258f061912dcd.png" width="580" height="469" class="img_ev3q"></p> <p>This time gpt-4 is selected as the best model. The tuned gpt-4 achieves much higher accuracy (56% vs. 44%) and lower cost than the untuned gpt-4. On level 5 the result is similar.</p> <p><img decoding="async" loading="lazy" alt="level 5 algebra" src="/autogen/assets/images/level5algebra-8fba701551334296d08580b4b489fe56.png" width="575" height="469" class="img_ev3q"></p> <p>We can see that AutoGen has found different optimal model and inference parameters for each subset of a particular level, which shows that these parameters matter in cost-sensitive LLM applications and need to be carefully tuned or adapted.</p> <p>An example notebook to run these experiments can be found at: <a href="https://github.com/microsoft/FLAML/blob/v1.2.1/notebook/autogen_chatgpt.ipynb" target="_blank" rel="noopener noreferrer">https://github.com/microsoft/FLAML/blob/v1.2.1/notebook/autogen_chatgpt.ipynb</a>. The experiments were run when AutoGen was a subpackage in FLAML.</p> <h2 class="anchor anchorWithStickyNavbar_LWe7" id="analysis-and-discussion">Analysis and Discussion<a href="#analysis-and-discussion" class="hash-link" aria-label="Direct link to Analysis and Discussion" title="Direct link to Analysis and Discussion"></a></h2> <p>While gpt-3.5-turbo demonstrates competitive accuracy with voted answers in relatively easy algebra problems under the same inference budget, gpt-4 is a better choice for the most difficult problems. In general, through parameter tuning and model selection, we can identify the opportunity to save the expensive model for more challenging tasks, and improve the overall effectiveness of a budget-constrained system.</p> <p>There are many other alternative ways of solving math problems, which we have not covered in this blog post. When there are choices beyond the inference parameters, they can be generally tuned via <a href="https://microsoft.github.io/FLAML/docs/Use-Cases/Tune-User-Defined-Function" target="_blank" rel="noopener noreferrer"><code>flaml.tune</code></a>.</p> <p>The need for model selection, parameter tuning and cost saving is not specific to the math problems. The <a href="https://github.com/Significant-Gravitas/Auto-GPT" target="_blank" rel="noopener noreferrer">Auto-GPT</a> project is an example where high cost can easily prevent a generic complex task to be accomplished as it needs many LLM inference calls.</p> <h2 class="anchor anchorWithStickyNavbar_LWe7" id="for-further-reading">For Further Reading<a href="#for-further-reading" class="hash-link" aria-label="Direct link to For Further Reading" title="Direct link to For Further Reading"></a></h2> <ul> <li><a href="https://arxiv.org/abs/2303.04673" target="_blank" rel="noopener noreferrer">Research paper about the tuning technique</a></li> <li><a href="/autogen/docs/Use-Cases/enhanced_inference">Documentation about inference tuning</a></li> </ul> <p><em>Do you have any experience to share about LLM applications? Do you like to see more support or research of LLM optimization or automation? Please join our <a href="https://discord.gg/pAbnFJrkgZ" target="_blank" rel="noopener noreferrer">Discord</a> server for discussion.</em></p></div><footer class="row docusaurus-mt-lg blogPostFooterDetailsFull_mRVl"><div class="col"><b>Tags:</b><ul class="tags_jXut padding--none margin-left--sm"><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/autogen/blog/tags/llm">LLM</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/autogen/blog/tags/gpt">GPT</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/autogen/blog/tags/research">research</a></li></ul></div></footer></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Blog post page navigation"><a class="pagination-nav__link pagination-nav__link--prev" href="/autogen/blog/2023/05/18/GPT-adaptive-humaneval"><div class="pagination-nav__sublabel">Newer Post</div><div class="pagination-nav__label">Achieve More, Pay Less - Use GPT-4 Smartly</div></a></nav></main><div class="col col--2"><div class="tableOfContents_bqdL thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#experiment-setup" class="table-of-contents__link toc-highlight">Experiment Setup</a></li><li><a href="#experiment-results" class="table-of-contents__link toc-highlight">Experiment Results</a></li><li><a href="#analysis-and-discussion" class="table-of-contents__link toc-highlight">Analysis and Discussion</a></li><li><a href="#for-further-reading" class="table-of-contents__link toc-highlight">For Further Reading</a></li></ul></div></div></div></div></div><footer class="footer footer--dark"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://discord.gg/pAbnFJrkgZ" target="_blank" rel="noopener noreferrer" class="footer__link-item">Discord<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://twitter.com/pyautogen" target="_blank" rel="noopener noreferrer" class="footer__link-item">Twitter<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 AutoGen Authors | <a target="_blank" style="color:#10adff" href="https://go.microsoft.com/fwlink/?LinkId=521839">Privacy and Cookies</a></div></div></div></footer></div> </body> </html> ``` -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/_markitdown.py: -------------------------------------------------------------------------------- ```python import mimetypes import os import re import sys import shutil import traceback import io from dataclasses import dataclass from importlib.metadata import entry_points from typing import Any, List, Dict, Optional, Union, BinaryIO from pathlib import Path from urllib.parse import urlparse from warnings import warn import requests import magika import charset_normalizer import codecs from ._stream_info import StreamInfo from ._uri_utils import parse_data_uri, file_uri_to_path from .converters import ( PlainTextConverter, HtmlConverter, RssConverter, WikipediaConverter, YouTubeConverter, IpynbConverter, BingSerpConverter, PdfConverter, DocxConverter, XlsxConverter, XlsConverter, PptxConverter, ImageConverter, AudioConverter, OutlookMsgConverter, ZipConverter, EpubConverter, DocumentIntelligenceConverter, CsvConverter, ) from ._base_converter import DocumentConverter, DocumentConverterResult from ._exceptions import ( FileConversionException, UnsupportedFormatException, FailedConversionAttempt, ) # Lower priority values are tried first. PRIORITY_SPECIFIC_FILE_FORMAT = ( 0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia ) PRIORITY_GENERIC_FILE_FORMAT = ( 10.0 # Near catch-all converters for mimetypes like text/*, etc. ) _plugins: Union[None, List[Any]] = None # If None, plugins have not been loaded yet. def _load_plugins() -> Union[None, List[Any]]: """Lazy load plugins, exiting early if already loaded.""" global _plugins # Skip if we've already loaded plugins if _plugins is not None: return _plugins # Load plugins _plugins = [] for entry_point in entry_points(group="markitdown.plugin"): try: _plugins.append(entry_point.load()) except Exception: tb = traceback.format_exc() warn(f"Plugin '{entry_point.name}' failed to load ... skipping:\n{tb}") return _plugins @dataclass(kw_only=True, frozen=True) class ConverterRegistration: """A registration of a converter with its priority and other metadata.""" converter: DocumentConverter priority: float class MarkItDown: """(In preview) An extremely simple text-based document reader, suitable for LLM use. This reader will convert common file-types or webpages to Markdown.""" def __init__( self, *, enable_builtins: Union[None, bool] = None, enable_plugins: Union[None, bool] = None, **kwargs, ): self._builtins_enabled = False self._plugins_enabled = False requests_session = kwargs.get("requests_session") if requests_session is None: self._requests_session = requests.Session() else: self._requests_session = requests_session self._magika = magika.Magika() # TODO - remove these (see enable_builtins) self._llm_client: Any = None self._llm_model: Union[str | None] = None self._llm_prompt: Union[str | None] = None self._exiftool_path: Union[str | None] = None self._style_map: Union[str | None] = None # Register the converters self._converters: List[ConverterRegistration] = [] if ( enable_builtins is None or enable_builtins ): # Default to True when not specified self.enable_builtins(**kwargs) if enable_plugins: self.enable_plugins(**kwargs) def enable_builtins(self, **kwargs) -> None: """ Enable and register built-in converters. Built-in converters are enabled by default. This method should only be called once, if built-ins were initially disabled. """ if not self._builtins_enabled: # TODO: Move these into converter constructors self._llm_client = kwargs.get("llm_client") self._llm_model = kwargs.get("llm_model") self._llm_prompt = kwargs.get("llm_prompt") self._exiftool_path = kwargs.get("exiftool_path") self._style_map = kwargs.get("style_map") if self._exiftool_path is None: self._exiftool_path = os.getenv("EXIFTOOL_PATH") # Still none? Check well-known paths if self._exiftool_path is None: candidate = shutil.which("exiftool") if candidate: candidate = os.path.abspath(candidate) if any( d == os.path.dirname(candidate) for d in [ "/usr/bin", "/usr/local/bin", "/opt", "/opt/bin", "/opt/local/bin", "/opt/homebrew/bin", "C:\\Windows\\System32", "C:\\Program Files", "C:\\Program Files (x86)", ] ): self._exiftool_path = candidate # Register converters for successful browsing operations # Later registrations are tried first / take higher priority than earlier registrations # To this end, the most specific converters should appear below the most generic converters self.register_converter( PlainTextConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT ) self.register_converter( ZipConverter(markitdown=self), priority=PRIORITY_GENERIC_FILE_FORMAT ) self.register_converter( HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT ) self.register_converter(RssConverter()) self.register_converter(WikipediaConverter()) self.register_converter(YouTubeConverter()) self.register_converter(BingSerpConverter()) self.register_converter(DocxConverter()) self.register_converter(XlsxConverter()) self.register_converter(XlsConverter()) self.register_converter(PptxConverter()) self.register_converter(AudioConverter()) self.register_converter(ImageConverter()) self.register_converter(IpynbConverter()) self.register_converter(PdfConverter()) self.register_converter(OutlookMsgConverter()) self.register_converter(EpubConverter()) self.register_converter(CsvConverter()) # Register Document Intelligence converter at the top of the stack if endpoint is provided docintel_endpoint = kwargs.get("docintel_endpoint") if docintel_endpoint is not None: docintel_args: Dict[str, Any] = {} docintel_args["endpoint"] = docintel_endpoint docintel_credential = kwargs.get("docintel_credential") if docintel_credential is not None: docintel_args["credential"] = docintel_credential docintel_types = kwargs.get("docintel_file_types") if docintel_types is not None: docintel_args["file_types"] = docintel_types docintel_version = kwargs.get("docintel_api_version") if docintel_version is not None: docintel_args["api_version"] = docintel_version self.register_converter( DocumentIntelligenceConverter(**docintel_args), ) self._builtins_enabled = True else: warn("Built-in converters are already enabled.", RuntimeWarning) def enable_plugins(self, **kwargs) -> None: """ Enable and register converters provided by plugins. Plugins are disabled by default. This method should only be called once, if plugins were initially disabled. """ if not self._plugins_enabled: # Load plugins plugins = _load_plugins() assert plugins is not None for plugin in plugins: try: plugin.register_converters(self, **kwargs) except Exception: tb = traceback.format_exc() warn(f"Plugin '{plugin}' failed to register converters:\n{tb}") self._plugins_enabled = True else: warn("Plugins converters are already enabled.", RuntimeWarning) def convert( self, source: Union[str, requests.Response, Path, BinaryIO], *, stream_info: Optional[StreamInfo] = None, **kwargs: Any, ) -> DocumentConverterResult: # TODO: deal with kwargs """ Args: - source: can be a path (str or Path), url, or a requests.response object - stream_info: optional stream info to use for the conversion. If None, infer from source - kwargs: additional arguments to pass to the converter """ # Local path or url if isinstance(source, str): if ( source.startswith("http:") or source.startswith("https:") or source.startswith("file:") or source.startswith("data:") ): # Rename the url argument to mock_url # (Deprecated -- use stream_info) _kwargs = {k: v for k, v in kwargs.items()} if "url" in _kwargs: _kwargs["mock_url"] = _kwargs["url"] del _kwargs["url"] return self.convert_uri(source, stream_info=stream_info, **_kwargs) else: return self.convert_local(source, stream_info=stream_info, **kwargs) # Path object elif isinstance(source, Path): return self.convert_local(source, stream_info=stream_info, **kwargs) # Request response elif isinstance(source, requests.Response): return self.convert_response(source, stream_info=stream_info, **kwargs) # Binary stream elif ( hasattr(source, "read") and callable(source.read) and not isinstance(source, io.TextIOBase) ): return self.convert_stream(source, stream_info=stream_info, **kwargs) else: raise TypeError( f"Invalid source type: {type(source)}. Expected str, requests.Response, BinaryIO." ) def convert_local( self, path: Union[str, Path], *, stream_info: Optional[StreamInfo] = None, file_extension: Optional[str] = None, # Deprecated -- use stream_info url: Optional[str] = None, # Deprecated -- use stream_info **kwargs: Any, ) -> DocumentConverterResult: if isinstance(path, Path): path = str(path) # Build a base StreamInfo object from which to start guesses base_guess = StreamInfo( local_path=path, extension=os.path.splitext(path)[1], filename=os.path.basename(path), ) # Extend the base_guess with any additional info from the arguments if stream_info is not None: base_guess = base_guess.copy_and_update(stream_info) if file_extension is not None: # Deprecated -- use stream_info base_guess = base_guess.copy_and_update(extension=file_extension) if url is not None: # Deprecated -- use stream_info base_guess = base_guess.copy_and_update(url=url) with open(path, "rb") as fh: guesses = self._get_stream_info_guesses( file_stream=fh, base_guess=base_guess ) return self._convert(file_stream=fh, stream_info_guesses=guesses, **kwargs) def convert_stream( self, stream: BinaryIO, *, stream_info: Optional[StreamInfo] = None, file_extension: Optional[str] = None, # Deprecated -- use stream_info url: Optional[str] = None, # Deprecated -- use stream_info **kwargs: Any, ) -> DocumentConverterResult: guesses: List[StreamInfo] = [] # Do we have anything on which to base a guess? base_guess = None if stream_info is not None or file_extension is not None or url is not None: # Start with a non-Null base guess if stream_info is None: base_guess = StreamInfo() else: base_guess = stream_info if file_extension is not None: # Deprecated -- use stream_info assert base_guess is not None # for mypy base_guess = base_guess.copy_and_update(extension=file_extension) if url is not None: # Deprecated -- use stream_info assert base_guess is not None # for mypy base_guess = base_guess.copy_and_update(url=url) # Check if we have a seekable stream. If not, load the entire stream into memory. if not stream.seekable(): buffer = io.BytesIO() while True: chunk = stream.read(4096) if not chunk: break buffer.write(chunk) buffer.seek(0) stream = buffer # Add guesses based on stream content guesses = self._get_stream_info_guesses( file_stream=stream, base_guess=base_guess or StreamInfo() ) return self._convert(file_stream=stream, stream_info_guesses=guesses, **kwargs) def convert_url( self, url: str, *, stream_info: Optional[StreamInfo] = None, file_extension: Optional[str] = None, mock_url: Optional[str] = None, **kwargs: Any, ) -> DocumentConverterResult: """Alias for convert_uri()""" # convert_url will likely be deprecated in the future in favor of convert_uri return self.convert_uri( url, stream_info=stream_info, file_extension=file_extension, mock_url=mock_url, **kwargs, ) def convert_uri( self, uri: str, *, stream_info: Optional[StreamInfo] = None, file_extension: Optional[str] = None, # Deprecated -- use stream_info mock_url: Optional[ str ] = None, # Mock the request as if it came from a different URL **kwargs: Any, ) -> DocumentConverterResult: uri = uri.strip() # File URIs if uri.startswith("file:"): netloc, path = file_uri_to_path(uri) if netloc and netloc != "localhost": raise ValueError( f"Unsupported file URI: {uri}. Netloc must be empty or localhost." ) return self.convert_local( path, stream_info=stream_info, file_extension=file_extension, url=mock_url, **kwargs, ) # Data URIs elif uri.startswith("data:"): mimetype, attributes, data = parse_data_uri(uri) base_guess = StreamInfo( mimetype=mimetype, charset=attributes.get("charset"), ) if stream_info is not None: base_guess = base_guess.copy_and_update(stream_info) return self.convert_stream( io.BytesIO(data), stream_info=base_guess, file_extension=file_extension, url=mock_url, **kwargs, ) # HTTP/HTTPS URIs elif uri.startswith("http:") or uri.startswith("https:"): response = self._requests_session.get(uri, stream=True) response.raise_for_status() return self.convert_response( response, stream_info=stream_info, file_extension=file_extension, url=mock_url, **kwargs, ) else: raise ValueError( f"Unsupported URI scheme: {uri.split(':')[0]}. Supported schemes are: file:, data:, http:, https:" ) def convert_response( self, response: requests.Response, *, stream_info: Optional[StreamInfo] = None, file_extension: Optional[str] = None, # Deprecated -- use stream_info url: Optional[str] = None, # Deprecated -- use stream_info **kwargs: Any, ) -> DocumentConverterResult: # If there is a content-type header, get the mimetype and charset (if present) mimetype: Optional[str] = None charset: Optional[str] = None if "content-type" in response.headers: parts = response.headers["content-type"].split(";") mimetype = parts.pop(0).strip() for part in parts: if part.strip().startswith("charset="): _charset = part.split("=")[1].strip() if len(_charset) > 0: charset = _charset # If there is a content-disposition header, get the filename and possibly the extension filename: Optional[str] = None extension: Optional[str] = None if "content-disposition" in response.headers: m = re.search(r"filename=([^;]+)", response.headers["content-disposition"]) if m: filename = m.group(1).strip("\"'") _, _extension = os.path.splitext(filename) if len(_extension) > 0: extension = _extension # If there is still no filename, try to read it from the url if filename is None: parsed_url = urlparse(response.url) _, _extension = os.path.splitext(parsed_url.path) if len(_extension) > 0: # Looks like this might be a file! filename = os.path.basename(parsed_url.path) extension = _extension # Create an initial guess from all this information base_guess = StreamInfo( mimetype=mimetype, charset=charset, filename=filename, extension=extension, url=response.url, ) # Update with any additional info from the arguments if stream_info is not None: base_guess = base_guess.copy_and_update(stream_info) if file_extension is not None: # Deprecated -- use stream_info base_guess = base_guess.copy_and_update(extension=file_extension) if url is not None: # Deprecated -- use stream_info base_guess = base_guess.copy_and_update(url=url) # Read into BytesIO buffer = io.BytesIO() for chunk in response.iter_content(chunk_size=512): buffer.write(chunk) buffer.seek(0) # Convert guesses = self._get_stream_info_guesses( file_stream=buffer, base_guess=base_guess ) return self._convert(file_stream=buffer, stream_info_guesses=guesses, **kwargs) def _convert( self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs ) -> DocumentConverterResult: res: Union[None, DocumentConverterResult] = None # Keep track of which converters throw exceptions failed_attempts: List[FailedConversionAttempt] = [] # Create a copy of the page_converters list, sorted by priority. # We do this with each call to _convert because the priority of converters may change between calls. # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order. sorted_registrations = sorted(self._converters, key=lambda x: x.priority) # Remember the initial stream position so that we can return to it cur_pos = file_stream.tell() for stream_info in stream_info_guesses + [StreamInfo()]: for converter_registration in sorted_registrations: converter = converter_registration.converter # Sanity check -- make sure the cur_pos is still the same assert ( cur_pos == file_stream.tell() ), "File stream position should NOT change between guess iterations" _kwargs = {k: v for k, v in kwargs.items()} # Copy any additional global options if "llm_client" not in _kwargs and self._llm_client is not None: _kwargs["llm_client"] = self._llm_client if "llm_model" not in _kwargs and self._llm_model is not None: _kwargs["llm_model"] = self._llm_model if "llm_prompt" not in _kwargs and self._llm_prompt is not None: _kwargs["llm_prompt"] = self._llm_prompt if "style_map" not in _kwargs and self._style_map is not None: _kwargs["style_map"] = self._style_map if "exiftool_path" not in _kwargs and self._exiftool_path is not None: _kwargs["exiftool_path"] = self._exiftool_path # Add the list of converters for nested processing _kwargs["_parent_converters"] = self._converters # Add legaxy kwargs if stream_info is not None: if stream_info.extension is not None: _kwargs["file_extension"] = stream_info.extension if stream_info.url is not None: _kwargs["url"] = stream_info.url # Check if the converter will accept the file, and if so, try to convert it _accepts = False try: _accepts = converter.accepts(file_stream, stream_info, **_kwargs) except NotImplementedError: pass # accept() should not have changed the file stream position assert ( cur_pos == file_stream.tell() ), f"{type(converter).__name__}.accept() should NOT change the file_stream position" # Attempt the conversion if _accepts: try: res = converter.convert(file_stream, stream_info, **_kwargs) except Exception: failed_attempts.append( FailedConversionAttempt( converter=converter, exc_info=sys.exc_info() ) ) finally: file_stream.seek(cur_pos) if res is not None: # Normalize the content res.text_content = "\n".join( [line.rstrip() for line in re.split(r"\r?\n", res.text_content)] ) res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content) return res # If we got this far without success, report any exceptions if len(failed_attempts) > 0: raise FileConversionException(attempts=failed_attempts) # Nothing can handle it! raise UnsupportedFormatException( "Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported." ) def register_page_converter(self, converter: DocumentConverter) -> None: """DEPRECATED: User register_converter instead.""" warn( "register_page_converter is deprecated. Use register_converter instead.", DeprecationWarning, ) self.register_converter(converter) def register_converter( self, converter: DocumentConverter, *, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT, ) -> None: """ Register a DocumentConverter with a given priority. Priorities work as follows: By default, most converters get priority DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception is the PlainTextConverter, HtmlConverter, and ZipConverter, which get priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), with lower values being tried first (i.e., higher priority). Just prior to conversion, the converters are sorted by priority, using a stable sort. This means that converters with the same priority will remain in the same order, with the most recently registered converters appearing first. We have tight control over the order of built-in converters, but plugins can register converters in any order. The registration's priority field reasserts some control over the order of converters. Plugins can register converters with any priority, to appear before or after the built-ins. For example, a plugin with priority 9 will run before the PlainTextConverter, but after the built-in converters. """ self._converters.insert( 0, ConverterRegistration(converter=converter, priority=priority) ) def _get_stream_info_guesses( self, file_stream: BinaryIO, base_guess: StreamInfo ) -> List[StreamInfo]: """ Given a base guess, attempt to guess or expand on the stream info using the stream content (via magika). """ guesses: List[StreamInfo] = [] # Enhance the base guess with information based on the extension or mimetype enhanced_guess = base_guess.copy_and_update() # If there's an extension and no mimetype, try to guess the mimetype if base_guess.mimetype is None and base_guess.extension is not None: _m, _ = mimetypes.guess_type( "placeholder" + base_guess.extension, strict=False ) if _m is not None: enhanced_guess = enhanced_guess.copy_and_update(mimetype=_m) # If there's a mimetype and no extension, try to guess the extension if base_guess.mimetype is not None and base_guess.extension is None: _e = mimetypes.guess_all_extensions(base_guess.mimetype, strict=False) if len(_e) > 0: enhanced_guess = enhanced_guess.copy_and_update(extension=_e[0]) # Call magika to guess from the stream cur_pos = file_stream.tell() try: result = self._magika.identify_stream(file_stream) if result.status == "ok" and result.prediction.output.label != "unknown": # If it's text, also guess the charset charset = None if result.prediction.output.is_text: # Read the first 4k to guess the charset file_stream.seek(cur_pos) stream_page = file_stream.read(4096) charset_result = charset_normalizer.from_bytes(stream_page).best() if charset_result is not None: charset = self._normalize_charset(charset_result.encoding) # Normalize the first extension listed guessed_extension = None if len(result.prediction.output.extensions) > 0: guessed_extension = "." + result.prediction.output.extensions[0] # Determine if the guess is compatible with the base guess compatible = True if ( base_guess.mimetype is not None and base_guess.mimetype != result.prediction.output.mime_type ): compatible = False if ( base_guess.extension is not None and base_guess.extension.lstrip(".") not in result.prediction.output.extensions ): compatible = False if ( base_guess.charset is not None and self._normalize_charset(base_guess.charset) != charset ): compatible = False if compatible: # Add the compatible base guess guesses.append( StreamInfo( mimetype=base_guess.mimetype or result.prediction.output.mime_type, extension=base_guess.extension or guessed_extension, charset=base_guess.charset or charset, filename=base_guess.filename, local_path=base_guess.local_path, url=base_guess.url, ) ) else: # The magika guess was incompatible with the base guess, so add both guesses guesses.append(enhanced_guess) guesses.append( StreamInfo( mimetype=result.prediction.output.mime_type, extension=guessed_extension, charset=charset, filename=base_guess.filename, local_path=base_guess.local_path, url=base_guess.url, ) ) else: # There were no other guesses, so just add the base guess guesses.append(enhanced_guess) finally: file_stream.seek(cur_pos) return guesses def _normalize_charset(self, charset: str | None) -> str | None: """ Normalize a charset string to a canonical form. """ if charset is None: return None try: return codecs.lookup(charset).name except LookupError: return charset ```