This is page 5 of 17. Use http://codebase.md/stanfordnlp/dspy?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .github │ ├── .internal_dspyai │ │ ├── internals │ │ │ ├── build-and-release.md │ │ │ └── release-checklist.md │ │ └── pyproject.toml │ ├── .tmp │ │ └── .generated-actions │ │ └── run-pypi-publish-in-docker-container │ │ └── action.yml │ ├── ISSUE_TEMPLATE │ │ ├── bug_report.yml │ │ └── feature_request.yml │ ├── PULL_REQUEST_TEMPLATE │ │ └── pull_request_template.md │ ├── workflow_scripts │ │ └── install_testpypi_pkg.sh │ └── workflows │ ├── build_and_release.yml │ ├── build_utils │ │ └── test_version.py │ ├── docs-push.yml │ ├── precommits_check.yml │ └── run_tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── docs │ ├── .gitignore │ ├── docs │ │ ├── api │ │ │ ├── adapters │ │ │ │ ├── Adapter.md │ │ │ │ ├── ChatAdapter.md │ │ │ │ ├── JSONAdapter.md │ │ │ │ └── TwoStepAdapter.md │ │ │ ├── evaluation │ │ │ │ ├── answer_exact_match.md │ │ │ │ ├── answer_passage_match.md │ │ │ │ ├── CompleteAndGrounded.md │ │ │ │ ├── Evaluate.md │ │ │ │ ├── EvaluationResult.md │ │ │ │ └── SemanticF1.md │ │ │ ├── experimental │ │ │ │ ├── Citations.md │ │ │ │ └── Document.md │ │ │ ├── index.md │ │ │ ├── models │ │ │ │ ├── Embedder.md │ │ │ │ └── LM.md │ │ │ ├── modules │ │ │ │ ├── BestOfN.md │ │ │ │ ├── ChainOfThought.md │ │ │ │ ├── CodeAct.md │ │ │ │ ├── Module.md │ │ │ │ ├── MultiChainComparison.md │ │ │ │ ├── Parallel.md │ │ │ │ ├── Predict.md │ │ │ │ ├── ProgramOfThought.md │ │ │ │ ├── ReAct.md │ │ │ │ └── Refine.md │ │ │ ├── optimizers │ │ │ │ ├── BetterTogether.md │ │ │ │ ├── BootstrapFewShot.md │ │ │ │ ├── BootstrapFewShotWithRandomSearch.md │ │ │ │ ├── BootstrapFinetune.md │ │ │ │ ├── BootstrapRS.md │ │ │ │ ├── COPRO.md │ │ │ │ ├── Ensemble.md │ │ │ │ ├── GEPA │ │ │ │ │ ├── GEPA_Advanced.md │ │ │ │ │ └── overview.md │ │ │ │ ├── InferRules.md │ │ │ │ ├── KNN.md │ │ │ │ ├── KNNFewShot.md │ │ │ │ ├── LabeledFewShot.md │ │ │ │ ├── MIPROv2.md │ │ │ │ └── SIMBA.md │ │ │ ├── primitives │ │ │ │ ├── Audio.md │ │ │ │ ├── Code.md │ │ │ │ ├── Example.md │ │ │ │ ├── History.md │ │ │ │ ├── Image.md │ │ │ │ ├── Prediction.md │ │ │ │ ├── Tool.md │ │ │ │ └── ToolCalls.md │ │ │ ├── signatures │ │ │ │ ├── InputField.md │ │ │ │ ├── OutputField.md │ │ │ │ └── Signature.md │ │ │ ├── tools │ │ │ │ ├── ColBERTv2.md │ │ │ │ ├── Embeddings.md │ │ │ │ └── PythonInterpreter.md │ │ │ └── utils │ │ │ ├── asyncify.md │ │ │ ├── configure_cache.md │ │ │ ├── disable_litellm_logging.md │ │ │ ├── disable_logging.md │ │ │ ├── enable_litellm_logging.md │ │ │ ├── enable_logging.md │ │ │ ├── inspect_history.md │ │ │ ├── load.md │ │ │ ├── StatusMessage.md │ │ │ ├── StatusMessageProvider.md │ │ │ ├── streamify.md │ │ │ └── StreamListener.md │ │ ├── cheatsheet.md │ │ ├── community │ │ │ ├── community-resources.md │ │ │ ├── how-to-contribute.md │ │ │ └── use-cases.md │ │ ├── deep-dive │ │ │ └── data-handling │ │ │ ├── built-in-datasets.md │ │ │ ├── examples.md │ │ │ ├── img │ │ │ │ └── data-loading.png │ │ │ └── loading-custom-data.md │ │ ├── faqs.md │ │ ├── index.md │ │ ├── js │ │ │ └── runllm-widget.js │ │ ├── learn │ │ │ ├── evaluation │ │ │ │ ├── data.md │ │ │ │ ├── metrics.md │ │ │ │ └── overview.md │ │ │ ├── figures │ │ │ │ ├── native_tool_call.png │ │ │ │ └── teleprompter-classes.png │ │ │ ├── index.md │ │ │ ├── optimization │ │ │ │ ├── optimizers.md │ │ │ │ └── overview.md │ │ │ └── programming │ │ │ ├── 7-assertions.md │ │ │ ├── adapters.md │ │ │ ├── language_models.md │ │ │ ├── mcp.md │ │ │ ├── modules.md │ │ │ ├── overview.md │ │ │ ├── signatures.md │ │ │ └── tools.md │ │ ├── production │ │ │ └── index.md │ │ ├── roadmap.md │ │ ├── static │ │ │ ├── .nojekyll │ │ │ └── img │ │ │ ├── dspy_logo.png │ │ │ ├── logo.png │ │ │ ├── mlflow-tracing-rag.png │ │ │ ├── modular.png │ │ │ ├── optimize.png │ │ │ ├── undraw_docusaurus_mountain.svg │ │ │ ├── undraw_docusaurus_react.svg │ │ │ ├── undraw_docusaurus_tree.svg │ │ │ └── universal_compatibility.png │ │ ├── stylesheets │ │ │ └── extra.css │ │ └── tutorials │ │ ├── agents │ │ │ ├── index.ipynb │ │ │ └── mlflow-tracing-agent.png │ │ ├── ai_text_game │ │ │ └── index.md │ │ ├── async │ │ │ └── index.md │ │ ├── audio │ │ │ └── index.ipynb │ │ ├── build_ai_program │ │ │ └── index.md │ │ ├── cache │ │ │ └── index.md │ │ ├── classification │ │ │ └── index.md │ │ ├── classification_finetuning │ │ │ ├── index.ipynb │ │ │ └── mlflow-tracing-classification.png │ │ ├── conversation_history │ │ │ └── index.md │ │ ├── core_development │ │ │ └── index.md │ │ ├── custom_module │ │ │ ├── index.ipynb │ │ │ └── mlflow-tracing-custom-module.png │ │ ├── customer_service_agent │ │ │ ├── index.ipynb │ │ │ └── mlflow-tracing-customer-service-agent.png │ │ ├── deployment │ │ │ ├── dspy_mlflow_ui.png │ │ │ └── index.md │ │ ├── email_extraction │ │ │ ├── index.md │ │ │ └── mlflow-tracing-email-extraction.png │ │ ├── entity_extraction │ │ │ ├── index.ipynb │ │ │ └── mlflow-tracing-entity-extraction.png │ │ ├── games │ │ │ ├── index.ipynb │ │ │ └── mlflow-tracing-agent.png │ │ ├── gepa_ai_program │ │ │ └── index.md │ │ ├── gepa_aime │ │ │ ├── index.ipynb │ │ │ ├── mlflow-tracing-gepa-aime.png │ │ │ └── mlflow-tracking-gepa-aime-optimization.png │ │ ├── gepa_facilitysupportanalyzer │ │ │ ├── index.ipynb │ │ │ ├── mlflow-tracing-gepa-support.png │ │ │ └── mlflow-tracking-gepa-support-optimization.png │ │ ├── gepa_papillon │ │ │ ├── index.ipynb │ │ │ ├── mlflow-tracing-gepa-papilon.png │ │ │ └── mlflow-tracking-gepa-papilon-optimization.png │ │ ├── image_generation_prompting │ │ │ └── index.ipynb │ │ ├── index.md │ │ ├── llms_txt_generation │ │ │ └── index.md │ │ ├── math │ │ │ ├── index.ipynb │ │ │ └── mlflow-tracing-math.png │ │ ├── mcp │ │ │ └── index.md │ │ ├── mem0_react_agent │ │ │ └── index.md │ │ ├── multihop_search │ │ │ ├── index.ipynb │ │ │ └── mlflow-tracing-multi-hop.png │ │ ├── observability │ │ │ ├── index.md │ │ │ ├── mlflow_trace_ui_navigation.gif │ │ │ ├── mlflow_trace_ui.png │ │ │ └── mlflow_trace_view.png │ │ ├── optimize_ai_program │ │ │ └── index.md │ │ ├── optimizer_tracking │ │ │ ├── child_run.png │ │ │ ├── experiment.png │ │ │ ├── index.md │ │ │ └── parent_run.png │ │ ├── output_refinement │ │ │ └── best-of-n-and-refine.md │ │ ├── papillon │ │ │ └── index.md │ │ ├── program_of_thought │ │ │ └── index.ipynb │ │ ├── rag │ │ │ ├── index.ipynb │ │ │ └── mlflow-tracing-rag.png │ │ ├── real_world_examples │ │ │ └── index.md │ │ ├── rl_ai_program │ │ │ └── index.md │ │ ├── rl_multihop │ │ │ └── index.ipynb │ │ ├── rl_papillon │ │ │ └── index.ipynb │ │ ├── sample_code_generation │ │ │ └── index.md │ │ ├── saving │ │ │ └── index.md │ │ ├── streaming │ │ │ └── index.md │ │ ├── tool_use │ │ │ └── index.ipynb │ │ └── yahoo_finance_react │ │ └── index.md │ ├── mkdocs.yml │ ├── overrides │ │ ├── home.html │ │ ├── main.html │ │ └── partials │ │ └── tabs.html │ ├── Pipfile │ ├── Pipfile.lock │ ├── README.md │ ├── requirements.txt │ ├── scripts │ │ ├── generate_api_docs.py │ │ └── generate_api_summary.py │ └── vercel.json ├── dspy │ ├── __init__.py │ ├── __metadata__.py │ ├── adapters │ │ ├── __init__.py │ │ ├── baml_adapter.py │ │ ├── base.py │ │ ├── chat_adapter.py │ │ ├── json_adapter.py │ │ ├── two_step_adapter.py │ │ ├── types │ │ │ ├── __init__.py │ │ │ ├── audio.py │ │ │ ├── base_type.py │ │ │ ├── citation.py │ │ │ ├── code.py │ │ │ ├── document.py │ │ │ ├── history.py │ │ │ ├── image.py │ │ │ └── tool.py │ │ ├── utils.py │ │ └── xml_adapter.py │ ├── clients │ │ ├── __init__.py │ │ ├── base_lm.py │ │ ├── cache.py │ │ ├── databricks.py │ │ ├── embedding.py │ │ ├── lm_local_arbor.py │ │ ├── lm_local.py │ │ ├── lm.py │ │ ├── openai.py │ │ ├── provider.py │ │ └── utils_finetune.py │ ├── datasets │ │ ├── __init__.py │ │ ├── alfworld │ │ │ ├── __init__.py │ │ │ ├── alfworld.py │ │ │ └── base_config.yml │ │ ├── colors.py │ │ ├── dataloader.py │ │ ├── dataset.py │ │ ├── gsm8k.py │ │ ├── hotpotqa.py │ │ └── math.py │ ├── dsp │ │ ├── __init__.py │ │ ├── colbertv2.py │ │ └── utils │ │ ├── __init__.py │ │ ├── dpr.py │ │ ├── settings.py │ │ └── utils.py │ ├── evaluate │ │ ├── __init__.py │ │ ├── auto_evaluation.py │ │ ├── evaluate.py │ │ └── metrics.py │ ├── experimental │ │ └── __init__.py │ ├── predict │ │ ├── __init__.py │ │ ├── aggregation.py │ │ ├── avatar │ │ │ ├── __init__.py │ │ │ ├── avatar.py │ │ │ ├── models.py │ │ │ └── signatures.py │ │ ├── best_of_n.py │ │ ├── chain_of_thought.py │ │ ├── code_act.py │ │ ├── knn.py │ │ ├── multi_chain_comparison.py │ │ ├── parallel.py │ │ ├── parameter.py │ │ ├── predict.py │ │ ├── program_of_thought.py │ │ ├── react.py │ │ ├── refine.py │ │ └── retry.py │ ├── primitives │ │ ├── __init__.py │ │ ├── base_module.py │ │ ├── example.py │ │ ├── module.py │ │ ├── prediction.py │ │ ├── python_interpreter.py │ │ └── runner.js │ ├── propose │ │ ├── __init__.py │ │ ├── dataset_summary_generator.py │ │ ├── grounded_proposer.py │ │ ├── propose_base.py │ │ └── utils.py │ ├── retrievers │ │ ├── __init__.py │ │ ├── databricks_rm.py │ │ ├── embeddings.py │ │ ├── retrieve.py │ │ └── weaviate_rm.py │ ├── signatures │ │ ├── __init__.py │ │ ├── field.py │ │ ├── signature.py │ │ └── utils.py │ ├── streaming │ │ ├── __init__.py │ │ ├── messages.py │ │ ├── streamify.py │ │ └── streaming_listener.py │ ├── teleprompt │ │ ├── __init__.py │ │ ├── avatar_optimizer.py │ │ ├── bettertogether.py │ │ ├── bootstrap_finetune.py │ │ ├── bootstrap_trace.py │ │ ├── bootstrap.py │ │ ├── copro_optimizer.py │ │ ├── ensemble.py │ │ ├── gepa │ │ │ ├── __init__.py │ │ │ ├── gepa_utils.py │ │ │ ├── gepa.py │ │ │ └── instruction_proposal.py │ │ ├── grpo.py │ │ ├── infer_rules.py │ │ ├── knn_fewshot.py │ │ ├── mipro_optimizer_v2.py │ │ ├── random_search.py │ │ ├── signature_opt.py │ │ ├── simba_utils.py │ │ ├── simba.py │ │ ├── teleprompt_optuna.py │ │ ├── teleprompt.py │ │ ├── utils.py │ │ └── vanilla.py │ └── utils │ ├── __init__.py │ ├── annotation.py │ ├── asyncify.py │ ├── caching.py │ ├── callback.py │ ├── dummies.py │ ├── exceptions.py │ ├── hasher.py │ ├── inspect_history.py │ ├── langchain_tool.py │ ├── logging_utils.py │ ├── mcp.py │ ├── parallelizer.py │ ├── saving.py │ ├── syncify.py │ ├── unbatchify.py │ └── usage_tracker.py ├── LICENSE ├── pyproject.toml ├── README.md ├── tests │ ├── __init__.py │ ├── adapters │ │ ├── test_adapter_utils.py │ │ ├── test_baml_adapter.py │ │ ├── test_base_type.py │ │ ├── test_chat_adapter.py │ │ ├── test_citation.py │ │ ├── test_code.py │ │ ├── test_document.py │ │ ├── test_json_adapter.py │ │ ├── test_tool.py │ │ ├── test_two_step_adapter.py │ │ └── test_xml_adapter.py │ ├── callback │ │ └── test_callback.py │ ├── clients │ │ ├── test_cache.py │ │ ├── test_databricks.py │ │ ├── test_embedding.py │ │ ├── test_inspect_global_history.py │ │ └── test_lm.py │ ├── conftest.py │ ├── datasets │ │ └── test_dataset.py │ ├── docs │ │ └── test_mkdocs_links.py │ ├── evaluate │ │ ├── test_evaluate.py │ │ └── test_metrics.py │ ├── examples │ │ └── test_baleen.py │ ├── metadata │ │ └── test_metadata.py │ ├── predict │ │ ├── test_aggregation.py │ │ ├── test_best_of_n.py │ │ ├── test_chain_of_thought.py │ │ ├── test_code_act.py │ │ ├── test_knn.py │ │ ├── test_multi_chain_comparison.py │ │ ├── test_parallel.py │ │ ├── test_predict.py │ │ ├── test_program_of_thought.py │ │ ├── test_react.py │ │ ├── test_refine.py │ │ └── test_retry.py │ ├── primitives │ │ ├── resources │ │ │ └── saved_program.json │ │ ├── test_base_module.py │ │ ├── test_example.py │ │ ├── test_module.py │ │ └── test_python_interpreter.py │ ├── propose │ │ └── test_grounded_proposer.py │ ├── README.md │ ├── reliability │ │ ├── __init__.py │ │ ├── complex_types │ │ │ └── generated │ │ │ ├── test_many_types_1 │ │ │ │ ├── inputs │ │ │ │ │ ├── input1.json │ │ │ │ │ └── input2.json │ │ │ │ ├── program.py │ │ │ │ └── schema.json │ │ │ ├── test_nesting_1 │ │ │ │ ├── inputs │ │ │ │ │ ├── input1.json │ │ │ │ │ └── input2.json │ │ │ │ ├── program.py │ │ │ │ └── schema.json │ │ │ └── test_nesting_2 │ │ │ ├── inputs │ │ │ │ └── input1.json │ │ │ ├── program.py │ │ │ └── schema.json │ │ ├── conftest.py │ │ ├── generate │ │ │ ├── __init__.py │ │ │ ├── __main__.py │ │ │ └── utils.py │ │ ├── input_formats │ │ │ └── generated │ │ │ └── test_markdown_1 │ │ │ ├── inputs │ │ │ │ ├── input1.json │ │ │ │ └── input2.json │ │ │ ├── program.py │ │ │ └── schema.json │ │ ├── README.md │ │ ├── reliability_conf.yaml │ │ ├── test_generated.py │ │ ├── test_pydantic_models.py │ │ └── utils.py │ ├── retrievers │ │ └── test_embeddings.py │ ├── signatures │ │ ├── test_adapter_image.py │ │ ├── test_custom_types.py │ │ └── test_signature.py │ ├── streaming │ │ └── test_streaming.py │ ├── teleprompt │ │ ├── gepa_dummy_lm_custom_component_selector_custom_instruction_proposer.json │ │ ├── gepa_dummy_lm.json │ │ ├── test_bootstrap_finetune.py │ │ ├── test_bootstrap_trace.py │ │ ├── test_bootstrap.py │ │ ├── test_copro_optimizer.py │ │ ├── test_ensemble.py │ │ ├── test_finetune.py │ │ ├── test_gepa_instruction_proposer.py │ │ ├── test_gepa.py │ │ ├── test_grpo.py │ │ ├── test_knn_fewshot.py │ │ ├── test_random_search.py │ │ ├── test_teleprompt.py │ │ └── test_utils.py │ ├── test_utils │ │ ├── __init__.py │ │ └── server │ │ ├── __init__.py │ │ ├── litellm_server_config.yaml │ │ └── litellm_server.py │ └── utils │ ├── __init__.py │ ├── resources │ │ └── mcp_server.py │ ├── test_annotation.py │ ├── test_asyncify.py │ ├── test_exceptions.py │ ├── test_langchain_tool.py │ ├── test_mcp.py │ ├── test_parallelizer.py │ ├── test_saving.py │ ├── test_settings.py │ ├── test_syncify.py │ ├── test_unbatchify.py │ └── test_usage_tracker.py └── uv.lock ``` # Files -------------------------------------------------------------------------------- /dspy/primitives/prediction.py: -------------------------------------------------------------------------------- ```python 1 | from dspy.primitives.example import Example 2 | 3 | 4 | class Prediction(Example): 5 | """A prediction object that contains the output of a DSPy module. 6 | 7 | Prediction inherits from Example. 8 | 9 | To allow feedback-augmented scores, Prediction supports comparison operations 10 | (<, >, <=, >=) for Predictions with a `score` field. The comparison operations 11 | compare the 'score' values as floats. For equality comparison, Predictions are equal 12 | if their underlying data stores are equal (inherited from Example). 13 | 14 | Arithmetic operations (+, /, etc.) are also supported for Predictions with a 'score' 15 | field, operating on the score value. 16 | """ 17 | 18 | def __init__(self, *args, **kwargs): 19 | super().__init__(*args, **kwargs) 20 | 21 | del self._demos 22 | del self._input_keys 23 | 24 | self._completions = None 25 | self._lm_usage = None 26 | 27 | def get_lm_usage(self): 28 | return self._lm_usage 29 | 30 | def set_lm_usage(self, value): 31 | self._lm_usage = value 32 | 33 | @classmethod 34 | def from_completions(cls, list_or_dict, signature=None): 35 | obj = cls() 36 | obj._completions = Completions(list_or_dict, signature=signature) 37 | obj._store = {k: v[0] for k, v in obj._completions.items()} 38 | 39 | return obj 40 | 41 | def __repr__(self): 42 | store_repr = ",\n ".join(f"{k}={v!r}" for k, v in self._store.items()) 43 | 44 | if self._completions is None or len(self._completions) == 1: 45 | return f"Prediction(\n {store_repr}\n)" 46 | 47 | num_completions = len(self._completions) 48 | return f"Prediction(\n {store_repr},\n completions=Completions(...)\n) ({num_completions-1} completions omitted)" 49 | 50 | def __str__(self): 51 | return self.__repr__() 52 | 53 | def __float__(self): 54 | if "score" not in self._store: 55 | raise ValueError("Prediction object does not have a 'score' field to convert to float.") 56 | return float(self._store["score"]) 57 | 58 | def __add__(self, other): 59 | if isinstance(other, (float, int)): 60 | return self.__float__() + other 61 | elif isinstance(other, Prediction): 62 | return self.__float__() + float(other) 63 | raise TypeError(f"Unsupported type for addition: {type(other)}") 64 | 65 | def __radd__(self, other): 66 | if isinstance(other, (float, int)): 67 | return other + self.__float__() 68 | elif isinstance(other, Prediction): 69 | return float(other) + self.__float__() 70 | raise TypeError(f"Unsupported type for addition: {type(other)}") 71 | 72 | def __truediv__(self, other): 73 | if isinstance(other, (float, int)): 74 | return self.__float__() / other 75 | elif isinstance(other, Prediction): 76 | return self.__float__() / float(other) 77 | raise TypeError(f"Unsupported type for division: {type(other)}") 78 | 79 | def __rtruediv__(self, other): 80 | if isinstance(other, (float, int)): 81 | return other / self.__float__() 82 | elif isinstance(other, Prediction): 83 | return float(other) / self.__float__() 84 | raise TypeError(f"Unsupported type for division: {type(other)}") 85 | 86 | def __lt__(self, other): 87 | if isinstance(other, (float, int)): 88 | return self.__float__() < other 89 | elif isinstance(other, Prediction): 90 | return self.__float__() < float(other) 91 | raise TypeError(f"Unsupported type for comparison: {type(other)}") 92 | 93 | def __le__(self, other): 94 | if isinstance(other, (float, int)): 95 | return self.__float__() <= other 96 | elif isinstance(other, Prediction): 97 | return self.__float__() <= float(other) 98 | raise TypeError(f"Unsupported type for comparison: {type(other)}") 99 | 100 | def __gt__(self, other): 101 | if isinstance(other, (float, int)): 102 | return self.__float__() > other 103 | elif isinstance(other, Prediction): 104 | return self.__float__() > float(other) 105 | raise TypeError(f"Unsupported type for comparison: {type(other)}") 106 | 107 | def __ge__(self, other): 108 | if isinstance(other, (float, int)): 109 | return self.__float__() >= other 110 | elif isinstance(other, Prediction): 111 | return self.__float__() >= float(other) 112 | raise TypeError(f"Unsupported type for comparison: {type(other)}") 113 | 114 | @property 115 | def completions(self): 116 | return self._completions 117 | 118 | 119 | class Completions: 120 | def __init__(self, list_or_dict, signature=None): 121 | self.signature = signature 122 | 123 | if isinstance(list_or_dict, list): 124 | kwargs = {} 125 | for arg in list_or_dict: 126 | for k, v in arg.items(): 127 | kwargs.setdefault(k, []).append(v) 128 | else: 129 | kwargs = list_or_dict 130 | 131 | assert all(isinstance(v, list) for v in kwargs.values()), "All values must be lists" 132 | 133 | if kwargs: 134 | length = len(next(iter(kwargs.values()))) 135 | assert all(len(v) == length for v in kwargs.values()), "All lists must have the same length" 136 | 137 | self._completions = kwargs 138 | 139 | def items(self): 140 | return self._completions.items() 141 | 142 | def __getitem__(self, key): 143 | if isinstance(key, int): 144 | if key < 0 or key >= len(self): 145 | raise IndexError("Index out of range") 146 | 147 | return Prediction(**{k: v[key] for k, v in self._completions.items()}) 148 | 149 | return self._completions[key] 150 | 151 | def __getattr__(self, name): 152 | if name == "_completions": 153 | raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") 154 | if name in self._completions: 155 | return self._completions[name] 156 | 157 | raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") 158 | 159 | def __len__(self): 160 | # Return the length of the list for one of the keys 161 | # It assumes all lists have the same length 162 | return len(next(iter(self._completions.values()))) 163 | 164 | def __contains__(self, key): 165 | return key in self._completions 166 | 167 | def __repr__(self): 168 | items_repr = ",\n ".join(f"{k}={v!r}" for k, v in self._completions.items()) 169 | return f"Completions(\n {items_repr}\n)" 170 | 171 | def __str__(self): 172 | # return str(self._completions) 173 | return self.__repr__() 174 | ``` -------------------------------------------------------------------------------- /tests/teleprompt/test_copro_optimizer.py: -------------------------------------------------------------------------------- ```python 1 | import dspy 2 | from dspy import Example 3 | from dspy.teleprompt.signature_opt import COPRO 4 | from dspy.utils.dummies import DummyLM 5 | 6 | 7 | # Define a simple metric function for testing 8 | def simple_metric(example, prediction): 9 | # Simplified metric for testing: true if prediction matches expected output 10 | return example.output == prediction.output 11 | 12 | 13 | # Example training and validation sets 14 | trainset = [ 15 | Example(input="Question: What is the color of the sky?", output="blue").with_inputs("input"), 16 | Example(input="Question: What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!").with_inputs( 17 | "input" 18 | ), 19 | ] 20 | 21 | 22 | def test_signature_optimizer_initialization(): 23 | optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4) 24 | assert optimizer.metric == simple_metric, "Metric not correctly initialized" 25 | assert optimizer.breadth == 2, "Breadth not correctly initialized" 26 | assert optimizer.depth == 1, "Depth not correctly initialized" 27 | assert optimizer.init_temperature == 1.4, "Initial temperature not correctly initialized" 28 | 29 | 30 | class SimpleModule(dspy.Module): 31 | def __init__(self, signature): 32 | super().__init__() 33 | # COPRO doesn't work with dspy.Predict 34 | self.predictor = dspy.ChainOfThought(signature) 35 | 36 | def forward(self, **kwargs): 37 | return self.predictor(**kwargs) 38 | 39 | 40 | def test_signature_optimizer_optimization_process(): 41 | optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4) 42 | dspy.settings.configure( 43 | lm=DummyLM( 44 | [ 45 | { 46 | "proposed_instruction": "Optimized instruction 1", 47 | "proposed_prefix_for_output_field": "Optimized instruction 2", 48 | }, 49 | ] 50 | ) 51 | ) 52 | 53 | student = SimpleModule("input -> output") 54 | 55 | # Assuming the compile method of COPRO requires a student module, a development set, and evaluation kwargs 56 | optimized_student = optimizer.compile( 57 | student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False} 58 | ) 59 | 60 | # Check that the optimized student has been modified from the original 61 | # This check can be more specific based on how the optimization modifies the student 62 | assert optimized_student is not student, "Optimization did not modify the student" 63 | 64 | # Further tests can be added to verify the specifics of the optimization process, 65 | # such as checking the instructions of the optimized student's predictors. 66 | 67 | 68 | def test_signature_optimizer_statistics_tracking(): 69 | optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4) 70 | optimizer.track_stats = True # Enable statistics tracking 71 | 72 | dspy.settings.configure( 73 | lm=DummyLM( 74 | [ 75 | { 76 | "proposed_instruction": "Optimized instruction 1", 77 | "proposed_prefix_for_output_field": "Optimized instruction 2", 78 | }, 79 | ] 80 | ) 81 | ) 82 | student = SimpleModule("input -> output") 83 | optimized_student = optimizer.compile( 84 | student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False} 85 | ) 86 | 87 | # Verify that statistics have been tracked and attached to the optimized student 88 | assert hasattr(optimized_student, "total_calls"), "Total calls statistic not tracked" 89 | assert hasattr(optimized_student, "results_best"), "Best results statistics not tracked" 90 | 91 | 92 | # Assuming the setup_signature_optimizer fixture and simple_metric function are defined as before 93 | 94 | 95 | def test_optimization_and_output_verification(): 96 | lm = DummyLM( 97 | [ 98 | {"proposed_instruction": "Optimized Prompt", "proposed_prefix_for_output_field": "Optimized Prefix"}, 99 | {"reasoning": "france", "output": "Paris"}, 100 | {"reasoning": "france", "output": "Paris"}, 101 | {"reasoning": "france", "output": "Paris"}, 102 | {"reasoning": "france", "output": "Paris"}, 103 | {"reasoning": "france", "output": "Paris"}, 104 | {"reasoning": "france", "output": "Paris"}, 105 | {"reasoning": "france", "output": "Paris"}, 106 | ] 107 | ) 108 | dspy.settings.configure(lm=lm) 109 | optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4) 110 | 111 | student = SimpleModule("input -> output") 112 | 113 | # Compile the student with the optimizer 114 | optimized_student = optimizer.compile( 115 | student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False} 116 | ) 117 | 118 | # Simulate calling the optimized student with a new input 119 | test_input = "What is the capital of France?" 120 | prediction = optimized_student(input=test_input) 121 | 122 | print(lm.get_convo(-1)) 123 | 124 | assert prediction.output == "Paris" 125 | 126 | 127 | def test_statistics_tracking_during_optimization(): 128 | dspy.settings.configure( 129 | lm=DummyLM( 130 | [ 131 | {"proposed_instruction": "Optimized Prompt", "proposed_prefix_for_output_field": "Optimized Prefix"}, 132 | ] 133 | ) 134 | ) 135 | 136 | optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4) 137 | optimizer.track_stats = True # Enable statistics tracking 138 | 139 | student = SimpleModule("input -> output") 140 | optimized_student = optimizer.compile( 141 | student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False} 142 | ) 143 | 144 | # Verify that statistics have been tracked 145 | assert hasattr(optimized_student, "total_calls"), "Optimizer did not track total metric calls" 146 | assert optimized_student.total_calls > 0, "Optimizer reported no metric calls" 147 | 148 | # Check if the results_best and results_latest contain valid statistics 149 | assert "results_best" in optimized_student.__dict__, "Optimizer did not track the best results" 150 | assert "results_latest" in optimized_student.__dict__, "Optimizer did not track the latest results" 151 | assert len(optimized_student.results_best) > 0, "Optimizer did not properly populate the best results statistics" 152 | assert ( 153 | len(optimized_student.results_latest) > 0 154 | ), "Optimizer did not properly populate the latest results statistics" 155 | 156 | # Additional detailed checks can be added here to verify the contents of the tracked statistics 157 | ``` -------------------------------------------------------------------------------- /docs/docs/learn/evaluation/metrics.md: -------------------------------------------------------------------------------- ```markdown 1 | --- 2 | sidebar_position: 5 3 | --- 4 | 5 | # Metrics 6 | 7 | DSPy is a machine learning framework, so you must think about your **automatic metrics** for evaluation (to track your progress) and optimization (so DSPy can make your programs more effective). 8 | 9 | 10 | ## What is a metric and how do I define a metric for my task? 11 | 12 | A metric is just a function that will take examples from your data and the output of your system and return a score that quantifies how good the output is. What makes outputs from your system good or bad? 13 | 14 | For simple tasks, this could be just "accuracy" or "exact match" or "F1 score". This may be the case for simple classification or short-form QA tasks. 15 | 16 | However, for most applications, your system will output long-form outputs. There, your metric should probably be a smaller DSPy program that checks multiple properties of the output (quite possibly using AI feedback from LMs). 17 | 18 | Getting this right on the first try is unlikely, but you should start with something simple and iterate. 19 | 20 | 21 | ## Simple metrics 22 | 23 | A DSPy metric is just a function in Python that takes `example` (e.g., from your training or dev set) and the output `pred` from your DSPy program, and outputs a `float` (or `int` or `bool`) score. 24 | 25 | Your metric should also accept an optional third argument called `trace`. You can ignore this for a moment, but it will enable some powerful tricks if you want to use your metric for optimization. 26 | 27 | Here's a simple example of a metric that's comparing `example.answer` and `pred.answer`. This particular metric will return a `bool`. 28 | 29 | ```python 30 | def validate_answer(example, pred, trace=None): 31 | return example.answer.lower() == pred.answer.lower() 32 | ``` 33 | 34 | Some people find these utilities (built-in) convenient: 35 | 36 | - `dspy.evaluate.metrics.answer_exact_match` 37 | - `dspy.evaluate.metrics.answer_passage_match` 38 | 39 | Your metrics could be more complex, e.g. check for multiple properties. The metric below will return a `float` if `trace is None` (i.e., if it's used for evaluation or optimization), and will return a `bool` otherwise (i.e., if it's used to bootstrap demonstrations). 40 | 41 | ```python 42 | def validate_context_and_answer(example, pred, trace=None): 43 | # check the gold label and the predicted answer are the same 44 | answer_match = example.answer.lower() == pred.answer.lower() 45 | 46 | # check the predicted answer comes from one of the retrieved contexts 47 | context_match = any((pred.answer.lower() in c) for c in pred.context) 48 | 49 | if trace is None: # if we're doing evaluation or optimization 50 | return (answer_match + context_match) / 2.0 51 | else: # if we're doing bootstrapping, i.e. self-generating good demonstrations of each step 52 | return answer_match and context_match 53 | ``` 54 | 55 | Defining a good metric is an iterative process, so doing some initial evaluations and looking at your data and outputs is key. 56 | 57 | 58 | ## Evaluation 59 | 60 | Once you have a metric, you can run evaluations in a simple Python loop. 61 | 62 | ```python 63 | scores = [] 64 | for x in devset: 65 | pred = program(**x.inputs()) 66 | score = metric(x, pred) 67 | scores.append(score) 68 | ``` 69 | 70 | If you need some utilities, you can also use the built-in `Evaluate` utility. It can help with things like parallel evaluation (multiple threads) or showing you a sample of inputs/outputs and the metric scores. 71 | 72 | ```python 73 | from dspy.evaluate import Evaluate 74 | 75 | # Set up the evaluator, which can be re-used in your code. 76 | evaluator = Evaluate(devset=YOUR_DEVSET, num_threads=1, display_progress=True, display_table=5) 77 | 78 | # Launch evaluation. 79 | evaluator(YOUR_PROGRAM, metric=YOUR_METRIC) 80 | ``` 81 | 82 | 83 | ## Intermediate: Using AI feedback for your metric 84 | 85 | For most applications, your system will output long-form outputs, so your metric should check multiple dimensions of the output using AI feedback from LMs. 86 | 87 | This simple signature could come in handy. 88 | 89 | ```python 90 | # Define the signature for automatic assessments. 91 | class Assess(dspy.Signature): 92 | """Assess the quality of a tweet along the specified dimension.""" 93 | 94 | assessed_text = dspy.InputField() 95 | assessment_question = dspy.InputField() 96 | assessment_answer: bool = dspy.OutputField() 97 | ``` 98 | 99 | For example, below is a simple metric that checks a generated tweet (1) answers a given question correctly and (2) whether it's also engaging. We also check that (3) `len(tweet) <= 280` characters. 100 | 101 | ```python 102 | def metric(gold, pred, trace=None): 103 | question, answer, tweet = gold.question, gold.answer, pred.output 104 | 105 | engaging = "Does the assessed text make for a self-contained, engaging tweet?" 106 | correct = f"The text should answer `{question}` with `{answer}`. Does the assessed text contain this answer?" 107 | 108 | correct = dspy.Predict(Assess)(assessed_text=tweet, assessment_question=correct) 109 | engaging = dspy.Predict(Assess)(assessed_text=tweet, assessment_question=engaging) 110 | 111 | correct, engaging = [m.assessment_answer for m in [correct, engaging]] 112 | score = (correct + engaging) if correct and (len(tweet) <= 280) else 0 113 | 114 | if trace is not None: return score >= 2 115 | return score / 2.0 116 | ``` 117 | 118 | When compiling, `trace is not None`, and we want to be strict about judging things, so we will only return `True` if `score >= 2`. Otherwise, we return a score out of 1.0 (i.e., `score / 2.0`). 119 | 120 | 121 | ## Advanced: Using a DSPy program as your metric 122 | 123 | If your metric is itself a DSPy program, one of the most powerful ways to iterate is to compile (optimize) your metric itself. That's usually easy because the output of the metric is usually a simple value (e.g., a score out of 5) so the metric's metric is easy to define and optimize by collecting a few examples. 124 | 125 | 126 | 127 | ### Advanced: Accessing the `trace` 128 | 129 | When your metric is used during evaluation runs, DSPy will not try to track the steps of your program. 130 | 131 | But during compiling (optimization), DSPy will trace your LM calls. The trace will contain inputs/outputs to each DSPy predictor and you can leverage that to validate intermediate steps for optimization. 132 | 133 | 134 | ```python 135 | def validate_hops(example, pred, trace=None): 136 | hops = [example.question] + [outputs.query for *_, outputs in trace if 'query' in outputs] 137 | 138 | if max([len(h) for h in hops]) > 100: return False 139 | if any(dspy.evaluate.answer_exact_match_str(hops[idx], hops[:idx], frac=0.8) for idx in range(2, len(hops))): return False 140 | 141 | return True 142 | ``` 143 | ``` -------------------------------------------------------------------------------- /tests/predict/test_parallel.py: -------------------------------------------------------------------------------- ```python 1 | import dspy 2 | from dspy.utils.dummies import DummyLM 3 | 4 | 5 | def test_parallel_module(): 6 | lm = DummyLM( 7 | [ 8 | {"output": "test output 1"}, 9 | {"output": "test output 2"}, 10 | {"output": "test output 3"}, 11 | {"output": "test output 4"}, 12 | {"output": "test output 5"}, 13 | ] 14 | ) 15 | dspy.settings.configure(lm=lm) 16 | 17 | class MyModule(dspy.Module): 18 | def __init__(self): 19 | super().__init__() 20 | self.predictor = dspy.Predict("input -> output") 21 | self.predictor2 = dspy.Predict("input -> output") 22 | 23 | self.parallel = dspy.Parallel(num_threads=2) 24 | 25 | def forward(self, input): 26 | return self.parallel( 27 | [ 28 | (self.predictor, input), 29 | (self.predictor2, input), 30 | (self.predictor, input), 31 | (self.predictor2, input), 32 | (self.predictor, input), 33 | ] 34 | ) 35 | 36 | output = MyModule()(dspy.Example(input="test input").with_inputs("input")) 37 | 38 | expected_outputs = {f"test output {i}" for i in range(1, 6)} 39 | assert {r.output for r in output} == expected_outputs 40 | 41 | 42 | def test_batch_module(): 43 | lm = DummyLM( 44 | [ 45 | {"output": "test output 1"}, 46 | {"output": "test output 2"}, 47 | {"output": "test output 3"}, 48 | {"output": "test output 4"}, 49 | {"output": "test output 5"}, 50 | ] 51 | ) 52 | res_lm = DummyLM( 53 | [ 54 | {"output": "test output 1", "reasoning": "test reasoning 1"}, 55 | {"output": "test output 2", "reasoning": "test reasoning 2"}, 56 | {"output": "test output 3", "reasoning": "test reasoning 3"}, 57 | {"output": "test output 4", "reasoning": "test reasoning 4"}, 58 | {"output": "test output 5", "reasoning": "test reasoning 5"}, 59 | ] 60 | ) 61 | 62 | class MyModule(dspy.Module): 63 | def __init__(self): 64 | super().__init__() 65 | self.predictor = dspy.Predict("input -> output") 66 | self.predictor2 = dspy.Predict("input -> output, reasoning") 67 | 68 | self.parallel = dspy.Parallel(num_threads=2) 69 | 70 | def forward(self, input): 71 | with dspy.context(lm=lm): 72 | res1 = self.predictor.batch([input] * 5) 73 | 74 | with dspy.context(lm=res_lm): 75 | res2 = self.predictor2.batch([input] * 5) 76 | 77 | return (res1, res2) 78 | 79 | result, reason_result = MyModule()(dspy.Example(input="test input").with_inputs("input")) 80 | 81 | # Check that we got all expected outputs without caring about order 82 | expected_outputs = {f"test output {i}" for i in range(1, 6)} 83 | assert {r.output for r in result} == expected_outputs 84 | assert {r.output for r in reason_result} == expected_outputs 85 | 86 | # Check that reasoning matches outputs for reason_result 87 | for r in reason_result: 88 | num = r.output.split()[-1] # get the number from "test output X" 89 | assert r.reasoning == f"test reasoning {num}" 90 | 91 | 92 | def test_nested_parallel_module(): 93 | lm = DummyLM( 94 | [ 95 | {"output": "test output 1"}, 96 | {"output": "test output 2"}, 97 | {"output": "test output 3"}, 98 | {"output": "test output 4"}, 99 | {"output": "test output 5"}, 100 | ] 101 | ) 102 | dspy.settings.configure(lm=lm) 103 | 104 | class MyModule(dspy.Module): 105 | def __init__(self): 106 | super().__init__() 107 | self.predictor = dspy.Predict("input -> output") 108 | self.predictor2 = dspy.Predict("input -> output") 109 | 110 | self.parallel = dspy.Parallel(num_threads=2) 111 | 112 | def forward(self, input): 113 | return self.parallel( 114 | [ 115 | (self.predictor, input), 116 | (self.predictor2, input), 117 | ( 118 | self.parallel, 119 | [ 120 | (self.predictor2, input), 121 | (self.predictor, input), 122 | ], 123 | ), 124 | ] 125 | ) 126 | 127 | output = MyModule()(dspy.Example(input="test input").with_inputs("input")) 128 | 129 | # For nested structure, check first two outputs and nested outputs separately 130 | assert {output[0].output, output[1].output} <= {f"test output {i}" for i in range(1, 5)} 131 | assert {output[2][0].output, output[2][1].output} <= {f"test output {i}" for i in range(1, 5)} 132 | all_outputs = {output[0].output, output[1].output, output[2][0].output, output[2][1].output} 133 | assert len(all_outputs) == 4 134 | 135 | 136 | def test_nested_batch_method(): 137 | lm = DummyLM( 138 | [ 139 | {"output": "test output 1"}, 140 | {"output": "test output 2"}, 141 | {"output": "test output 3"}, 142 | {"output": "test output 4"}, 143 | {"output": "test output 5"}, 144 | ] 145 | ) 146 | dspy.settings.configure(lm=lm) 147 | 148 | class MyModule(dspy.Module): 149 | def __init__(self): 150 | super().__init__() 151 | self.predictor = dspy.Predict("input -> output") 152 | 153 | def forward(self, input): 154 | res = self.predictor.batch([dspy.Example(input=input).with_inputs("input")] * 2) 155 | 156 | return res 157 | 158 | result = MyModule().batch([dspy.Example(input="test input").with_inputs("input")] * 2) 159 | 160 | assert {result[0][0].output, result[0][1].output, result[1][0].output, result[1][1].output} == { 161 | "test output 1", 162 | "test output 2", 163 | "test output 3", 164 | "test output 4", 165 | } 166 | 167 | 168 | def test_batch_with_failed_examples(): 169 | class FailingModule(dspy.Module): 170 | def forward(self, value: int) -> str: 171 | if value == 42: 172 | raise ValueError("test error") 173 | return f"success-{value}" 174 | 175 | module = FailingModule() 176 | 177 | examples = [ 178 | dspy.Example(value=1).with_inputs("value"), 179 | dspy.Example(value=42).with_inputs("value"), # This will fail 180 | dspy.Example(value=3).with_inputs("value"), 181 | ] 182 | 183 | results, failed_examples, exceptions = module.batch( 184 | examples, 185 | return_failed_examples=True, 186 | provide_traceback=True, 187 | ) 188 | 189 | assert results == ["success-1", None, "success-3"] 190 | 191 | assert len(failed_examples) == 1 192 | assert failed_examples[0].inputs()["value"] == 42 193 | 194 | assert len(exceptions) == 1 195 | assert isinstance(exceptions[0], ValueError) 196 | assert str(exceptions[0]) == "test error" 197 | ``` -------------------------------------------------------------------------------- /.github/workflows/build_and_release.yml: -------------------------------------------------------------------------------- ```yaml 1 | --- 2 | name: Publish Python 🐍 distributions 📦 to PyPI 3 | on: 4 | push: 5 | tags: 6 | - "*" 7 | jobs: 8 | extract-tag: 9 | runs-on: ubuntu-latest 10 | outputs: 11 | version: ${{ steps.extract_tag.outputs.tag }} 12 | steps: 13 | - uses: actions/checkout@v4 14 | - id: extract_tag 15 | name: Extract tag name 16 | run: echo "tag=$(echo $GITHUB_REF | cut -d / -f 3)" >> "$GITHUB_OUTPUT" 17 | 18 | build-and-publish-test-pypi: 19 | needs: extract-tag 20 | runs-on: ubuntu-latest 21 | environment: 22 | name: pypi 23 | permissions: 24 | id-token: write # IMPORTANT: mandatory for trusted publishing 25 | contents: write 26 | steps: 27 | - uses: actions/checkout@v4 28 | - name: Set up Python 3.11 29 | uses: actions/setup-python@v3 30 | with: 31 | python-version: "3.11" 32 | - name: Install dependencies 33 | run: python3 -m pip install --upgrade setuptools wheel twine build semver packaging 34 | - name: Get correct version for TestPyPI release 35 | id: check_version 36 | run: | 37 | VERSION=${{ needs.extract-tag.outputs.version }} 38 | PACKAGE_NAME="dspy-ai-test" 39 | echo "Checking if $VERSION for $PACKAGE_NAME exists on TestPyPI" 40 | NEW_VERSION=$(python3 .github/workflows/build_utils/test_version.py $PACKAGE_NAME $VERSION) 41 | echo "Version to be used for TestPyPI release: $NEW_VERSION" 42 | echo "version=$NEW_VERSION" >> "$GITHUB_OUTPUT" 43 | - name: Update version in pyproject.toml 44 | run: sed -i '/#replace_package_version_marker/{n;s/version="[^"]*"/version="${{ steps.check_version.outputs.version }}"/;}' pyproject.toml 45 | - name: Update package name in pyproject.toml 46 | run: sed -i '/#replace_package_name_marker/{n;s/name="[^"]*"/name="dspy-ai-test"/;}' pyproject.toml 47 | - name: Build a binary wheel 48 | run: python3 -m build 49 | # Test the locally built wheel 50 | - name: Create test environment 51 | run: python -m venv test_before_testpypi 52 | - name: Test package installation and functionality 53 | run: | 54 | source test_before_testpypi/bin/activate 55 | # Install the locally built wheel and testing dependencies 56 | pip install dist/*.whl pytest pytest-asyncio 57 | pytest tests/metadata/test_metadata.py tests/predict 58 | deactivate 59 | # Publish to test-PyPI 60 | - name: Publish distribution 📦 to test-PyPI 61 | uses: pypa/gh-action-pypi-publish@release/v1 # This requires a trusted publisher to be setup in pypi/testpypi 62 | with: 63 | repository-url: https://test.pypi.org/legacy/ 64 | 65 | # TODO: Add tests using dspy-ai-test 66 | 67 | build-and-publish-pypi: 68 | needs: [extract-tag, build-and-publish-test-pypi] 69 | # Only publish to PyPI if the repository owner is stanfordnlp 70 | if: github.repository_owner == 'stanfordnlp' 71 | runs-on: ubuntu-latest 72 | environment: 73 | name: pypi 74 | permissions: 75 | id-token: write # IMPORTANT: mandatory for trusted publishing 76 | contents: write 77 | steps: 78 | - uses: actions/checkout@v4 79 | - name: Set up Python 3.11 80 | uses: actions/setup-python@v3 81 | with: 82 | python-version: "3.11" 83 | - name: Install dependencies 84 | run: python3 -m pip install --upgrade setuptools wheel twine build 85 | - name: Update version in pyproject.toml 86 | run: sed -i '/#replace_package_version_marker/{n;s/version *= *"[^"]*"/version="${{ needs.extract-tag.outputs.version }}"/;}' pyproject.toml 87 | - name: Update version in __metadata__.py 88 | run: sed -i '/#replace_package_version_marker/{n;s/__version__ *= *"[^"]*"/__version__="${{ needs.extract-tag.outputs.version }}"/;}' ./dspy/__metadata__.py 89 | # Publish to dspy 90 | - name: Update package name in pyproject.toml 91 | run: sed -i '/#replace_package_name_marker/{n;s/name *= *"[^"]*"/name="dspy"/;}' pyproject.toml 92 | - name: Update package name in metadata.py 93 | run: sed -i '/#replace_package_name_marker/{n;s/__name__ *= *"[^"]*"/__name__="dspy"/;}' ./dspy/__metadata__.py 94 | - name: Build a binary wheel 95 | run: python3 -m build 96 | # Test the locally built wheel before publishing to pypi 97 | - name: Create test environment 98 | run: python -m venv test_before_pypi 99 | - name: Test package installation and functionality 100 | run: | 101 | source test_before_pypi/bin/activate 102 | # Install the locally built wheel and testing dependencies 103 | pip install dist/*.whl pytest pytest-asyncio 104 | pytest tests/metadata/test_metadata.py tests/predict 105 | deactivate 106 | rm -r test_before_pypi 107 | - name: Publish distribution 📦 to PyPI (dspy) 108 | uses: pypa/gh-action-pypi-publish@release/v1 109 | with: 110 | attestations: false 111 | # Publish to dspy-ai 112 | - name: Update package name in pyproject.toml 113 | run: sed -i '/#replace_package_name_marker/{n;s/name *= *"[^"]*"/name="dspy-ai"/;}' .github/.internal_dspyai/pyproject.toml 114 | - name: Update version for dspy-ai release 115 | run: sed -i '/#replace_package_version_marker/{n;s/version *= *"[^"]*"/version="${{ needs.extract-tag.outputs.version }}"/;}' .github/.internal_dspyai/pyproject.toml 116 | - name: Update dspy dependency for dspy-ai release 117 | run: | 118 | sed -i '/#replace_dspy_version_marker/{n;s/dspy>=[^"]*/dspy>=${{ needs.extract-tag.outputs.version }}/;}' .github/.internal_dspyai/pyproject.toml 119 | - name: Build a binary wheel (dspy-ai) 120 | run: python3 -m build .github/.internal_dspyai/ 121 | - name: Publish distribution 📦 to PyPI (dspy-ai) 122 | uses: pypa/gh-action-pypi-publish@release/v1 123 | with: 124 | attestations: false 125 | packages-dir: .github/.internal_dspyai/dist/ 126 | - uses: stefanzweifel/git-auto-commit-action@v5 # auto commit changes to main 127 | with: 128 | commit_message: Update versions 129 | create_branch: true 130 | branch: release-${{ needs.extract-tag.outputs.version }} 131 | - name: Checkout main branch 132 | run: | 133 | git fetch origin 134 | git checkout main 135 | - name: Configure git user 136 | run: | 137 | git config --global user.email "[email protected]" 138 | git config --global user.name "Github Actions" 139 | - name: Merge release branch into main 140 | run: | 141 | git merge --no-ff release-${{ needs.extract-tag.outputs.version }} 142 | - name: Push changes to main 143 | run: | 144 | git push origin main 145 | ``` -------------------------------------------------------------------------------- /dspy/teleprompt/infer_rules.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | import random 3 | 4 | import numpy as np 5 | 6 | import dspy 7 | from dspy.evaluate.evaluate import Evaluate 8 | from dspy.teleprompt import BootstrapFewShot 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class InferRules(BootstrapFewShot): 14 | def __init__(self, num_candidates=10, num_rules=10, num_threads=None, teacher_settings=None, **kwargs): 15 | super().__init__(teacher_settings=teacher_settings, **kwargs) 16 | 17 | self.num_candidates = num_candidates 18 | self.num_rules = num_rules 19 | self.num_threads = num_threads 20 | self.rules_induction_program = RulesInductionProgram(num_rules, teacher_settings=teacher_settings) 21 | self.metric = kwargs.get("metric") 22 | self.max_errors = kwargs.get("max_errors") 23 | 24 | def compile(self, student, *, teacher=None, trainset, valset=None): 25 | if valset is None: 26 | train_size = int(0.5 * len(trainset)) 27 | trainset, valset = trainset[:train_size], trainset[train_size:] 28 | 29 | super().compile(student, teacher=teacher, trainset=trainset) 30 | 31 | original_program = self.student.deepcopy() 32 | all_predictors = [p for p in original_program.predictors() if hasattr(p, "signature")] 33 | instructions_list = [p.signature.instructions for p in all_predictors] 34 | 35 | best_score = -np.inf 36 | best_program = None 37 | 38 | for candidate_idx in range(self.num_candidates): 39 | candidate_program = original_program.deepcopy() 40 | candidate_predictors = [p for p in candidate_program.predictors() if hasattr(p, "signature")] 41 | 42 | for i, predictor in enumerate(candidate_predictors): 43 | predictor.signature.instructions = instructions_list[i] 44 | 45 | for i, predictor in enumerate(candidate_predictors): 46 | rules = self.induce_natural_language_rules(predictor, trainset) 47 | predictor.signature.instructions = instructions_list[i] 48 | self.update_program_instructions(predictor, rules) 49 | 50 | score = self.evaluate_program(candidate_program, valset) 51 | 52 | if score > best_score: 53 | best_score = score 54 | best_program = candidate_program 55 | 56 | logger.info(f"Evaluated Candidate {candidate_idx + 1} with score {score}. Current best score: {best_score}") 57 | 58 | logger.info(f"Final best score: {best_score}") 59 | 60 | return best_program 61 | 62 | def induce_natural_language_rules(self, predictor, trainset): 63 | demos = self.get_predictor_demos(trainset, predictor) 64 | signature = predictor.signature 65 | while True: 66 | examples_text = self.format_examples(demos, signature) 67 | try: 68 | return self.rules_induction_program(examples_text) 69 | except Exception as e: 70 | assert ( 71 | isinstance(e, ValueError) 72 | or e.__class__.__name__ == "BadRequestError" 73 | or "ContextWindowExceededError" in str(e) 74 | ) 75 | if len(demos) > 1: 76 | demos = demos[:-1] 77 | else: 78 | raise RuntimeError( 79 | "Failed to generate natural language rules since a single example couldn't fit in the model's " 80 | "context window." 81 | ) from e 82 | 83 | def update_program_instructions(self, predictor, natural_language_rules): 84 | predictor.signature.instructions = ( 85 | f"{predictor.signature.instructions}\n\n" 86 | f"Please adhere to the following rules when making your prediction:\n{natural_language_rules}" 87 | ) 88 | 89 | def format_examples(self, demos, signature): 90 | examples_text = "" 91 | for demo in demos: 92 | input_fields = {k: v for k, v in demo.items() if k in signature.input_fields} 93 | output_fields = {k: v for k, v in demo.items() if k in signature.output_fields} 94 | input_text = "\n".join(f"{k}: {v}" for k, v in input_fields.items()) 95 | output_text = "\n".join(f"{k}: {v}" for k, v in output_fields.items()) 96 | examples_text += f"Input Fields:\n{input_text}\n\n=========\nOutput Fields:\n{output_text}\n\n" 97 | return examples_text 98 | 99 | def get_predictor_demos(self, trainset, predictor): 100 | # TODO: Consider how this handled "incomplete" demos. 101 | signature = predictor.signature 102 | return [ 103 | { 104 | key: value 105 | for key, value in example.items() 106 | if key in signature.input_fields or key in signature.output_fields 107 | } 108 | for example in trainset 109 | ] 110 | 111 | def evaluate_program(self, program, dataset): 112 | effective_max_errors = ( 113 | self.max_errors if self.max_errors is not None else dspy.settings.max_errors 114 | ) 115 | evaluate = Evaluate( 116 | devset=dataset, 117 | metric=self.metric, 118 | num_threads=self.num_threads, 119 | max_errors=effective_max_errors, 120 | display_table=False, 121 | display_progress=True, 122 | ) 123 | score = evaluate(program, metric=self.metric).score 124 | return score 125 | 126 | 127 | class RulesInductionProgram(dspy.Module): 128 | def __init__(self, num_rules, teacher_settings=None): 129 | super().__init__() 130 | 131 | class CustomRulesInduction(dspy.Signature): 132 | __doc__ = ( 133 | f"Given a set of examples, extract a list of {num_rules} concise and non-redundant natural language " 134 | "rules that provide clear guidance for performing the task. All rules should be actionable for a " 135 | "well-specified scope of examples of this general kind of task." 136 | ) 137 | examples_text = dspy.InputField(desc="Text containing examples") 138 | natural_language_rules = dspy.OutputField(desc="Induced natural language rules") 139 | 140 | self.rules_induction = dspy.ChainOfThought(CustomRulesInduction) 141 | self.teacher_settings = teacher_settings or {} 142 | self.rng = random.Random(0) 143 | 144 | def forward(self, examples_text): 145 | with dspy.settings.context(**self.teacher_settings): 146 | # Generate rules with a fresh rollout and non-zero temperature. 147 | lm = dspy.settings.lm.copy( 148 | rollout_id=self.rng.randint(0, 10**9), temperature=1.0 149 | ) 150 | with dspy.settings.context(lm=lm): 151 | rules = self.rules_induction(examples_text=examples_text).natural_language_rules 152 | 153 | return rules.strip() 154 | ``` -------------------------------------------------------------------------------- /tests/primitives/test_python_interpreter.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import random 3 | import shutil 4 | 5 | import pytest 6 | 7 | from dspy.primitives.python_interpreter import InterpreterError, PythonInterpreter 8 | 9 | # This test suite requires deno to be installed. Please install deno following https://docs.deno.com/runtime/getting_started/installation/ 10 | if shutil.which("deno") is None: 11 | pytest.skip(reason="Deno is not installed or not in PATH", allow_module_level=True) 12 | 13 | 14 | def test_execute_simple_code(): 15 | with PythonInterpreter() as interpreter: 16 | code = "print('Hello, World!')" 17 | result = interpreter.execute(code) 18 | assert result == "Hello, World!\n", "Simple print statement should return 'Hello World!\n'" 19 | 20 | 21 | def test_import(): 22 | with PythonInterpreter() as interpreter: 23 | code = "import math\nresult = math.sqrt(4)\nresult" 24 | result = interpreter.execute(code) 25 | assert result == 2, "Should be able to import and use math.sqrt" 26 | 27 | 28 | def test_user_variable_definitions(): 29 | with PythonInterpreter() as interpreter: 30 | code = "result = number + 1\nresult" 31 | result = interpreter.execute(code, variables={"number": 4}) 32 | assert result == 5, "User variable assignment should work" 33 | 34 | 35 | def test_failure_syntax_error(): 36 | with PythonInterpreter() as interpreter: 37 | code = "+++" 38 | with pytest.raises(SyntaxError, match="Invalid Python syntax"): 39 | interpreter.execute(code) 40 | 41 | 42 | def test_failure_zero_division(): 43 | with PythonInterpreter() as interpreter: 44 | code = "1+0/0" 45 | with pytest.raises(InterpreterError, match="ZeroDivisionError"): 46 | interpreter.execute(code) 47 | 48 | 49 | def test_exception_args(): 50 | with PythonInterpreter() as interpreter: 51 | token = random.randint(1, 10**9) 52 | code = f"raise ValueError({token})" 53 | with pytest.raises(InterpreterError, match=rf"ValueError: \[{token}\]"): 54 | interpreter.execute(code) 55 | 56 | 57 | def test_final_answer_trick(): 58 | with PythonInterpreter() as interpreter: 59 | token = random.randint(1, 10**9) 60 | code = f"final_answer('The result is', {token})" 61 | result = interpreter(code) 62 | 63 | # They should maintain the same order 64 | assert result == ["The result is", token], "The returned results are differ, `final_answer` trick doesn't work" 65 | 66 | def test_enable_env_vars_flag(): 67 | os.environ["FOO_TEST_ENV"] = "test_value" 68 | 69 | with PythonInterpreter(enable_env_vars=None) as interpreter: 70 | code = "import os\nresult = os.getenv('FOO_TEST_ENV')\nresult" 71 | result = interpreter.execute(code) 72 | assert result == "", "Environment variables should be inaccessible without allow-env" 73 | 74 | with PythonInterpreter(enable_env_vars=["FOO_TEST_ENV"]) as interpreter: 75 | code = "import os\nresult = os.getenv('FOO_TEST_ENV')\nresult" 76 | result = interpreter.execute(code) 77 | assert result == "test_value", "Environment variables should be accessible with allow-env" 78 | 79 | 80 | 81 | def test_read_file_access_control(tmp_path): 82 | testfile_path = tmp_path / "test_temp_file.txt" 83 | virtual_path = f"/sandbox/{testfile_path.name}" 84 | with open(testfile_path, "w") as f: 85 | f.write("test content") 86 | 87 | with PythonInterpreter(enable_read_paths=[str(testfile_path)]) as interpreter: 88 | code = ( 89 | f"with open({virtual_path!r}, 'r') as f:\n" 90 | f" data = f.read()\n" 91 | f"data" 92 | ) 93 | result = interpreter.execute(code) 94 | assert result == "test content", "Test file should be accessible with enable_read_paths and specified file" 95 | 96 | with PythonInterpreter(enable_read_paths=None) as interpreter: 97 | code = ( 98 | f"try:\n" 99 | f" with open({virtual_path!r}, 'r') as f:\n" 100 | f" data = f.read()\n" 101 | f"except Exception as e:\n" 102 | f" data = str(e)\n" 103 | f"data" 104 | ) 105 | result = interpreter.execute(code) 106 | assert ("PermissionDenied" in result or "denied" in result.lower() or "no such file" in result.lower()), "Test file should not be accessible without enable_read_paths" 107 | 108 | def test_enable_write_flag(tmp_path): 109 | testfile_path = tmp_path / "test_temp_output.txt" 110 | virtual_path = f"/sandbox/{testfile_path.name}" 111 | 112 | with PythonInterpreter(enable_write_paths=None) as interpreter: 113 | code = ( 114 | f"try:\n" 115 | f" with open({virtual_path!r}, 'w') as f:\n" 116 | f" f.write('blocked')\n" 117 | f" result = 'wrote'\n" 118 | f"except Exception as e:\n" 119 | f" result = str(e)\n" 120 | f"result" 121 | ) 122 | result = interpreter.execute(code) 123 | assert ("PermissionDenied" in result or "denied" in result.lower() or "no such file" in result.lower()), "Test file should not be writable without enable_write_paths" 124 | 125 | with PythonInterpreter(enable_write_paths=[str(testfile_path)]) as interpreter: 126 | code = ( 127 | f"with open({virtual_path!r}, 'w') as f:\n" 128 | f" f.write('allowed')\n" 129 | f"'ok'" 130 | ) 131 | result = interpreter.execute(code) 132 | assert result == "ok", "Test file should be writable with enable_write_paths" 133 | assert testfile_path.exists() 134 | with open(testfile_path) as f: 135 | assert f.read() == "allowed", "Test file outputs should match content written during execution" 136 | 137 | with open(testfile_path, "w") as f: 138 | f.write("original_content") 139 | with PythonInterpreter(enable_write_paths=[str(testfile_path)], sync_files=False) as interpreter: 140 | code = ( 141 | f"with open({virtual_path!r}, 'w') as f:\n" 142 | f" f.write('should_not_sync')\n" 143 | f"'done_no_sync'" 144 | ) 145 | result = interpreter.execute(code) 146 | assert result == "done_no_sync" 147 | with open(testfile_path) as f: 148 | assert f.read() == "original_content", "File should not be changed when sync_files is False" 149 | 150 | 151 | 152 | def test_enable_net_flag(): 153 | test_url = "https://example.com" 154 | 155 | with PythonInterpreter(enable_network_access=None) as interpreter: 156 | code = ( 157 | "import js\n" 158 | f"resp = await js.fetch({test_url!r})\n" 159 | "resp.status" 160 | ) 161 | with pytest.raises(InterpreterError, match="PythonError"): 162 | interpreter.execute(code) 163 | 164 | with PythonInterpreter(enable_network_access=["example.com"]) as interpreter: 165 | code = ( 166 | "import js\n" 167 | f"resp = await js.fetch({test_url!r})\n" 168 | "resp.status" 169 | ) 170 | result = interpreter.execute(code) 171 | assert int(result) == 200, "Network access is permitted with enable_network_access" 172 | ``` -------------------------------------------------------------------------------- /dspy/adapters/types/base_type.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import re 3 | from typing import Any, Optional, get_args, get_origin 4 | 5 | import json_repair 6 | import pydantic 7 | from litellm import ModelResponseStream 8 | 9 | CUSTOM_TYPE_START_IDENTIFIER = "<<CUSTOM-TYPE-START-IDENTIFIER>>" 10 | CUSTOM_TYPE_END_IDENTIFIER = "<<CUSTOM-TYPE-END-IDENTIFIER>>" 11 | 12 | 13 | class Type(pydantic.BaseModel): 14 | """Base class to support creating custom types for DSPy signatures. 15 | 16 | This is the parent class of DSPy custom types, e.g, dspy.Image. Subclasses must implement the `format` method to 17 | return a list of dictionaries (same as the Array of content parts in the OpenAI API user message's content field). 18 | 19 | Example: 20 | 21 | ```python 22 | class Image(Type): 23 | url: str 24 | 25 | def format(self) -> list[dict[str, Any]]: 26 | return [{"type": "image_url", "image_url": {"url": self.url}}] 27 | ``` 28 | """ 29 | 30 | def format(self) -> list[dict[str, Any]] | str: 31 | raise NotImplementedError 32 | 33 | @classmethod 34 | def description(cls) -> str: 35 | """Description of the custom type""" 36 | return "" 37 | 38 | @classmethod 39 | def extract_custom_type_from_annotation(cls, annotation): 40 | """Extract all custom types from the annotation. 41 | 42 | This is used to extract all custom types from the annotation of a field, while the annotation can 43 | have arbitrary level of nesting. For example, we detect `Tool` is in `list[dict[str, Tool]]`. 44 | """ 45 | # Direct match. Nested type like `list[dict[str, Event]]` passes `isinstance(annotation, type)` in python 3.10 46 | # while fails in python 3.11. To accommodate users using python 3.10, we need to capture the error and ignore it. 47 | try: 48 | if isinstance(annotation, type) and issubclass(annotation, cls): 49 | return [annotation] 50 | except TypeError: 51 | pass 52 | 53 | origin = get_origin(annotation) 54 | if origin is None: 55 | return [] 56 | 57 | result = [] 58 | # Recurse into all type args 59 | for arg in get_args(annotation): 60 | result.extend(cls.extract_custom_type_from_annotation(arg)) 61 | 62 | return result 63 | 64 | @pydantic.model_serializer() 65 | def serialize_model(self): 66 | formatted = self.format() 67 | if isinstance(formatted, list): 68 | return ( 69 | f"{CUSTOM_TYPE_START_IDENTIFIER}{json.dumps(formatted, ensure_ascii=False)}{CUSTOM_TYPE_END_IDENTIFIER}" 70 | ) 71 | return formatted 72 | 73 | @classmethod 74 | def is_streamable(cls) -> bool: 75 | """Whether the custom type is streamable.""" 76 | return False 77 | 78 | @classmethod 79 | def parse_stream_chunk(cls, chunk: ModelResponseStream) -> Optional["Type"]: 80 | """ 81 | Parse a stream chunk into the custom type. 82 | 83 | Args: 84 | chunk: A stream chunk. 85 | 86 | Returns: 87 | A custom type object or None if the chunk is not for this custom type. 88 | """ 89 | return None 90 | 91 | 92 | @classmethod 93 | def parse_lm_response(cls, response: str | dict[str, Any]) -> Optional["Type"]: 94 | """Parse a LM response into the custom type. 95 | 96 | Args: 97 | response: A LM response. 98 | 99 | Returns: 100 | A custom type object. 101 | """ 102 | return None 103 | 104 | def split_message_content_for_custom_types(messages: list[dict[str, Any]]) -> list[dict[str, Any]]: 105 | """Split user message content into a list of content blocks. 106 | 107 | This method splits each user message's content in the `messages` list to be a list of content block, so that 108 | the custom types like `dspy.Image` can be properly formatted for better quality. For example, the split content 109 | may look like below if the user message has a `dspy.Image` object: 110 | 111 | ``` 112 | [ 113 | {"type": "text", "text": "{text_before_image}"}, 114 | {"type": "image_url", "image_url": {"url": "{image_url}"}}, 115 | {"type": "text", "text": "{text_after_image}"}, 116 | ] 117 | ``` 118 | 119 | This is implemented by finding the `<<CUSTOM-TYPE-START-IDENTIFIER>>` and `<<CUSTOM-TYPE-END-IDENTIFIER>>` 120 | in the user message content and splitting the content around them. The `<<CUSTOM-TYPE-START-IDENTIFIER>>` 121 | and `<<CUSTOM-TYPE-END-IDENTIFIER>>` are the reserved identifiers for the custom types as in `dspy.Type`. 122 | 123 | Args: 124 | messages: a list of messages sent to the LM. The format is the same as [OpenAI API's messages 125 | format](https://platform.openai.com/docs/guides/chat-completions/response-format). 126 | 127 | Returns: 128 | A list of messages with the content split into a list of content blocks around custom types content. 129 | """ 130 | for message in messages: 131 | if message["role"] != "user": 132 | # Custom type messages are only in user messages 133 | continue 134 | 135 | pattern = rf"{CUSTOM_TYPE_START_IDENTIFIER}(.*?){CUSTOM_TYPE_END_IDENTIFIER}" 136 | result = [] 137 | last_end = 0 138 | # DSPy adapter always formats user input into a string content before custom type splitting 139 | content: str = message["content"] 140 | 141 | for match in re.finditer(pattern, content, re.DOTALL): 142 | start, end = match.span() 143 | 144 | # Add text before the current block 145 | if start > last_end: 146 | result.append({"type": "text", "text": content[last_end:start]}) 147 | 148 | # Parse the JSON inside the block 149 | custom_type_content = match.group(1).strip() 150 | parsed = None 151 | 152 | for parse_fn in [json.loads, _parse_doubly_quoted_json, json_repair.loads]: 153 | try: 154 | parsed = parse_fn(custom_type_content) 155 | break 156 | except json.JSONDecodeError: 157 | continue 158 | 159 | if parsed: 160 | for custom_type_content in parsed: 161 | result.append(custom_type_content) 162 | else: 163 | # fallback to raw string if it's not valid JSON 164 | result.append({"type": "text", "text": custom_type_content}) 165 | 166 | last_end = end 167 | 168 | if last_end == 0: 169 | # No custom type found, return the original message 170 | continue 171 | 172 | # Add any remaining text after the last match 173 | if last_end < len(content): 174 | result.append({"type": "text", "text": content[last_end:]}) 175 | 176 | message["content"] = result 177 | 178 | return messages 179 | 180 | 181 | def _parse_doubly_quoted_json(json_str: str) -> Any: 182 | """ 183 | Parse a doubly quoted JSON string into a Python dict. 184 | `dspy.Type` can be json-encoded twice if included in either list or dict, e.g., `list[dspy.experimental.Document]` 185 | """ 186 | return json.loads(json.loads(f'"{json_str}"')) 187 | ``` -------------------------------------------------------------------------------- /docs/docs/tutorials/saving/index.md: -------------------------------------------------------------------------------- ```markdown 1 | # Tutorial: Saving and Loading your DSPy program 2 | 3 | This guide demonstrates how to save and load your DSPy program. At a high level, there are two ways to save your DSPy program: 4 | 5 | 1. Save the state of the program only, similar to weights-only saving in PyTorch. 6 | 2. Save the whole program, including both the architecture and the state, which is supported by `dspy>=2.6.0`. 7 | 8 | ## State-only Saving 9 | 10 | State represents the DSPy program's internal state, including the signature, demos (few-shot examples), and other information like 11 | the `lm` to use for each `dspy.Predict` in the program. It also includes configurable attributes of other DSPy modules like 12 | `k` for `dspy.retrievers.Retriever`. To save the state of a program, use the `save` method and set `save_program=False`. You can 13 | choose to save the state to a JSON file or a pickle file. We recommend saving the state to a JSON file because it is safer and readable. 14 | But sometimes your program contains non-serializable objects like `dspy.Image` or `datetime.datetime`, in which case you should save 15 | the state to a pickle file. 16 | 17 | Let's say we have compiled a program with some data, and we want to save the program for future usage: 18 | 19 | ```python 20 | import dspy 21 | from dspy.datasets.gsm8k import GSM8K, gsm8k_metric 22 | 23 | dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini")) 24 | 25 | gsm8k = GSM8K() 26 | gsm8k_trainset = gsm8k.train[:10] 27 | dspy_program = dspy.ChainOfThought("question -> answer") 28 | 29 | optimizer = dspy.BootstrapFewShot(metric=gsm8k_metric, max_bootstrapped_demos=4, max_labeled_demos=4, max_rounds=5) 30 | compiled_dspy_program = optimizer.compile(dspy_program, trainset=gsm8k_trainset) 31 | ``` 32 | 33 | To save the state of your program to json file: 34 | 35 | ```python 36 | compiled_dspy_program.save("./dspy_program/program.json", save_program=False) 37 | ``` 38 | 39 | To save the state of your program to a pickle file: 40 | 41 | ```python 42 | compiled_dspy_program.save("./dspy_program/program.pkl", save_program=False) 43 | ``` 44 | 45 | To load your saved state, you need to **recreate the same program**, then load the state using the `load` method. 46 | 47 | ```python 48 | loaded_dspy_program = dspy.ChainOfThought("question -> answer") # Recreate the same program. 49 | loaded_dspy_program.load("./dspy_program/program.json") 50 | 51 | assert len(compiled_dspy_program.demos) == len(loaded_dspy_program.demos) 52 | for original_demo, loaded_demo in zip(compiled_dspy_program.demos, loaded_dspy_program.demos): 53 | # Loaded demo is a dict, while the original demo is a dspy.Example. 54 | assert original_demo.toDict() == loaded_demo 55 | assert str(compiled_dspy_program.signature) == str(loaded_dspy_program.signature) 56 | ``` 57 | 58 | Or load the state from a pickle file: 59 | 60 | ```python 61 | loaded_dspy_program = dspy.ChainOfThought("question -> answer") # Recreate the same program. 62 | loaded_dspy_program.load("./dspy_program/program.pkl") 63 | 64 | assert len(compiled_dspy_program.demos) == len(loaded_dspy_program.demos) 65 | for original_demo, loaded_demo in zip(compiled_dspy_program.demos, loaded_dspy_program.demos): 66 | # Loaded demo is a dict, while the original demo is a dspy.Example. 67 | assert original_demo.toDict() == loaded_demo 68 | assert str(compiled_dspy_program.signature) == str(loaded_dspy_program.signature) 69 | ``` 70 | 71 | ## Whole Program Saving 72 | 73 | Starting from `dspy>=2.6.0`, DSPy supports saving the whole program, including the architecture and the state. This feature 74 | is powered by `cloudpickle`, which is a library for serializing and deserializing Python objects. 75 | 76 | To save the whole program, use the `save` method and set `save_program=True`, and specify a directory path to save the program 77 | instead of a file name. We require a directory path because we also save some metadata, e.g., the dependency versions along 78 | with the program itself. 79 | 80 | ```python 81 | compiled_dspy_program.save("./dspy_program/", save_program=True) 82 | ``` 83 | 84 | To load the saved program, directly use `dspy.load` method: 85 | 86 | ```python 87 | loaded_dspy_program = dspy.load("./dspy_program/") 88 | 89 | assert len(compiled_dspy_program.demos) == len(loaded_dspy_program.demos) 90 | for original_demo, loaded_demo in zip(compiled_dspy_program.demos, loaded_dspy_program.demos): 91 | # Loaded demo is a dict, while the original demo is a dspy.Example. 92 | assert original_demo.toDict() == loaded_demo 93 | assert str(compiled_dspy_program.signature) == str(loaded_dspy_program.signature) 94 | ``` 95 | 96 | With whole program saving, you don't need to recreate the program, but can directly load the architecture along with the state. 97 | You can pick the suitable saving approach based on your needs. 98 | 99 | ### Serializing Imported Modules 100 | 101 | When saving a program with `save_program=True`, you might need to include custom modules that your program depends on. This is 102 | necessary if your program depends on these modules, but at loading time these modules are not imported before calling `dspy.load`. 103 | 104 | You can specify which custom modules should be serialized with your program by passing them to the `modules_to_serialize` 105 | parameter when calling `save`. This ensures that any dependencies your program relies on are included during serialization and 106 | available when loading the program later. 107 | 108 | Under the hood this uses cloudpickle's `cloudpickle.register_pickle_by_value` function to register a module as picklable by value. 109 | When a module is registered this way, cloudpickle will serialize the module by value rather than by reference, ensuring that the 110 | module contents are preserved with the saved program. 111 | 112 | For example, if your program uses custom modules: 113 | 114 | ```python 115 | import dspy 116 | import my_custom_module 117 | 118 | compiled_dspy_program = dspy.ChainOfThought(my_custom_module.custom_signature) 119 | 120 | # Save the program with the custom module 121 | compiled_dspy_program.save( 122 | "./dspy_program/", 123 | save_program=True, 124 | modules_to_serialize=[my_custom_module] 125 | ) 126 | ``` 127 | 128 | This ensures that the required modules are properly serialized and available when loading the program later. Any number of 129 | modules can be passed to `modules_to_serialize`. If you don't specify `modules_to_serialize`, no additional modules will be 130 | registered for serialization. 131 | 132 | ## Backward Compatibility 133 | 134 | As of `dspy<3.0.0`, we don't guarantee the backward compatibility of the saved program. For example, if you save the program with `dspy==2.5.35`, 135 | at loading time please make sure to use the same version of DSPy to load the program, otherwise the program may not work as expected. Chances 136 | are that loading a saved file in a different version of DSPy will not raise an error, but the performance could be different from when 137 | the program was saved. 138 | 139 | Starting from `dspy>=3.0.0`, we will guarantee the backward compatibility of the saved program in major releases, i.e., programs saved in `dspy==3.0.0` 140 | should be loadable in `dspy==3.7.10`. 141 | ``` -------------------------------------------------------------------------------- /dspy/dsp/utils/settings.py: -------------------------------------------------------------------------------- ```python 1 | import asyncio 2 | import contextvars 3 | import copy 4 | import threading 5 | from contextlib import contextmanager 6 | 7 | from dspy.dsp.utils.utils import dotdict 8 | 9 | DEFAULT_CONFIG = dotdict( 10 | lm=None, 11 | adapter=None, 12 | rm=None, 13 | branch_idx=0, 14 | trace=[], 15 | callbacks=[], 16 | async_max_workers=8, 17 | send_stream=None, 18 | disable_history=False, 19 | track_usage=False, 20 | usage_tracker=None, 21 | caller_predict=None, 22 | caller_modules=None, 23 | stream_listeners=[], 24 | provide_traceback=False, # Whether to include traceback information in error logs. 25 | num_threads=8, # Number of threads to use for parallel processing. 26 | max_errors=10, # Maximum errors before halting operations. 27 | # If true, async tools can be called in sync mode by getting converted to sync. 28 | allow_tool_async_sync_conversion=False, 29 | max_history_size=10000, 30 | max_trace_size=10000, 31 | ) 32 | 33 | # Global base configuration and owner tracking 34 | main_thread_config = copy.deepcopy(DEFAULT_CONFIG) 35 | config_owner_thread_id = None 36 | config_owner_async_task = None 37 | 38 | # Global lock for settings configuration 39 | global_lock = threading.Lock() 40 | 41 | thread_local_overrides = contextvars.ContextVar("context_overrides", default=dotdict()) 42 | 43 | 44 | class Settings: 45 | """ 46 | A singleton class for DSPy configuration settings. 47 | Thread-safe global configuration. 48 | - 'configure' can be called by only one 'owner' thread (the first thread that calls it). 49 | - Other threads see the configured global values from 'main_thread_config'. 50 | - 'context' sets thread-local overrides. These overrides propagate to threads spawned 51 | inside that context block, when (and only when!) using a ParallelExecutor that copies overrides. 52 | 53 | 1. Only one unique thread (which can be any thread!) can call dspy.configure. 54 | 2. It affects a global state, visible to all. As a result, user threads work, but they shouldn't be 55 | mixed with concurrent changes to dspy.configure from the "main" thread. 56 | (TODO: In the future, add warnings: if there are near-in-time user-thread reads followed by .configure calls.) 57 | 3. Any thread can use dspy.context. It propagates to child threads created with DSPy primitives: Parallel, asyncify, etc. 58 | """ 59 | 60 | _instance = None 61 | 62 | def __new__(cls): 63 | if cls._instance is None: 64 | cls._instance = super().__new__(cls) 65 | return cls._instance 66 | 67 | @property 68 | def lock(self): 69 | return global_lock 70 | 71 | def __getattr__(self, name): 72 | overrides = thread_local_overrides.get() 73 | if name in overrides: 74 | return overrides[name] 75 | elif name in main_thread_config: 76 | return main_thread_config[name] 77 | else: 78 | raise AttributeError(f"'Settings' object has no attribute '{name}'") 79 | 80 | def __setattr__(self, name, value): 81 | if name in ("_instance",): 82 | super().__setattr__(name, value) 83 | else: 84 | self.configure(**{name: value}) 85 | 86 | def __getitem__(self, key): 87 | return self.__getattr__(key) 88 | 89 | def __setitem__(self, key, value): 90 | self.__setattr__(key, value) 91 | 92 | def __contains__(self, key): 93 | overrides = thread_local_overrides.get() 94 | return key in overrides or key in main_thread_config 95 | 96 | def get(self, key, default=None): 97 | try: 98 | return self[key] 99 | except AttributeError: 100 | return default 101 | 102 | def copy(self): 103 | overrides = thread_local_overrides.get() 104 | return dotdict({**main_thread_config, **overrides}) 105 | 106 | @property 107 | def config(self): 108 | return self.copy() 109 | 110 | def _ensure_configure_allowed(self): 111 | global main_thread_config, config_owner_thread_id, config_owner_async_task 112 | current_thread_id = threading.get_ident() 113 | 114 | if config_owner_thread_id is None: 115 | # First `configure` call assigns the owner thread id. 116 | config_owner_thread_id = current_thread_id 117 | 118 | if config_owner_thread_id != current_thread_id: 119 | # Disallow a second `configure` calls from other threads. 120 | raise RuntimeError("dspy.settings can only be changed by the thread that initially configured it.") 121 | 122 | # Async task doesn't allow a second `configure` call, must use dspy.context(...) instead. 123 | is_async_task = False 124 | try: 125 | if asyncio.current_task() is not None: 126 | is_async_task = True 127 | except RuntimeError: 128 | # This exception (e.g., "no current task") means we are not in an async loop/task, 129 | # or asyncio module itself is not fully functional in this specific sub-thread context. 130 | is_async_task = False 131 | 132 | if not is_async_task: 133 | return 134 | 135 | if config_owner_async_task is None: 136 | # First `configure` call assigns the owner async task. 137 | config_owner_async_task = asyncio.current_task() 138 | return 139 | 140 | # We are in an async task. Now check for IPython and allow calling `configure` from IPython. 141 | in_ipython = False 142 | try: 143 | from IPython import get_ipython 144 | 145 | # get_ipython is a global injected by IPython environments. 146 | # We check its existence and type to be more robust. 147 | in_ipython = get_ipython() is not None 148 | except Exception: 149 | # If `IPython` is not installed or `get_ipython` failed, we are not in an IPython environment. 150 | in_ipython = False 151 | 152 | if not in_ipython and config_owner_async_task != asyncio.current_task(): 153 | raise RuntimeError( 154 | "dspy.settings.configure(...) can only be called from the same async task that called it first. Please " 155 | "use `dspy.context(...)` in other async tasks instead." 156 | ) 157 | 158 | def configure(self, **kwargs): 159 | # If no exception is raised, the `configure` call is allowed. 160 | self._ensure_configure_allowed() 161 | 162 | # Update global config 163 | for k, v in kwargs.items(): 164 | main_thread_config[k] = v 165 | 166 | @contextmanager 167 | def context(self, **kwargs): 168 | """ 169 | Context manager for temporary configuration changes at the thread level. 170 | Does not affect global configuration. Changes only apply to the current thread. 171 | If threads are spawned inside this block using ParallelExecutor, they will inherit these overrides. 172 | """ 173 | 174 | original_overrides = thread_local_overrides.get().copy() 175 | new_overrides = dotdict({**main_thread_config, **original_overrides, **kwargs}) 176 | token = thread_local_overrides.set(new_overrides) 177 | 178 | try: 179 | yield 180 | finally: 181 | thread_local_overrides.reset(token) 182 | 183 | def __repr__(self): 184 | overrides = thread_local_overrides.get() 185 | combined_config = {**main_thread_config, **overrides} 186 | return repr(combined_config) 187 | 188 | 189 | settings = Settings() 190 | ``` -------------------------------------------------------------------------------- /dspy/teleprompt/random_search.py: -------------------------------------------------------------------------------- ```python 1 | import random 2 | 3 | import dspy 4 | from dspy.evaluate.evaluate import Evaluate 5 | from dspy.teleprompt.teleprompt import Teleprompter 6 | 7 | from .bootstrap import BootstrapFewShot 8 | from .vanilla import LabeledFewShot 9 | 10 | # TODO: Don't forget dealing with the raw demos. 11 | # TODO: Deal with the (pretty common) case of having a metric for filtering and a separate metric for eval. 12 | # The metric itself may tell though by the presence of trace. 13 | 14 | # TODO: This function should take a max_budget and max_teacher_budget. That's in the number of program calls. 15 | # In this case, max_student_budget is max_budget - max_teacher_budget. 16 | # For max_teacher_budget, this will just limit the total number of things we bootstrap. 17 | # This can end up implicitly defining the number of candidate programs (i.e., stop when runs out). Cap at 16. 18 | # For max_student_budget, this will be a more upfront calculation. 19 | # Right now, it can also just induce the number of candidate programs. Later, it could be used more interestingly 20 | # for selective early stopping. 21 | # Progressive elimination sounds about right: after 50 examples, drop bottom third, after 100, another third, etc. 22 | # until only 3--5 are left for the end. Could also be systematic and add (earlier) stopping based on error bounds. 23 | # In general, though, the early filtering is just saying: either there are some really bad ones, or some really really 24 | # good ones, or most things are pretty close. In all of these cases, dropping the bottom third is not going to hurt. 25 | 26 | 27 | class BootstrapFewShotWithRandomSearch(Teleprompter): 28 | def __init__( 29 | self, 30 | metric, 31 | teacher_settings=None, 32 | max_bootstrapped_demos=4, 33 | max_labeled_demos=16, 34 | max_rounds=1, 35 | num_candidate_programs=16, 36 | num_threads=None, 37 | max_errors=None, 38 | stop_at_score=None, 39 | metric_threshold=None, 40 | ): 41 | self.metric = metric 42 | self.teacher_settings = teacher_settings or {} 43 | self.max_rounds = max_rounds 44 | 45 | self.num_threads = num_threads 46 | self.stop_at_score = stop_at_score 47 | self.metric_threshold = metric_threshold 48 | self.min_num_samples = 1 49 | self.max_num_samples = max_bootstrapped_demos 50 | self.max_errors = max_errors 51 | self.num_candidate_sets = num_candidate_programs 52 | self.max_labeled_demos = max_labeled_demos 53 | 54 | print(f"Going to sample between {self.min_num_samples} and {self.max_num_samples} traces per predictor.") 55 | print(f"Will attempt to bootstrap {self.num_candidate_sets} candidate sets.") 56 | 57 | def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None, labeled_sample=True): 58 | self.trainset = trainset 59 | self.valset = valset or trainset # TODO: FIXME: Note this choice. 60 | 61 | effective_max_errors = self.max_errors if self.max_errors is not None else dspy.settings.max_errors 62 | 63 | scores = [] 64 | all_subscores = [] 65 | score_data = [] 66 | 67 | for seed in range(-3, self.num_candidate_sets): 68 | if (restrict is not None) and (seed not in restrict): 69 | continue 70 | 71 | trainset_copy = list(self.trainset) 72 | 73 | if seed == -3: 74 | # zero-shot 75 | program = student.reset_copy() 76 | 77 | elif seed == -2: 78 | # labels only 79 | teleprompter = LabeledFewShot(k=self.max_labeled_demos) 80 | program = teleprompter.compile(student, trainset=trainset_copy, sample=labeled_sample) 81 | 82 | elif seed == -1: 83 | # unshuffled few-shot 84 | optimizer = BootstrapFewShot( 85 | metric=self.metric, 86 | metric_threshold=self.metric_threshold, 87 | max_bootstrapped_demos=self.max_num_samples, 88 | max_labeled_demos=self.max_labeled_demos, 89 | teacher_settings=self.teacher_settings, 90 | max_rounds=self.max_rounds, 91 | max_errors=effective_max_errors, 92 | ) 93 | program = optimizer.compile(student, teacher=teacher, trainset=trainset_copy) 94 | 95 | else: 96 | assert seed >= 0, seed 97 | 98 | random.Random(seed).shuffle(trainset_copy) 99 | size = random.Random(seed).randint(self.min_num_samples, self.max_num_samples) 100 | 101 | optimizer = BootstrapFewShot( 102 | metric=self.metric, 103 | metric_threshold=self.metric_threshold, 104 | max_bootstrapped_demos=size, 105 | max_labeled_demos=self.max_labeled_demos, 106 | teacher_settings=self.teacher_settings, 107 | max_rounds=self.max_rounds, 108 | max_errors=effective_max_errors, 109 | ) 110 | 111 | program = optimizer.compile(student, teacher=teacher, trainset=trainset_copy) 112 | 113 | evaluate = Evaluate( 114 | devset=self.valset, 115 | metric=self.metric, 116 | num_threads=self.num_threads, 117 | max_errors=effective_max_errors, 118 | display_table=False, 119 | display_progress=True, 120 | ) 121 | 122 | result = evaluate(program) 123 | 124 | score, subscores = result.score, [output[2] for output in result.results] 125 | 126 | all_subscores.append(subscores) 127 | 128 | if len(scores) == 0 or score > max(scores): 129 | print("New best score:", score, "for seed", seed) 130 | best_program = program 131 | 132 | scores.append(score) 133 | print(f"Scores so far: {scores}") 134 | print(f"Best score so far: {max(scores)}") 135 | 136 | score_data.append({"score": score, "subscores": subscores, "seed": seed, "program": program}) 137 | 138 | if self.stop_at_score is not None and score >= self.stop_at_score: 139 | print(f"Stopping early because score {score} is >= stop_at_score {self.stop_at_score}") 140 | break 141 | 142 | # To best program, attach all program candidates in decreasing average score 143 | best_program.candidate_programs = score_data 144 | best_program.candidate_programs = sorted( 145 | best_program.candidate_programs, key=lambda x: x["score"], reverse=True 146 | ) 147 | 148 | print(f"{len(best_program.candidate_programs)} candidate programs found.") 149 | 150 | return best_program 151 | 152 | 153 | # sample between 4 and 10 examples from traces 154 | # TODO: FIXME: The max number of demos should be determined in part by the LM's tokenizer + max_length. 155 | # This does require executing the program, or at least the predictor. 156 | # # # # # # (Actually we can just combine the token counts of the traces, when formatted via signature/adapter). 157 | # Alternatively, we can keep track of the (zero-shot) number of tokens when we bootstrap. 158 | # As another option, we can just try a wide range and handle failures as penalties on the score. 159 | # The number "24" of traces to collect can also be affected. If we only need 3x10, some overlap is ok. 160 | # We can also consider having short_demos and long_demos. 161 | ``` -------------------------------------------------------------------------------- /tests/primitives/test_module.py: -------------------------------------------------------------------------------- ```python 1 | from pathlib import Path 2 | 3 | import dspy 4 | from dspy.primitives.module import Module, set_attribute_by_name # Adjust the import based on your file structure 5 | from dspy.utils import DummyLM 6 | 7 | 8 | class HopModule(dspy.Module): 9 | def __init__(self): 10 | super().__init__() 11 | self.predict1 = dspy.Predict("question -> query") 12 | self.predict2 = dspy.Predict("query -> answer") 13 | 14 | def forward(self, question): 15 | query = self.predict1(question=question).query 16 | return self.predict2(query=query) 17 | 18 | 19 | def test_module_initialization(): 20 | module = Module() 21 | assert module._compiled is False, "Module _compiled attribute should be False upon initialization" 22 | 23 | 24 | def test_named_predictors(): 25 | module = HopModule() 26 | named_preds = module.named_predictors() 27 | assert len(named_preds) == 2, "Should identify correct number of Predict instances" 28 | names, preds = zip(*named_preds, strict=False) 29 | assert "predict1" in names and "predict2" in names, "Named predictors should include 'predict1' and 'predict2'" 30 | 31 | 32 | def test_predictors(): 33 | module = HopModule() 34 | preds = module.predictors() 35 | assert len(preds) == 2, "Should return correct number of Predict instances" 36 | assert all(isinstance(p, dspy.Predict) for p in preds), "All returned items should be instances of PredictMock" 37 | 38 | 39 | def test_forward(): 40 | program = HopModule() 41 | dspy.settings.configure( 42 | lm=DummyLM( 43 | { 44 | "What is 1+1?": {"query": "let me check"}, 45 | "let me check": {"answer": "2"}, 46 | } 47 | ) 48 | ) 49 | result = program(question="What is 1+1?").answer 50 | assert result == "2" 51 | 52 | 53 | def test_nested_named_predictors(): 54 | class Hop2Module(dspy.Module): 55 | def __init__(self): 56 | super().__init__() 57 | self.hop = HopModule() 58 | 59 | module = Hop2Module() 60 | named_preds = module.named_predictors() 61 | assert len(named_preds) == 2 62 | names, _preds = zip(*named_preds, strict=False) 63 | assert "hop.predict1" in names 64 | assert "hop.predict2" in names 65 | 66 | 67 | def test_empty_module(): 68 | module = Module() 69 | assert list(module.named_sub_modules()) == [("self", module)] 70 | 71 | 72 | def test_single_level(): 73 | module = Module() 74 | module.sub = Module() 75 | expected = [("self", module), ("self.sub", module.sub)] 76 | assert list(module.named_sub_modules()) == expected 77 | 78 | 79 | def test_multiple_levels(): 80 | module = Module() 81 | module.sub = Module() 82 | module.sub.subsub = Module() 83 | expected = [("self", module), ("self.sub", module.sub), ("self.sub.subsub", module.sub.subsub)] 84 | assert list(module.named_sub_modules()) == expected 85 | 86 | 87 | def test_multiple_sub_modules(): 88 | module = Module() 89 | module.sub1 = Module() 90 | module.sub2 = Module() 91 | expected = [("self", module), ("self.sub1", module.sub1), ("self.sub2", module.sub2)] 92 | assert sorted(module.named_sub_modules()) == sorted(expected) 93 | 94 | 95 | def test_non_base_module_attributes(): 96 | module = Module() 97 | module.sub = Module() 98 | module.not_a_sub = "Not a self" 99 | expected = [("self", module), ("self.sub", module.sub)] 100 | assert list(module.named_sub_modules()) == expected 101 | 102 | 103 | def test_complex_module_traversal(): 104 | root = Module() 105 | root.sub_module = Module() 106 | root.sub_module.nested_list = [Module(), {"key": Module()}] 107 | root.sub_module.nested_tuple = (Module(), [Module(), Module()]) 108 | expected_names = { 109 | "self", 110 | "self.sub_module", 111 | "self.sub_module.nested_list[0]", 112 | "self.sub_module.nested_list[1][key]", 113 | "self.sub_module.nested_tuple[0]", 114 | "self.sub_module.nested_tuple[1][0]", 115 | "self.sub_module.nested_tuple[1][1]", 116 | } 117 | found_names = {name for name, _ in root.named_sub_modules()} 118 | 119 | assert found_names == expected_names, ( 120 | f"Missing or extra modules found. Missing: {expected_names - found_names}, Extra: {found_names - expected_names}" 121 | ) 122 | 123 | 124 | def test_complex_module_traversal_with_same_module(): 125 | root = Module() 126 | root.sub_module = Module() 127 | root.sub_module.nested_list = [Module(), {"key": Module()}] 128 | same_module = Module() 129 | root.sub_module.nested_tuple = (Module(), [same_module, same_module]) 130 | expected_names = { 131 | "self", 132 | "self.sub_module", 133 | "self.sub_module.nested_list[0]", 134 | "self.sub_module.nested_list[1][key]", # NOTE: named_sub_modules allows recursive structures 135 | "self.sub_module.nested_tuple[0]", 136 | "self.sub_module.nested_tuple[1][0]", # NEW: named_sub_modules allows recursive structures, but named_parameters does not 137 | } 138 | found_names = {name for name, _ in root.named_sub_modules()} 139 | 140 | assert found_names == expected_names, ( 141 | f"Missing or extra modules found. Missing: {expected_names - found_names}, Extra: {found_names - expected_names}" 142 | ) 143 | 144 | 145 | def test_complex_module_set_attribute_by_name(): 146 | root = Module() 147 | root.sub_module = Module() 148 | root.sub_module.nested_list = [Module(), {"key": Module()}] 149 | same_module = Module() 150 | root.sub_module.nested_tuple = (Module(), [same_module, same_module]) 151 | 152 | set_attribute_by_name(root, "test_attrib", True) 153 | assert root.test_attrib is True 154 | set_attribute_by_name(root, "sub_module.test_attrib", True) 155 | assert root.sub_module.test_attrib is True 156 | set_attribute_by_name(root, "sub_module.nested_list[0].test_attrib", True) 157 | assert root.sub_module.nested_list[0].test_attrib is True 158 | set_attribute_by_name(root, "sub_module.nested_list[1]['key'].test_attrib", True) 159 | assert root.sub_module.nested_list[1]["key"].test_attrib is True 160 | set_attribute_by_name(root, "sub_module.nested_tuple[0].test_attrib", True) 161 | assert root.sub_module.nested_tuple[0].test_attrib is True 162 | set_attribute_by_name(root, "sub_module.nested_tuple[1][0].test_attrib", True) 163 | assert root.sub_module.nested_tuple[1][0].test_attrib is True 164 | assert root.sub_module.nested_tuple[1][1].test_attrib is True 165 | 166 | 167 | class DuplicateModule(Module): 168 | def __init__(self): 169 | super().__init__() 170 | self.p0 = dspy.Predict("question -> answer") 171 | self.p1 = self.p0 172 | 173 | 174 | def test_named_parameters_duplicate_references(): 175 | module = DuplicateModule() 176 | # Only testing for whether exceptions are thrown or not 177 | # As Module.named_parameters() is recursive, this is mainly for catching infinite recursion 178 | module.named_parameters() 179 | 180 | 181 | def test_load_dspy_program_cross_version(): 182 | """ 183 | Test backward compatibility for loading a saved DSPy program. 184 | 185 | This test verifies that DSPy can load a program saved in version 3.0.1, ensuring compatibility with older versions. 186 | The saved state is located in 'test/primitives/resources/saved_program.json' and represents an optimized 187 | `dspy.ReAct` program. 188 | """ 189 | path = Path(__file__).parent / "resources" / "saved_program.json" 190 | loaded_react = dspy.ReAct("question->answer", tools=[]) 191 | loaded_react.load(path) 192 | assert ( 193 | "Imagine you are a detective racing against time to solve a high-profile" 194 | in loaded_react.react.signature.instructions 195 | ) 196 | assert "Given the very verbose fields `question`" in loaded_react.extract.predict.signature.instructions 197 | 198 | assert len(loaded_react.react.demos) == 2 199 | assert len(loaded_react.extract.predict.demos) == 2 200 | ``` -------------------------------------------------------------------------------- /dspy/dsp/utils/dpr.py: -------------------------------------------------------------------------------- ```python 1 | """ 2 | Source: DPR Implementation from Facebook Research 3 | https://github.com/facebookresearch/DPR/tree/master/dpr 4 | Original license: https://github.com/facebookresearch/DPR/blob/main/LICENSE 5 | """ 6 | 7 | import copy 8 | import logging 9 | import unicodedata 10 | 11 | import regex 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class Tokens: 17 | """A class to represent a list of tokenized text.""" 18 | 19 | TEXT = 0 20 | TEXT_WS = 1 21 | SPAN = 2 22 | POS = 3 23 | LEMMA = 4 24 | NER = 5 25 | 26 | def __init__(self, data, annotators, opts=None): 27 | self.data = data 28 | self.annotators = annotators 29 | self.opts = opts or {} 30 | 31 | def __len__(self): 32 | """The number of tokens.""" 33 | return len(self.data) 34 | 35 | def slice(self, i=None, j=None): 36 | """Return a view of the list of tokens from [i, j).""" 37 | new_tokens = copy.copy(self) 38 | new_tokens.data = self.data[i:j] 39 | return new_tokens 40 | 41 | def untokenize(self): 42 | """Returns the original text (with whitespace reinserted).""" 43 | return "".join([t[self.TEXT_WS] for t in self.data]).strip() 44 | 45 | def words(self, uncased=False): 46 | """Returns a list of the text of each token 47 | 48 | Args: 49 | uncased: lower cases text 50 | """ 51 | if uncased: 52 | return [t[self.TEXT].lower() for t in self.data] 53 | else: 54 | return [t[self.TEXT] for t in self.data] 55 | 56 | def offsets(self): 57 | """Returns a list of [start, end) character offsets of each token.""" 58 | return [t[self.SPAN] for t in self.data] 59 | 60 | def pos(self): 61 | """Returns a list of part-of-speech tags of each token. 62 | Returns None if this annotation was not included. 63 | """ 64 | if "pos" not in self.annotators: 65 | return None 66 | return [t[self.POS] for t in self.data] 67 | 68 | def lemmas(self): 69 | """Returns a list of the lemmatized text of each token. 70 | Returns None if this annotation was not included. 71 | """ 72 | if "lemma" not in self.annotators: 73 | return None 74 | return [t[self.LEMMA] for t in self.data] 75 | 76 | def entities(self): 77 | """Returns a list of named-entity-recognition tags of each token. 78 | Returns None if this annotation was not included. 79 | """ 80 | if "ner" not in self.annotators: 81 | return None 82 | return [t[self.NER] for t in self.data] 83 | 84 | def ngrams(self, n=1, uncased=False, filter_fn=None, as_strings=True): 85 | """Returns a list of all ngrams from length 1 to n. 86 | 87 | Args: 88 | n: upper limit of ngram length 89 | uncased: lower cases text 90 | filter_fn: user function that takes in an ngram list and returns 91 | True or False to keep or not keep the ngram 92 | as_string: return the ngram as a string vs list 93 | """ 94 | 95 | def _skip(gram): 96 | if not filter_fn: 97 | return False 98 | return filter_fn(gram) 99 | 100 | words = self.words(uncased) 101 | ngrams = [ 102 | (s, e + 1) 103 | for s in range(len(words)) 104 | for e in range(s, min(s + n, len(words))) 105 | if not _skip(words[s : e + 1]) 106 | ] 107 | 108 | # Concatenate into strings 109 | if as_strings: 110 | ngrams = ["{}".format(" ".join(words[s:e])) for (s, e) in ngrams] 111 | 112 | return ngrams 113 | 114 | def entity_groups(self): 115 | """Group consecutive entity tokens with the same NER tag.""" 116 | entities = self.entities() 117 | if not entities: 118 | return None 119 | non_ent = self.opts.get("non_ent", "O") 120 | groups = [] 121 | idx = 0 122 | while idx < len(entities): 123 | ner_tag = entities[idx] 124 | # Check for entity tag 125 | if ner_tag != non_ent: 126 | # Chomp the sequence 127 | start = idx 128 | while idx < len(entities) and entities[idx] == ner_tag: 129 | idx += 1 130 | groups.append((self.slice(start, idx).untokenize(), ner_tag)) 131 | else: 132 | idx += 1 133 | return groups 134 | 135 | 136 | class Tokenizer: 137 | """Base tokenizer class. 138 | Tokenizers implement tokenize, which should return a Tokens class. 139 | """ 140 | 141 | def tokenize(self, text): 142 | raise NotImplementedError 143 | 144 | def shutdown(self): 145 | pass 146 | 147 | def __del__(self): 148 | self.shutdown() 149 | 150 | 151 | class SimpleTokenizer(Tokenizer): 152 | ALPHA_NUM = r"[\p{L}\p{N}\p{M}]+" 153 | NON_WS = r"[^\p{Z}\p{C}]" 154 | 155 | def __init__(self, **kwargs): 156 | """ 157 | Args: 158 | annotators: None or empty set (only tokenizes). 159 | """ 160 | self._regexp = regex.compile( 161 | "(%s)|(%s)" % (self.ALPHA_NUM, self.NON_WS), 162 | flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE, 163 | ) 164 | if len(kwargs.get("annotators", {})) > 0: 165 | logger.warning( 166 | "%s only tokenizes! Skipping annotators: %s", 167 | type(self).__name__, 168 | kwargs.get("annotators"), 169 | ) 170 | self.annotators = set() 171 | 172 | def tokenize(self, text): 173 | data = [] 174 | matches = list(self._regexp.finditer(text)) 175 | for i in range(len(matches)): 176 | # Get text 177 | token = matches[i].group() 178 | 179 | # Get whitespace 180 | span = matches[i].span() 181 | start_ws = span[0] 182 | if i + 1 < len(matches): 183 | end_ws = matches[i + 1].span()[0] 184 | else: 185 | end_ws = span[1] 186 | 187 | # Format data 188 | data.append( 189 | ( 190 | token, 191 | text[start_ws:end_ws], 192 | span, 193 | ) 194 | ) 195 | return Tokens(data, self.annotators) 196 | 197 | 198 | def has_answer(tokenized_answers, text): 199 | text = DPR_normalize(text) 200 | 201 | for single_answer in tokenized_answers: 202 | for i in range(0, len(text) - len(single_answer) + 1): 203 | if single_answer == text[i : i + len(single_answer)]: 204 | return True 205 | 206 | return False 207 | 208 | 209 | def locate_answers(tokenized_answers, text): 210 | """ 211 | Returns each occurrence of an answer as (offset, endpos) in terms of *characters*. 212 | """ 213 | tokenized_text = DPR_tokenize(text) 214 | occurrences = [] 215 | 216 | text_words, text_word_positions = tokenized_text.words(uncased=True), tokenized_text.offsets() 217 | answers_words = [ans.words(uncased=True) for ans in tokenized_answers] 218 | 219 | for single_answer in answers_words: 220 | for i in range(0, len(text_words) - len(single_answer) + 1): 221 | if single_answer == text_words[i : i + len(single_answer)]: 222 | (offset, _), (_, endpos) = text_word_positions[i], text_word_positions[i + len(single_answer) - 1] 223 | occurrences.append((offset, endpos)) 224 | 225 | return occurrences 226 | 227 | 228 | STokenizer = SimpleTokenizer() 229 | 230 | 231 | def DPR_tokenize(text): # noqa: N802 232 | return STokenizer.tokenize(unicodedata.normalize("NFD", text)) 233 | 234 | 235 | def DPR_normalize(text): # noqa: N802 236 | return DPR_tokenize(text).words(uncased=True) 237 | 238 | 239 | # Source: https://github.com/shmsw25/qa-hard-em/blob/master/prepro_util.py 240 | def strip_accents(text): 241 | """Strips accents from a piece of text.""" 242 | text = unicodedata.normalize("NFD", text) 243 | output = [] 244 | for char in text: 245 | cat = unicodedata.category(char) 246 | if cat == "Mn": 247 | continue 248 | output.append(char) 249 | return "".join(output) 250 | ``` -------------------------------------------------------------------------------- /dspy/propose/utils.py: -------------------------------------------------------------------------------- ```python 1 | import inspect 2 | import json 3 | import re 4 | 5 | import dspy 6 | 7 | try: 8 | from IPython.core.magics.code import extract_symbols 9 | except ImportError: 10 | # Won't be able to read code from jupyter notebooks 11 | extract_symbols = None 12 | 13 | from dspy.predict.parameter import Parameter 14 | from dspy.teleprompt.utils import get_signature, new_getfile 15 | 16 | 17 | def strip_prefix(text): 18 | pattern = r"^[\*\s]*(([\w\'\-]+\s+){0,4}[\w\'\-]+):\s*" 19 | modified_text = re.sub(pattern, "", text) 20 | return modified_text.strip('"') 21 | 22 | def create_instruction_set_history_string(base_program, trial_logs, top_n): 23 | program_history = [] 24 | for trial_num in trial_logs: 25 | trial = trial_logs[trial_num] 26 | if "program_path" in trial: 27 | trial_program = base_program.deepcopy() 28 | trial_program.load(trial["program_path"]) 29 | program_history.append({ 30 | "program": trial_program, 31 | "score": trial["score"], 32 | }) 33 | 34 | # Deduplicate program history based on the program's instruction set 35 | seen_programs = set() 36 | unique_program_history = [] 37 | for entry in program_history: 38 | program = entry["program"] 39 | instruction_set = get_program_instruction_set_string(program) 40 | if instruction_set not in seen_programs: 41 | seen_programs.add(instruction_set) 42 | unique_program_history.append(entry) 43 | 44 | # Get the top n programs from program history 45 | top_n_program_history = sorted(unique_program_history, key=lambda x: x["score"], reverse=True)[:top_n] 46 | top_n_program_history.reverse() 47 | 48 | # Create formatted string 49 | instruction_set_history_string = "" 50 | for entry in top_n_program_history: 51 | program = entry["program"] 52 | score = entry["score"] 53 | instruction_set = get_program_instruction_set_string(program) 54 | instruction_set_history_string += instruction_set + f" | Score: {score}\n\n" 55 | 56 | return instruction_set_history_string 57 | 58 | def parse_list_of_instructions(instruction_string): 59 | # Try to convert the string representation of a list to an actual list using JSON 60 | try: 61 | instructions = json.loads(instruction_string) 62 | return instructions 63 | except json.JSONDecodeError: 64 | pass 65 | 66 | # If JSON decoding fails, extract strings within quotes 67 | instructions = re.findall(r'"([^"]*)"', instruction_string) 68 | return instructions 69 | 70 | def get_program_instruction_set_string(program): 71 | instruction_list = [] 72 | for _, pred in enumerate(program.predictors()): 73 | pred_instructions = get_signature(pred).instructions 74 | instruction_list.append(f'"{pred_instructions}"') 75 | # Joining the list into a single string that looks like a list 76 | return f"[{', '.join(instruction_list)}]" 77 | 78 | def create_predictor_level_history_string(base_program, predictor_i, trial_logs, top_n): 79 | instruction_aggregate = {} 80 | instruction_history = [] 81 | 82 | # Load trial programs 83 | for trial_num in trial_logs: 84 | trial = trial_logs[trial_num] 85 | if "program_path" in trial: 86 | trial_program = base_program.deepcopy() 87 | trial_program.load(trial["program_path"]) 88 | instruction_history.append({ 89 | "program": trial_program, 90 | "score": trial["score"], 91 | }) 92 | 93 | # Aggregate scores for each instruction 94 | for history_item in instruction_history: 95 | predictor = history_item["program"].predictors()[predictor_i] 96 | instruction = get_signature(predictor).instructions 97 | score = history_item["score"] 98 | 99 | if instruction in instruction_aggregate: 100 | instruction_aggregate[instruction]["total_score"] += score 101 | instruction_aggregate[instruction]["count"] += 1 102 | else: 103 | instruction_aggregate[instruction] = {"total_score": score, "count": 1} 104 | 105 | # Calculate average score for each instruction and prepare for sorting 106 | predictor_history = [] 107 | for instruction, data in instruction_aggregate.items(): 108 | average_score = data["total_score"] / data["count"] 109 | predictor_history.append((instruction, average_score)) 110 | 111 | # Deduplicate and sort by average score, then select top N 112 | seen_instructions = set() 113 | unique_predictor_history = [] 114 | for instruction, score in predictor_history: 115 | if instruction not in seen_instructions: 116 | seen_instructions.add(instruction) 117 | unique_predictor_history.append((instruction, score)) 118 | 119 | top_instructions = sorted(unique_predictor_history, key=lambda x: x[1], reverse=True)[:top_n] 120 | top_instructions.reverse() 121 | 122 | # Create formatted history string 123 | predictor_history_string = "" 124 | for instruction, score in top_instructions: 125 | predictor_history_string += instruction + f" | Score: {score}\n\n" 126 | 127 | return predictor_history_string 128 | 129 | def create_example_string(fields, example): 130 | 131 | # Building the output string 132 | output = [] 133 | for field_name, field_values in fields.items(): 134 | name = field_values.json_schema_extra["prefix"] 135 | 136 | # Determine the value from input_data or prediction_data 137 | value = example.get(field_name) 138 | 139 | # Construct the string for the current field 140 | field_str = f"{name} {value}" 141 | output.append(field_str) 142 | 143 | # Joining all the field strings 144 | return "\n".join(output) 145 | 146 | def get_dspy_source_code(module): 147 | header = [] 148 | base_code = "" 149 | 150 | # Don't get source code for Predict or ChainOfThought modules (NOTE we will need to extend this list as more DSPy.modules are added) 151 | # TODO: if type(module).__name__ not in ["Predict", "ChainOfThought", "ReAct"]: 152 | if not type(module).__name__ == "Predict" and not type(module).__name__ == "ChainOfThought": 153 | try: 154 | base_code = inspect.getsource(type(module)) 155 | except TypeError: 156 | obj = type(module) 157 | cell_code = "".join(inspect.linecache.getlines(new_getfile(obj))) 158 | class_code = extract_symbols(cell_code, obj.__name__)[0][0] 159 | base_code = str(class_code) 160 | 161 | completed_set = set() 162 | for attribute in module.__dict__.keys(): 163 | try: 164 | iterable = iter(getattr(module, attribute)) 165 | except TypeError: 166 | iterable = [getattr(module, attribute)] 167 | 168 | for item in iterable: 169 | # Skip items that are unhashable (like module history) 170 | try: 171 | hash(item) 172 | except TypeError: 173 | continue 174 | if isinstance(item, Parameter): 175 | if hasattr(item, "signature") and item.signature is not None and item.signature.__pydantic_parent_namespace__["signature_name"] + "_sig" not in completed_set: 176 | try: 177 | header.append(inspect.getsource(item.signature)) 178 | print(inspect.getsource(item.signature)) 179 | except (TypeError, OSError): 180 | header.append(str(item.signature)) 181 | completed_set.add(item.signature.__pydantic_parent_namespace__["signature_name"] + "_sig") 182 | if isinstance(item, dspy.Module): 183 | code = get_dspy_source_code(item).strip() 184 | if code not in completed_set: 185 | header.append(code) 186 | completed_set.add(code) 187 | completed_set.add(item) 188 | 189 | return "\n\n".join(header) + "\n\n" + base_code 190 | ``` -------------------------------------------------------------------------------- /docs/docs/api/optimizers/GEPA/overview.md: -------------------------------------------------------------------------------- ```markdown 1 | # dspy.GEPA: Reflective Prompt Optimizer 2 | 3 | **GEPA** (Genetic-Pareto) is a reflective optimizer proposed in "GEPA: Reflective Prompt Evolution Can Outperform Reinforcement Learning" (Agrawal et al., 2025, [arxiv:2507.19457](https://arxiv.org/abs/2507.19457)), that adaptively evolves _textual components_ (such as prompts) of arbitrary systems. In addition to scalar scores returned by metrics, users can also provide GEPA with a text feedback to guide the optimization process. Such textual feedback provides GEPA more visibility into why the system got the score that it did, and then GEPA can introspect to identify how to improve the score. This allows GEPA to propose high performing prompts in very few rollouts. 4 | 5 | <!-- START_API_REF --> 6 | ::: dspy.GEPA 7 | handler: python 8 | options: 9 | members: 10 | - auto_budget 11 | - compile 12 | - get_params 13 | show_source: true 14 | show_root_heading: true 15 | heading_level: 2 16 | docstring_style: google 17 | show_root_full_path: true 18 | show_object_full_path: false 19 | separate_signature: false 20 | inherited_members: true 21 | ::: 22 | <!-- END_API_REF --> 23 | 24 | One of the key insights behind GEPA is its ability to leverage domain-specific textual feedback. Users should provide a feedback function as the GEPA metric, which has the following call signature: 25 | <!-- START_API_REF --> 26 | ::: dspy.teleprompt.gepa.gepa.GEPAFeedbackMetric 27 | handler: python 28 | options: 29 | members: 30 | - __call__ 31 | show_source: true 32 | show_root_heading: true 33 | heading_level: 2 34 | docstring_style: google 35 | show_root_full_path: true 36 | show_object_full_path: false 37 | separate_signature: false 38 | inherited_members: true 39 | ::: 40 | <!-- END_API_REF --> 41 | 42 | When `track_stats=True`, GEPA returns detailed results about all of the proposed candidates, and metadata about the optimization run. The results are available in the `detailed_results` attribute of the optimized program returned by GEPA, and has the following type: 43 | <!-- START_API_REF --> 44 | ::: dspy.teleprompt.gepa.gepa.DspyGEPAResult 45 | handler: python 46 | options: 47 | show_source: true 48 | show_root_heading: true 49 | heading_level: 2 50 | docstring_style: google 51 | show_root_full_path: true 52 | show_object_full_path: false 53 | separate_signature: false 54 | inherited_members: true 55 | ::: 56 | <!-- END_API_REF --> 57 | 58 | ## Usage Examples 59 | 60 | See GEPA usage tutorials in [GEPA Tutorials](../../../tutorials/gepa_ai_program/index.md). 61 | 62 | ### Inference-Time Search 63 | 64 | GEPA can act as a test-time/inference search mechanism. By setting your `valset` to your _evaluation batch_ and using `track_best_outputs=True`, GEPA produces for each batch element the highest-scoring outputs found during the evolutionary search. 65 | 66 | ```python 67 | gepa = dspy.GEPA(metric=metric, track_stats=True, ...) 68 | new_prog = gepa.compile(student, trainset=my_tasks, valset=my_tasks) 69 | highest_score_achieved_per_task = new_prog.detailed_results.highest_score_achieved_per_val_task 70 | best_outputs = new_prog.detailed_results.best_outputs_valset 71 | ``` 72 | 73 | ## How Does GEPA Work? 74 | 75 | ### 1. **Reflective Prompt Mutation** 76 | 77 | GEPA uses LLMs to _reflect_ on structured execution traces (inputs, outputs, failures, feedback), targeting a chosen module and proposing a new instruction/program text tailored to real observed failures and rich textual/environmental feedback. 78 | 79 | ### 2. **Rich Textual Feedback as Optimization Signal** 80 | 81 | GEPA can leverage _any_ textual feedback available—not just scalar rewards. This includes evaluation logs, code traces, failed parses, constraint violations, error message strings, or even isolated submodule-specific feedback. This allows actionable, domain-aware optimization. 82 | 83 | ### 3. **Pareto-based Candidate Selection** 84 | 85 | Rather than evolving just the _best_ global candidate (which leads to local optima or stagnation), GEPA maintains a Pareto frontier: the set of candidates which achieve the highest score on at least one evaluation instance. In each iteration, the next candidate to mutate is sampled (with probability proportional to coverage) from this frontier, guaranteeing both exploration and robust retention of complementary strategies. 86 | 87 | ### Algorithm Summary 88 | 89 | 1. **Initialize** the candidate pool with the the unoptimized program. 90 | 2. **Iterate**: 91 | - **Sample a candidate** (from Pareto frontier). 92 | - **Sample a minibatch** from the train set. 93 | - **Collect execution traces + feedbacks** for module rollout on minibatch. 94 | - **Select a module** of the candidate for targeted improvement. 95 | - **LLM Reflection:** Propose a new instruction/prompt for the targeted module using reflective meta-prompting and the gathered feedback. 96 | - **Roll out the new candidate** on the minibatch; **if improved, evaluate on Pareto validation set**. 97 | - **Update the candidate pool/Pareto frontier.** 98 | - **[Optionally] System-aware merge/crossover**: Combine best-performing modules from distinct lineages. 99 | 3. **Continue** until rollout or metric budget is exhausted. 100 | 4. **Return** candidate with best aggregate performance on validation. 101 | 102 | ## Implementing Feedback Metrics 103 | 104 | A well-designed metric is central to GEPA's sample efficiency and learning signal richness. GEPA expects the metric to returns a `dspy.Prediction(score=..., feedback=...)`. GEPA leverages natural language traces from LLM-based workflows for optimization, preserving intermediate trajectories and errors in plain text rather than reducing them to numerical rewards. This mirrors human diagnostic processes, enabling clearer identification of system behaviors and bottlenecks. 105 | 106 | Practical Recipe for GEPA-Friendly Feedback: 107 | 108 | - **Leverage Existing Artifacts**: Use logs, unit tests, evaluation scripts, and profiler outputs; surfacing these often suffices. 109 | - **Decompose Outcomes**: Break scores into per-objective components (e.g., correctness, latency, cost, safety) and attribute errors to steps. 110 | - **Expose Trajectories**: Label pipeline stages, reporting pass/fail with salient errors (e.g., in code generation pipelines). 111 | - **Ground in Checks**: Employ automatic validators (unit tests, schemas, simulators) or LLM-as-a-judge for non-verifiable tasks (as in PUPA). 112 | - **Prioritize Clarity**: Focus on error coverage and decision points over technical complexity. 113 | 114 | ### Examples 115 | 116 | - **Document Retrieval** (e.g., HotpotQA): List correctly retrieved, incorrect, or missed documents, beyond mere Recall/F1 scores. 117 | - **Multi-Objective Tasks** (e.g., PUPA): Decompose aggregate scores to reveal contributions from each objective, highlighting tradeoffs (e.g., quality vs. privacy). 118 | - **Stacked Pipelines** (e.g., code generation: parse → compile → run → profile → evaluate): Expose stage-specific failures; natural-language traces often suffice for LLM self-correction. 119 | 120 | ## Custom Instruction Proposal 121 | 122 | For advanced customization of GEPA's instruction proposal mechanism, including custom instruction proposers and component selectors, see [Advanced Features](GEPA_Advanced.md). 123 | 124 | ## Further Reading 125 | 126 | - [GEPA Paper: arxiv:2507.19457](https://arxiv.org/abs/2507.19457) 127 | - [GEPA Github](https://github.com/gepa-ai/gepa) - This repository provides the core GEPA evolution pipeline used by `dspy.GEPA` optimizer. 128 | - [DSPy Tutorials](../../../tutorials/gepa_ai_program/index.md) 129 | ``` -------------------------------------------------------------------------------- /docs/docs/tutorials/cache/index.md: -------------------------------------------------------------------------------- ```markdown 1 | # Use and Customize DSPy Cache 2 | 3 | In this tutorial, we will explore the design of DSPy's caching mechanism and demonstrate how to effectively use and customize it. 4 | 5 | ## DSPy Cache Structure 6 | 7 | DSPy's caching system is architected in three distinct layers: 8 | 9 | 1. **In-memory cache**: Implemented using `cachetools.LRUCache`, this layer provides fast access to frequently used data. 10 | 2. **On-disk cache**: Leveraging `diskcache.FanoutCache`, this layer offers persistent storage for cached items. 11 | 3. **Prompt cache (Server-side cache)**: This layer is managed by the LLM service provider (e.g., OpenAI, Anthropic). 12 | 13 | While DSPy does not directly control the server-side prompt cache, it offers users the flexibility to enable, disable, and customize the in-memory and on-disk caches to suit their specific requirements. 14 | 15 | ## Using DSPy Cache 16 | 17 | By default, both in-memory and on-disk caching are automatically enabled in DSPy. No specific action is required to start using the cache. When a cache hit occurs, you will observe a significant reduction in the module call's execution time. Furthermore, if usage tracking is enabled, the usage metrics for a cached call will be `None`. 18 | 19 | Consider the following example: 20 | 21 | ```python 22 | import dspy 23 | import os 24 | import time 25 | 26 | os.environ["OPENAI_API_KEY"] = "{your_openai_key}" 27 | 28 | dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini"), track_usage=True) 29 | 30 | predict = dspy.Predict("question->answer") 31 | 32 | start = time.time() 33 | result1 = predict(question="Who is the GOAT of basketball?") 34 | print(f"Time elapse: {time.time() - start: 2f}\n\nTotal usage: {result1.get_lm_usage()}") 35 | 36 | start = time.time() 37 | result2 = predict(question="Who is the GOAT of basketball?") 38 | print(f"Time elapse: {time.time() - start: 2f}\n\nTotal usage: {result2.get_lm_usage()}") 39 | ``` 40 | 41 | A sample output looks like: 42 | 43 | ``` 44 | Time elapse: 4.384113 45 | Total usage: {'openai/gpt-4o-mini': {'completion_tokens': 97, 'prompt_tokens': 144, 'total_tokens': 241, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0, 'text_tokens': None}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0, 'text_tokens': None, 'image_tokens': None}}} 46 | 47 | Time elapse: 0.000529 48 | Total usage: {} 49 | ``` 50 | 51 | ## Disabling/Enabling DSPy Cache 52 | 53 | There are scenarios where you might need to disable caching, either entirely or selectively for in-memory or on-disk caches. For instance: 54 | 55 | - You require different responses for identical LM requests. 56 | - You lack disk write permissions and need to disable the on-disk cache. 57 | - You have limited memory resources and wish to disable the in-memory cache. 58 | 59 | DSPy provides the `dspy.configure_cache()` utility function for this purpose. You can use the corresponding flags to control the enabled/disabled state of each cache type: 60 | 61 | ```python 62 | dspy.configure_cache( 63 | enable_disk_cache=False, 64 | enable_memory_cache=False, 65 | ) 66 | ``` 67 | 68 | In additions, you can manage the capacity of the in-memory and on-disk caches: 69 | 70 | ```python 71 | dspy.configure_cache( 72 | enable_disk_cache=True, 73 | enable_memory_cache=True, 74 | disk_size_limit_bytes=YOUR_DESIRED_VALUE, 75 | memory_max_entries=YOUR_DESIRED_VALUE, 76 | ) 77 | ``` 78 | 79 | Please note that `disk_size_limit_bytes` defines the maximum size in bytes for the on-disk cache, while `memory_max_entries` specifies the maximum number of entries for the in-memory cache. 80 | 81 | ## Understanding and Customizing the Cache 82 | 83 | In specific situations, you might want to implement a custom cache, for example, to gain finer control over how cache keys are generated. By default, the cache key is derived from a hash of all request arguments sent to `litellm`, excluding credentials like `api_key`. 84 | 85 | To create a custom cache, you need to subclass `dspy.clients.Cache` and override the relevant methods: 86 | 87 | ```python 88 | class CustomCache(dspy.clients.Cache): 89 | def __init__(self, **kwargs): 90 | {write your own constructor} 91 | 92 | def cache_key(self, request: dict[str, Any], ignored_args_for_cache_key: Optional[list[str]] = None) -> str: 93 | {write your logic of computing cache key} 94 | 95 | def get(self, request: dict[str, Any], ignored_args_for_cache_key: Optional[list[str]] = None) -> Any: 96 | {write your cache read logic} 97 | 98 | def put( 99 | self, 100 | request: dict[str, Any], 101 | value: Any, 102 | ignored_args_for_cache_key: Optional[list[str]] = None, 103 | enable_memory_cache: bool = True, 104 | ) -> None: 105 | {write your cache write logic} 106 | ``` 107 | 108 | To ensure seamless integration with the rest of DSPy, it is recommended to implement your custom cache using the same method signatures as the base class, or at a minimum, include `**kwargs` in your method definitions to prevent runtime errors during cache read/write operations. 109 | 110 | Once your custom cache class is defined, you can instruct DSPy to use it: 111 | 112 | ```python 113 | dspy.cache = CustomCache() 114 | ``` 115 | 116 | Let's illustrate this with a practical example. Suppose we want the cache key computation to depend solely on the request message content, ignoring other parameters like the specific LM being called. We can create a custom cache as follows: 117 | 118 | ```python 119 | class CustomCache(dspy.clients.Cache): 120 | 121 | def cache_key(self, request: dict[str, Any], ignored_args_for_cache_key: Optional[list[str]] = None) -> str: 122 | messages = request.get("messages", []) 123 | return sha256(ujson.dumps(messages, sort_keys=True).encode()).hexdigest() 124 | 125 | dspy.cache = CustomCache(enable_disk_cache=True, enable_memory_cache=True, disk_cache_dir=dspy.clients.DISK_CACHE_DIR) 126 | ``` 127 | 128 | For comparison, consider executing the code below without the custom cache: 129 | 130 | ```python 131 | import dspy 132 | import os 133 | import time 134 | 135 | os.environ["OPENAI_API_KEY"] = "{your_openai_key}" 136 | 137 | dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini")) 138 | 139 | predict = dspy.Predict("question->answer") 140 | 141 | start = time.time() 142 | result1 = predict(question="Who is the GOAT of soccer?") 143 | print(f"Time elapse: {time.time() - start: 2f}") 144 | 145 | start = time.time() 146 | with dspy.context(lm=dspy.LM("openai/gpt-4.1-mini")): 147 | result2 = predict(question="Who is the GOAT of soccer?") 148 | print(f"Time elapse: {time.time() - start: 2f}") 149 | ``` 150 | 151 | The time elapsed will indicate that the cache is not hit on the second call. However, when using the custom cache: 152 | 153 | ```python 154 | import dspy 155 | import os 156 | import time 157 | from typing import Dict, Any, Optional 158 | import ujson 159 | from hashlib import sha256 160 | 161 | os.environ["OPENAI_API_KEY"] = "{your_openai_key}" 162 | 163 | dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini")) 164 | 165 | class CustomCache(dspy.clients.Cache): 166 | 167 | def cache_key(self, request: dict[str, Any], ignored_args_for_cache_key: Optional[list[str]] = None) -> str: 168 | messages = request.get("messages", []) 169 | return sha256(ujson.dumps(messages, sort_keys=True).encode()).hexdigest() 170 | 171 | dspy.cache = CustomCache(enable_disk_cache=True, enable_memory_cache=True, disk_cache_dir=dspy.clients.DISK_CACHE_DIR) 172 | 173 | predict = dspy.Predict("question->answer") 174 | 175 | start = time.time() 176 | result1 = predict(question="Who is the GOAT of volleyball?") 177 | print(f"Time elapse: {time.time() - start: 2f}") 178 | 179 | start = time.time() 180 | with dspy.context(lm=dspy.LM("openai/gpt-4.1-mini")): 181 | result2 = predict(question="Who is the GOAT of volleyball?") 182 | print(f"Time elapse: {time.time() - start: 2f}") 183 | ``` 184 | 185 | You will observe that the cache is hit on the second call, demonstrating the effect of the custom cache key logic. ``` -------------------------------------------------------------------------------- /dspy/datasets/dataloader.py: -------------------------------------------------------------------------------- ```python 1 | import random 2 | from collections.abc import Mapping 3 | from typing import TYPE_CHECKING 4 | 5 | import dspy 6 | from dspy.datasets.dataset import Dataset 7 | 8 | if TYPE_CHECKING: 9 | import pandas as pd 10 | 11 | 12 | class DataLoader(Dataset): 13 | def __init__(self): 14 | pass 15 | 16 | def from_huggingface( 17 | self, 18 | dataset_name: str, 19 | *args, 20 | input_keys: tuple[str] = (), 21 | fields: tuple[str] | None = None, 22 | **kwargs, 23 | ) -> Mapping[str, list[dspy.Example]] | list[dspy.Example]: 24 | if fields and not isinstance(fields, tuple): 25 | raise ValueError("Invalid fields provided. Please provide a tuple of fields.") 26 | 27 | if not isinstance(input_keys, tuple): 28 | raise ValueError("Invalid input keys provided. Please provide a tuple of input keys.") 29 | 30 | from datasets import load_dataset 31 | 32 | dataset = load_dataset(dataset_name, *args, **kwargs) 33 | 34 | if isinstance(dataset, list) and isinstance(kwargs["split"], list): 35 | dataset = {split_name: dataset[idx] for idx, split_name in enumerate(kwargs["split"])} 36 | 37 | try: 38 | returned_split = {} 39 | for split_name in dataset.keys(): 40 | if fields: 41 | returned_split[split_name] = [ 42 | dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) 43 | for row in dataset[split_name] 44 | ] 45 | else: 46 | returned_split[split_name] = [ 47 | dspy.Example({field: row[field] for field in row.keys()}).with_inputs(*input_keys) 48 | for row in dataset[split_name] 49 | ] 50 | 51 | return returned_split 52 | except AttributeError: 53 | if fields: 54 | return [ 55 | dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) for row in dataset 56 | ] 57 | else: 58 | return [ 59 | dspy.Example({field: row[field] for field in row.keys()}).with_inputs(*input_keys) 60 | for row in dataset 61 | ] 62 | 63 | def from_csv( 64 | self, 65 | file_path: str, 66 | fields: list[str] | None = None, 67 | input_keys: tuple[str] = (), 68 | ) -> list[dspy.Example]: 69 | from datasets import load_dataset 70 | 71 | dataset = load_dataset("csv", data_files=file_path)["train"] 72 | 73 | if not fields: 74 | fields = list(dataset.features) 75 | 76 | return [dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) for row in dataset] 77 | 78 | def from_pandas( 79 | self, 80 | df: "pd.DataFrame", 81 | fields: list[str] | None = None, 82 | input_keys: tuple[str] = (), 83 | ) -> list[dspy.Example]: 84 | if fields is None: 85 | fields = list(df.columns) 86 | 87 | return [ 88 | dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) for _, row in df.iterrows() 89 | ] 90 | 91 | def from_json( 92 | self, 93 | file_path: str, 94 | fields: list[str] | None = None, 95 | input_keys: tuple[str] = (), 96 | ) -> list[dspy.Example]: 97 | from datasets import load_dataset 98 | 99 | dataset = load_dataset("json", data_files=file_path)["train"] 100 | 101 | if not fields: 102 | fields = list(dataset.features) 103 | 104 | return [dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) for row in dataset] 105 | 106 | def from_parquet( 107 | self, 108 | file_path: str, 109 | fields: list[str] | None = None, 110 | input_keys: tuple[str] = (), 111 | ) -> list[dspy.Example]: 112 | from datasets import load_dataset 113 | 114 | dataset = load_dataset("parquet", data_files=file_path)["train"] 115 | 116 | if not fields: 117 | fields = list(dataset.features) 118 | 119 | return [dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) for row in dataset] 120 | 121 | def from_rm(self, num_samples: int, fields: list[str], input_keys: list[str]) -> list[dspy.Example]: 122 | try: 123 | rm = dspy.settings.rm 124 | try: 125 | return [ 126 | dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) 127 | for row in rm.get_objects(num_samples=num_samples, fields=fields) 128 | ] 129 | except AttributeError: 130 | raise ValueError( 131 | "Retrieval module does not support `get_objects`. Please use a different retrieval module." 132 | ) 133 | except AttributeError: 134 | raise ValueError( 135 | "Retrieval module not found. Please set a retrieval module using `dspy.settings.configure`." 136 | ) 137 | 138 | def sample( 139 | self, 140 | dataset: list[dspy.Example], 141 | n: int, 142 | *args, 143 | **kwargs, 144 | ) -> list[dspy.Example]: 145 | if not isinstance(dataset, list): 146 | raise ValueError( 147 | f"Invalid dataset provided of type {type(dataset)}. Please provide a list of `dspy.Example`s." 148 | ) 149 | 150 | return random.sample(dataset, n, *args, **kwargs) 151 | 152 | def train_test_split( 153 | self, 154 | dataset: list[dspy.Example], 155 | train_size: int | float = 0.75, 156 | test_size: int | float | None = None, 157 | random_state: int | None = None, 158 | ) -> Mapping[str, list[dspy.Example]]: 159 | if random_state is not None: 160 | random.seed(random_state) 161 | 162 | dataset_shuffled = dataset.copy() 163 | random.shuffle(dataset_shuffled) 164 | 165 | if train_size is not None and isinstance(train_size, float) and (0 < train_size < 1): 166 | train_end = int(len(dataset_shuffled) * train_size) 167 | elif train_size is not None and isinstance(train_size, int): 168 | train_end = train_size 169 | else: 170 | raise ValueError( 171 | "Invalid `train_size`. Please provide a float between 0 and 1 to represent the proportion of the " 172 | "dataset to include in the train split or an int to represent the absolute number of samples to " 173 | f"include in the train split. Received `train_size`: {train_size}." 174 | ) 175 | 176 | if test_size is not None: 177 | if isinstance(test_size, float) and (0 < test_size < 1): 178 | test_end = int(len(dataset_shuffled) * test_size) 179 | elif isinstance(test_size, int): 180 | test_end = test_size 181 | else: 182 | raise ValueError( 183 | "Invalid `test_size`. Please provide a float between 0 and 1 to represent the proportion of the " 184 | "dataset to include in the test split or an int to represent the absolute number of samples to " 185 | f"include in the test split. Received `test_size`: {test_size}." 186 | ) 187 | if train_end + test_end > len(dataset_shuffled): 188 | raise ValueError( 189 | "`train_size` + `test_size` cannot exceed the total number of samples. Received " 190 | f"`train_size`: {train_end}, `test_size`: {test_end}, and `dataset_size`: {len(dataset_shuffled)}." 191 | ) 192 | else: 193 | test_end = len(dataset_shuffled) - train_end 194 | 195 | train_dataset = dataset_shuffled[:train_end] 196 | test_dataset = dataset_shuffled[train_end : train_end + test_end] 197 | 198 | return {"train": train_dataset, "test": test_dataset} 199 | ``` -------------------------------------------------------------------------------- /dspy/teleprompt/bettertogether.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | import random 3 | from typing import Callable 4 | 5 | import dspy 6 | from dspy.primitives.example import Example 7 | from dspy.primitives.module import Module 8 | from dspy.teleprompt.bootstrap_finetune import ( 9 | BootstrapFinetune, 10 | all_predictors_have_lms, 11 | kill_lms, 12 | launch_lms, 13 | prepare_student, 14 | ) 15 | from dspy.teleprompt.random_search import BootstrapFewShotWithRandomSearch 16 | from dspy.teleprompt.teleprompt import Teleprompter 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | class BetterTogether(Teleprompter): 22 | 23 | STRAT_SEP = " -> " 24 | 25 | def __init__(self, 26 | metric: Callable, 27 | prompt_optimizer: Teleprompter | None = None, 28 | weight_optimizer: Teleprompter | None = None, 29 | seed: int | None = None, 30 | ): 31 | if not dspy.settings.experimental: 32 | raise ValueError("This is an experimental optimizer. Set `dspy.settings.experimental` to `True` to use it.") 33 | 34 | # TODO: Note that the BetterTogether optimizer is meaningful when 35 | # BootstrapFinetune uses a metric to filter the training data before 36 | # fine-tuning. However, one can also choose to run this optimizer with 37 | # a BootstrapFinetune without a metric, say, if there aren't labels 38 | # available for the training data. Should this be noted somewhere? 39 | # TODO: We should re-consider if the metric should be required. 40 | self.prompt_optimizer = prompt_optimizer if prompt_optimizer else BootstrapFewShotWithRandomSearch(metric=metric) 41 | self.weight_optimizer = weight_optimizer if weight_optimizer else BootstrapFinetune(metric=metric) 42 | 43 | is_supported_prompt = isinstance(self.prompt_optimizer, BootstrapFewShotWithRandomSearch) 44 | is_supported_weight = isinstance(self.weight_optimizer, BootstrapFinetune) 45 | if not is_supported_prompt or not is_supported_weight: 46 | raise ValueError( 47 | "The BetterTogether optimizer only supports the following optimizers for now: BootstrapFinetune, " 48 | "BootstrapFewShotWithRandomSearch." 49 | ) 50 | 51 | self.rng = random.Random(seed) 52 | 53 | def compile( 54 | self, 55 | student: Module, 56 | trainset: list[Example], 57 | strategy: str = "p -> w -> p", 58 | valset_ratio = 0.1, 59 | ) -> Module: 60 | # TODO: We could record acc on a different valset to pick the best 61 | # strategy within the provided strategy 62 | logger.info("Validating the strategy") 63 | parsed_strategy = strategy.lower().split(self.STRAT_SEP) 64 | 65 | if not all(s in ["p", "w"] for s in parsed_strategy): 66 | raise ValueError( 67 | f"The strategy should be a sequence of 'p' and 'w' separated by '{self.STRAT_SEP}', but " 68 | f"found: {strategy}" 69 | ) 70 | 71 | logger.info("Preparing the student program...") 72 | # TODO: Prepare student returns student.reset_copy(), which is what gets 73 | # optimized. We should make this clear in the doc comments. 74 | student = prepare_student(student) 75 | all_predictors_have_lms(student) 76 | 77 | # Make a shallow copy of the trainset, so that we don't change the order 78 | # of the examples in the original trainset 79 | trainset = trainset[:] 80 | logger.info("Compiling the student program...") 81 | student = self._run_strategies(parsed_strategy, student, trainset, valset_ratio) 82 | 83 | logger.info("BetterTogether has finished compiling the student program") 84 | return student 85 | 86 | def _run_strategies(self, parsed_strategy, student, trainset, valset_ratio) -> Module: 87 | # Keep track of all the partial strategies/programs in parsed_strategy 88 | # "" corresponds to the initial student program 89 | candidate_programs = [] 90 | candidate_programs.append(("", student)) 91 | launched_flag = False 92 | 93 | for ind, step_code in enumerate(parsed_strategy): 94 | current_strategy = self.STRAT_SEP.join(parsed_strategy[:ind + 1]) 95 | logger.info( 96 | f"\n########## Step {ind + 1} of {len(parsed_strategy)} - Strategy " 97 | f"'{current_strategy}' ##########" 98 | ) 99 | 100 | logger.info("Shuffling the trainset...") 101 | self.rng.shuffle(trainset) 102 | if not launched_flag: 103 | launch_lms(student) 104 | launched_flag = True 105 | 106 | # TODO: Should we reset or just deepcopy? How does resetting affect 107 | # the predictor LMs? 108 | student = student.deepcopy() 109 | student._compiled = False 110 | if step_code == "p": 111 | student = self._compile_prompt_optimizer(student, trainset, valset_ratio) 112 | elif step_code == "w": 113 | student = self._compile_weight_optimizer(student, trainset) 114 | launched_flag = False 115 | 116 | # Record the program corresponding to the current strategy 117 | candidate_programs.append((current_strategy, student)) 118 | 119 | if launched_flag: 120 | kill_lms(student) 121 | 122 | student.candidate_programs = candidate_programs 123 | return student 124 | 125 | def _compile_prompt_optimizer(self, student, trainset, valset_ratio) -> Module: 126 | logger.info("Preparing for prompt optimization...") 127 | 128 | # Sampling a validation set from the trainset for the prompt optimizer 129 | # We drop the hints for prompt optimization 130 | trainset = [x.with_inputs(*list(set(x.inputs().keys()) - {"hint"})) for x in trainset] 131 | num_val = int(valset_ratio * len(trainset)) 132 | prompt_valset = trainset[:num_val] 133 | prompt_trainset = trainset[num_val:] 134 | 135 | # TODO: To make this optimizer general, we need to ensure that all the 136 | # prompt optimizers are accepting a valset or encode a way to check if 137 | # a valset should be passed to an optimizer's compile method. 138 | # TODO: We should ensure that the prompt optimizers in DSPy respect the 139 | # predictor.lm attributes. In particular, 140 | # BootstrapFewShotWithRandomSearch seems to be resetting these. We are 141 | # manually re-setting the LMs here to circumvent this issue, but we 142 | # should consider addressing it in BFRS. 143 | logger.info("Compiling the prompt optimizer...") 144 | pred_lms = [pred.lm for pred in student.predictors()] 145 | student = self.prompt_optimizer.compile(student, trainset=prompt_trainset, valset=prompt_valset) 146 | for pred, lm in zip(student.predictors(), pred_lms, strict=False): 147 | pred.lm = lm 148 | 149 | return student 150 | 151 | def _compile_weight_optimizer(self, student, trainset) -> Module: 152 | logger.info("Preparing for weight optimization...") 153 | 154 | # Saving the LMs before compiling the weight optimizer 155 | original_lms = [pred.lm for pred in student.predictors()] 156 | 157 | # TODO: To make this optimizer general, we need to ensure that all the 158 | # prompt optimizers are accepting a valset or encode a way to check if 159 | # a valset should be passed to an optimizer's compile. 160 | logger.info("Compiling the weight optimizer...") 161 | student = self.weight_optimizer.compile(student, trainset=trainset) 162 | 163 | # Updating the train kwargs for the new LMs. This is needed because the 164 | # train_kwargs of the optimizer is configured for the original LMs. 165 | new_lms = [pred.lm for pred in student.predictors()] 166 | for original_lm, new_lm in zip(original_lms, new_lms, strict=False): 167 | original_params = self.weight_optimizer.train_kwargs[original_lm] 168 | self.weight_optimizer.train_kwargs[new_lm] = original_params 169 | 170 | return student 171 | ``` -------------------------------------------------------------------------------- /dspy/clients/embedding.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Any, Callable 2 | 3 | import litellm 4 | import numpy as np 5 | 6 | from dspy.clients.cache import request_cache 7 | 8 | 9 | class Embedder: 10 | """DSPy embedding class. 11 | 12 | The class for computing embeddings for text inputs. This class provides a unified interface for both: 13 | 14 | 1. Hosted embedding models (e.g. OpenAI's text-embedding-3-small) via litellm integration 15 | 2. Custom embedding functions that you provide 16 | 17 | For hosted models, simply pass the model name as a string (e.g., "openai/text-embedding-3-small"). The class will use 18 | litellm to handle the API calls and caching. 19 | 20 | For custom embedding models, pass a callable function that: 21 | - Takes a list of strings as input. 22 | - Returns embeddings as either: 23 | - A 2D numpy array of float32 values 24 | - A 2D list of float32 values 25 | - Each row should represent one embedding vector 26 | 27 | Args: 28 | model: The embedding model to use. This can be either a string (representing the name of the hosted embedding 29 | model, must be an embedding model supported by litellm) or a callable that represents a custom embedding 30 | model. 31 | batch_size (int, optional): The default batch size for processing inputs in batches. Defaults to 200. 32 | caching (bool, optional): Whether to cache the embedding response when using a hosted model. Defaults to True. 33 | **kwargs: Additional default keyword arguments to pass to the embedding model. 34 | 35 | Examples: 36 | Example 1: Using a hosted model. 37 | 38 | ```python 39 | import dspy 40 | 41 | embedder = dspy.Embedder("openai/text-embedding-3-small", batch_size=100) 42 | embeddings = embedder(["hello", "world"]) 43 | 44 | assert embeddings.shape == (2, 1536) 45 | ``` 46 | 47 | Example 2: Using any local embedding model, e.g. from https://huggingface.co/models?library=sentence-transformers. 48 | 49 | ```python 50 | # pip install sentence_transformers 51 | import dspy 52 | from sentence_transformers import SentenceTransformer 53 | 54 | # Load an extremely efficient local model for retrieval 55 | model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1", device="cpu") 56 | 57 | embedder = dspy.Embedder(model.encode) 58 | embeddings = embedder(["hello", "world"], batch_size=1) 59 | 60 | assert embeddings.shape == (2, 1024) 61 | ``` 62 | 63 | Example 3: Using a custom function. 64 | 65 | ```python 66 | import dspy 67 | import numpy as np 68 | 69 | def my_embedder(texts): 70 | return np.random.rand(len(texts), 10) 71 | 72 | embedder = dspy.Embedder(my_embedder) 73 | embeddings = embedder(["hello", "world"], batch_size=1) 74 | 75 | assert embeddings.shape == (2, 10) 76 | ``` 77 | """ 78 | 79 | def __init__(self, model: str | Callable, batch_size: int = 200, caching: bool = True, **kwargs: dict[str, Any]): 80 | self.model = model 81 | self.batch_size = batch_size 82 | self.caching = caching 83 | self.default_kwargs = kwargs 84 | 85 | def _preprocess(self, inputs, batch_size=None, caching=None, **kwargs): 86 | if isinstance(inputs, str): 87 | is_single_input = True 88 | inputs = [inputs] 89 | else: 90 | is_single_input = False 91 | 92 | if not all(isinstance(inp, str) for inp in inputs): 93 | raise ValueError("All inputs must be strings.") 94 | 95 | batch_size = batch_size or self.batch_size 96 | caching = caching or self.caching 97 | merged_kwargs = self.default_kwargs.copy() 98 | merged_kwargs.update(kwargs) 99 | 100 | input_batches = [] 101 | for i in range(0, len(inputs), batch_size): 102 | input_batches.append(inputs[i : i + batch_size]) 103 | 104 | return input_batches, caching, merged_kwargs, is_single_input 105 | 106 | def _postprocess(self, embeddings_list, is_single_input): 107 | embeddings = np.array(embeddings_list, dtype=np.float32) 108 | if is_single_input: 109 | return embeddings[0] 110 | else: 111 | return np.array(embeddings, dtype=np.float32) 112 | 113 | def __call__(self, inputs: str | list[str], batch_size: int | None = None, caching: bool | None = None, **kwargs: dict[str, Any]) -> np.ndarray: 114 | """Compute embeddings for the given inputs. 115 | 116 | Args: 117 | inputs: The inputs to compute embeddings for, can be a single string or a list of strings. 118 | batch_size (int, optional): The batch size for processing inputs. If None, defaults to the batch_size set 119 | during initialization. 120 | caching (bool, optional): Whether to cache the embedding response when using a hosted model. If None, 121 | defaults to the caching setting from initialization. 122 | kwargs: Additional keyword arguments to pass to the embedding model. These will override the default 123 | kwargs provided during initialization. 124 | 125 | Returns: 126 | numpy.ndarray: If the input is a single string, returns a 1D numpy array representing the embedding. 127 | If the input is a list of strings, returns a 2D numpy array of embeddings, one embedding per row. 128 | """ 129 | input_batches, caching, kwargs, is_single_input = self._preprocess(inputs, batch_size, caching, **kwargs) 130 | 131 | compute_embeddings = _cached_compute_embeddings if caching else _compute_embeddings 132 | 133 | embeddings_list = [] 134 | 135 | for batch in input_batches: 136 | embeddings_list.extend(compute_embeddings(self.model, batch, caching=caching, **kwargs)) 137 | return self._postprocess(embeddings_list, is_single_input) 138 | 139 | async def acall(self, inputs, batch_size=None, caching=None, **kwargs): 140 | input_batches, caching, kwargs, is_single_input = self._preprocess(inputs, batch_size, caching, **kwargs) 141 | 142 | embeddings_list = [] 143 | acompute_embeddings = _cached_acompute_embeddings if caching else _acompute_embeddings 144 | 145 | for batch in input_batches: 146 | embeddings_list.extend(await acompute_embeddings(self.model, batch, caching=caching, **kwargs)) 147 | return self._postprocess(embeddings_list, is_single_input) 148 | 149 | 150 | def _compute_embeddings(model, batch_inputs, caching=False, **kwargs): 151 | if isinstance(model, str): 152 | caching = caching and litellm.cache is not None 153 | embedding_response = litellm.embedding(model=model, input=batch_inputs, caching=caching, **kwargs) 154 | return [data["embedding"] for data in embedding_response.data] 155 | elif callable(model): 156 | return model(batch_inputs, **kwargs) 157 | else: 158 | raise ValueError(f"`model` in `dspy.Embedder` must be a string or a callable, but got {type(model)}.") 159 | 160 | 161 | @request_cache(ignored_args_for_cache_key=["api_key", "api_base", "base_url"]) 162 | def _cached_compute_embeddings(model, batch_inputs, caching=True, **kwargs): 163 | return _compute_embeddings(model, batch_inputs, caching=caching, **kwargs) 164 | 165 | 166 | async def _acompute_embeddings(model, batch_inputs, caching=False, **kwargs): 167 | if isinstance(model, str): 168 | caching = caching and litellm.cache is not None 169 | embedding_response = await litellm.aembedding(model=model, input=batch_inputs, caching=caching, **kwargs) 170 | return [data["embedding"] for data in embedding_response.data] 171 | elif callable(model): 172 | return model(batch_inputs, **kwargs) 173 | else: 174 | raise ValueError(f"`model` in `dspy.Embedder` must be a string or a callable, but got {type(model)}.") 175 | 176 | 177 | @request_cache(ignored_args_for_cache_key=["api_key", "api_base", "base_url"]) 178 | async def _cached_acompute_embeddings(model, batch_inputs, caching=True, **kwargs): 179 | return await _acompute_embeddings(model, batch_inputs, caching=caching, **kwargs) 180 | ``` -------------------------------------------------------------------------------- /dspy/utils/dummies.py: -------------------------------------------------------------------------------- ```python 1 | import random 2 | from collections import defaultdict 3 | from typing import Any 4 | 5 | import numpy as np 6 | 7 | from dspy.adapters.chat_adapter import FieldInfoWithName, field_header_pattern 8 | from dspy.clients.lm import LM 9 | from dspy.dsp.utils.utils import dotdict 10 | from dspy.signatures.field import OutputField 11 | from dspy.utils.callback import with_callbacks 12 | 13 | 14 | class DummyLM(LM): 15 | """Dummy language model for unit testing purposes. 16 | 17 | Three modes of operation: 18 | 19 | Mode 1: List of dictionaries 20 | 21 | If a list of dictionaries is provided, the dummy model will return the next dictionary 22 | in the list for each request, formatted according to the `format_field_with_value` function. 23 | 24 | Example: 25 | 26 | ``` 27 | lm = DummyLM([{"answer": "red"}, {"answer": "blue"}]) 28 | dspy.settings.configure(lm=lm) 29 | predictor("What color is the sky?") 30 | # Output: 31 | # [[## answer ##]] 32 | # red 33 | predictor("What color is the sky?") 34 | # Output: 35 | # [[## answer ##]] 36 | # blue 37 | ``` 38 | 39 | Mode 2: Dictionary of dictionaries 40 | 41 | If a dictionary of dictionaries is provided, the dummy model will return the value 42 | corresponding to the key which is contained with the final message of the prompt, 43 | formatted according to the `format_field_with_value` function from the chat adapter. 44 | 45 | ``` 46 | lm = DummyLM({"What color is the sky?": {"answer": "blue"}}) 47 | dspy.settings.configure(lm=lm) 48 | predictor("What color is the sky?") 49 | # Output: 50 | # [[## answer ##]] 51 | # blue 52 | ``` 53 | 54 | Mode 3: Follow examples 55 | 56 | If `follow_examples` is set to True, and the prompt contains an example input exactly equal to the prompt, 57 | the dummy model will return the output from that example. 58 | 59 | ``` 60 | lm = DummyLM([{"answer": "red"}], follow_examples=True) 61 | dspy.settings.configure(lm=lm) 62 | predictor("What color is the sky?, demos=dspy.Example(input="What color is the sky?", output="blue")) 63 | # Output: 64 | # [[## answer ##]] 65 | # blue 66 | ``` 67 | 68 | """ 69 | 70 | def __init__(self, answers: list[dict[str, Any]] | dict[str, dict[str, Any]], follow_examples: bool = False, adapter=None): 71 | super().__init__("dummy", "chat", 0.0, 1000, True) 72 | self.answers = answers 73 | if isinstance(answers, list): 74 | self.answers = iter(answers) 75 | self.follow_examples = follow_examples 76 | 77 | # Set adapter, defaulting to ChatAdapter 78 | if adapter is None: 79 | from dspy.adapters.chat_adapter import ChatAdapter 80 | adapter = ChatAdapter() 81 | self.adapter = adapter 82 | 83 | def _use_example(self, messages): 84 | # find all field names 85 | fields = defaultdict(int) 86 | for message in messages: 87 | if "content" in message: 88 | if ma := field_header_pattern.match(message["content"]): 89 | fields[message["content"][ma.start() : ma.end()]] += 1 90 | # find the fields which are missing from the final turns 91 | max_count = max(fields.values()) 92 | output_fields = [field for field, count in fields.items() if count != max_count] 93 | 94 | # get the output from the last turn that has the output fields as headers 95 | final_input = messages[-1]["content"].split("\n\n")[0] 96 | for input, output in zip(reversed(messages[:-1]), reversed(messages), strict=False): 97 | if any(field in output["content"] for field in output_fields) and final_input in input["content"]: 98 | return output["content"] 99 | 100 | @with_callbacks 101 | def __call__(self, prompt=None, messages=None, **kwargs): 102 | def format_answer_fields(field_names_and_values: dict[str, Any]): 103 | fields_with_values = { 104 | FieldInfoWithName(name=field_name, info=OutputField()): value 105 | for field_name, value in field_names_and_values.items() 106 | } 107 | # The reason why DummyLM needs an adapter is because it needs to know which output format to mimic. 108 | # Normally LMs should not have any knowledge of an adapter, because the output format is defined in the prompt. 109 | adapter = self.adapter 110 | 111 | # Try to use role="assistant" if the adapter supports it (like JSONAdapter) 112 | try: 113 | return adapter.format_field_with_value(fields_with_values, role="assistant") 114 | except TypeError: 115 | # Fallback for adapters that don't support role parameter (like ChatAdapter) 116 | return adapter.format_field_with_value(fields_with_values) 117 | 118 | # Build the request. 119 | outputs = [] 120 | for _ in range(kwargs.get("n", 1)): 121 | messages = messages or [{"role": "user", "content": prompt}] 122 | kwargs = {**self.kwargs, **kwargs} 123 | 124 | if self.follow_examples: 125 | outputs.append(self._use_example(messages)) 126 | elif isinstance(self.answers, dict): 127 | outputs.append( 128 | next( 129 | (format_answer_fields(v) for k, v in self.answers.items() if k in messages[-1]["content"]), 130 | "No more responses", 131 | ) 132 | ) 133 | else: 134 | outputs.append(format_answer_fields(next(self.answers, {"answer": "No more responses"}))) 135 | 136 | # Logging, with removed api key & where `cost` is None on cache hit. 137 | kwargs = {k: v for k, v in kwargs.items() if not k.startswith("api_")} 138 | entry = {"prompt": prompt, "messages": messages, "kwargs": kwargs} 139 | entry = {**entry, "outputs": outputs, "usage": 0} 140 | entry = {**entry, "cost": 0} 141 | self.update_history(entry) 142 | 143 | return outputs 144 | 145 | async def acall(self, prompt=None, messages=None, **kwargs): 146 | return self.__call__(prompt=prompt, messages=messages, **kwargs) 147 | 148 | def get_convo(self, index): 149 | """Get the prompt + answer from the ith message.""" 150 | return self.history[index]["messages"], self.history[index]["outputs"] 151 | 152 | 153 | def dummy_rm(passages=()) -> callable: 154 | if not passages: 155 | 156 | def inner(query: str, *, k: int, **kwargs): 157 | raise ValueError("No passages defined") 158 | 159 | return inner 160 | max_length = max(map(len, passages)) + 100 161 | vectorizer = DummyVectorizer(max_length) 162 | passage_vecs = vectorizer(passages) 163 | 164 | def inner(query: str, *, k: int, **kwargs): 165 | assert k <= len(passages) 166 | query_vec = vectorizer([query])[0] 167 | scores = passage_vecs @ query_vec 168 | largest_idx = (-scores).argsort()[:k] 169 | 170 | return [dotdict(long_text=passages[i]) for i in largest_idx] 171 | 172 | return inner 173 | 174 | 175 | class DummyVectorizer: 176 | """Simple vectorizer based on n-grams.""" 177 | 178 | def __init__(self, max_length=100, n_gram=2): 179 | self.max_length = max_length 180 | self.n_gram = n_gram 181 | self.P = 10**9 + 7 # A large prime number 182 | random.seed(123) 183 | self.coeffs = [random.randrange(1, self.P) for _ in range(n_gram)] 184 | 185 | def _hash(self, gram): 186 | """Hashes a string using a polynomial hash function.""" 187 | h = 1 188 | for coeff, c in zip(self.coeffs, gram, strict=False): 189 | h = h * coeff + ord(c) 190 | h %= self.P 191 | return h % self.max_length 192 | 193 | def __call__(self, texts: list[str]) -> np.ndarray: 194 | vecs = [] 195 | for text in texts: 196 | grams = [text[i : i + self.n_gram] for i in range(len(text) - self.n_gram + 1)] 197 | vec = [0] * self.max_length 198 | for gram in grams: 199 | vec[self._hash(gram)] += 1 200 | vecs.append(vec) 201 | 202 | vecs = np.array(vecs, dtype=np.float32) 203 | vecs -= np.mean(vecs, axis=1, keepdims=True) 204 | vecs /= np.linalg.norm(vecs, axis=1, keepdims=True) + 1e-10 # Added epsilon to avoid division by zero 205 | return vecs 206 | ``` -------------------------------------------------------------------------------- /tests/utils/test_settings.py: -------------------------------------------------------------------------------- ```python 1 | import asyncio 2 | import time 3 | from concurrent.futures import ThreadPoolExecutor 4 | from unittest import mock 5 | 6 | import pytest 7 | from litellm import Choices, Message, ModelResponse 8 | 9 | import dspy 10 | 11 | 12 | def test_basic_dspy_settings(): 13 | dspy.configure(lm=dspy.LM("openai/gpt-4o"), adapter=dspy.JSONAdapter(), callbacks=[lambda x: x]) 14 | assert dspy.settings.lm.model == "openai/gpt-4o" 15 | assert isinstance(dspy.settings.adapter, dspy.JSONAdapter) 16 | assert len(dspy.settings.callbacks) == 1 17 | 18 | 19 | def test_forbid_configure_call_in_child_thread(): 20 | dspy.configure(lm=dspy.LM("openai/gpt-4o"), adapter=dspy.JSONAdapter(), callbacks=[lambda x: x]) 21 | 22 | def worker(): 23 | with pytest.raises(RuntimeError, match="Cannot call dspy.configure"): 24 | dspy.configure(lm=dspy.LM("openai/gpt-4o-mini"), callbacks=[]) 25 | 26 | with ThreadPoolExecutor(max_workers=1) as executor: 27 | executor.submit(worker) 28 | 29 | 30 | def test_dspy_context(): 31 | dspy.configure(lm=dspy.LM("openai/gpt-4o"), adapter=dspy.JSONAdapter(), callbacks=[lambda x: x]) 32 | with dspy.context(lm=dspy.LM("openai/gpt-4o-mini"), callbacks=[]): 33 | assert dspy.settings.lm.model == "openai/gpt-4o-mini" 34 | assert len(dspy.settings.callbacks) == 0 35 | 36 | assert dspy.settings.lm.model == "openai/gpt-4o" 37 | assert len(dspy.settings.callbacks) == 1 38 | 39 | 40 | def test_dspy_context_parallel(): 41 | dspy.configure(lm=dspy.LM("openai/gpt-4o"), adapter=dspy.JSONAdapter(), callbacks=[lambda x: x]) 42 | 43 | def worker(i): 44 | with dspy.context(lm=dspy.LM("openai/gpt-4o-mini"), trace=[i], callbacks=[]): 45 | assert dspy.settings.lm.model == "openai/gpt-4o-mini" 46 | assert dspy.settings.trace == [i] 47 | assert len(dspy.settings.callbacks) == 0 48 | 49 | with ThreadPoolExecutor(max_workers=5) as executor: 50 | executor.map(worker, range(3)) 51 | 52 | assert dspy.settings.lm.model == "openai/gpt-4o" 53 | assert len(dspy.settings.callbacks) == 1 54 | 55 | 56 | def test_dspy_context_with_dspy_parallel(): 57 | dspy.configure(lm=dspy.LM("openai/gpt-4o", cache=False), adapter=dspy.ChatAdapter()) 58 | 59 | class MyModule(dspy.Module): 60 | def __init__(self): 61 | self.predict = dspy.Predict("question -> answer") 62 | 63 | def forward(self, question: str) -> str: 64 | lm = dspy.LM("openai/gpt-4o-mini", cache=False) if "France" in question else dspy.settings.lm 65 | with dspy.context(lm=lm): 66 | time.sleep(1) 67 | assert dspy.settings.lm.model == lm.model 68 | return self.predict(question=question) 69 | 70 | with mock.patch("litellm.completion") as mock_completion: 71 | mock_completion.return_value = ModelResponse( 72 | choices=[Choices(message=Message(content="[[ ## answer ## ]]\nParis"))], 73 | model="openai/gpt-4o-mini", 74 | ) 75 | 76 | module = MyModule() 77 | parallelizer = dspy.Parallel() 78 | input_pairs = [ 79 | (module, {"question": "What is the capital of France?"}), 80 | (module, {"question": "What is the capital of Germany?"}), 81 | ] 82 | parallelizer(input_pairs) 83 | 84 | # Verify mock was called correctly 85 | assert mock_completion.call_count == 2 86 | for call_args in mock_completion.call_args_list: 87 | if "France" in call_args.kwargs["messages"][-1]["content"]: 88 | # France question uses gpt-4o-mini 89 | assert call_args.kwargs["model"] == "openai/gpt-4o-mini" 90 | else: 91 | # Germany question uses gpt-4o 92 | assert call_args.kwargs["model"] == "openai/gpt-4o" 93 | 94 | # The main thread is not affected by the context 95 | assert dspy.settings.lm.model == "openai/gpt-4o" 96 | 97 | 98 | @pytest.mark.asyncio 99 | async def test_dspy_context_with_async_task_group(): 100 | class MyModule(dspy.Module): 101 | def __init__(self): 102 | self.predict = dspy.Predict("question -> answer") 103 | 104 | async def aforward(self, question: str) -> str: 105 | lm = ( 106 | dspy.LM("openai/gpt-4o-mini", cache=False) 107 | if "France" in question 108 | else dspy.LM("openai/gpt-4o", cache=False) 109 | ) 110 | with dspy.context(lm=lm, trace=[]): 111 | await asyncio.sleep(1) 112 | assert dspy.settings.lm.model == lm.model 113 | result = await self.predict.acall(question=question) 114 | assert len(dspy.settings.trace) == 1 115 | return result 116 | 117 | module = MyModule() 118 | 119 | with dspy.context(lm=dspy.LM("openai/gpt-4.1", cache=False), adapter=dspy.ChatAdapter()): 120 | with mock.patch("litellm.acompletion") as mock_completion: 121 | mock_completion.return_value = ModelResponse( 122 | choices=[Choices(message=Message(content="[[ ## answer ## ]]\nParis"))], 123 | model="openai/gpt-4o-mini", 124 | ) 125 | 126 | # Define the coroutines to be run 127 | coroutines = [ 128 | module.acall(question="What is the capital of France?"), 129 | module.acall(question="What is the capital of France?"), 130 | module.acall(question="What is the capital of Germany?"), 131 | module.acall(question="What is the capital of Germany?"), 132 | ] 133 | 134 | # Run them concurrently and gather results 135 | results = await asyncio.gather(*coroutines) 136 | 137 | assert results[0].answer == "Paris" 138 | assert results[1].answer == "Paris" 139 | assert results[2].answer == "Paris" 140 | assert results[3].answer == "Paris" 141 | 142 | # Verify mock was called correctly 143 | assert mock_completion.call_count == 4 144 | # France question uses gpt-4o-mini 145 | assert mock_completion.call_args_list[0].kwargs["model"] == "openai/gpt-4o-mini" 146 | assert mock_completion.call_args_list[1].kwargs["model"] == "openai/gpt-4o-mini" 147 | # Germany question uses gpt-4o 148 | assert mock_completion.call_args_list[2].kwargs["model"] == "openai/gpt-4o" 149 | assert mock_completion.call_args_list[3].kwargs["model"] == "openai/gpt-4o" 150 | 151 | # The main thread is not affected by the context 152 | assert dspy.settings.lm.model == "openai/gpt-4.1" 153 | assert dspy.settings.trace == [] 154 | 155 | 156 | @pytest.mark.asyncio 157 | async def test_dspy_configure_allowance_async(): 158 | def bar1(): 159 | # `dspy.configure` is disallowed in different async tasks from the initial one. 160 | # In this case, foo1 (async) calls bar1 (sync), and bar1 uses the async task from foo1. 161 | with pytest.raises(RuntimeError) as e: 162 | dspy.configure(lm=dspy.LM("openai/gpt-4o")) 163 | assert "dspy.settings.configure(...) can only be called from the same async" in str(e.value) 164 | 165 | async def foo1(): 166 | bar1() 167 | await asyncio.sleep(0.1) 168 | 169 | async def foo2(): 170 | # `dspy.configure` is disallowed in different async tasks from the initial one. 171 | with pytest.raises(RuntimeError) as e: 172 | dspy.configure(lm=dspy.LM("openai/gpt-4o")) 173 | assert "dspy.settings.configure(...) can only be called from the same async" in str(e.value) 174 | await asyncio.sleep(0.1) 175 | 176 | async def foo3(): 177 | # `dspy.context` is allowed in different async tasks from the initial one. 178 | with dspy.context(lm=dspy.LM("openai/gpt-4o")): 179 | await asyncio.sleep(0.1) 180 | 181 | async def foo4(): 182 | # foo4 is directly invoked by the entry task, so it has the same async task as the entry task. 183 | dspy.configure(lm=dspy.LM("openai/gpt-4o")) 184 | await asyncio.sleep(0.1) 185 | 186 | # `dspy.configure` is allowed to be called multiple times in the same async task. 187 | dspy.configure(lm=dspy.LM("openai/gpt-4o-mini")) 188 | dspy.configure(lm=dspy.LM("openai/gpt-4o")) 189 | dspy.configure(adapter=dspy.JSONAdapter()) 190 | 191 | await asyncio.gather(foo1(), foo2(), foo3()) 192 | 193 | foo4() 194 | ```