stanfordnlp/dspy # codebase.md

This is page 5 of 17. Use http://codebase.md/stanfordnlp/dspy?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .github
│   ├── .internal_dspyai
│   │   ├── internals
│   │   │   ├── build-and-release.md
│   │   │   └── release-checklist.md
│   │   └── pyproject.toml
│   ├── .tmp
│   │   └── .generated-actions
│   │       └── run-pypi-publish-in-docker-container
│   │           └── action.yml
│   ├── ISSUE_TEMPLATE
│   │   ├── bug_report.yml
│   │   └── feature_request.yml
│   ├── PULL_REQUEST_TEMPLATE
│   │   └── pull_request_template.md
│   ├── workflow_scripts
│   │   └── install_testpypi_pkg.sh
│   └── workflows
│       ├── build_and_release.yml
│       ├── build_utils
│       │   └── test_version.py
│       ├── docs-push.yml
│       ├── precommits_check.yml
│       └── run_tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── docs
│   ├── .gitignore
│   ├── docs
│   │   ├── api
│   │   │   ├── adapters
│   │   │   │   ├── Adapter.md
│   │   │   │   ├── ChatAdapter.md
│   │   │   │   ├── JSONAdapter.md
│   │   │   │   └── TwoStepAdapter.md
│   │   │   ├── evaluation
│   │   │   │   ├── answer_exact_match.md
│   │   │   │   ├── answer_passage_match.md
│   │   │   │   ├── CompleteAndGrounded.md
│   │   │   │   ├── Evaluate.md
│   │   │   │   ├── EvaluationResult.md
│   │   │   │   └── SemanticF1.md
│   │   │   ├── experimental
│   │   │   │   ├── Citations.md
│   │   │   │   └── Document.md
│   │   │   ├── index.md
│   │   │   ├── models
│   │   │   │   ├── Embedder.md
│   │   │   │   └── LM.md
│   │   │   ├── modules
│   │   │   │   ├── BestOfN.md
│   │   │   │   ├── ChainOfThought.md
│   │   │   │   ├── CodeAct.md
│   │   │   │   ├── Module.md
│   │   │   │   ├── MultiChainComparison.md
│   │   │   │   ├── Parallel.md
│   │   │   │   ├── Predict.md
│   │   │   │   ├── ProgramOfThought.md
│   │   │   │   ├── ReAct.md
│   │   │   │   └── Refine.md
│   │   │   ├── optimizers
│   │   │   │   ├── BetterTogether.md
│   │   │   │   ├── BootstrapFewShot.md
│   │   │   │   ├── BootstrapFewShotWithRandomSearch.md
│   │   │   │   ├── BootstrapFinetune.md
│   │   │   │   ├── BootstrapRS.md
│   │   │   │   ├── COPRO.md
│   │   │   │   ├── Ensemble.md
│   │   │   │   ├── GEPA
│   │   │   │   │   ├── GEPA_Advanced.md
│   │   │   │   │   └── overview.md
│   │   │   │   ├── InferRules.md
│   │   │   │   ├── KNN.md
│   │   │   │   ├── KNNFewShot.md
│   │   │   │   ├── LabeledFewShot.md
│   │   │   │   ├── MIPROv2.md
│   │   │   │   └── SIMBA.md
│   │   │   ├── primitives
│   │   │   │   ├── Audio.md
│   │   │   │   ├── Code.md
│   │   │   │   ├── Example.md
│   │   │   │   ├── History.md
│   │   │   │   ├── Image.md
│   │   │   │   ├── Prediction.md
│   │   │   │   ├── Tool.md
│   │   │   │   └── ToolCalls.md
│   │   │   ├── signatures
│   │   │   │   ├── InputField.md
│   │   │   │   ├── OutputField.md
│   │   │   │   └── Signature.md
│   │   │   ├── tools
│   │   │   │   ├── ColBERTv2.md
│   │   │   │   ├── Embeddings.md
│   │   │   │   └── PythonInterpreter.md
│   │   │   └── utils
│   │   │       ├── asyncify.md
│   │   │       ├── configure_cache.md
│   │   │       ├── disable_litellm_logging.md
│   │   │       ├── disable_logging.md
│   │   │       ├── enable_litellm_logging.md
│   │   │       ├── enable_logging.md
│   │   │       ├── inspect_history.md
│   │   │       ├── load.md
│   │   │       ├── StatusMessage.md
│   │   │       ├── StatusMessageProvider.md
│   │   │       ├── streamify.md
│   │   │       └── StreamListener.md
│   │   ├── cheatsheet.md
│   │   ├── community
│   │   │   ├── community-resources.md
│   │   │   ├── how-to-contribute.md
│   │   │   └── use-cases.md
│   │   ├── deep-dive
│   │   │   └── data-handling
│   │   │       ├── built-in-datasets.md
│   │   │       ├── examples.md
│   │   │       ├── img
│   │   │       │   └── data-loading.png
│   │   │       └── loading-custom-data.md
│   │   ├── faqs.md
│   │   ├── index.md
│   │   ├── js
│   │   │   └── runllm-widget.js
│   │   ├── learn
│   │   │   ├── evaluation
│   │   │   │   ├── data.md
│   │   │   │   ├── metrics.md
│   │   │   │   └── overview.md
│   │   │   ├── figures
│   │   │   │   ├── native_tool_call.png
│   │   │   │   └── teleprompter-classes.png
│   │   │   ├── index.md
│   │   │   ├── optimization
│   │   │   │   ├── optimizers.md
│   │   │   │   └── overview.md
│   │   │   └── programming
│   │   │       ├── 7-assertions.md
│   │   │       ├── adapters.md
│   │   │       ├── language_models.md
│   │   │       ├── mcp.md
│   │   │       ├── modules.md
│   │   │       ├── overview.md
│   │   │       ├── signatures.md
│   │   │       └── tools.md
│   │   ├── production
│   │   │   └── index.md
│   │   ├── roadmap.md
│   │   ├── static
│   │   │   ├── .nojekyll
│   │   │   └── img
│   │   │       ├── dspy_logo.png
│   │   │       ├── logo.png
│   │   │       ├── mlflow-tracing-rag.png
│   │   │       ├── modular.png
│   │   │       ├── optimize.png
│   │   │       ├── undraw_docusaurus_mountain.svg
│   │   │       ├── undraw_docusaurus_react.svg
│   │   │       ├── undraw_docusaurus_tree.svg
│   │   │       └── universal_compatibility.png
│   │   ├── stylesheets
│   │   │   └── extra.css
│   │   └── tutorials
│   │       ├── agents
│   │       │   ├── index.ipynb
│   │       │   └── mlflow-tracing-agent.png
│   │       ├── ai_text_game
│   │       │   └── index.md
│   │       ├── async
│   │       │   └── index.md
│   │       ├── audio
│   │       │   └── index.ipynb
│   │       ├── build_ai_program
│   │       │   └── index.md
│   │       ├── cache
│   │       │   └── index.md
│   │       ├── classification
│   │       │   └── index.md
│   │       ├── classification_finetuning
│   │       │   ├── index.ipynb
│   │       │   └── mlflow-tracing-classification.png
│   │       ├── conversation_history
│   │       │   └── index.md
│   │       ├── core_development
│   │       │   └── index.md
│   │       ├── custom_module
│   │       │   ├── index.ipynb
│   │       │   └── mlflow-tracing-custom-module.png
│   │       ├── customer_service_agent
│   │       │   ├── index.ipynb
│   │       │   └── mlflow-tracing-customer-service-agent.png
│   │       ├── deployment
│   │       │   ├── dspy_mlflow_ui.png
│   │       │   └── index.md
│   │       ├── email_extraction
│   │       │   ├── index.md
│   │       │   └── mlflow-tracing-email-extraction.png
│   │       ├── entity_extraction
│   │       │   ├── index.ipynb
│   │       │   └── mlflow-tracing-entity-extraction.png
│   │       ├── games
│   │       │   ├── index.ipynb
│   │       │   └── mlflow-tracing-agent.png
│   │       ├── gepa_ai_program
│   │       │   └── index.md
│   │       ├── gepa_aime
│   │       │   ├── index.ipynb
│   │       │   ├── mlflow-tracing-gepa-aime.png
│   │       │   └── mlflow-tracking-gepa-aime-optimization.png
│   │       ├── gepa_facilitysupportanalyzer
│   │       │   ├── index.ipynb
│   │       │   ├── mlflow-tracing-gepa-support.png
│   │       │   └── mlflow-tracking-gepa-support-optimization.png
│   │       ├── gepa_papillon
│   │       │   ├── index.ipynb
│   │       │   ├── mlflow-tracing-gepa-papilon.png
│   │       │   └── mlflow-tracking-gepa-papilon-optimization.png
│   │       ├── image_generation_prompting
│   │       │   └── index.ipynb
│   │       ├── index.md
│   │       ├── llms_txt_generation
│   │       │   └── index.md
│   │       ├── math
│   │       │   ├── index.ipynb
│   │       │   └── mlflow-tracing-math.png
│   │       ├── mcp
│   │       │   └── index.md
│   │       ├── mem0_react_agent
│   │       │   └── index.md
│   │       ├── multihop_search
│   │       │   ├── index.ipynb
│   │       │   └── mlflow-tracing-multi-hop.png
│   │       ├── observability
│   │       │   ├── index.md
│   │       │   ├── mlflow_trace_ui_navigation.gif
│   │       │   ├── mlflow_trace_ui.png
│   │       │   └── mlflow_trace_view.png
│   │       ├── optimize_ai_program
│   │       │   └── index.md
│   │       ├── optimizer_tracking
│   │       │   ├── child_run.png
│   │       │   ├── experiment.png
│   │       │   ├── index.md
│   │       │   └── parent_run.png
│   │       ├── output_refinement
│   │       │   └── best-of-n-and-refine.md
│   │       ├── papillon
│   │       │   └── index.md
│   │       ├── program_of_thought
│   │       │   └── index.ipynb
│   │       ├── rag
│   │       │   ├── index.ipynb
│   │       │   └── mlflow-tracing-rag.png
│   │       ├── real_world_examples
│   │       │   └── index.md
│   │       ├── rl_ai_program
│   │       │   └── index.md
│   │       ├── rl_multihop
│   │       │   └── index.ipynb
│   │       ├── rl_papillon
│   │       │   └── index.ipynb
│   │       ├── sample_code_generation
│   │       │   └── index.md
│   │       ├── saving
│   │       │   └── index.md
│   │       ├── streaming
│   │       │   └── index.md
│   │       ├── tool_use
│   │       │   └── index.ipynb
│   │       └── yahoo_finance_react
│   │           └── index.md
│   ├── mkdocs.yml
│   ├── overrides
│   │   ├── home.html
│   │   ├── main.html
│   │   └── partials
│   │       └── tabs.html
│   ├── Pipfile
│   ├── Pipfile.lock
│   ├── README.md
│   ├── requirements.txt
│   ├── scripts
│   │   ├── generate_api_docs.py
│   │   └── generate_api_summary.py
│   └── vercel.json
├── dspy
│   ├── __init__.py
│   ├── __metadata__.py
│   ├── adapters
│   │   ├── __init__.py
│   │   ├── baml_adapter.py
│   │   ├── base.py
│   │   ├── chat_adapter.py
│   │   ├── json_adapter.py
│   │   ├── two_step_adapter.py
│   │   ├── types
│   │   │   ├── __init__.py
│   │   │   ├── audio.py
│   │   │   ├── base_type.py
│   │   │   ├── citation.py
│   │   │   ├── code.py
│   │   │   ├── document.py
│   │   │   ├── history.py
│   │   │   ├── image.py
│   │   │   └── tool.py
│   │   ├── utils.py
│   │   └── xml_adapter.py
│   ├── clients
│   │   ├── __init__.py
│   │   ├── base_lm.py
│   │   ├── cache.py
│   │   ├── databricks.py
│   │   ├── embedding.py
│   │   ├── lm_local_arbor.py
│   │   ├── lm_local.py
│   │   ├── lm.py
│   │   ├── openai.py
│   │   ├── provider.py
│   │   └── utils_finetune.py
│   ├── datasets
│   │   ├── __init__.py
│   │   ├── alfworld
│   │   │   ├── __init__.py
│   │   │   ├── alfworld.py
│   │   │   └── base_config.yml
│   │   ├── colors.py
│   │   ├── dataloader.py
│   │   ├── dataset.py
│   │   ├── gsm8k.py
│   │   ├── hotpotqa.py
│   │   └── math.py
│   ├── dsp
│   │   ├── __init__.py
│   │   ├── colbertv2.py
│   │   └── utils
│   │       ├── __init__.py
│   │       ├── dpr.py
│   │       ├── settings.py
│   │       └── utils.py
│   ├── evaluate
│   │   ├── __init__.py
│   │   ├── auto_evaluation.py
│   │   ├── evaluate.py
│   │   └── metrics.py
│   ├── experimental
│   │   └── __init__.py
│   ├── predict
│   │   ├── __init__.py
│   │   ├── aggregation.py
│   │   ├── avatar
│   │   │   ├── __init__.py
│   │   │   ├── avatar.py
│   │   │   ├── models.py
│   │   │   └── signatures.py
│   │   ├── best_of_n.py
│   │   ├── chain_of_thought.py
│   │   ├── code_act.py
│   │   ├── knn.py
│   │   ├── multi_chain_comparison.py
│   │   ├── parallel.py
│   │   ├── parameter.py
│   │   ├── predict.py
│   │   ├── program_of_thought.py
│   │   ├── react.py
│   │   ├── refine.py
│   │   └── retry.py
│   ├── primitives
│   │   ├── __init__.py
│   │   ├── base_module.py
│   │   ├── example.py
│   │   ├── module.py
│   │   ├── prediction.py
│   │   ├── python_interpreter.py
│   │   └── runner.js
│   ├── propose
│   │   ├── __init__.py
│   │   ├── dataset_summary_generator.py
│   │   ├── grounded_proposer.py
│   │   ├── propose_base.py
│   │   └── utils.py
│   ├── retrievers
│   │   ├── __init__.py
│   │   ├── databricks_rm.py
│   │   ├── embeddings.py
│   │   ├── retrieve.py
│   │   └── weaviate_rm.py
│   ├── signatures
│   │   ├── __init__.py
│   │   ├── field.py
│   │   ├── signature.py
│   │   └── utils.py
│   ├── streaming
│   │   ├── __init__.py
│   │   ├── messages.py
│   │   ├── streamify.py
│   │   └── streaming_listener.py
│   ├── teleprompt
│   │   ├── __init__.py
│   │   ├── avatar_optimizer.py
│   │   ├── bettertogether.py
│   │   ├── bootstrap_finetune.py
│   │   ├── bootstrap_trace.py
│   │   ├── bootstrap.py
│   │   ├── copro_optimizer.py
│   │   ├── ensemble.py
│   │   ├── gepa
│   │   │   ├── __init__.py
│   │   │   ├── gepa_utils.py
│   │   │   ├── gepa.py
│   │   │   └── instruction_proposal.py
│   │   ├── grpo.py
│   │   ├── infer_rules.py
│   │   ├── knn_fewshot.py
│   │   ├── mipro_optimizer_v2.py
│   │   ├── random_search.py
│   │   ├── signature_opt.py
│   │   ├── simba_utils.py
│   │   ├── simba.py
│   │   ├── teleprompt_optuna.py
│   │   ├── teleprompt.py
│   │   ├── utils.py
│   │   └── vanilla.py
│   └── utils
│       ├── __init__.py
│       ├── annotation.py
│       ├── asyncify.py
│       ├── caching.py
│       ├── callback.py
│       ├── dummies.py
│       ├── exceptions.py
│       ├── hasher.py
│       ├── inspect_history.py
│       ├── langchain_tool.py
│       ├── logging_utils.py
│       ├── mcp.py
│       ├── parallelizer.py
│       ├── saving.py
│       ├── syncify.py
│       ├── unbatchify.py
│       └── usage_tracker.py
├── LICENSE
├── pyproject.toml
├── README.md
├── tests
│   ├── __init__.py
│   ├── adapters
│   │   ├── test_adapter_utils.py
│   │   ├── test_baml_adapter.py
│   │   ├── test_base_type.py
│   │   ├── test_chat_adapter.py
│   │   ├── test_citation.py
│   │   ├── test_code.py
│   │   ├── test_document.py
│   │   ├── test_json_adapter.py
│   │   ├── test_tool.py
│   │   ├── test_two_step_adapter.py
│   │   └── test_xml_adapter.py
│   ├── callback
│   │   └── test_callback.py
│   ├── clients
│   │   ├── test_cache.py
│   │   ├── test_databricks.py
│   │   ├── test_embedding.py
│   │   ├── test_inspect_global_history.py
│   │   └── test_lm.py
│   ├── conftest.py
│   ├── datasets
│   │   └── test_dataset.py
│   ├── docs
│   │   └── test_mkdocs_links.py
│   ├── evaluate
│   │   ├── test_evaluate.py
│   │   └── test_metrics.py
│   ├── examples
│   │   └── test_baleen.py
│   ├── metadata
│   │   └── test_metadata.py
│   ├── predict
│   │   ├── test_aggregation.py
│   │   ├── test_best_of_n.py
│   │   ├── test_chain_of_thought.py
│   │   ├── test_code_act.py
│   │   ├── test_knn.py
│   │   ├── test_multi_chain_comparison.py
│   │   ├── test_parallel.py
│   │   ├── test_predict.py
│   │   ├── test_program_of_thought.py
│   │   ├── test_react.py
│   │   ├── test_refine.py
│   │   └── test_retry.py
│   ├── primitives
│   │   ├── resources
│   │   │   └── saved_program.json
│   │   ├── test_base_module.py
│   │   ├── test_example.py
│   │   ├── test_module.py
│   │   └── test_python_interpreter.py
│   ├── propose
│   │   └── test_grounded_proposer.py
│   ├── README.md
│   ├── reliability
│   │   ├── __init__.py
│   │   ├── complex_types
│   │   │   └── generated
│   │   │       ├── test_many_types_1
│   │   │       │   ├── inputs
│   │   │       │   │   ├── input1.json
│   │   │       │   │   └── input2.json
│   │   │       │   ├── program.py
│   │   │       │   └── schema.json
│   │   │       ├── test_nesting_1
│   │   │       │   ├── inputs
│   │   │       │   │   ├── input1.json
│   │   │       │   │   └── input2.json
│   │   │       │   ├── program.py
│   │   │       │   └── schema.json
│   │   │       └── test_nesting_2
│   │   │           ├── inputs
│   │   │           │   └── input1.json
│   │   │           ├── program.py
│   │   │           └── schema.json
│   │   ├── conftest.py
│   │   ├── generate
│   │   │   ├── __init__.py
│   │   │   ├── __main__.py
│   │   │   └── utils.py
│   │   ├── input_formats
│   │   │   └── generated
│   │   │       └── test_markdown_1
│   │   │           ├── inputs
│   │   │           │   ├── input1.json
│   │   │           │   └── input2.json
│   │   │           ├── program.py
│   │   │           └── schema.json
│   │   ├── README.md
│   │   ├── reliability_conf.yaml
│   │   ├── test_generated.py
│   │   ├── test_pydantic_models.py
│   │   └── utils.py
│   ├── retrievers
│   │   └── test_embeddings.py
│   ├── signatures
│   │   ├── test_adapter_image.py
│   │   ├── test_custom_types.py
│   │   └── test_signature.py
│   ├── streaming
│   │   └── test_streaming.py
│   ├── teleprompt
│   │   ├── gepa_dummy_lm_custom_component_selector_custom_instruction_proposer.json
│   │   ├── gepa_dummy_lm.json
│   │   ├── test_bootstrap_finetune.py
│   │   ├── test_bootstrap_trace.py
│   │   ├── test_bootstrap.py
│   │   ├── test_copro_optimizer.py
│   │   ├── test_ensemble.py
│   │   ├── test_finetune.py
│   │   ├── test_gepa_instruction_proposer.py
│   │   ├── test_gepa.py
│   │   ├── test_grpo.py
│   │   ├── test_knn_fewshot.py
│   │   ├── test_random_search.py
│   │   ├── test_teleprompt.py
│   │   └── test_utils.py
│   ├── test_utils
│   │   ├── __init__.py
│   │   └── server
│   │       ├── __init__.py
│   │       ├── litellm_server_config.yaml
│   │       └── litellm_server.py
│   └── utils
│       ├── __init__.py
│       ├── resources
│       │   └── mcp_server.py
│       ├── test_annotation.py
│       ├── test_asyncify.py
│       ├── test_exceptions.py
│       ├── test_langchain_tool.py
│       ├── test_mcp.py
│       ├── test_parallelizer.py
│       ├── test_saving.py
│       ├── test_settings.py
│       ├── test_syncify.py
│       ├── test_unbatchify.py
│       └── test_usage_tracker.py
└── uv.lock
```

# Files

--------------------------------------------------------------------------------
/dspy/primitives/prediction.py:
--------------------------------------------------------------------------------

```python
  1 | from dspy.primitives.example import Example
  2 | 
  3 | 
  4 | class Prediction(Example):
  5 |     """A prediction object that contains the output of a DSPy module.
  6 |     
  7 |     Prediction inherits from Example.
  8 |     
  9 |     To allow feedback-augmented scores, Prediction supports comparison operations
 10 |     (<, >, <=, >=) for Predictions with a `score` field. The comparison operations
 11 |     compare the 'score' values as floats. For equality comparison, Predictions are equal
 12 |     if their underlying data stores are equal (inherited from Example).
 13 |     
 14 |     Arithmetic operations (+, /, etc.) are also supported for Predictions with a 'score'
 15 |     field, operating on the score value.
 16 |     """
 17 | 
 18 |     def __init__(self, *args, **kwargs):
 19 |         super().__init__(*args, **kwargs)
 20 | 
 21 |         del self._demos
 22 |         del self._input_keys
 23 | 
 24 |         self._completions = None
 25 |         self._lm_usage = None
 26 | 
 27 |     def get_lm_usage(self):
 28 |         return self._lm_usage
 29 | 
 30 |     def set_lm_usage(self, value):
 31 |         self._lm_usage = value
 32 | 
 33 |     @classmethod
 34 |     def from_completions(cls, list_or_dict, signature=None):
 35 |         obj = cls()
 36 |         obj._completions = Completions(list_or_dict, signature=signature)
 37 |         obj._store = {k: v[0] for k, v in obj._completions.items()}
 38 | 
 39 |         return obj
 40 | 
 41 |     def __repr__(self):
 42 |         store_repr = ",\n    ".join(f"{k}={v!r}" for k, v in self._store.items())
 43 | 
 44 |         if self._completions is None or len(self._completions) == 1:
 45 |             return f"Prediction(\n    {store_repr}\n)"
 46 | 
 47 |         num_completions = len(self._completions)
 48 |         return f"Prediction(\n    {store_repr},\n    completions=Completions(...)\n) ({num_completions-1} completions omitted)"
 49 | 
 50 |     def __str__(self):
 51 |         return self.__repr__()
 52 | 
 53 |     def __float__(self):
 54 |         if "score" not in self._store:
 55 |             raise ValueError("Prediction object does not have a 'score' field to convert to float.")
 56 |         return float(self._store["score"])
 57 | 
 58 |     def __add__(self, other):
 59 |         if isinstance(other, (float, int)):
 60 |             return self.__float__() + other
 61 |         elif isinstance(other, Prediction):
 62 |             return self.__float__() + float(other)
 63 |         raise TypeError(f"Unsupported type for addition: {type(other)}")
 64 | 
 65 |     def __radd__(self, other):
 66 |         if isinstance(other, (float, int)):
 67 |             return other + self.__float__()
 68 |         elif isinstance(other, Prediction):
 69 |             return float(other) + self.__float__()
 70 |         raise TypeError(f"Unsupported type for addition: {type(other)}")
 71 | 
 72 |     def __truediv__(self, other):
 73 |         if isinstance(other, (float, int)):
 74 |             return self.__float__() / other
 75 |         elif isinstance(other, Prediction):
 76 |             return self.__float__() / float(other)
 77 |         raise TypeError(f"Unsupported type for division: {type(other)}")
 78 | 
 79 |     def __rtruediv__(self, other):
 80 |         if isinstance(other, (float, int)):
 81 |             return other / self.__float__()
 82 |         elif isinstance(other, Prediction):
 83 |             return float(other) / self.__float__()
 84 |         raise TypeError(f"Unsupported type for division: {type(other)}")
 85 | 
 86 |     def __lt__(self, other):
 87 |         if isinstance(other, (float, int)):
 88 |             return self.__float__() < other
 89 |         elif isinstance(other, Prediction):
 90 |             return self.__float__() < float(other)
 91 |         raise TypeError(f"Unsupported type for comparison: {type(other)}")
 92 | 
 93 |     def __le__(self, other):
 94 |         if isinstance(other, (float, int)):
 95 |             return self.__float__() <= other
 96 |         elif isinstance(other, Prediction):
 97 |             return self.__float__() <= float(other)
 98 |         raise TypeError(f"Unsupported type for comparison: {type(other)}")
 99 | 
100 |     def __gt__(self, other):
101 |         if isinstance(other, (float, int)):
102 |             return self.__float__() > other
103 |         elif isinstance(other, Prediction):
104 |             return self.__float__() > float(other)
105 |         raise TypeError(f"Unsupported type for comparison: {type(other)}")
106 | 
107 |     def __ge__(self, other):
108 |         if isinstance(other, (float, int)):
109 |             return self.__float__() >= other
110 |         elif isinstance(other, Prediction):
111 |             return self.__float__() >= float(other)
112 |         raise TypeError(f"Unsupported type for comparison: {type(other)}")
113 | 
114 |     @property
115 |     def completions(self):
116 |         return self._completions
117 | 
118 | 
119 | class Completions:
120 |     def __init__(self, list_or_dict, signature=None):
121 |         self.signature = signature
122 | 
123 |         if isinstance(list_or_dict, list):
124 |             kwargs = {}
125 |             for arg in list_or_dict:
126 |                 for k, v in arg.items():
127 |                     kwargs.setdefault(k, []).append(v)
128 |         else:
129 |             kwargs = list_or_dict
130 | 
131 |         assert all(isinstance(v, list) for v in kwargs.values()), "All values must be lists"
132 | 
133 |         if kwargs:
134 |             length = len(next(iter(kwargs.values())))
135 |             assert all(len(v) == length for v in kwargs.values()), "All lists must have the same length"
136 | 
137 |         self._completions = kwargs
138 | 
139 |     def items(self):
140 |         return self._completions.items()
141 | 
142 |     def __getitem__(self, key):
143 |         if isinstance(key, int):
144 |             if key < 0 or key >= len(self):
145 |                 raise IndexError("Index out of range")
146 | 
147 |             return Prediction(**{k: v[key] for k, v in self._completions.items()})
148 | 
149 |         return self._completions[key]
150 | 
151 |     def __getattr__(self, name):
152 |         if name == "_completions":
153 |             raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
154 |         if name in self._completions:
155 |             return self._completions[name]
156 | 
157 |         raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
158 | 
159 |     def __len__(self):
160 |         # Return the length of the list for one of the keys
161 |         # It assumes all lists have the same length
162 |         return len(next(iter(self._completions.values())))
163 | 
164 |     def __contains__(self, key):
165 |         return key in self._completions
166 | 
167 |     def __repr__(self):
168 |         items_repr = ",\n    ".join(f"{k}={v!r}" for k, v in self._completions.items())
169 |         return f"Completions(\n    {items_repr}\n)"
170 | 
171 |     def __str__(self):
172 |         # return str(self._completions)
173 |         return self.__repr__()
174 | 
```

--------------------------------------------------------------------------------
/tests/teleprompt/test_copro_optimizer.py:
--------------------------------------------------------------------------------

```python
  1 | import dspy
  2 | from dspy import Example
  3 | from dspy.teleprompt.signature_opt import COPRO
  4 | from dspy.utils.dummies import DummyLM
  5 | 
  6 | 
  7 | # Define a simple metric function for testing
  8 | def simple_metric(example, prediction):
  9 |     # Simplified metric for testing: true if prediction matches expected output
 10 |     return example.output == prediction.output
 11 | 
 12 | 
 13 | # Example training and validation sets
 14 | trainset = [
 15 |     Example(input="Question: What is the color of the sky?", output="blue").with_inputs("input"),
 16 |     Example(input="Question: What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!").with_inputs(
 17 |         "input"
 18 |     ),
 19 | ]
 20 | 
 21 | 
 22 | def test_signature_optimizer_initialization():
 23 |     optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
 24 |     assert optimizer.metric == simple_metric, "Metric not correctly initialized"
 25 |     assert optimizer.breadth == 2, "Breadth not correctly initialized"
 26 |     assert optimizer.depth == 1, "Depth not correctly initialized"
 27 |     assert optimizer.init_temperature == 1.4, "Initial temperature not correctly initialized"
 28 | 
 29 | 
 30 | class SimpleModule(dspy.Module):
 31 |     def __init__(self, signature):
 32 |         super().__init__()
 33 |         # COPRO doesn't work with dspy.Predict
 34 |         self.predictor = dspy.ChainOfThought(signature)
 35 | 
 36 |     def forward(self, **kwargs):
 37 |         return self.predictor(**kwargs)
 38 | 
 39 | 
 40 | def test_signature_optimizer_optimization_process():
 41 |     optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
 42 |     dspy.settings.configure(
 43 |         lm=DummyLM(
 44 |             [
 45 |                 {
 46 |                     "proposed_instruction": "Optimized instruction 1",
 47 |                     "proposed_prefix_for_output_field": "Optimized instruction 2",
 48 |                 },
 49 |             ]
 50 |         )
 51 |     )
 52 | 
 53 |     student = SimpleModule("input -> output")
 54 | 
 55 |     # Assuming the compile method of COPRO requires a student module, a development set, and evaluation kwargs
 56 |     optimized_student = optimizer.compile(
 57 |         student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False}
 58 |     )
 59 | 
 60 |     # Check that the optimized student has been modified from the original
 61 |     # This check can be more specific based on how the optimization modifies the student
 62 |     assert optimized_student is not student, "Optimization did not modify the student"
 63 | 
 64 |     # Further tests can be added to verify the specifics of the optimization process,
 65 |     # such as checking the instructions of the optimized student's predictors.
 66 | 
 67 | 
 68 | def test_signature_optimizer_statistics_tracking():
 69 |     optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
 70 |     optimizer.track_stats = True  # Enable statistics tracking
 71 | 
 72 |     dspy.settings.configure(
 73 |         lm=DummyLM(
 74 |             [
 75 |                 {
 76 |                     "proposed_instruction": "Optimized instruction 1",
 77 |                     "proposed_prefix_for_output_field": "Optimized instruction 2",
 78 |                 },
 79 |             ]
 80 |         )
 81 |     )
 82 |     student = SimpleModule("input -> output")
 83 |     optimized_student = optimizer.compile(
 84 |         student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False}
 85 |     )
 86 | 
 87 |     # Verify that statistics have been tracked and attached to the optimized student
 88 |     assert hasattr(optimized_student, "total_calls"), "Total calls statistic not tracked"
 89 |     assert hasattr(optimized_student, "results_best"), "Best results statistics not tracked"
 90 | 
 91 | 
 92 | # Assuming the setup_signature_optimizer fixture and simple_metric function are defined as before
 93 | 
 94 | 
 95 | def test_optimization_and_output_verification():
 96 |     lm = DummyLM(
 97 |         [
 98 |             {"proposed_instruction": "Optimized Prompt", "proposed_prefix_for_output_field": "Optimized Prefix"},
 99 |             {"reasoning": "france", "output": "Paris"},
100 |             {"reasoning": "france", "output": "Paris"},
101 |             {"reasoning": "france", "output": "Paris"},
102 |             {"reasoning": "france", "output": "Paris"},
103 |             {"reasoning": "france", "output": "Paris"},
104 |             {"reasoning": "france", "output": "Paris"},
105 |             {"reasoning": "france", "output": "Paris"},
106 |         ]
107 |     )
108 |     dspy.settings.configure(lm=lm)
109 |     optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
110 | 
111 |     student = SimpleModule("input -> output")
112 | 
113 |     # Compile the student with the optimizer
114 |     optimized_student = optimizer.compile(
115 |         student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False}
116 |     )
117 | 
118 |     # Simulate calling the optimized student with a new input
119 |     test_input = "What is the capital of France?"
120 |     prediction = optimized_student(input=test_input)
121 | 
122 |     print(lm.get_convo(-1))
123 | 
124 |     assert prediction.output == "Paris"
125 | 
126 | 
127 | def test_statistics_tracking_during_optimization():
128 |     dspy.settings.configure(
129 |         lm=DummyLM(
130 |             [
131 |                 {"proposed_instruction": "Optimized Prompt", "proposed_prefix_for_output_field": "Optimized Prefix"},
132 |             ]
133 |         )
134 |     )
135 | 
136 |     optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
137 |     optimizer.track_stats = True  # Enable statistics tracking
138 | 
139 |     student = SimpleModule("input -> output")
140 |     optimized_student = optimizer.compile(
141 |         student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False}
142 |     )
143 | 
144 |     # Verify that statistics have been tracked
145 |     assert hasattr(optimized_student, "total_calls"), "Optimizer did not track total metric calls"
146 |     assert optimized_student.total_calls > 0, "Optimizer reported no metric calls"
147 | 
148 |     # Check if the results_best and results_latest contain valid statistics
149 |     assert "results_best" in optimized_student.__dict__, "Optimizer did not track the best results"
150 |     assert "results_latest" in optimized_student.__dict__, "Optimizer did not track the latest results"
151 |     assert len(optimized_student.results_best) > 0, "Optimizer did not properly populate the best results statistics"
152 |     assert (
153 |         len(optimized_student.results_latest) > 0
154 |     ), "Optimizer did not properly populate the latest results statistics"
155 | 
156 |     # Additional detailed checks can be added here to verify the contents of the tracked statistics
157 | 
```

--------------------------------------------------------------------------------
/docs/docs/learn/evaluation/metrics.md:
--------------------------------------------------------------------------------

```markdown
  1 | ---
  2 | sidebar_position: 5
  3 | ---
  4 | 
  5 | # Metrics
  6 | 
  7 | DSPy is a machine learning framework, so you must think about your **automatic metrics** for evaluation (to track your progress) and optimization (so DSPy can make your programs more effective).
  8 | 
  9 | 
 10 | ## What is a metric and how do I define a metric for my task?
 11 | 
 12 | A metric is just a function that will take examples from your data and the output of your system and return a score that quantifies how good the output is. What makes outputs from your system good or bad? 
 13 | 
 14 | For simple tasks, this could be just "accuracy" or "exact match" or "F1 score". This may be the case for simple classification or short-form QA tasks.
 15 | 
 16 | However, for most applications, your system will output long-form outputs. There, your metric should probably be a smaller DSPy program that checks multiple properties of the output (quite possibly using AI feedback from LMs).
 17 | 
 18 | Getting this right on the first try is unlikely, but you should start with something simple and iterate. 
 19 | 
 20 | 
 21 | ## Simple metrics
 22 | 
 23 | A DSPy metric is just a function in Python that takes `example` (e.g., from your training or dev set) and the output `pred` from your DSPy program, and outputs a `float` (or `int` or `bool`) score.
 24 | 
 25 | Your metric should also accept an optional third argument called `trace`. You can ignore this for a moment, but it will enable some powerful tricks if you want to use your metric for optimization.
 26 | 
 27 | Here's a simple example of a metric that's comparing `example.answer` and `pred.answer`. This particular metric will return a `bool`.
 28 | 
 29 | ```python
 30 | def validate_answer(example, pred, trace=None):
 31 |     return example.answer.lower() == pred.answer.lower()
 32 | ```
 33 | 
 34 | Some people find these utilities (built-in) convenient:
 35 | 
 36 | - `dspy.evaluate.metrics.answer_exact_match`
 37 | - `dspy.evaluate.metrics.answer_passage_match`
 38 | 
 39 | Your metrics could be more complex, e.g. check for multiple properties. The metric below will return a `float` if `trace is None` (i.e., if it's used for evaluation or optimization), and will return a `bool` otherwise (i.e., if it's used to bootstrap demonstrations).
 40 | 
 41 | ```python
 42 | def validate_context_and_answer(example, pred, trace=None):
 43 |     # check the gold label and the predicted answer are the same
 44 |     answer_match = example.answer.lower() == pred.answer.lower()
 45 | 
 46 |     # check the predicted answer comes from one of the retrieved contexts
 47 |     context_match = any((pred.answer.lower() in c) for c in pred.context)
 48 | 
 49 |     if trace is None: # if we're doing evaluation or optimization
 50 |         return (answer_match + context_match) / 2.0
 51 |     else: # if we're doing bootstrapping, i.e. self-generating good demonstrations of each step
 52 |         return answer_match and context_match
 53 | ```
 54 | 
 55 | Defining a good metric is an iterative process, so doing some initial evaluations and looking at your data and outputs is key.
 56 | 
 57 | 
 58 | ## Evaluation
 59 | 
 60 | Once you have a metric, you can run evaluations in a simple Python loop.
 61 | 
 62 | ```python
 63 | scores = []
 64 | for x in devset:
 65 |     pred = program(**x.inputs())
 66 |     score = metric(x, pred)
 67 |     scores.append(score)
 68 | ```
 69 | 
 70 | If you need some utilities, you can also use the built-in `Evaluate` utility. It can help with things like parallel evaluation (multiple threads) or showing you a sample of inputs/outputs and the metric scores.
 71 | 
 72 | ```python
 73 | from dspy.evaluate import Evaluate
 74 | 
 75 | # Set up the evaluator, which can be re-used in your code.
 76 | evaluator = Evaluate(devset=YOUR_DEVSET, num_threads=1, display_progress=True, display_table=5)
 77 | 
 78 | # Launch evaluation.
 79 | evaluator(YOUR_PROGRAM, metric=YOUR_METRIC)
 80 | ```
 81 | 
 82 | 
 83 | ## Intermediate: Using AI feedback for your metric
 84 | 
 85 | For most applications, your system will output long-form outputs, so your metric should check multiple dimensions of the output using AI feedback from LMs.
 86 | 
 87 | This simple signature could come in handy.
 88 | 
 89 | ```python
 90 | # Define the signature for automatic assessments.
 91 | class Assess(dspy.Signature):
 92 |     """Assess the quality of a tweet along the specified dimension."""
 93 | 
 94 |     assessed_text = dspy.InputField()
 95 |     assessment_question = dspy.InputField()
 96 |     assessment_answer: bool = dspy.OutputField()
 97 | ```
 98 | 
 99 | For example, below is a simple metric that checks a generated tweet (1) answers a given question correctly and (2) whether it's also engaging. We also check that (3) `len(tweet) <= 280` characters.
100 | 
101 | ```python
102 | def metric(gold, pred, trace=None):
103 |     question, answer, tweet = gold.question, gold.answer, pred.output
104 | 
105 |     engaging = "Does the assessed text make for a self-contained, engaging tweet?"
106 |     correct = f"The text should answer `{question}` with `{answer}`. Does the assessed text contain this answer?"
107 |     
108 |     correct =  dspy.Predict(Assess)(assessed_text=tweet, assessment_question=correct)
109 |     engaging = dspy.Predict(Assess)(assessed_text=tweet, assessment_question=engaging)
110 | 
111 |     correct, engaging = [m.assessment_answer for m in [correct, engaging]]
112 |     score = (correct + engaging) if correct and (len(tweet) <= 280) else 0
113 | 
114 |     if trace is not None: return score >= 2
115 |     return score / 2.0
116 | ```
117 | 
118 | When compiling, `trace is not None`, and we want to be strict about judging things, so we will only return `True` if `score >= 2`. Otherwise, we return a score out of 1.0 (i.e., `score / 2.0`).
119 | 
120 | 
121 | ## Advanced: Using a DSPy program as your metric
122 | 
123 | If your metric is itself a DSPy program, one of the most powerful ways to iterate is to compile (optimize) your metric itself. That's usually easy because the output of the metric is usually a simple value (e.g., a score out of 5) so the metric's metric is easy to define and optimize by collecting a few examples.
124 | 
125 | 
126 | 
127 | ### Advanced: Accessing the `trace`
128 | 
129 | When your metric is used during evaluation runs, DSPy will not try to track the steps of your program.
130 | 
131 | But during compiling (optimization), DSPy will trace your LM calls. The trace will contain inputs/outputs to each DSPy predictor and you can leverage that to validate intermediate steps for optimization.
132 | 
133 | 
134 | ```python
135 | def validate_hops(example, pred, trace=None):
136 |     hops = [example.question] + [outputs.query for *_, outputs in trace if 'query' in outputs]
137 | 
138 |     if max([len(h) for h in hops]) > 100: return False
139 |     if any(dspy.evaluate.answer_exact_match_str(hops[idx], hops[:idx], frac=0.8) for idx in range(2, len(hops))): return False
140 | 
141 |     return True
142 | ```
143 | 
```

--------------------------------------------------------------------------------
/tests/predict/test_parallel.py:
--------------------------------------------------------------------------------

```python
  1 | import dspy
  2 | from dspy.utils.dummies import DummyLM
  3 | 
  4 | 
  5 | def test_parallel_module():
  6 |     lm = DummyLM(
  7 |         [
  8 |             {"output": "test output 1"},
  9 |             {"output": "test output 2"},
 10 |             {"output": "test output 3"},
 11 |             {"output": "test output 4"},
 12 |             {"output": "test output 5"},
 13 |         ]
 14 |     )
 15 |     dspy.settings.configure(lm=lm)
 16 | 
 17 |     class MyModule(dspy.Module):
 18 |         def __init__(self):
 19 |             super().__init__()
 20 |             self.predictor = dspy.Predict("input -> output")
 21 |             self.predictor2 = dspy.Predict("input -> output")
 22 | 
 23 |             self.parallel = dspy.Parallel(num_threads=2)
 24 | 
 25 |         def forward(self, input):
 26 |             return self.parallel(
 27 |                 [
 28 |                     (self.predictor, input),
 29 |                     (self.predictor2, input),
 30 |                     (self.predictor, input),
 31 |                     (self.predictor2, input),
 32 |                     (self.predictor, input),
 33 |                 ]
 34 |             )
 35 | 
 36 |     output = MyModule()(dspy.Example(input="test input").with_inputs("input"))
 37 | 
 38 |     expected_outputs = {f"test output {i}" for i in range(1, 6)}
 39 |     assert {r.output for r in output} == expected_outputs
 40 | 
 41 | 
 42 | def test_batch_module():
 43 |     lm = DummyLM(
 44 |         [
 45 |             {"output": "test output 1"},
 46 |             {"output": "test output 2"},
 47 |             {"output": "test output 3"},
 48 |             {"output": "test output 4"},
 49 |             {"output": "test output 5"},
 50 |         ]
 51 |     )
 52 |     res_lm = DummyLM(
 53 |         [
 54 |             {"output": "test output 1", "reasoning": "test reasoning 1"},
 55 |             {"output": "test output 2", "reasoning": "test reasoning 2"},
 56 |             {"output": "test output 3", "reasoning": "test reasoning 3"},
 57 |             {"output": "test output 4", "reasoning": "test reasoning 4"},
 58 |             {"output": "test output 5", "reasoning": "test reasoning 5"},
 59 |         ]
 60 |     )
 61 | 
 62 |     class MyModule(dspy.Module):
 63 |         def __init__(self):
 64 |             super().__init__()
 65 |             self.predictor = dspy.Predict("input -> output")
 66 |             self.predictor2 = dspy.Predict("input -> output, reasoning")
 67 | 
 68 |             self.parallel = dspy.Parallel(num_threads=2)
 69 | 
 70 |         def forward(self, input):
 71 |             with dspy.context(lm=lm):
 72 |                 res1 = self.predictor.batch([input] * 5)
 73 | 
 74 |             with dspy.context(lm=res_lm):
 75 |                 res2 = self.predictor2.batch([input] * 5)
 76 | 
 77 |             return (res1, res2)
 78 | 
 79 |     result, reason_result = MyModule()(dspy.Example(input="test input").with_inputs("input"))
 80 | 
 81 |     # Check that we got all expected outputs without caring about order
 82 |     expected_outputs = {f"test output {i}" for i in range(1, 6)}
 83 |     assert {r.output for r in result} == expected_outputs
 84 |     assert {r.output for r in reason_result} == expected_outputs
 85 | 
 86 |     # Check that reasoning matches outputs for reason_result
 87 |     for r in reason_result:
 88 |         num = r.output.split()[-1]  # get the number from "test output X"
 89 |         assert r.reasoning == f"test reasoning {num}"
 90 | 
 91 | 
 92 | def test_nested_parallel_module():
 93 |     lm = DummyLM(
 94 |         [
 95 |             {"output": "test output 1"},
 96 |             {"output": "test output 2"},
 97 |             {"output": "test output 3"},
 98 |             {"output": "test output 4"},
 99 |             {"output": "test output 5"},
100 |         ]
101 |     )
102 |     dspy.settings.configure(lm=lm)
103 | 
104 |     class MyModule(dspy.Module):
105 |         def __init__(self):
106 |             super().__init__()
107 |             self.predictor = dspy.Predict("input -> output")
108 |             self.predictor2 = dspy.Predict("input -> output")
109 | 
110 |             self.parallel = dspy.Parallel(num_threads=2)
111 | 
112 |         def forward(self, input):
113 |             return self.parallel(
114 |                 [
115 |                     (self.predictor, input),
116 |                     (self.predictor2, input),
117 |                     (
118 |                         self.parallel,
119 |                         [
120 |                             (self.predictor2, input),
121 |                             (self.predictor, input),
122 |                         ],
123 |                     ),
124 |                 ]
125 |             )
126 | 
127 |     output = MyModule()(dspy.Example(input="test input").with_inputs("input"))
128 | 
129 |     # For nested structure, check first two outputs and nested outputs separately
130 |     assert {output[0].output, output[1].output} <= {f"test output {i}" for i in range(1, 5)}
131 |     assert {output[2][0].output, output[2][1].output} <= {f"test output {i}" for i in range(1, 5)}
132 |     all_outputs = {output[0].output, output[1].output, output[2][0].output, output[2][1].output}
133 |     assert len(all_outputs) == 4
134 | 
135 | 
136 | def test_nested_batch_method():
137 |     lm = DummyLM(
138 |         [
139 |             {"output": "test output 1"},
140 |             {"output": "test output 2"},
141 |             {"output": "test output 3"},
142 |             {"output": "test output 4"},
143 |             {"output": "test output 5"},
144 |         ]
145 |     )
146 |     dspy.settings.configure(lm=lm)
147 | 
148 |     class MyModule(dspy.Module):
149 |         def __init__(self):
150 |             super().__init__()
151 |             self.predictor = dspy.Predict("input -> output")
152 | 
153 |         def forward(self, input):
154 |             res = self.predictor.batch([dspy.Example(input=input).with_inputs("input")] * 2)
155 | 
156 |             return res
157 | 
158 |     result = MyModule().batch([dspy.Example(input="test input").with_inputs("input")] * 2)
159 | 
160 |     assert {result[0][0].output, result[0][1].output, result[1][0].output, result[1][1].output} == {
161 |         "test output 1",
162 |         "test output 2",
163 |         "test output 3",
164 |         "test output 4",
165 |     }
166 | 
167 | 
168 | def test_batch_with_failed_examples():
169 |     class FailingModule(dspy.Module):
170 |         def forward(self, value: int) -> str:
171 |             if value == 42:
172 |                 raise ValueError("test error")
173 |             return f"success-{value}"
174 | 
175 |     module = FailingModule()
176 | 
177 |     examples = [
178 |         dspy.Example(value=1).with_inputs("value"),
179 |         dspy.Example(value=42).with_inputs("value"),  # This will fail
180 |         dspy.Example(value=3).with_inputs("value"),
181 |     ]
182 | 
183 |     results, failed_examples, exceptions = module.batch(
184 |         examples,
185 |         return_failed_examples=True,
186 |         provide_traceback=True,
187 |     )
188 | 
189 |     assert results == ["success-1", None, "success-3"]
190 | 
191 |     assert len(failed_examples) == 1
192 |     assert failed_examples[0].inputs()["value"] == 42
193 | 
194 |     assert len(exceptions) == 1
195 |     assert isinstance(exceptions[0], ValueError)
196 |     assert str(exceptions[0]) == "test error"
197 | 
```

--------------------------------------------------------------------------------
/.github/workflows/build_and_release.yml:
--------------------------------------------------------------------------------

```yaml
  1 | ---
  2 | name: Publish Python 🐍 distributions 📦 to PyPI
  3 | on:
  4 |   push:
  5 |     tags:
  6 |       - "*"
  7 | jobs:
  8 |   extract-tag:
  9 |     runs-on: ubuntu-latest
 10 |     outputs:
 11 |       version: ${{ steps.extract_tag.outputs.tag }}
 12 |     steps:
 13 |       - uses: actions/checkout@v4
 14 |       - id: extract_tag
 15 |         name: Extract tag name
 16 |         run: echo "tag=$(echo $GITHUB_REF | cut -d / -f 3)" >> "$GITHUB_OUTPUT"
 17 | 
 18 |   build-and-publish-test-pypi:
 19 |     needs: extract-tag
 20 |     runs-on: ubuntu-latest
 21 |     environment:
 22 |       name: pypi
 23 |     permissions:
 24 |       id-token: write # IMPORTANT: mandatory for trusted publishing
 25 |       contents: write
 26 |     steps:
 27 |       - uses: actions/checkout@v4
 28 |       - name: Set up Python 3.11
 29 |         uses: actions/setup-python@v3
 30 |         with:
 31 |           python-version: "3.11"
 32 |       - name: Install dependencies
 33 |         run: python3 -m pip install --upgrade setuptools wheel twine build semver packaging
 34 |       - name: Get correct version for TestPyPI release
 35 |         id: check_version
 36 |         run: |
 37 |           VERSION=${{ needs.extract-tag.outputs.version }}  
 38 |           PACKAGE_NAME="dspy-ai-test"
 39 |           echo "Checking if $VERSION for $PACKAGE_NAME exists on TestPyPI"  
 40 |           NEW_VERSION=$(python3 .github/workflows/build_utils/test_version.py $PACKAGE_NAME $VERSION)  
 41 |           echo "Version to be used for TestPyPI release: $NEW_VERSION"  
 42 |           echo "version=$NEW_VERSION" >> "$GITHUB_OUTPUT"
 43 |       - name: Update version in pyproject.toml
 44 |         run: sed -i '/#replace_package_version_marker/{n;s/version="[^"]*"/version="${{ steps.check_version.outputs.version }}"/;}' pyproject.toml
 45 |       - name: Update package name in pyproject.toml
 46 |         run: sed -i '/#replace_package_name_marker/{n;s/name="[^"]*"/name="dspy-ai-test"/;}' pyproject.toml
 47 |       - name: Build a binary wheel
 48 |         run: python3 -m build
 49 |       # Test the locally built wheel
 50 |       - name: Create test environment
 51 |         run: python -m venv test_before_testpypi
 52 |       - name: Test package installation and functionality
 53 |         run: |
 54 |           source test_before_testpypi/bin/activate
 55 |           # Install the locally built wheel and testing dependencies
 56 |           pip install dist/*.whl pytest pytest-asyncio
 57 |           pytest tests/metadata/test_metadata.py tests/predict
 58 |           deactivate
 59 |       # Publish to test-PyPI
 60 |       - name: Publish distribution 📦 to test-PyPI
 61 |         uses: pypa/gh-action-pypi-publish@release/v1 # This requires a trusted publisher to be setup in pypi/testpypi
 62 |         with:
 63 |           repository-url: https://test.pypi.org/legacy/
 64 | 
 65 |   # TODO: Add tests using dspy-ai-test
 66 | 
 67 |   build-and-publish-pypi:
 68 |     needs: [extract-tag, build-and-publish-test-pypi]
 69 |     # Only publish to PyPI if the repository owner is stanfordnlp
 70 |     if: github.repository_owner == 'stanfordnlp'
 71 |     runs-on: ubuntu-latest
 72 |     environment:
 73 |       name: pypi
 74 |     permissions:
 75 |       id-token: write # IMPORTANT: mandatory for trusted publishing
 76 |       contents: write
 77 |     steps:
 78 |       - uses: actions/checkout@v4
 79 |       - name: Set up Python 3.11
 80 |         uses: actions/setup-python@v3
 81 |         with:
 82 |           python-version: "3.11"
 83 |       - name: Install dependencies
 84 |         run: python3 -m pip install --upgrade setuptools wheel twine build
 85 |       - name: Update version in pyproject.toml
 86 |         run: sed -i '/#replace_package_version_marker/{n;s/version *= *"[^"]*"/version="${{ needs.extract-tag.outputs.version }}"/;}' pyproject.toml
 87 |       - name: Update version in __metadata__.py
 88 |         run: sed -i '/#replace_package_version_marker/{n;s/__version__ *= *"[^"]*"/__version__="${{ needs.extract-tag.outputs.version }}"/;}' ./dspy/__metadata__.py
 89 |       # Publish to dspy
 90 |       - name: Update package name in pyproject.toml
 91 |         run: sed -i '/#replace_package_name_marker/{n;s/name *= *"[^"]*"/name="dspy"/;}' pyproject.toml
 92 |       - name: Update package name in metadata.py
 93 |         run: sed -i '/#replace_package_name_marker/{n;s/__name__ *= *"[^"]*"/__name__="dspy"/;}' ./dspy/__metadata__.py
 94 |       - name: Build a binary wheel
 95 |         run: python3 -m build
 96 |       # Test the locally built wheel before publishing to pypi
 97 |       - name: Create test environment
 98 |         run: python -m venv test_before_pypi
 99 |       - name: Test package installation and functionality
100 |         run: |
101 |           source test_before_pypi/bin/activate
102 |           # Install the locally built wheel and testing dependencies
103 |           pip install dist/*.whl pytest pytest-asyncio
104 |           pytest tests/metadata/test_metadata.py tests/predict
105 |           deactivate
106 |           rm -r test_before_pypi
107 |       - name: Publish distribution 📦 to PyPI (dspy)
108 |         uses: pypa/gh-action-pypi-publish@release/v1
109 |         with:
110 |           attestations: false
111 |       # Publish to dspy-ai
112 |       - name: Update package name in pyproject.toml
113 |         run: sed -i '/#replace_package_name_marker/{n;s/name *= *"[^"]*"/name="dspy-ai"/;}' .github/.internal_dspyai/pyproject.toml
114 |       - name: Update version for dspy-ai release
115 |         run: sed -i '/#replace_package_version_marker/{n;s/version *= *"[^"]*"/version="${{ needs.extract-tag.outputs.version }}"/;}' .github/.internal_dspyai/pyproject.toml
116 |       - name: Update dspy dependency for dspy-ai release
117 |         run: |
118 |           sed -i '/#replace_dspy_version_marker/{n;s/dspy>=[^"]*/dspy>=${{ needs.extract-tag.outputs.version }}/;}' .github/.internal_dspyai/pyproject.toml
119 |       - name: Build a binary wheel (dspy-ai)
120 |         run: python3 -m build .github/.internal_dspyai/
121 |       - name: Publish distribution 📦 to PyPI (dspy-ai)
122 |         uses: pypa/gh-action-pypi-publish@release/v1
123 |         with:
124 |           attestations: false
125 |           packages-dir: .github/.internal_dspyai/dist/
126 |       - uses: stefanzweifel/git-auto-commit-action@v5 # auto commit changes to main
127 |         with:
128 |           commit_message: Update versions
129 |           create_branch: true
130 |           branch: release-${{ needs.extract-tag.outputs.version }}
131 |       - name: Checkout main branch
132 |         run: |
133 |           git fetch origin
134 |           git checkout main
135 |       - name: Configure git user
136 |         run: |
137 |           git config --global user.email "[email protected]"
138 |           git config --global user.name "Github Actions"
139 |       - name: Merge release branch into main
140 |         run: |
141 |           git merge --no-ff release-${{ needs.extract-tag.outputs.version }}
142 |       - name: Push changes to main
143 |         run: |
144 |           git push origin main
145 | 
```

--------------------------------------------------------------------------------
/dspy/teleprompt/infer_rules.py:
--------------------------------------------------------------------------------

```python
  1 | import logging
  2 | import random
  3 | 
  4 | import numpy as np
  5 | 
  6 | import dspy
  7 | from dspy.evaluate.evaluate import Evaluate
  8 | from dspy.teleprompt import BootstrapFewShot
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | class InferRules(BootstrapFewShot):
 14 |     def __init__(self, num_candidates=10, num_rules=10, num_threads=None, teacher_settings=None, **kwargs):
 15 |         super().__init__(teacher_settings=teacher_settings, **kwargs)
 16 | 
 17 |         self.num_candidates = num_candidates
 18 |         self.num_rules = num_rules
 19 |         self.num_threads = num_threads
 20 |         self.rules_induction_program = RulesInductionProgram(num_rules, teacher_settings=teacher_settings)
 21 |         self.metric = kwargs.get("metric")
 22 |         self.max_errors = kwargs.get("max_errors")
 23 | 
 24 |     def compile(self, student, *, teacher=None, trainset, valset=None):
 25 |         if valset is None:
 26 |             train_size = int(0.5 * len(trainset))
 27 |             trainset, valset = trainset[:train_size], trainset[train_size:]
 28 | 
 29 |         super().compile(student, teacher=teacher, trainset=trainset)
 30 | 
 31 |         original_program = self.student.deepcopy()
 32 |         all_predictors = [p for p in original_program.predictors() if hasattr(p, "signature")]
 33 |         instructions_list = [p.signature.instructions for p in all_predictors]
 34 | 
 35 |         best_score = -np.inf
 36 |         best_program = None
 37 | 
 38 |         for candidate_idx in range(self.num_candidates):
 39 |             candidate_program = original_program.deepcopy()
 40 |             candidate_predictors = [p for p in candidate_program.predictors() if hasattr(p, "signature")]
 41 | 
 42 |             for i, predictor in enumerate(candidate_predictors):
 43 |                 predictor.signature.instructions = instructions_list[i]
 44 | 
 45 |             for i, predictor in enumerate(candidate_predictors):
 46 |                 rules = self.induce_natural_language_rules(predictor, trainset)
 47 |                 predictor.signature.instructions = instructions_list[i]
 48 |                 self.update_program_instructions(predictor, rules)
 49 | 
 50 |             score = self.evaluate_program(candidate_program, valset)
 51 | 
 52 |             if score > best_score:
 53 |                 best_score = score
 54 |                 best_program = candidate_program
 55 | 
 56 |             logger.info(f"Evaluated Candidate {candidate_idx + 1} with score {score}. Current best score: {best_score}")
 57 | 
 58 |         logger.info(f"Final best score: {best_score}")
 59 | 
 60 |         return best_program
 61 | 
 62 |     def induce_natural_language_rules(self, predictor, trainset):
 63 |         demos = self.get_predictor_demos(trainset, predictor)
 64 |         signature = predictor.signature
 65 |         while True:
 66 |             examples_text = self.format_examples(demos, signature)
 67 |             try:
 68 |                 return self.rules_induction_program(examples_text)
 69 |             except Exception as e:
 70 |                 assert (
 71 |                     isinstance(e, ValueError)
 72 |                     or e.__class__.__name__ == "BadRequestError"
 73 |                     or "ContextWindowExceededError" in str(e)
 74 |                 )
 75 |                 if len(demos) > 1:
 76 |                     demos = demos[:-1]
 77 |                 else:
 78 |                     raise RuntimeError(
 79 |                         "Failed to generate natural language rules since a single example couldn't fit in the model's "
 80 |                         "context window."
 81 |                     ) from e
 82 | 
 83 |     def update_program_instructions(self, predictor, natural_language_rules):
 84 |         predictor.signature.instructions = (
 85 |             f"{predictor.signature.instructions}\n\n"
 86 |             f"Please adhere to the following rules when making your prediction:\n{natural_language_rules}"
 87 |         )
 88 | 
 89 |     def format_examples(self, demos, signature):
 90 |         examples_text = ""
 91 |         for demo in demos:
 92 |             input_fields = {k: v for k, v in demo.items() if k in signature.input_fields}
 93 |             output_fields = {k: v for k, v in demo.items() if k in signature.output_fields}
 94 |             input_text = "\n".join(f"{k}: {v}" for k, v in input_fields.items())
 95 |             output_text = "\n".join(f"{k}: {v}" for k, v in output_fields.items())
 96 |             examples_text += f"Input Fields:\n{input_text}\n\n=========\nOutput Fields:\n{output_text}\n\n"
 97 |         return examples_text
 98 | 
 99 |     def get_predictor_demos(self, trainset, predictor):
100 |         # TODO: Consider how this handled "incomplete" demos.
101 |         signature = predictor.signature
102 |         return [
103 |             {
104 |                 key: value
105 |                 for key, value in example.items()
106 |                 if key in signature.input_fields or key in signature.output_fields
107 |             }
108 |             for example in trainset
109 |         ]
110 | 
111 |     def evaluate_program(self, program, dataset):
112 |         effective_max_errors = (
113 |             self.max_errors if self.max_errors is not None else dspy.settings.max_errors
114 |         )
115 |         evaluate = Evaluate(
116 |             devset=dataset,
117 |             metric=self.metric,
118 |             num_threads=self.num_threads,
119 |             max_errors=effective_max_errors,
120 |             display_table=False,
121 |             display_progress=True,
122 |         )
123 |         score = evaluate(program, metric=self.metric).score
124 |         return score
125 | 
126 | 
127 | class RulesInductionProgram(dspy.Module):
128 |     def __init__(self, num_rules, teacher_settings=None):
129 |         super().__init__()
130 | 
131 |         class CustomRulesInduction(dspy.Signature):
132 |             __doc__ = (
133 |                 f"Given a set of examples, extract a list of {num_rules} concise and non-redundant natural language "
134 |                 "rules that provide clear guidance for performing the task. All rules should be actionable for a "
135 |                 "well-specified scope of examples of this general kind of task."
136 |             )
137 |             examples_text = dspy.InputField(desc="Text containing examples")
138 |             natural_language_rules = dspy.OutputField(desc="Induced natural language rules")
139 | 
140 |         self.rules_induction = dspy.ChainOfThought(CustomRulesInduction)
141 |         self.teacher_settings = teacher_settings or {}
142 |         self.rng = random.Random(0)
143 | 
144 |     def forward(self, examples_text):
145 |         with dspy.settings.context(**self.teacher_settings):
146 |             # Generate rules with a fresh rollout and non-zero temperature.
147 |             lm = dspy.settings.lm.copy(
148 |                 rollout_id=self.rng.randint(0, 10**9), temperature=1.0
149 |             )
150 |             with dspy.settings.context(lm=lm):
151 |                 rules = self.rules_induction(examples_text=examples_text).natural_language_rules
152 | 
153 |         return rules.strip()
154 | 
```

--------------------------------------------------------------------------------
/tests/primitives/test_python_interpreter.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import random
  3 | import shutil
  4 | 
  5 | import pytest
  6 | 
  7 | from dspy.primitives.python_interpreter import InterpreterError, PythonInterpreter
  8 | 
  9 | # This test suite requires deno to be installed. Please install deno following https://docs.deno.com/runtime/getting_started/installation/
 10 | if shutil.which("deno") is None:
 11 |     pytest.skip(reason="Deno is not installed or not in PATH", allow_module_level=True)
 12 | 
 13 | 
 14 | def test_execute_simple_code():
 15 |     with PythonInterpreter() as interpreter:
 16 |         code = "print('Hello, World!')"
 17 |         result = interpreter.execute(code)
 18 |         assert result == "Hello, World!\n", "Simple print statement should return 'Hello World!\n'"
 19 | 
 20 | 
 21 | def test_import():
 22 |     with PythonInterpreter() as interpreter:
 23 |         code = "import math\nresult = math.sqrt(4)\nresult"
 24 |         result = interpreter.execute(code)
 25 |         assert result == 2, "Should be able to import and use math.sqrt"
 26 | 
 27 | 
 28 | def test_user_variable_definitions():
 29 |     with PythonInterpreter() as interpreter:
 30 |         code = "result = number + 1\nresult"
 31 |         result = interpreter.execute(code, variables={"number": 4})
 32 |         assert result == 5, "User variable assignment should work"
 33 | 
 34 | 
 35 | def test_failure_syntax_error():
 36 |     with PythonInterpreter() as interpreter:
 37 |         code = "+++"
 38 |         with pytest.raises(SyntaxError, match="Invalid Python syntax"):
 39 |             interpreter.execute(code)
 40 | 
 41 | 
 42 | def test_failure_zero_division():
 43 |     with PythonInterpreter() as interpreter:
 44 |         code = "1+0/0"
 45 |         with pytest.raises(InterpreterError, match="ZeroDivisionError"):
 46 |             interpreter.execute(code)
 47 | 
 48 | 
 49 | def test_exception_args():
 50 |     with PythonInterpreter() as interpreter:
 51 |         token = random.randint(1, 10**9)
 52 |         code = f"raise ValueError({token})"
 53 |         with pytest.raises(InterpreterError, match=rf"ValueError: \[{token}\]"):
 54 |             interpreter.execute(code)
 55 | 
 56 | 
 57 | def test_final_answer_trick():
 58 |     with PythonInterpreter() as interpreter:
 59 |         token = random.randint(1, 10**9)
 60 |         code = f"final_answer('The result is', {token})"
 61 |         result = interpreter(code)
 62 | 
 63 |         # They should maintain the same order
 64 |         assert result == ["The result is", token], "The returned results are differ, `final_answer` trick doesn't work"
 65 | 
 66 | def test_enable_env_vars_flag():
 67 |     os.environ["FOO_TEST_ENV"] = "test_value"
 68 | 
 69 |     with PythonInterpreter(enable_env_vars=None) as interpreter:
 70 |         code = "import os\nresult = os.getenv('FOO_TEST_ENV')\nresult"
 71 |         result = interpreter.execute(code)
 72 |         assert result == "", "Environment variables should be inaccessible without allow-env"
 73 | 
 74 |     with PythonInterpreter(enable_env_vars=["FOO_TEST_ENV"]) as interpreter:
 75 |         code = "import os\nresult = os.getenv('FOO_TEST_ENV')\nresult"
 76 |         result = interpreter.execute(code)
 77 |         assert result == "test_value", "Environment variables should be accessible with allow-env"
 78 | 
 79 | 
 80 | 
 81 | def test_read_file_access_control(tmp_path):
 82 |     testfile_path = tmp_path / "test_temp_file.txt"
 83 |     virtual_path = f"/sandbox/{testfile_path.name}"
 84 |     with open(testfile_path, "w") as f:
 85 |         f.write("test content")
 86 | 
 87 |     with PythonInterpreter(enable_read_paths=[str(testfile_path)]) as interpreter:
 88 |         code = (
 89 |             f"with open({virtual_path!r}, 'r') as f:\n"
 90 |             f"    data = f.read()\n"
 91 |             f"data"
 92 |         )
 93 |         result = interpreter.execute(code)
 94 |         assert result == "test content", "Test file should be accessible with enable_read_paths and specified file"
 95 | 
 96 |     with PythonInterpreter(enable_read_paths=None) as interpreter:
 97 |         code = (
 98 |             f"try:\n"
 99 |             f"    with open({virtual_path!r}, 'r') as f:\n"
100 |             f"        data = f.read()\n"
101 |             f"except Exception as e:\n"
102 |             f"    data = str(e)\n"
103 |             f"data"
104 |         )
105 |         result = interpreter.execute(code)
106 |         assert ("PermissionDenied" in result or "denied" in result.lower() or "no such file" in result.lower()), "Test file should not be accessible without enable_read_paths"
107 | 
108 | def test_enable_write_flag(tmp_path):
109 |     testfile_path = tmp_path / "test_temp_output.txt"
110 |     virtual_path = f"/sandbox/{testfile_path.name}"
111 | 
112 |     with PythonInterpreter(enable_write_paths=None) as interpreter:
113 |         code = (
114 |             f"try:\n"
115 |             f"    with open({virtual_path!r}, 'w') as f:\n"
116 |             f"        f.write('blocked')\n"
117 |             f"    result = 'wrote'\n"
118 |             f"except Exception as e:\n"
119 |             f"    result = str(e)\n"
120 |             f"result"
121 |         )
122 |         result = interpreter.execute(code)
123 |         assert ("PermissionDenied" in result or "denied" in result.lower() or "no such file" in result.lower()), "Test file should not be writable without enable_write_paths"
124 | 
125 |     with PythonInterpreter(enable_write_paths=[str(testfile_path)]) as interpreter:
126 |         code = (
127 |             f"with open({virtual_path!r}, 'w') as f:\n"
128 |             f"    f.write('allowed')\n"
129 |             f"'ok'"
130 |         )
131 |         result = interpreter.execute(code)
132 |         assert result == "ok", "Test file should be writable with enable_write_paths"
133 |     assert testfile_path.exists()
134 |     with open(testfile_path) as f:
135 |         assert f.read() == "allowed", "Test file outputs should match content written during execution"
136 | 
137 |     with open(testfile_path, "w") as f:
138 |         f.write("original_content")
139 |     with PythonInterpreter(enable_write_paths=[str(testfile_path)], sync_files=False) as interpreter:
140 |         code = (
141 |             f"with open({virtual_path!r}, 'w') as f:\n"
142 |             f"    f.write('should_not_sync')\n"
143 |             f"'done_no_sync'"
144 |         )
145 |         result = interpreter.execute(code)
146 |         assert result == "done_no_sync"
147 |     with open(testfile_path) as f:
148 |         assert f.read() == "original_content", "File should not be changed when sync_files is False"
149 | 
150 | 
151 | 
152 | def test_enable_net_flag():
153 |     test_url = "https://example.com"
154 | 
155 |     with PythonInterpreter(enable_network_access=None) as interpreter:
156 |         code = (
157 |             "import js\n"
158 |             f"resp = await js.fetch({test_url!r})\n"
159 |             "resp.status"
160 |         )
161 |         with pytest.raises(InterpreterError, match="PythonError"):
162 |             interpreter.execute(code)
163 | 
164 |     with PythonInterpreter(enable_network_access=["example.com"]) as interpreter:
165 |         code = (
166 |             "import js\n"
167 |             f"resp = await js.fetch({test_url!r})\n"
168 |             "resp.status"
169 |         )
170 |         result = interpreter.execute(code)
171 |         assert int(result) == 200, "Network access is permitted with enable_network_access"
172 | 
```

--------------------------------------------------------------------------------
/dspy/adapters/types/base_type.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import re
  3 | from typing import Any, Optional, get_args, get_origin
  4 | 
  5 | import json_repair
  6 | import pydantic
  7 | from litellm import ModelResponseStream
  8 | 
  9 | CUSTOM_TYPE_START_IDENTIFIER = "<<CUSTOM-TYPE-START-IDENTIFIER>>"
 10 | CUSTOM_TYPE_END_IDENTIFIER = "<<CUSTOM-TYPE-END-IDENTIFIER>>"
 11 | 
 12 | 
 13 | class Type(pydantic.BaseModel):
 14 |     """Base class to support creating custom types for DSPy signatures.
 15 | 
 16 |     This is the parent class of DSPy custom types, e.g, dspy.Image. Subclasses must implement the `format` method to
 17 |     return a list of dictionaries (same as the Array of content parts in the OpenAI API user message's content field).
 18 | 
 19 |     Example:
 20 | 
 21 |         ```python
 22 |         class Image(Type):
 23 |             url: str
 24 | 
 25 |             def format(self) -> list[dict[str, Any]]:
 26 |                 return [{"type": "image_url", "image_url": {"url": self.url}}]
 27 |         ```
 28 |     """
 29 | 
 30 |     def format(self) -> list[dict[str, Any]] | str:
 31 |         raise NotImplementedError
 32 | 
 33 |     @classmethod
 34 |     def description(cls) -> str:
 35 |         """Description of the custom type"""
 36 |         return ""
 37 | 
 38 |     @classmethod
 39 |     def extract_custom_type_from_annotation(cls, annotation):
 40 |         """Extract all custom types from the annotation.
 41 | 
 42 |         This is used to extract all custom types from the annotation of a field, while the annotation can
 43 |         have arbitrary level of nesting. For example, we detect `Tool` is in `list[dict[str, Tool]]`.
 44 |         """
 45 |         # Direct match. Nested type like `list[dict[str, Event]]` passes `isinstance(annotation, type)` in python 3.10
 46 |         # while fails in python 3.11. To accommodate users using python 3.10, we need to capture the error and ignore it.
 47 |         try:
 48 |             if isinstance(annotation, type) and issubclass(annotation, cls):
 49 |                 return [annotation]
 50 |         except TypeError:
 51 |             pass
 52 | 
 53 |         origin = get_origin(annotation)
 54 |         if origin is None:
 55 |             return []
 56 | 
 57 |         result = []
 58 |         # Recurse into all type args
 59 |         for arg in get_args(annotation):
 60 |             result.extend(cls.extract_custom_type_from_annotation(arg))
 61 | 
 62 |         return result
 63 | 
 64 |     @pydantic.model_serializer()
 65 |     def serialize_model(self):
 66 |         formatted = self.format()
 67 |         if isinstance(formatted, list):
 68 |             return (
 69 |                 f"{CUSTOM_TYPE_START_IDENTIFIER}{json.dumps(formatted, ensure_ascii=False)}{CUSTOM_TYPE_END_IDENTIFIER}"
 70 |             )
 71 |         return formatted
 72 | 
 73 |     @classmethod
 74 |     def is_streamable(cls) -> bool:
 75 |         """Whether the custom type is streamable."""
 76 |         return False
 77 | 
 78 |     @classmethod
 79 |     def parse_stream_chunk(cls, chunk: ModelResponseStream) -> Optional["Type"]:
 80 |         """
 81 |         Parse a stream chunk into the custom type.
 82 | 
 83 |         Args:
 84 |             chunk: A stream chunk.
 85 | 
 86 |         Returns:
 87 |             A custom type object or None if the chunk is not for this custom type.
 88 |         """
 89 |         return None
 90 | 
 91 | 
 92 |     @classmethod
 93 |     def parse_lm_response(cls, response: str | dict[str, Any]) -> Optional["Type"]:
 94 |         """Parse a LM response into the custom type.
 95 | 
 96 |         Args:
 97 |             response: A LM response.
 98 | 
 99 |         Returns:
100 |             A custom type object.
101 |         """
102 |         return None
103 | 
104 | def split_message_content_for_custom_types(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
105 |     """Split user message content into a list of content blocks.
106 | 
107 |     This method splits each user message's content in the `messages` list to be a list of content block, so that
108 |     the custom types like `dspy.Image` can be properly formatted for better quality. For example, the split content
109 |     may look like below if the user message has a `dspy.Image` object:
110 | 
111 |     ```
112 |     [
113 |         {"type": "text", "text": "{text_before_image}"},
114 |         {"type": "image_url", "image_url": {"url": "{image_url}"}},
115 |         {"type": "text", "text": "{text_after_image}"},
116 |     ]
117 |     ```
118 | 
119 |     This is implemented by finding the `<<CUSTOM-TYPE-START-IDENTIFIER>>` and `<<CUSTOM-TYPE-END-IDENTIFIER>>`
120 |     in the user message content and splitting the content around them. The `<<CUSTOM-TYPE-START-IDENTIFIER>>`
121 |     and `<<CUSTOM-TYPE-END-IDENTIFIER>>` are the reserved identifiers for the custom types as in `dspy.Type`.
122 | 
123 |     Args:
124 |         messages: a list of messages sent to the LM. The format is the same as [OpenAI API's messages
125 |             format](https://platform.openai.com/docs/guides/chat-completions/response-format).
126 | 
127 |     Returns:
128 |         A list of messages with the content split into a list of content blocks around custom types content.
129 |     """
130 |     for message in messages:
131 |         if message["role"] != "user":
132 |             # Custom type messages are only in user messages
133 |             continue
134 | 
135 |         pattern = rf"{CUSTOM_TYPE_START_IDENTIFIER}(.*?){CUSTOM_TYPE_END_IDENTIFIER}"
136 |         result = []
137 |         last_end = 0
138 |         # DSPy adapter always formats user input into a string content before custom type splitting
139 |         content: str = message["content"]
140 | 
141 |         for match in re.finditer(pattern, content, re.DOTALL):
142 |             start, end = match.span()
143 | 
144 |             # Add text before the current block
145 |             if start > last_end:
146 |                 result.append({"type": "text", "text": content[last_end:start]})
147 | 
148 |             # Parse the JSON inside the block
149 |             custom_type_content = match.group(1).strip()
150 |             parsed = None
151 | 
152 |             for parse_fn in [json.loads, _parse_doubly_quoted_json, json_repair.loads]:
153 |                 try:
154 |                     parsed = parse_fn(custom_type_content)
155 |                     break
156 |                 except json.JSONDecodeError:
157 |                     continue
158 | 
159 |             if parsed:
160 |                 for custom_type_content in parsed:
161 |                     result.append(custom_type_content)
162 |             else:
163 |                 # fallback to raw string if it's not valid JSON
164 |                 result.append({"type": "text", "text": custom_type_content})
165 | 
166 |             last_end = end
167 | 
168 |         if last_end == 0:
169 |             # No custom type found, return the original message
170 |             continue
171 | 
172 |         # Add any remaining text after the last match
173 |         if last_end < len(content):
174 |             result.append({"type": "text", "text": content[last_end:]})
175 | 
176 |         message["content"] = result
177 | 
178 |     return messages
179 | 
180 | 
181 | def _parse_doubly_quoted_json(json_str: str) -> Any:
182 |     """
183 |     Parse a doubly quoted JSON string into a Python dict.
184 |     `dspy.Type` can be json-encoded twice if included in either list or dict, e.g., `list[dspy.experimental.Document]`
185 |     """
186 |     return json.loads(json.loads(f'"{json_str}"'))
187 | 
```

--------------------------------------------------------------------------------
/docs/docs/tutorials/saving/index.md:
--------------------------------------------------------------------------------

```markdown
  1 | # Tutorial: Saving and Loading your DSPy program
  2 | 
  3 | This guide demonstrates how to save and load your DSPy program. At a high level, there are two ways to save your DSPy program:
  4 | 
  5 | 1. Save the state of the program only, similar to weights-only saving in PyTorch.
  6 | 2. Save the whole program, including both the architecture and the state, which is supported by `dspy>=2.6.0`.
  7 | 
  8 | ## State-only Saving
  9 | 
 10 | State represents the DSPy program's internal state, including the signature, demos (few-shot examples), and other information like
 11 | the `lm` to use for each `dspy.Predict` in the program. It also includes configurable attributes of other DSPy modules like
 12 | `k` for `dspy.retrievers.Retriever`. To save the state of a program, use the `save` method and set `save_program=False`. You can
 13 | choose to save the state to a JSON file or a pickle file. We recommend saving the state to a JSON file because it is safer and readable.
 14 | But sometimes your program contains non-serializable objects like `dspy.Image` or `datetime.datetime`, in which case you should save
 15 | the state to a pickle file.
 16 | 
 17 | Let's say we have compiled a program with some data, and we want to save the program for future usage:
 18 | 
 19 | ```python
 20 | import dspy
 21 | from dspy.datasets.gsm8k import GSM8K, gsm8k_metric
 22 | 
 23 | dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini"))
 24 | 
 25 | gsm8k = GSM8K()
 26 | gsm8k_trainset = gsm8k.train[:10]
 27 | dspy_program = dspy.ChainOfThought("question -> answer")
 28 | 
 29 | optimizer = dspy.BootstrapFewShot(metric=gsm8k_metric, max_bootstrapped_demos=4, max_labeled_demos=4, max_rounds=5)
 30 | compiled_dspy_program = optimizer.compile(dspy_program, trainset=gsm8k_trainset)
 31 | ```
 32 | 
 33 | To save the state of your program to json file:
 34 | 
 35 | ```python
 36 | compiled_dspy_program.save("./dspy_program/program.json", save_program=False)
 37 | ```
 38 | 
 39 | To save the state of your program to a pickle file:
 40 | 
 41 | ```python
 42 | compiled_dspy_program.save("./dspy_program/program.pkl", save_program=False)
 43 | ```
 44 | 
 45 | To load your saved state, you need to **recreate the same program**, then load the state using the `load` method.
 46 | 
 47 | ```python
 48 | loaded_dspy_program = dspy.ChainOfThought("question -> answer") # Recreate the same program.
 49 | loaded_dspy_program.load("./dspy_program/program.json")
 50 | 
 51 | assert len(compiled_dspy_program.demos) == len(loaded_dspy_program.demos)
 52 | for original_demo, loaded_demo in zip(compiled_dspy_program.demos, loaded_dspy_program.demos):
 53 |     # Loaded demo is a dict, while the original demo is a dspy.Example.
 54 |     assert original_demo.toDict() == loaded_demo
 55 | assert str(compiled_dspy_program.signature) == str(loaded_dspy_program.signature)
 56 | ```
 57 | 
 58 | Or load the state from a pickle file:
 59 | 
 60 | ```python
 61 | loaded_dspy_program = dspy.ChainOfThought("question -> answer") # Recreate the same program.
 62 | loaded_dspy_program.load("./dspy_program/program.pkl")
 63 | 
 64 | assert len(compiled_dspy_program.demos) == len(loaded_dspy_program.demos)
 65 | for original_demo, loaded_demo in zip(compiled_dspy_program.demos, loaded_dspy_program.demos):
 66 |     # Loaded demo is a dict, while the original demo is a dspy.Example.
 67 |     assert original_demo.toDict() == loaded_demo
 68 | assert str(compiled_dspy_program.signature) == str(loaded_dspy_program.signature)
 69 | ```
 70 | 
 71 | ## Whole Program Saving
 72 | 
 73 | Starting from `dspy>=2.6.0`, DSPy supports saving the whole program, including the architecture and the state. This feature
 74 | is powered by `cloudpickle`, which is a library for serializing and deserializing Python objects.
 75 | 
 76 | To save the whole program, use the `save` method and set `save_program=True`, and specify a directory path to save the program
 77 | instead of a file name. We require a directory path because we also save some metadata, e.g., the dependency versions along
 78 | with the program itself.
 79 | 
 80 | ```python
 81 | compiled_dspy_program.save("./dspy_program/", save_program=True)
 82 | ```
 83 | 
 84 | To load the saved program, directly use `dspy.load` method:
 85 | 
 86 | ```python
 87 | loaded_dspy_program = dspy.load("./dspy_program/")
 88 | 
 89 | assert len(compiled_dspy_program.demos) == len(loaded_dspy_program.demos)
 90 | for original_demo, loaded_demo in zip(compiled_dspy_program.demos, loaded_dspy_program.demos):
 91 |     # Loaded demo is a dict, while the original demo is a dspy.Example.
 92 |     assert original_demo.toDict() == loaded_demo
 93 | assert str(compiled_dspy_program.signature) == str(loaded_dspy_program.signature)
 94 | ```
 95 | 
 96 | With whole program saving, you don't need to recreate the program, but can directly load the architecture along with the state.
 97 | You can pick the suitable saving approach based on your needs.
 98 | 
 99 | ### Serializing Imported Modules
100 | 
101 | When saving a program with `save_program=True`, you might need to include custom modules that your program depends on. This is
102 | necessary if your program depends on these modules, but at loading time these modules are not imported before calling `dspy.load`.
103 | 
104 | You can specify which custom modules should be serialized with your program by passing them to the `modules_to_serialize`
105 | parameter when calling `save`. This ensures that any dependencies your program relies on are included during serialization and
106 | available when loading the program later.
107 | 
108 | Under the hood this uses cloudpickle's `cloudpickle.register_pickle_by_value` function to register a module as picklable by value.
109 | When a module is registered this way, cloudpickle will serialize the module by value rather than by reference, ensuring that the
110 | module contents are preserved with the saved program.
111 | 
112 | For example, if your program uses custom modules:
113 | 
114 | ```python
115 | import dspy
116 | import my_custom_module
117 | 
118 | compiled_dspy_program = dspy.ChainOfThought(my_custom_module.custom_signature)
119 | 
120 | # Save the program with the custom module
121 | compiled_dspy_program.save(
122 |     "./dspy_program/",
123 |     save_program=True,
124 |     modules_to_serialize=[my_custom_module]
125 | )
126 | ```
127 | 
128 | This ensures that the required modules are properly serialized and available when loading the program later. Any number of
129 | modules can be passed to `modules_to_serialize`. If you don't specify `modules_to_serialize`, no additional modules will be
130 | registered for serialization.
131 | 
132 | ## Backward Compatibility
133 | 
134 | As of `dspy<3.0.0`, we don't guarantee the backward compatibility of the saved program. For example, if you save the program with `dspy==2.5.35`,
135 | at loading time please make sure to use the same version of DSPy to load the program, otherwise the program may not work as expected. Chances
136 | are that loading a saved file in a different version of DSPy will not raise an error, but the performance could be different from when
137 | the program was saved.
138 | 
139 | Starting from `dspy>=3.0.0`, we will guarantee the backward compatibility of the saved program in major releases, i.e., programs saved in `dspy==3.0.0`
140 | should be loadable in `dspy==3.7.10`.
141 | 
```

--------------------------------------------------------------------------------
/dspy/dsp/utils/settings.py:
--------------------------------------------------------------------------------

```python
  1 | import asyncio
  2 | import contextvars
  3 | import copy
  4 | import threading
  5 | from contextlib import contextmanager
  6 | 
  7 | from dspy.dsp.utils.utils import dotdict
  8 | 
  9 | DEFAULT_CONFIG = dotdict(
 10 |     lm=None,
 11 |     adapter=None,
 12 |     rm=None,
 13 |     branch_idx=0,
 14 |     trace=[],
 15 |     callbacks=[],
 16 |     async_max_workers=8,
 17 |     send_stream=None,
 18 |     disable_history=False,
 19 |     track_usage=False,
 20 |     usage_tracker=None,
 21 |     caller_predict=None,
 22 |     caller_modules=None,
 23 |     stream_listeners=[],
 24 |     provide_traceback=False,  # Whether to include traceback information in error logs.
 25 |     num_threads=8,  # Number of threads to use for parallel processing.
 26 |     max_errors=10,  # Maximum errors before halting operations.
 27 |     # If true, async tools can be called in sync mode by getting converted to sync.
 28 |     allow_tool_async_sync_conversion=False,
 29 |     max_history_size=10000,
 30 |     max_trace_size=10000,
 31 | )
 32 | 
 33 | # Global base configuration and owner tracking
 34 | main_thread_config = copy.deepcopy(DEFAULT_CONFIG)
 35 | config_owner_thread_id = None
 36 | config_owner_async_task = None
 37 | 
 38 | # Global lock for settings configuration
 39 | global_lock = threading.Lock()
 40 | 
 41 | thread_local_overrides = contextvars.ContextVar("context_overrides", default=dotdict())
 42 | 
 43 | 
 44 | class Settings:
 45 |     """
 46 |     A singleton class for DSPy configuration settings.
 47 |     Thread-safe global configuration.
 48 |     - 'configure' can be called by only one 'owner' thread (the first thread that calls it).
 49 |     - Other threads see the configured global values from 'main_thread_config'.
 50 |     - 'context' sets thread-local overrides. These overrides propagate to threads spawned
 51 |       inside that context block, when (and only when!) using a ParallelExecutor that copies overrides.
 52 | 
 53 |       1. Only one unique thread (which can be any thread!) can call dspy.configure.
 54 |       2. It affects a global state, visible to all. As a result, user threads work, but they shouldn't be
 55 |          mixed with concurrent changes to dspy.configure from the "main" thread.
 56 |          (TODO: In the future, add warnings: if there are near-in-time user-thread reads followed by .configure calls.)
 57 |       3. Any thread can use dspy.context. It propagates to child threads created with DSPy primitives: Parallel, asyncify, etc.
 58 |     """
 59 | 
 60 |     _instance = None
 61 | 
 62 |     def __new__(cls):
 63 |         if cls._instance is None:
 64 |             cls._instance = super().__new__(cls)
 65 |         return cls._instance
 66 | 
 67 |     @property
 68 |     def lock(self):
 69 |         return global_lock
 70 | 
 71 |     def __getattr__(self, name):
 72 |         overrides = thread_local_overrides.get()
 73 |         if name in overrides:
 74 |             return overrides[name]
 75 |         elif name in main_thread_config:
 76 |             return main_thread_config[name]
 77 |         else:
 78 |             raise AttributeError(f"'Settings' object has no attribute '{name}'")
 79 | 
 80 |     def __setattr__(self, name, value):
 81 |         if name in ("_instance",):
 82 |             super().__setattr__(name, value)
 83 |         else:
 84 |             self.configure(**{name: value})
 85 | 
 86 |     def __getitem__(self, key):
 87 |         return self.__getattr__(key)
 88 | 
 89 |     def __setitem__(self, key, value):
 90 |         self.__setattr__(key, value)
 91 | 
 92 |     def __contains__(self, key):
 93 |         overrides = thread_local_overrides.get()
 94 |         return key in overrides or key in main_thread_config
 95 | 
 96 |     def get(self, key, default=None):
 97 |         try:
 98 |             return self[key]
 99 |         except AttributeError:
100 |             return default
101 | 
102 |     def copy(self):
103 |         overrides = thread_local_overrides.get()
104 |         return dotdict({**main_thread_config, **overrides})
105 | 
106 |     @property
107 |     def config(self):
108 |         return self.copy()
109 | 
110 |     def _ensure_configure_allowed(self):
111 |         global main_thread_config, config_owner_thread_id, config_owner_async_task
112 |         current_thread_id = threading.get_ident()
113 | 
114 |         if config_owner_thread_id is None:
115 |             # First `configure` call assigns the owner thread id.
116 |             config_owner_thread_id = current_thread_id
117 | 
118 |         if config_owner_thread_id != current_thread_id:
119 |             # Disallow a second `configure` calls from other threads.
120 |             raise RuntimeError("dspy.settings can only be changed by the thread that initially configured it.")
121 | 
122 |         # Async task doesn't allow a second `configure` call, must use dspy.context(...) instead.
123 |         is_async_task = False
124 |         try:
125 |             if asyncio.current_task() is not None:
126 |                 is_async_task = True
127 |         except RuntimeError:
128 |             # This exception (e.g., "no current task") means we are not in an async loop/task,
129 |             # or asyncio module itself is not fully functional in this specific sub-thread context.
130 |             is_async_task = False
131 | 
132 |         if not is_async_task:
133 |             return
134 | 
135 |         if config_owner_async_task is None:
136 |             # First `configure` call assigns the owner async task.
137 |             config_owner_async_task = asyncio.current_task()
138 |             return
139 | 
140 |         # We are in an async task. Now check for IPython and allow calling `configure` from IPython.
141 |         in_ipython = False
142 |         try:
143 |             from IPython import get_ipython
144 | 
145 |             # get_ipython is a global injected by IPython environments.
146 |             # We check its existence and type to be more robust.
147 |             in_ipython = get_ipython() is not None
148 |         except Exception:
149 |             # If `IPython` is not installed or `get_ipython` failed, we are not in an IPython environment.
150 |             in_ipython = False
151 | 
152 |         if not in_ipython and config_owner_async_task != asyncio.current_task():
153 |             raise RuntimeError(
154 |                 "dspy.settings.configure(...) can only be called from the same async task that called it first. Please "
155 |                 "use `dspy.context(...)` in other async tasks instead."
156 |             )
157 | 
158 |     def configure(self, **kwargs):
159 |         # If no exception is raised, the `configure` call is allowed.
160 |         self._ensure_configure_allowed()
161 | 
162 |         # Update global config
163 |         for k, v in kwargs.items():
164 |             main_thread_config[k] = v
165 | 
166 |     @contextmanager
167 |     def context(self, **kwargs):
168 |         """
169 |         Context manager for temporary configuration changes at the thread level.
170 |         Does not affect global configuration. Changes only apply to the current thread.
171 |         If threads are spawned inside this block using ParallelExecutor, they will inherit these overrides.
172 |         """
173 | 
174 |         original_overrides = thread_local_overrides.get().copy()
175 |         new_overrides = dotdict({**main_thread_config, **original_overrides, **kwargs})
176 |         token = thread_local_overrides.set(new_overrides)
177 | 
178 |         try:
179 |             yield
180 |         finally:
181 |             thread_local_overrides.reset(token)
182 | 
183 |     def __repr__(self):
184 |         overrides = thread_local_overrides.get()
185 |         combined_config = {**main_thread_config, **overrides}
186 |         return repr(combined_config)
187 | 
188 | 
189 | settings = Settings()
190 | 
```

--------------------------------------------------------------------------------
/dspy/teleprompt/random_search.py:
--------------------------------------------------------------------------------

```python
  1 | import random
  2 | 
  3 | import dspy
  4 | from dspy.evaluate.evaluate import Evaluate
  5 | from dspy.teleprompt.teleprompt import Teleprompter
  6 | 
  7 | from .bootstrap import BootstrapFewShot
  8 | from .vanilla import LabeledFewShot
  9 | 
 10 | # TODO: Don't forget dealing with the raw demos.
 11 | # TODO: Deal with the (pretty common) case of having a metric for filtering and a separate metric for eval.
 12 | # The metric itself may tell though by the presence of trace.
 13 | 
 14 | # TODO: This function should take a max_budget and max_teacher_budget. That's in the number of program calls.
 15 | # In this case, max_student_budget is max_budget - max_teacher_budget.
 16 | # For max_teacher_budget, this will just limit the total number of things we bootstrap.
 17 | # This can end up implicitly defining the number of candidate programs (i.e., stop when runs out). Cap at 16.
 18 | # For max_student_budget, this will be a more upfront calculation.
 19 | # Right now, it can also just induce the number of candidate programs. Later, it could be used more interestingly
 20 | # for selective early stopping.
 21 | # Progressive elimination sounds about right: after 50 examples, drop bottom third, after 100, another third, etc.
 22 | # until only 3--5 are left for the end. Could also be systematic and add (earlier) stopping based on error bounds.
 23 | # In general, though, the early filtering is just saying: either there are some really bad ones, or some really really
 24 | # good ones, or most things are pretty close. In all of these cases, dropping the bottom third is not going to hurt.
 25 | 
 26 | 
 27 | class BootstrapFewShotWithRandomSearch(Teleprompter):
 28 |     def __init__(
 29 |         self,
 30 |         metric,
 31 |         teacher_settings=None,
 32 |         max_bootstrapped_demos=4,
 33 |         max_labeled_demos=16,
 34 |         max_rounds=1,
 35 |         num_candidate_programs=16,
 36 |         num_threads=None,
 37 |         max_errors=None,
 38 |         stop_at_score=None,
 39 |         metric_threshold=None,
 40 |     ):
 41 |         self.metric = metric
 42 |         self.teacher_settings = teacher_settings or {}
 43 |         self.max_rounds = max_rounds
 44 | 
 45 |         self.num_threads = num_threads
 46 |         self.stop_at_score = stop_at_score
 47 |         self.metric_threshold = metric_threshold
 48 |         self.min_num_samples = 1
 49 |         self.max_num_samples = max_bootstrapped_demos
 50 |         self.max_errors = max_errors
 51 |         self.num_candidate_sets = num_candidate_programs
 52 |         self.max_labeled_demos = max_labeled_demos
 53 | 
 54 |         print(f"Going to sample between {self.min_num_samples} and {self.max_num_samples} traces per predictor.")
 55 |         print(f"Will attempt to bootstrap {self.num_candidate_sets} candidate sets.")
 56 | 
 57 |     def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None, labeled_sample=True):
 58 |         self.trainset = trainset
 59 |         self.valset = valset or trainset  # TODO: FIXME: Note this choice.
 60 | 
 61 |         effective_max_errors = self.max_errors if self.max_errors is not None else dspy.settings.max_errors
 62 | 
 63 |         scores = []
 64 |         all_subscores = []
 65 |         score_data = []
 66 | 
 67 |         for seed in range(-3, self.num_candidate_sets):
 68 |             if (restrict is not None) and (seed not in restrict):
 69 |                 continue
 70 | 
 71 |             trainset_copy = list(self.trainset)
 72 | 
 73 |             if seed == -3:
 74 |                 # zero-shot
 75 |                 program = student.reset_copy()
 76 | 
 77 |             elif seed == -2:
 78 |                 # labels only
 79 |                 teleprompter = LabeledFewShot(k=self.max_labeled_demos)
 80 |                 program = teleprompter.compile(student, trainset=trainset_copy, sample=labeled_sample)
 81 | 
 82 |             elif seed == -1:
 83 |                 # unshuffled few-shot
 84 |                 optimizer = BootstrapFewShot(
 85 |                     metric=self.metric,
 86 |                     metric_threshold=self.metric_threshold,
 87 |                     max_bootstrapped_demos=self.max_num_samples,
 88 |                     max_labeled_demos=self.max_labeled_demos,
 89 |                     teacher_settings=self.teacher_settings,
 90 |                     max_rounds=self.max_rounds,
 91 |                     max_errors=effective_max_errors,
 92 |                 )
 93 |                 program = optimizer.compile(student, teacher=teacher, trainset=trainset_copy)
 94 | 
 95 |             else:
 96 |                 assert seed >= 0, seed
 97 | 
 98 |                 random.Random(seed).shuffle(trainset_copy)
 99 |                 size = random.Random(seed).randint(self.min_num_samples, self.max_num_samples)
100 | 
101 |                 optimizer = BootstrapFewShot(
102 |                     metric=self.metric,
103 |                     metric_threshold=self.metric_threshold,
104 |                     max_bootstrapped_demos=size,
105 |                     max_labeled_demos=self.max_labeled_demos,
106 |                     teacher_settings=self.teacher_settings,
107 |                     max_rounds=self.max_rounds,
108 |                     max_errors=effective_max_errors,
109 |                 )
110 | 
111 |                 program = optimizer.compile(student, teacher=teacher, trainset=trainset_copy)
112 | 
113 |             evaluate = Evaluate(
114 |                 devset=self.valset,
115 |                 metric=self.metric,
116 |                 num_threads=self.num_threads,
117 |                 max_errors=effective_max_errors,
118 |                 display_table=False,
119 |                 display_progress=True,
120 |             )
121 | 
122 |             result = evaluate(program)
123 | 
124 |             score, subscores = result.score, [output[2] for output in result.results]
125 | 
126 |             all_subscores.append(subscores)
127 | 
128 |             if len(scores) == 0 or score > max(scores):
129 |                 print("New best score:", score, "for seed", seed)
130 |                 best_program = program
131 | 
132 |             scores.append(score)
133 |             print(f"Scores so far: {scores}")
134 |             print(f"Best score so far: {max(scores)}")
135 | 
136 |             score_data.append({"score": score, "subscores": subscores, "seed": seed, "program": program})
137 | 
138 |             if self.stop_at_score is not None and score >= self.stop_at_score:
139 |                 print(f"Stopping early because score {score} is >= stop_at_score {self.stop_at_score}")
140 |                 break
141 | 
142 |         # To best program, attach all program candidates in decreasing average score
143 |         best_program.candidate_programs = score_data
144 |         best_program.candidate_programs = sorted(
145 |             best_program.candidate_programs, key=lambda x: x["score"], reverse=True
146 |         )
147 | 
148 |         print(f"{len(best_program.candidate_programs)} candidate programs found.")
149 | 
150 |         return best_program
151 | 
152 | 
153 | # sample between 4 and 10 examples from traces
154 | # TODO: FIXME: The max number of demos should be determined in part by the LM's tokenizer + max_length.
155 | # This does require executing the program, or at least the predictor.
156 | # # # # # # (Actually we can just combine the token counts of the traces, when formatted via signature/adapter).
157 | # Alternatively, we can keep track of the (zero-shot) number of tokens when we bootstrap.
158 | # As another option, we can just try a wide range and handle failures as penalties on the score.
159 | # The number "24" of traces to collect can also be affected. If we only need 3x10, some overlap is ok.
160 | # We can also consider having short_demos and long_demos.
161 | 
```

--------------------------------------------------------------------------------
/tests/primitives/test_module.py:
--------------------------------------------------------------------------------

```python
  1 | from pathlib import Path
  2 | 
  3 | import dspy
  4 | from dspy.primitives.module import Module, set_attribute_by_name  # Adjust the import based on your file structure
  5 | from dspy.utils import DummyLM
  6 | 
  7 | 
  8 | class HopModule(dspy.Module):
  9 |     def __init__(self):
 10 |         super().__init__()
 11 |         self.predict1 = dspy.Predict("question -> query")
 12 |         self.predict2 = dspy.Predict("query -> answer")
 13 | 
 14 |     def forward(self, question):
 15 |         query = self.predict1(question=question).query
 16 |         return self.predict2(query=query)
 17 | 
 18 | 
 19 | def test_module_initialization():
 20 |     module = Module()
 21 |     assert module._compiled is False, "Module _compiled attribute should be False upon initialization"
 22 | 
 23 | 
 24 | def test_named_predictors():
 25 |     module = HopModule()
 26 |     named_preds = module.named_predictors()
 27 |     assert len(named_preds) == 2, "Should identify correct number of Predict instances"
 28 |     names, preds = zip(*named_preds, strict=False)
 29 |     assert "predict1" in names and "predict2" in names, "Named predictors should include 'predict1' and 'predict2'"
 30 | 
 31 | 
 32 | def test_predictors():
 33 |     module = HopModule()
 34 |     preds = module.predictors()
 35 |     assert len(preds) == 2, "Should return correct number of Predict instances"
 36 |     assert all(isinstance(p, dspy.Predict) for p in preds), "All returned items should be instances of PredictMock"
 37 | 
 38 | 
 39 | def test_forward():
 40 |     program = HopModule()
 41 |     dspy.settings.configure(
 42 |         lm=DummyLM(
 43 |             {
 44 |                 "What is 1+1?": {"query": "let me check"},
 45 |                 "let me check": {"answer": "2"},
 46 |             }
 47 |         )
 48 |     )
 49 |     result = program(question="What is 1+1?").answer
 50 |     assert result == "2"
 51 | 
 52 | 
 53 | def test_nested_named_predictors():
 54 |     class Hop2Module(dspy.Module):
 55 |         def __init__(self):
 56 |             super().__init__()
 57 |             self.hop = HopModule()
 58 | 
 59 |     module = Hop2Module()
 60 |     named_preds = module.named_predictors()
 61 |     assert len(named_preds) == 2
 62 |     names, _preds = zip(*named_preds, strict=False)
 63 |     assert "hop.predict1" in names
 64 |     assert "hop.predict2" in names
 65 | 
 66 | 
 67 | def test_empty_module():
 68 |     module = Module()
 69 |     assert list(module.named_sub_modules()) == [("self", module)]
 70 | 
 71 | 
 72 | def test_single_level():
 73 |     module = Module()
 74 |     module.sub = Module()
 75 |     expected = [("self", module), ("self.sub", module.sub)]
 76 |     assert list(module.named_sub_modules()) == expected
 77 | 
 78 | 
 79 | def test_multiple_levels():
 80 |     module = Module()
 81 |     module.sub = Module()
 82 |     module.sub.subsub = Module()
 83 |     expected = [("self", module), ("self.sub", module.sub), ("self.sub.subsub", module.sub.subsub)]
 84 |     assert list(module.named_sub_modules()) == expected
 85 | 
 86 | 
 87 | def test_multiple_sub_modules():
 88 |     module = Module()
 89 |     module.sub1 = Module()
 90 |     module.sub2 = Module()
 91 |     expected = [("self", module), ("self.sub1", module.sub1), ("self.sub2", module.sub2)]
 92 |     assert sorted(module.named_sub_modules()) == sorted(expected)
 93 | 
 94 | 
 95 | def test_non_base_module_attributes():
 96 |     module = Module()
 97 |     module.sub = Module()
 98 |     module.not_a_sub = "Not a self"
 99 |     expected = [("self", module), ("self.sub", module.sub)]
100 |     assert list(module.named_sub_modules()) == expected
101 | 
102 | 
103 | def test_complex_module_traversal():
104 |     root = Module()
105 |     root.sub_module = Module()
106 |     root.sub_module.nested_list = [Module(), {"key": Module()}]
107 |     root.sub_module.nested_tuple = (Module(), [Module(), Module()])
108 |     expected_names = {
109 |         "self",
110 |         "self.sub_module",
111 |         "self.sub_module.nested_list[0]",
112 |         "self.sub_module.nested_list[1][key]",
113 |         "self.sub_module.nested_tuple[0]",
114 |         "self.sub_module.nested_tuple[1][0]",
115 |         "self.sub_module.nested_tuple[1][1]",
116 |     }
117 |     found_names = {name for name, _ in root.named_sub_modules()}
118 | 
119 |     assert found_names == expected_names, (
120 |         f"Missing or extra modules found. Missing: {expected_names - found_names}, Extra: {found_names - expected_names}"
121 |     )
122 | 
123 | 
124 | def test_complex_module_traversal_with_same_module():
125 |     root = Module()
126 |     root.sub_module = Module()
127 |     root.sub_module.nested_list = [Module(), {"key": Module()}]
128 |     same_module = Module()
129 |     root.sub_module.nested_tuple = (Module(), [same_module, same_module])
130 |     expected_names = {
131 |         "self",
132 |         "self.sub_module",
133 |         "self.sub_module.nested_list[0]",
134 |         "self.sub_module.nested_list[1][key]",  # NOTE: named_sub_modules allows recursive structures
135 |         "self.sub_module.nested_tuple[0]",
136 |         "self.sub_module.nested_tuple[1][0]",  # NEW: named_sub_modules allows recursive structures, but named_parameters does not
137 |     }
138 |     found_names = {name for name, _ in root.named_sub_modules()}
139 | 
140 |     assert found_names == expected_names, (
141 |         f"Missing or extra modules found. Missing: {expected_names - found_names}, Extra: {found_names - expected_names}"
142 |     )
143 | 
144 | 
145 | def test_complex_module_set_attribute_by_name():
146 |     root = Module()
147 |     root.sub_module = Module()
148 |     root.sub_module.nested_list = [Module(), {"key": Module()}]
149 |     same_module = Module()
150 |     root.sub_module.nested_tuple = (Module(), [same_module, same_module])
151 | 
152 |     set_attribute_by_name(root, "test_attrib", True)
153 |     assert root.test_attrib is True
154 |     set_attribute_by_name(root, "sub_module.test_attrib", True)
155 |     assert root.sub_module.test_attrib is True
156 |     set_attribute_by_name(root, "sub_module.nested_list[0].test_attrib", True)
157 |     assert root.sub_module.nested_list[0].test_attrib is True
158 |     set_attribute_by_name(root, "sub_module.nested_list[1]['key'].test_attrib", True)
159 |     assert root.sub_module.nested_list[1]["key"].test_attrib is True
160 |     set_attribute_by_name(root, "sub_module.nested_tuple[0].test_attrib", True)
161 |     assert root.sub_module.nested_tuple[0].test_attrib is True
162 |     set_attribute_by_name(root, "sub_module.nested_tuple[1][0].test_attrib", True)
163 |     assert root.sub_module.nested_tuple[1][0].test_attrib is True
164 |     assert root.sub_module.nested_tuple[1][1].test_attrib is True
165 | 
166 | 
167 | class DuplicateModule(Module):
168 |     def __init__(self):
169 |         super().__init__()
170 |         self.p0 = dspy.Predict("question -> answer")
171 |         self.p1 = self.p0
172 | 
173 | 
174 | def test_named_parameters_duplicate_references():
175 |     module = DuplicateModule()
176 |     # Only testing for whether exceptions are thrown or not
177 |     # As Module.named_parameters() is recursive, this is mainly for catching infinite recursion
178 |     module.named_parameters()
179 | 
180 | 
181 | def test_load_dspy_program_cross_version():
182 |     """
183 |     Test backward compatibility for loading a saved DSPy program.
184 | 
185 |     This test verifies that DSPy can load a program saved in version 3.0.1, ensuring compatibility with older versions.
186 |     The saved state is located in 'test/primitives/resources/saved_program.json' and represents an optimized
187 |     `dspy.ReAct` program.
188 |     """
189 |     path = Path(__file__).parent / "resources" / "saved_program.json"
190 |     loaded_react = dspy.ReAct("question->answer", tools=[])
191 |     loaded_react.load(path)
192 |     assert (
193 |         "Imagine you are a detective racing against time to solve a high-profile"
194 |         in loaded_react.react.signature.instructions
195 |     )
196 |     assert "Given the very verbose fields `question`" in loaded_react.extract.predict.signature.instructions
197 | 
198 |     assert len(loaded_react.react.demos) == 2
199 |     assert len(loaded_react.extract.predict.demos) == 2
200 | 
```

--------------------------------------------------------------------------------
/dspy/dsp/utils/dpr.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Source: DPR Implementation from Facebook Research
  3 | https://github.com/facebookresearch/DPR/tree/master/dpr
  4 | Original license: https://github.com/facebookresearch/DPR/blob/main/LICENSE
  5 | """
  6 | 
  7 | import copy
  8 | import logging
  9 | import unicodedata
 10 | 
 11 | import regex
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | class Tokens:
 17 |     """A class to represent a list of tokenized text."""
 18 | 
 19 |     TEXT = 0
 20 |     TEXT_WS = 1
 21 |     SPAN = 2
 22 |     POS = 3
 23 |     LEMMA = 4
 24 |     NER = 5
 25 | 
 26 |     def __init__(self, data, annotators, opts=None):
 27 |         self.data = data
 28 |         self.annotators = annotators
 29 |         self.opts = opts or {}
 30 | 
 31 |     def __len__(self):
 32 |         """The number of tokens."""
 33 |         return len(self.data)
 34 | 
 35 |     def slice(self, i=None, j=None):
 36 |         """Return a view of the list of tokens from [i, j)."""
 37 |         new_tokens = copy.copy(self)
 38 |         new_tokens.data = self.data[i:j]
 39 |         return new_tokens
 40 | 
 41 |     def untokenize(self):
 42 |         """Returns the original text (with whitespace reinserted)."""
 43 |         return "".join([t[self.TEXT_WS] for t in self.data]).strip()
 44 | 
 45 |     def words(self, uncased=False):
 46 |         """Returns a list of the text of each token
 47 | 
 48 |         Args:
 49 |             uncased: lower cases text
 50 |         """
 51 |         if uncased:
 52 |             return [t[self.TEXT].lower() for t in self.data]
 53 |         else:
 54 |             return [t[self.TEXT] for t in self.data]
 55 | 
 56 |     def offsets(self):
 57 |         """Returns a list of [start, end) character offsets of each token."""
 58 |         return [t[self.SPAN] for t in self.data]
 59 | 
 60 |     def pos(self):
 61 |         """Returns a list of part-of-speech tags of each token.
 62 |         Returns None if this annotation was not included.
 63 |         """
 64 |         if "pos" not in self.annotators:
 65 |             return None
 66 |         return [t[self.POS] for t in self.data]
 67 | 
 68 |     def lemmas(self):
 69 |         """Returns a list of the lemmatized text of each token.
 70 |         Returns None if this annotation was not included.
 71 |         """
 72 |         if "lemma" not in self.annotators:
 73 |             return None
 74 |         return [t[self.LEMMA] for t in self.data]
 75 | 
 76 |     def entities(self):
 77 |         """Returns a list of named-entity-recognition tags of each token.
 78 |         Returns None if this annotation was not included.
 79 |         """
 80 |         if "ner" not in self.annotators:
 81 |             return None
 82 |         return [t[self.NER] for t in self.data]
 83 | 
 84 |     def ngrams(self, n=1, uncased=False, filter_fn=None, as_strings=True):
 85 |         """Returns a list of all ngrams from length 1 to n.
 86 | 
 87 |         Args:
 88 |             n: upper limit of ngram length
 89 |             uncased: lower cases text
 90 |             filter_fn: user function that takes in an ngram list and returns
 91 |               True or False to keep or not keep the ngram
 92 |             as_string: return the ngram as a string vs list
 93 |         """
 94 | 
 95 |         def _skip(gram):
 96 |             if not filter_fn:
 97 |                 return False
 98 |             return filter_fn(gram)
 99 | 
100 |         words = self.words(uncased)
101 |         ngrams = [
102 |             (s, e + 1)
103 |             for s in range(len(words))
104 |             for e in range(s, min(s + n, len(words)))
105 |             if not _skip(words[s : e + 1])
106 |         ]
107 | 
108 |         # Concatenate into strings
109 |         if as_strings:
110 |             ngrams = ["{}".format(" ".join(words[s:e])) for (s, e) in ngrams]
111 | 
112 |         return ngrams
113 | 
114 |     def entity_groups(self):
115 |         """Group consecutive entity tokens with the same NER tag."""
116 |         entities = self.entities()
117 |         if not entities:
118 |             return None
119 |         non_ent = self.opts.get("non_ent", "O")
120 |         groups = []
121 |         idx = 0
122 |         while idx < len(entities):
123 |             ner_tag = entities[idx]
124 |             # Check for entity tag
125 |             if ner_tag != non_ent:
126 |                 # Chomp the sequence
127 |                 start = idx
128 |                 while idx < len(entities) and entities[idx] == ner_tag:
129 |                     idx += 1
130 |                 groups.append((self.slice(start, idx).untokenize(), ner_tag))
131 |             else:
132 |                 idx += 1
133 |         return groups
134 | 
135 | 
136 | class Tokenizer:
137 |     """Base tokenizer class.
138 |     Tokenizers implement tokenize, which should return a Tokens class.
139 |     """
140 | 
141 |     def tokenize(self, text):
142 |         raise NotImplementedError
143 | 
144 |     def shutdown(self):
145 |         pass
146 | 
147 |     def __del__(self):
148 |         self.shutdown()
149 | 
150 | 
151 | class SimpleTokenizer(Tokenizer):
152 |     ALPHA_NUM = r"[\p{L}\p{N}\p{M}]+"
153 |     NON_WS = r"[^\p{Z}\p{C}]"
154 | 
155 |     def __init__(self, **kwargs):
156 |         """
157 |         Args:
158 |             annotators: None or empty set (only tokenizes).
159 |         """
160 |         self._regexp = regex.compile(
161 |             "(%s)|(%s)" % (self.ALPHA_NUM, self.NON_WS),
162 |             flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE,
163 |         )
164 |         if len(kwargs.get("annotators", {})) > 0:
165 |             logger.warning(
166 |                 "%s only tokenizes! Skipping annotators: %s",
167 |                 type(self).__name__,
168 |                 kwargs.get("annotators"),
169 |             )
170 |         self.annotators = set()
171 | 
172 |     def tokenize(self, text):
173 |         data = []
174 |         matches = list(self._regexp.finditer(text))
175 |         for i in range(len(matches)):
176 |             # Get text
177 |             token = matches[i].group()
178 | 
179 |             # Get whitespace
180 |             span = matches[i].span()
181 |             start_ws = span[0]
182 |             if i + 1 < len(matches):
183 |                 end_ws = matches[i + 1].span()[0]
184 |             else:
185 |                 end_ws = span[1]
186 | 
187 |             # Format data
188 |             data.append(
189 |                 (
190 |                     token,
191 |                     text[start_ws:end_ws],
192 |                     span,
193 |                 )
194 |             )
195 |         return Tokens(data, self.annotators)
196 | 
197 | 
198 | def has_answer(tokenized_answers, text):
199 |     text = DPR_normalize(text)
200 | 
201 |     for single_answer in tokenized_answers:
202 |         for i in range(0, len(text) - len(single_answer) + 1):
203 |             if single_answer == text[i : i + len(single_answer)]:
204 |                 return True
205 | 
206 |     return False
207 | 
208 | 
209 | def locate_answers(tokenized_answers, text):
210 |     """
211 |     Returns each occurrence of an answer as (offset, endpos) in terms of *characters*.
212 |     """
213 |     tokenized_text = DPR_tokenize(text)
214 |     occurrences = []
215 | 
216 |     text_words, text_word_positions = tokenized_text.words(uncased=True), tokenized_text.offsets()
217 |     answers_words = [ans.words(uncased=True) for ans in tokenized_answers]
218 | 
219 |     for single_answer in answers_words:
220 |         for i in range(0, len(text_words) - len(single_answer) + 1):
221 |             if single_answer == text_words[i : i + len(single_answer)]:
222 |                 (offset, _), (_, endpos) = text_word_positions[i], text_word_positions[i + len(single_answer) - 1]
223 |                 occurrences.append((offset, endpos))
224 | 
225 |     return occurrences
226 | 
227 | 
228 | STokenizer = SimpleTokenizer()
229 | 
230 | 
231 | def DPR_tokenize(text):  # noqa: N802
232 |     return STokenizer.tokenize(unicodedata.normalize("NFD", text))
233 | 
234 | 
235 | def DPR_normalize(text):  # noqa: N802
236 |     return DPR_tokenize(text).words(uncased=True)
237 | 
238 | 
239 | # Source: https://github.com/shmsw25/qa-hard-em/blob/master/prepro_util.py
240 | def strip_accents(text):
241 |     """Strips accents from a piece of text."""
242 |     text = unicodedata.normalize("NFD", text)
243 |     output = []
244 |     for char in text:
245 |         cat = unicodedata.category(char)
246 |         if cat == "Mn":
247 |             continue
248 |         output.append(char)
249 |     return "".join(output)
250 | 
```

--------------------------------------------------------------------------------
/dspy/propose/utils.py:
--------------------------------------------------------------------------------

```python
  1 | import inspect
  2 | import json
  3 | import re
  4 | 
  5 | import dspy
  6 | 
  7 | try:
  8 |     from IPython.core.magics.code import extract_symbols
  9 | except ImportError:
 10 |     # Won't be able to read code from jupyter notebooks
 11 |     extract_symbols = None
 12 | 
 13 | from dspy.predict.parameter import Parameter
 14 | from dspy.teleprompt.utils import get_signature, new_getfile
 15 | 
 16 | 
 17 | def strip_prefix(text):
 18 |     pattern = r"^[\*\s]*(([\w\'\-]+\s+){0,4}[\w\'\-]+):\s*"
 19 |     modified_text = re.sub(pattern, "", text)
 20 |     return modified_text.strip('"')
 21 | 
 22 | def create_instruction_set_history_string(base_program, trial_logs, top_n):
 23 |     program_history = []
 24 |     for trial_num in trial_logs:
 25 |         trial = trial_logs[trial_num]
 26 |         if "program_path" in trial:
 27 |             trial_program = base_program.deepcopy()
 28 |             trial_program.load(trial["program_path"])
 29 |             program_history.append({
 30 |                 "program": trial_program,
 31 |                 "score": trial["score"],
 32 |             })
 33 | 
 34 |     # Deduplicate program history based on the program's instruction set
 35 |     seen_programs = set()
 36 |     unique_program_history = []
 37 |     for entry in program_history:
 38 |         program = entry["program"]
 39 |         instruction_set = get_program_instruction_set_string(program)
 40 |         if instruction_set not in seen_programs:
 41 |             seen_programs.add(instruction_set)
 42 |             unique_program_history.append(entry)
 43 | 
 44 |     # Get the top n programs from program history
 45 |     top_n_program_history = sorted(unique_program_history, key=lambda x: x["score"], reverse=True)[:top_n]
 46 |     top_n_program_history.reverse()
 47 | 
 48 |     # Create formatted string
 49 |     instruction_set_history_string = ""
 50 |     for entry in top_n_program_history:
 51 |         program = entry["program"]
 52 |         score = entry["score"]
 53 |         instruction_set = get_program_instruction_set_string(program)
 54 |         instruction_set_history_string += instruction_set + f" | Score: {score}\n\n"
 55 | 
 56 |     return instruction_set_history_string
 57 | 
 58 | def parse_list_of_instructions(instruction_string):
 59 |     # Try to convert the string representation of a list to an actual list using JSON
 60 |     try:
 61 |         instructions = json.loads(instruction_string)
 62 |         return instructions
 63 |     except json.JSONDecodeError:
 64 |         pass
 65 | 
 66 |     # If JSON decoding fails, extract strings within quotes
 67 |     instructions = re.findall(r'"([^"]*)"', instruction_string)
 68 |     return instructions
 69 | 
 70 | def get_program_instruction_set_string(program):
 71 |     instruction_list = []
 72 |     for _, pred in enumerate(program.predictors()):
 73 |         pred_instructions = get_signature(pred).instructions
 74 |         instruction_list.append(f'"{pred_instructions}"')
 75 |     # Joining the list into a single string that looks like a list
 76 |     return f"[{', '.join(instruction_list)}]"
 77 | 
 78 | def create_predictor_level_history_string(base_program, predictor_i, trial_logs, top_n):
 79 |     instruction_aggregate = {}
 80 |     instruction_history = []
 81 | 
 82 |     # Load trial programs
 83 |     for trial_num in trial_logs:
 84 |         trial = trial_logs[trial_num]
 85 |         if "program_path" in trial:
 86 |             trial_program = base_program.deepcopy()
 87 |             trial_program.load(trial["program_path"])
 88 |             instruction_history.append({
 89 |                 "program": trial_program,
 90 |                 "score": trial["score"],
 91 |             })
 92 | 
 93 |     # Aggregate scores for each instruction
 94 |     for history_item in instruction_history:
 95 |         predictor = history_item["program"].predictors()[predictor_i]
 96 |         instruction = get_signature(predictor).instructions
 97 |         score = history_item["score"]
 98 | 
 99 |         if instruction in instruction_aggregate:
100 |             instruction_aggregate[instruction]["total_score"] += score
101 |             instruction_aggregate[instruction]["count"] += 1
102 |         else:
103 |             instruction_aggregate[instruction] = {"total_score": score, "count": 1}
104 | 
105 |     # Calculate average score for each instruction and prepare for sorting
106 |     predictor_history = []
107 |     for instruction, data in instruction_aggregate.items():
108 |         average_score = data["total_score"] / data["count"]
109 |         predictor_history.append((instruction, average_score))
110 | 
111 |     # Deduplicate and sort by average score, then select top N
112 |     seen_instructions = set()
113 |     unique_predictor_history = []
114 |     for instruction, score in predictor_history:
115 |         if instruction not in seen_instructions:
116 |             seen_instructions.add(instruction)
117 |             unique_predictor_history.append((instruction, score))
118 | 
119 |     top_instructions = sorted(unique_predictor_history, key=lambda x: x[1], reverse=True)[:top_n]
120 |     top_instructions.reverse()
121 | 
122 |     # Create formatted history string
123 |     predictor_history_string = ""
124 |     for instruction, score in top_instructions:
125 |         predictor_history_string += instruction + f" | Score: {score}\n\n"
126 | 
127 |     return predictor_history_string
128 | 
129 | def create_example_string(fields, example):
130 | 
131 |     # Building the output string
132 |     output = []
133 |     for field_name, field_values in fields.items():
134 |         name = field_values.json_schema_extra["prefix"]
135 | 
136 |         # Determine the value from input_data or prediction_data
137 |         value = example.get(field_name)
138 | 
139 |         # Construct the string for the current field
140 |         field_str = f"{name} {value}"
141 |         output.append(field_str)
142 | 
143 |     # Joining all the field strings
144 |     return "\n".join(output)
145 | 
146 | def get_dspy_source_code(module):
147 |     header = []
148 |     base_code = ""
149 | 
150 |     # Don't get source code for Predict or ChainOfThought modules (NOTE we will need to extend this list as more DSPy.modules are added)
151 |     # TODO: if type(module).__name__ not in ["Predict", "ChainOfThought", "ReAct"]:
152 |     if not type(module).__name__ == "Predict" and not type(module).__name__ == "ChainOfThought":
153 |         try:
154 |             base_code = inspect.getsource(type(module))
155 |         except TypeError:
156 |             obj = type(module)
157 |             cell_code = "".join(inspect.linecache.getlines(new_getfile(obj)))
158 |             class_code = extract_symbols(cell_code, obj.__name__)[0][0]
159 |             base_code = str(class_code)
160 | 
161 |     completed_set = set()
162 |     for attribute in module.__dict__.keys():
163 |         try:
164 |             iterable = iter(getattr(module, attribute))
165 |         except TypeError:
166 |             iterable = [getattr(module, attribute)]
167 | 
168 |         for item in iterable:
169 |             # Skip items that are unhashable (like module history)
170 |             try:
171 |                 hash(item)
172 |             except TypeError:
173 |                 continue
174 |             if isinstance(item, Parameter):
175 |                 if hasattr(item, "signature") and item.signature is not None and item.signature.__pydantic_parent_namespace__["signature_name"] + "_sig" not in completed_set:
176 |                     try:
177 |                         header.append(inspect.getsource(item.signature))
178 |                         print(inspect.getsource(item.signature))
179 |                     except (TypeError, OSError):
180 |                         header.append(str(item.signature))
181 |                     completed_set.add(item.signature.__pydantic_parent_namespace__["signature_name"] + "_sig")
182 |             if isinstance(item, dspy.Module):
183 |                 code = get_dspy_source_code(item).strip()
184 |                 if code not in completed_set:
185 |                     header.append(code)
186 |                     completed_set.add(code)
187 |             completed_set.add(item)
188 | 
189 |     return "\n\n".join(header) + "\n\n" + base_code
190 | 
```

--------------------------------------------------------------------------------
/docs/docs/api/optimizers/GEPA/overview.md:
--------------------------------------------------------------------------------

```markdown
  1 | # dspy.GEPA: Reflective Prompt Optimizer
  2 | 
  3 | **GEPA** (Genetic-Pareto) is a reflective optimizer proposed in "GEPA: Reflective Prompt Evolution Can Outperform Reinforcement Learning" (Agrawal et al., 2025, [arxiv:2507.19457](https://arxiv.org/abs/2507.19457)), that adaptively evolves _textual components_ (such as prompts) of arbitrary systems. In addition to scalar scores returned by metrics, users can also provide GEPA with a text feedback to guide the optimization process. Such textual feedback provides GEPA more visibility into why the system got the score that it did, and then GEPA can introspect to identify how to improve the score. This allows GEPA to propose high performing prompts in very few rollouts.
  4 | 
  5 | <!-- START_API_REF -->
  6 | ::: dspy.GEPA
  7 |     handler: python
  8 |     options:
  9 |         members:
 10 |             - auto_budget
 11 |             - compile
 12 |             - get_params
 13 |         show_source: true
 14 |         show_root_heading: true
 15 |         heading_level: 2
 16 |         docstring_style: google
 17 |         show_root_full_path: true
 18 |         show_object_full_path: false
 19 |         separate_signature: false
 20 |         inherited_members: true
 21 | :::
 22 | <!-- END_API_REF -->
 23 | 
 24 | One of the key insights behind GEPA is its ability to leverage domain-specific textual feedback. Users should provide a feedback function as the GEPA metric, which has the following call signature:
 25 | <!-- START_API_REF -->
 26 | ::: dspy.teleprompt.gepa.gepa.GEPAFeedbackMetric
 27 |     handler: python
 28 |     options:
 29 |         members:
 30 |             - __call__
 31 |         show_source: true
 32 |         show_root_heading: true
 33 |         heading_level: 2
 34 |         docstring_style: google
 35 |         show_root_full_path: true
 36 |         show_object_full_path: false
 37 |         separate_signature: false
 38 |         inherited_members: true
 39 | :::
 40 | <!-- END_API_REF -->
 41 | 
 42 | When `track_stats=True`, GEPA returns detailed results about all of the proposed candidates, and metadata about the optimization run. The results are available in the `detailed_results` attribute of the optimized program returned by GEPA, and has the following type:
 43 | <!-- START_API_REF -->
 44 | ::: dspy.teleprompt.gepa.gepa.DspyGEPAResult
 45 |     handler: python
 46 |     options:
 47 |         show_source: true
 48 |         show_root_heading: true
 49 |         heading_level: 2
 50 |         docstring_style: google
 51 |         show_root_full_path: true
 52 |         show_object_full_path: false
 53 |         separate_signature: false
 54 |         inherited_members: true
 55 | :::
 56 | <!-- END_API_REF -->
 57 | 
 58 | ## Usage Examples
 59 | 
 60 | See GEPA usage tutorials in [GEPA Tutorials](../../../tutorials/gepa_ai_program/index.md).
 61 | 
 62 | ### Inference-Time Search
 63 | 
 64 | GEPA can act as a test-time/inference search mechanism. By setting your `valset` to your _evaluation batch_ and using `track_best_outputs=True`, GEPA produces for each batch element the highest-scoring outputs found during the evolutionary search.
 65 | 
 66 | ```python
 67 | gepa = dspy.GEPA(metric=metric, track_stats=True, ...)
 68 | new_prog = gepa.compile(student, trainset=my_tasks, valset=my_tasks)
 69 | highest_score_achieved_per_task = new_prog.detailed_results.highest_score_achieved_per_val_task
 70 | best_outputs = new_prog.detailed_results.best_outputs_valset
 71 | ```
 72 | 
 73 | ## How Does GEPA Work?
 74 | 
 75 | ### 1. **Reflective Prompt Mutation**
 76 | 
 77 | GEPA uses LLMs to _reflect_ on structured execution traces (inputs, outputs, failures, feedback), targeting a chosen module and proposing a new instruction/program text tailored to real observed failures and rich textual/environmental feedback.
 78 | 
 79 | ### 2. **Rich Textual Feedback as Optimization Signal**
 80 | 
 81 | GEPA can leverage _any_ textual feedback available—not just scalar rewards. This includes evaluation logs, code traces, failed parses, constraint violations, error message strings, or even isolated submodule-specific feedback. This allows actionable, domain-aware optimization. 
 82 | 
 83 | ### 3. **Pareto-based Candidate Selection**
 84 | 
 85 | Rather than evolving just the _best_ global candidate (which leads to local optima or stagnation), GEPA maintains a Pareto frontier: the set of candidates which achieve the highest score on at least one evaluation instance. In each iteration, the next candidate to mutate is sampled (with probability proportional to coverage) from this frontier, guaranteeing both exploration and robust retention of complementary strategies.
 86 | 
 87 | ### Algorithm Summary
 88 | 
 89 | 1. **Initialize** the candidate pool with the the unoptimized program.
 90 | 2. **Iterate**:
 91 |    - **Sample a candidate** (from Pareto frontier).
 92 |    - **Sample a minibatch** from the train set.
 93 |    - **Collect execution traces + feedbacks** for module rollout on minibatch.
 94 |    - **Select a module** of the candidate for targeted improvement.
 95 |    - **LLM Reflection:** Propose a new instruction/prompt for the targeted module using reflective meta-prompting and the gathered feedback.
 96 |    - **Roll out the new candidate** on the minibatch; **if improved, evaluate on Pareto validation set**.
 97 |    - **Update the candidate pool/Pareto frontier.**
 98 |    - **[Optionally] System-aware merge/crossover**: Combine best-performing modules from distinct lineages.
 99 | 3. **Continue** until rollout or metric budget is exhausted. 
100 | 4. **Return** candidate with best aggregate performance on validation.
101 | 
102 | ## Implementing Feedback Metrics
103 | 
104 | A well-designed metric is central to GEPA's sample efficiency and learning signal richness. GEPA expects the metric to returns a `dspy.Prediction(score=..., feedback=...)`. GEPA leverages natural language traces from LLM-based workflows for optimization, preserving intermediate trajectories and errors in plain text rather than reducing them to numerical rewards. This mirrors human diagnostic processes, enabling clearer identification of system behaviors and bottlenecks.
105 | 
106 | Practical Recipe for GEPA-Friendly Feedback:
107 | 
108 | - **Leverage Existing Artifacts**: Use logs, unit tests, evaluation scripts, and profiler outputs; surfacing these often suffices.
109 | - **Decompose Outcomes**: Break scores into per-objective components (e.g., correctness, latency, cost, safety) and attribute errors to steps.
110 | - **Expose Trajectories**: Label pipeline stages, reporting pass/fail with salient errors (e.g., in code generation pipelines).
111 | - **Ground in Checks**: Employ automatic validators (unit tests, schemas, simulators) or LLM-as-a-judge for non-verifiable tasks (as in PUPA).
112 | - **Prioritize Clarity**: Focus on error coverage and decision points over technical complexity.
113 | 
114 | ### Examples
115 | 
116 | - **Document Retrieval** (e.g., HotpotQA): List correctly retrieved, incorrect, or missed documents, beyond mere Recall/F1 scores.
117 | - **Multi-Objective Tasks** (e.g., PUPA): Decompose aggregate scores to reveal contributions from each objective, highlighting tradeoffs (e.g., quality vs. privacy).
118 | - **Stacked Pipelines** (e.g., code generation: parse → compile → run → profile → evaluate): Expose stage-specific failures; natural-language traces often suffice for LLM self-correction.
119 | 
120 | ## Custom Instruction Proposal
121 | 
122 | For advanced customization of GEPA's instruction proposal mechanism, including custom instruction proposers and component selectors, see [Advanced Features](GEPA_Advanced.md).
123 | 
124 | ## Further Reading
125 | 
126 | - [GEPA Paper: arxiv:2507.19457](https://arxiv.org/abs/2507.19457)
127 | - [GEPA Github](https://github.com/gepa-ai/gepa) - This repository provides the core GEPA evolution pipeline used by `dspy.GEPA` optimizer.
128 | - [DSPy Tutorials](../../../tutorials/gepa_ai_program/index.md)
129 | 
```

--------------------------------------------------------------------------------
/docs/docs/tutorials/cache/index.md:
--------------------------------------------------------------------------------

```markdown
  1 | # Use and Customize DSPy Cache
  2 | 
  3 | In this tutorial, we will explore the design of DSPy's caching mechanism and demonstrate how to effectively use and customize it.
  4 | 
  5 | ## DSPy Cache Structure
  6 | 
  7 | DSPy's caching system is architected in three distinct layers:
  8 | 
  9 | 1.  **In-memory cache**: Implemented using `cachetools.LRUCache`, this layer provides fast access to frequently used data.
 10 | 2.  **On-disk cache**: Leveraging `diskcache.FanoutCache`, this layer offers persistent storage for cached items.
 11 | 3.  **Prompt cache (Server-side cache)**: This layer is managed by the LLM service provider (e.g., OpenAI, Anthropic).
 12 | 
 13 | While DSPy does not directly control the server-side prompt cache, it offers users the flexibility to enable, disable, and customize the in-memory and on-disk caches to suit their specific requirements.
 14 | 
 15 | ## Using DSPy Cache
 16 | 
 17 | By default, both in-memory and on-disk caching are automatically enabled in DSPy. No specific action is required to start using the cache. When a cache hit occurs, you will observe a significant reduction in the module call's execution time. Furthermore, if usage tracking is enabled, the usage metrics for a cached call will be `None`.
 18 | 
 19 | Consider the following example:
 20 | 
 21 | ```python
 22 | import dspy
 23 | import os
 24 | import time
 25 | 
 26 | os.environ["OPENAI_API_KEY"] = "{your_openai_key}"
 27 | 
 28 | dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini"), track_usage=True)
 29 | 
 30 | predict = dspy.Predict("question->answer")
 31 | 
 32 | start = time.time()
 33 | result1 = predict(question="Who is the GOAT of basketball?")
 34 | print(f"Time elapse: {time.time() - start: 2f}\n\nTotal usage: {result1.get_lm_usage()}")
 35 | 
 36 | start = time.time()
 37 | result2 = predict(question="Who is the GOAT of basketball?")
 38 | print(f"Time elapse: {time.time() - start: 2f}\n\nTotal usage: {result2.get_lm_usage()}")
 39 | ```
 40 | 
 41 | A sample output looks like:
 42 | 
 43 | ```
 44 | Time elapse:  4.384113
 45 | Total usage: {'openai/gpt-4o-mini': {'completion_tokens': 97, 'prompt_tokens': 144, 'total_tokens': 241, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0, 'text_tokens': None}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0, 'text_tokens': None, 'image_tokens': None}}}
 46 | 
 47 | Time elapse:  0.000529
 48 | Total usage: {}
 49 | ```
 50 | 
 51 | ## Disabling/Enabling DSPy Cache
 52 | 
 53 | There are scenarios where you might need to disable caching, either entirely or selectively for in-memory or on-disk caches. For instance:
 54 | 
 55 | - You require different responses for identical LM requests.
 56 | - You lack disk write permissions and need to disable the on-disk cache.
 57 | - You have limited memory resources and wish to disable the in-memory cache.
 58 | 
 59 | DSPy provides the `dspy.configure_cache()` utility function for this purpose. You can use the corresponding flags to control the enabled/disabled state of each cache type:
 60 | 
 61 | ```python
 62 | dspy.configure_cache(
 63 |     enable_disk_cache=False,
 64 |     enable_memory_cache=False,
 65 | )
 66 | ```
 67 | 
 68 | In additions, you can manage the capacity of the in-memory and on-disk caches:
 69 | 
 70 | ```python
 71 | dspy.configure_cache(
 72 |     enable_disk_cache=True,
 73 |     enable_memory_cache=True,
 74 |     disk_size_limit_bytes=YOUR_DESIRED_VALUE,
 75 |     memory_max_entries=YOUR_DESIRED_VALUE,
 76 | )
 77 | ```
 78 | 
 79 | Please note that `disk_size_limit_bytes` defines the maximum size in bytes for the on-disk cache, while `memory_max_entries` specifies the maximum number of entries for the in-memory cache.
 80 | 
 81 | ## Understanding and Customizing the Cache
 82 | 
 83 | In specific situations, you might want to implement a custom cache, for example, to gain finer control over how cache keys are generated. By default, the cache key is derived from a hash of all request arguments sent to `litellm`, excluding credentials like `api_key`.
 84 | 
 85 | To create a custom cache, you need to subclass `dspy.clients.Cache` and override the relevant methods:
 86 | 
 87 | ```python
 88 | class CustomCache(dspy.clients.Cache):
 89 |     def __init__(self, **kwargs):
 90 |         {write your own constructor}
 91 | 
 92 |     def cache_key(self, request: dict[str, Any], ignored_args_for_cache_key: Optional[list[str]] = None) -> str:
 93 |         {write your logic of computing cache key}
 94 | 
 95 |     def get(self, request: dict[str, Any], ignored_args_for_cache_key: Optional[list[str]] = None) -> Any:
 96 |         {write your cache read logic}
 97 | 
 98 |     def put(
 99 |         self,
100 |         request: dict[str, Any],
101 |         value: Any,
102 |         ignored_args_for_cache_key: Optional[list[str]] = None,
103 |         enable_memory_cache: bool = True,
104 |     ) -> None:
105 |         {write your cache write logic}
106 | ```
107 | 
108 | To ensure seamless integration with the rest of DSPy, it is recommended to implement your custom cache using the same method signatures as the base class, or at a minimum, include `**kwargs` in your method definitions to prevent runtime errors during cache read/write operations.
109 | 
110 | Once your custom cache class is defined, you can instruct DSPy to use it:
111 | 
112 | ```python
113 | dspy.cache = CustomCache()
114 | ```
115 | 
116 | Let's illustrate this with a practical example. Suppose we want the cache key computation to depend solely on the request message content, ignoring other parameters like the specific LM being called. We can create a custom cache as follows:
117 | 
118 | ```python
119 | class CustomCache(dspy.clients.Cache):
120 | 
121 |     def cache_key(self, request: dict[str, Any], ignored_args_for_cache_key: Optional[list[str]] = None) -> str:
122 |         messages = request.get("messages", [])
123 |         return sha256(ujson.dumps(messages, sort_keys=True).encode()).hexdigest()
124 | 
125 | dspy.cache = CustomCache(enable_disk_cache=True, enable_memory_cache=True, disk_cache_dir=dspy.clients.DISK_CACHE_DIR)
126 | ```
127 | 
128 | For comparison, consider executing the code below without the custom cache:
129 | 
130 | ```python
131 | import dspy
132 | import os
133 | import time
134 | 
135 | os.environ["OPENAI_API_KEY"] = "{your_openai_key}"
136 | 
137 | dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini"))
138 | 
139 | predict = dspy.Predict("question->answer")
140 | 
141 | start = time.time()
142 | result1 = predict(question="Who is the GOAT of soccer?")
143 | print(f"Time elapse: {time.time() - start: 2f}")
144 | 
145 | start = time.time()
146 | with dspy.context(lm=dspy.LM("openai/gpt-4.1-mini")):
147 |     result2 = predict(question="Who is the GOAT of soccer?")
148 | print(f"Time elapse: {time.time() - start: 2f}")
149 | ```
150 | 
151 | The time elapsed will indicate that the cache is not hit on the second call. However, when using the custom cache:
152 | 
153 | ```python
154 | import dspy
155 | import os
156 | import time
157 | from typing import Dict, Any, Optional
158 | import ujson
159 | from hashlib import sha256
160 | 
161 | os.environ["OPENAI_API_KEY"] = "{your_openai_key}"
162 | 
163 | dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini"))
164 | 
165 | class CustomCache(dspy.clients.Cache):
166 | 
167 |     def cache_key(self, request: dict[str, Any], ignored_args_for_cache_key: Optional[list[str]] = None) -> str:
168 |         messages = request.get("messages", [])
169 |         return sha256(ujson.dumps(messages, sort_keys=True).encode()).hexdigest()
170 | 
171 | dspy.cache = CustomCache(enable_disk_cache=True, enable_memory_cache=True, disk_cache_dir=dspy.clients.DISK_CACHE_DIR)
172 | 
173 | predict = dspy.Predict("question->answer")
174 | 
175 | start = time.time()
176 | result1 = predict(question="Who is the GOAT of volleyball?")
177 | print(f"Time elapse: {time.time() - start: 2f}")
178 | 
179 | start = time.time()
180 | with dspy.context(lm=dspy.LM("openai/gpt-4.1-mini")):
181 |     result2 = predict(question="Who is the GOAT of volleyball?")
182 | print(f"Time elapse: {time.time() - start: 2f}")
183 | ```
184 | 
185 | You will observe that the cache is hit on the second call, demonstrating the effect of the custom cache key logic.
```

--------------------------------------------------------------------------------
/dspy/datasets/dataloader.py:
--------------------------------------------------------------------------------

```python
  1 | import random
  2 | from collections.abc import Mapping
  3 | from typing import TYPE_CHECKING
  4 | 
  5 | import dspy
  6 | from dspy.datasets.dataset import Dataset
  7 | 
  8 | if TYPE_CHECKING:
  9 |     import pandas as pd
 10 | 
 11 | 
 12 | class DataLoader(Dataset):
 13 |     def __init__(self):
 14 |         pass
 15 | 
 16 |     def from_huggingface(
 17 |         self,
 18 |         dataset_name: str,
 19 |         *args,
 20 |         input_keys: tuple[str] = (),
 21 |         fields: tuple[str] | None = None,
 22 |         **kwargs,
 23 |     ) -> Mapping[str, list[dspy.Example]] | list[dspy.Example]:
 24 |         if fields and not isinstance(fields, tuple):
 25 |             raise ValueError("Invalid fields provided. Please provide a tuple of fields.")
 26 | 
 27 |         if not isinstance(input_keys, tuple):
 28 |             raise ValueError("Invalid input keys provided. Please provide a tuple of input keys.")
 29 | 
 30 |         from datasets import load_dataset
 31 | 
 32 |         dataset = load_dataset(dataset_name, *args, **kwargs)
 33 | 
 34 |         if isinstance(dataset, list) and isinstance(kwargs["split"], list):
 35 |             dataset = {split_name: dataset[idx] for idx, split_name in enumerate(kwargs["split"])}
 36 | 
 37 |         try:
 38 |             returned_split = {}
 39 |             for split_name in dataset.keys():
 40 |                 if fields:
 41 |                     returned_split[split_name] = [
 42 |                         dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys)
 43 |                         for row in dataset[split_name]
 44 |                     ]
 45 |                 else:
 46 |                     returned_split[split_name] = [
 47 |                         dspy.Example({field: row[field] for field in row.keys()}).with_inputs(*input_keys)
 48 |                         for row in dataset[split_name]
 49 |                     ]
 50 | 
 51 |             return returned_split
 52 |         except AttributeError:
 53 |             if fields:
 54 |                 return [
 55 |                     dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) for row in dataset
 56 |                 ]
 57 |             else:
 58 |                 return [
 59 |                     dspy.Example({field: row[field] for field in row.keys()}).with_inputs(*input_keys)
 60 |                     for row in dataset
 61 |                 ]
 62 | 
 63 |     def from_csv(
 64 |         self,
 65 |         file_path: str,
 66 |         fields: list[str] | None = None,
 67 |         input_keys: tuple[str] = (),
 68 |     ) -> list[dspy.Example]:
 69 |         from datasets import load_dataset
 70 | 
 71 |         dataset = load_dataset("csv", data_files=file_path)["train"]
 72 | 
 73 |         if not fields:
 74 |             fields = list(dataset.features)
 75 | 
 76 |         return [dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) for row in dataset]
 77 | 
 78 |     def from_pandas(
 79 |         self,
 80 |         df: "pd.DataFrame",
 81 |         fields: list[str] | None = None,
 82 |         input_keys: tuple[str] = (),
 83 |     ) -> list[dspy.Example]:
 84 |         if fields is None:
 85 |             fields = list(df.columns)
 86 | 
 87 |         return [
 88 |             dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) for _, row in df.iterrows()
 89 |         ]
 90 | 
 91 |     def from_json(
 92 |         self,
 93 |         file_path: str,
 94 |         fields: list[str] | None = None,
 95 |         input_keys: tuple[str] = (),
 96 |     ) -> list[dspy.Example]:
 97 |         from datasets import load_dataset
 98 | 
 99 |         dataset = load_dataset("json", data_files=file_path)["train"]
100 | 
101 |         if not fields:
102 |             fields = list(dataset.features)
103 | 
104 |         return [dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) for row in dataset]
105 | 
106 |     def from_parquet(
107 |         self,
108 |         file_path: str,
109 |         fields: list[str] | None = None,
110 |         input_keys: tuple[str] = (),
111 |     ) -> list[dspy.Example]:
112 |         from datasets import load_dataset
113 | 
114 |         dataset = load_dataset("parquet", data_files=file_path)["train"]
115 | 
116 |         if not fields:
117 |             fields = list(dataset.features)
118 | 
119 |         return [dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys) for row in dataset]
120 | 
121 |     def from_rm(self, num_samples: int, fields: list[str], input_keys: list[str]) -> list[dspy.Example]:
122 |         try:
123 |             rm = dspy.settings.rm
124 |             try:
125 |                 return [
126 |                     dspy.Example({field: row[field] for field in fields}).with_inputs(*input_keys)
127 |                     for row in rm.get_objects(num_samples=num_samples, fields=fields)
128 |                 ]
129 |             except AttributeError:
130 |                 raise ValueError(
131 |                     "Retrieval module does not support `get_objects`. Please use a different retrieval module."
132 |                 )
133 |         except AttributeError:
134 |             raise ValueError(
135 |                 "Retrieval module not found. Please set a retrieval module using `dspy.settings.configure`."
136 |             )
137 | 
138 |     def sample(
139 |         self,
140 |         dataset: list[dspy.Example],
141 |         n: int,
142 |         *args,
143 |         **kwargs,
144 |     ) -> list[dspy.Example]:
145 |         if not isinstance(dataset, list):
146 |             raise ValueError(
147 |                 f"Invalid dataset provided of type {type(dataset)}. Please provide a list of `dspy.Example`s."
148 |             )
149 | 
150 |         return random.sample(dataset, n, *args, **kwargs)
151 | 
152 |     def train_test_split(
153 |         self,
154 |         dataset: list[dspy.Example],
155 |         train_size: int | float = 0.75,
156 |         test_size: int | float | None = None,
157 |         random_state: int | None = None,
158 |     ) -> Mapping[str, list[dspy.Example]]:
159 |         if random_state is not None:
160 |             random.seed(random_state)
161 | 
162 |         dataset_shuffled = dataset.copy()
163 |         random.shuffle(dataset_shuffled)
164 | 
165 |         if train_size is not None and isinstance(train_size, float) and (0 < train_size < 1):
166 |             train_end = int(len(dataset_shuffled) * train_size)
167 |         elif train_size is not None and isinstance(train_size, int):
168 |             train_end = train_size
169 |         else:
170 |             raise ValueError(
171 |                 "Invalid `train_size`. Please provide a float between 0 and 1 to represent the proportion of the "
172 |                 "dataset to include in the train split or an int to represent the absolute number of samples to "
173 |                 f"include in the train split. Received `train_size`: {train_size}."
174 |             )
175 | 
176 |         if test_size is not None:
177 |             if isinstance(test_size, float) and (0 < test_size < 1):
178 |                 test_end = int(len(dataset_shuffled) * test_size)
179 |             elif isinstance(test_size, int):
180 |                 test_end = test_size
181 |             else:
182 |                 raise ValueError(
183 |                     "Invalid `test_size`. Please provide a float between 0 and 1 to represent the proportion of the "
184 |                     "dataset to include in the test split or an int to represent the absolute number of samples to "
185 |                     f"include in the test split. Received `test_size`: {test_size}."
186 |                 )
187 |             if train_end + test_end > len(dataset_shuffled):
188 |                 raise ValueError(
189 |                     "`train_size` + `test_size` cannot exceed the total number of samples. Received "
190 |                     f"`train_size`: {train_end}, `test_size`: {test_end}, and `dataset_size`: {len(dataset_shuffled)}."
191 |                 )
192 |         else:
193 |             test_end = len(dataset_shuffled) - train_end
194 | 
195 |         train_dataset = dataset_shuffled[:train_end]
196 |         test_dataset = dataset_shuffled[train_end : train_end + test_end]
197 | 
198 |         return {"train": train_dataset, "test": test_dataset}
199 | 
```

--------------------------------------------------------------------------------
/dspy/teleprompt/bettertogether.py:
--------------------------------------------------------------------------------

```python
  1 | import logging
  2 | import random
  3 | from typing import Callable
  4 | 
  5 | import dspy
  6 | from dspy.primitives.example import Example
  7 | from dspy.primitives.module import Module
  8 | from dspy.teleprompt.bootstrap_finetune import (
  9 |     BootstrapFinetune,
 10 |     all_predictors_have_lms,
 11 |     kill_lms,
 12 |     launch_lms,
 13 |     prepare_student,
 14 | )
 15 | from dspy.teleprompt.random_search import BootstrapFewShotWithRandomSearch
 16 | from dspy.teleprompt.teleprompt import Teleprompter
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | class BetterTogether(Teleprompter):
 22 | 
 23 |     STRAT_SEP = " -> "
 24 | 
 25 |     def __init__(self,
 26 |         metric: Callable,
 27 |         prompt_optimizer: Teleprompter | None = None,
 28 |         weight_optimizer: Teleprompter | None = None,
 29 |         seed: int | None = None,
 30 |       ):
 31 |         if not dspy.settings.experimental:
 32 |             raise ValueError("This is an experimental optimizer. Set `dspy.settings.experimental` to `True` to use it.")
 33 | 
 34 |         # TODO: Note that the BetterTogether optimizer is meaningful when
 35 |         # BootstrapFinetune uses a metric to filter the training data before
 36 |         # fine-tuning. However, one can also choose to run this optimizer with
 37 |         # a BootstrapFinetune without a metric, say, if there aren't labels
 38 |         # available for the training data. Should this be noted somewhere?
 39 |         # TODO: We should re-consider if the metric should be required.
 40 |         self.prompt_optimizer = prompt_optimizer if prompt_optimizer else BootstrapFewShotWithRandomSearch(metric=metric)
 41 |         self.weight_optimizer = weight_optimizer if weight_optimizer else BootstrapFinetune(metric=metric)
 42 | 
 43 |         is_supported_prompt = isinstance(self.prompt_optimizer, BootstrapFewShotWithRandomSearch)
 44 |         is_supported_weight = isinstance(self.weight_optimizer, BootstrapFinetune)
 45 |         if not is_supported_prompt or not is_supported_weight:
 46 |             raise ValueError(
 47 |                 "The BetterTogether optimizer only supports the following optimizers for now: BootstrapFinetune, "
 48 |                 "BootstrapFewShotWithRandomSearch."
 49 |             )
 50 | 
 51 |         self.rng = random.Random(seed)
 52 | 
 53 |     def compile(
 54 |         self,
 55 |         student: Module,
 56 |         trainset: list[Example],
 57 |         strategy: str = "p -> w -> p",
 58 |         valset_ratio = 0.1,
 59 |     ) -> Module:
 60 |         # TODO: We could record acc on a different valset to pick the best
 61 |         # strategy within the provided strategy
 62 |         logger.info("Validating the strategy")
 63 |         parsed_strategy = strategy.lower().split(self.STRAT_SEP)
 64 | 
 65 |         if not all(s in ["p", "w"] for s in parsed_strategy):
 66 |             raise ValueError(
 67 |                 f"The strategy should be a sequence of 'p' and 'w' separated by '{self.STRAT_SEP}', but "
 68 |                 f"found: {strategy}"
 69 |             )
 70 | 
 71 |         logger.info("Preparing the student program...")
 72 |         # TODO: Prepare student returns student.reset_copy(), which is what gets
 73 |         # optimized. We should make this clear in the doc comments.
 74 |         student = prepare_student(student)
 75 |         all_predictors_have_lms(student)
 76 | 
 77 |         # Make a shallow copy of the trainset, so that we don't change the order
 78 |         # of the examples in the original trainset
 79 |         trainset = trainset[:]
 80 |         logger.info("Compiling the student program...")
 81 |         student = self._run_strategies(parsed_strategy, student, trainset, valset_ratio)
 82 | 
 83 |         logger.info("BetterTogether has finished compiling the student program")
 84 |         return student
 85 | 
 86 |     def _run_strategies(self, parsed_strategy, student, trainset, valset_ratio) -> Module:
 87 |         # Keep track of all the partial strategies/programs in parsed_strategy
 88 |         # "" corresponds to the initial student program
 89 |         candidate_programs = []
 90 |         candidate_programs.append(("", student))
 91 |         launched_flag = False
 92 | 
 93 |         for ind, step_code in enumerate(parsed_strategy):
 94 |             current_strategy = self.STRAT_SEP.join(parsed_strategy[:ind + 1])
 95 |             logger.info(
 96 |                 f"\n########## Step {ind + 1} of {len(parsed_strategy)} - Strategy "
 97 |                 f"'{current_strategy}' ##########"
 98 |             )
 99 | 
100 |             logger.info("Shuffling the trainset...")
101 |             self.rng.shuffle(trainset)
102 |             if not launched_flag:
103 |                 launch_lms(student)
104 |                 launched_flag = True
105 | 
106 |             # TODO: Should we reset or just deepcopy? How does resetting affect
107 |             # the predictor LMs?
108 |             student = student.deepcopy()
109 |             student._compiled = False
110 |             if step_code == "p":
111 |                 student = self._compile_prompt_optimizer(student, trainset, valset_ratio)
112 |             elif step_code == "w":
113 |                 student = self._compile_weight_optimizer(student, trainset)
114 |                 launched_flag = False
115 | 
116 |             # Record the program corresponding to the current strategy
117 |             candidate_programs.append((current_strategy, student))
118 | 
119 |         if launched_flag:
120 |             kill_lms(student)
121 | 
122 |         student.candidate_programs = candidate_programs
123 |         return student
124 | 
125 |     def _compile_prompt_optimizer(self, student, trainset, valset_ratio) -> Module:
126 |         logger.info("Preparing for prompt optimization...")
127 | 
128 |         # Sampling a validation set from the trainset for the prompt optimizer
129 |         # We drop the hints for prompt optimization
130 |         trainset = [x.with_inputs(*list(set(x.inputs().keys()) - {"hint"})) for x in trainset]
131 |         num_val = int(valset_ratio * len(trainset))
132 |         prompt_valset = trainset[:num_val]
133 |         prompt_trainset = trainset[num_val:]
134 | 
135 |         # TODO: To make this optimizer general, we need to ensure that all the
136 |         # prompt optimizers are accepting a valset or encode a way to check if
137 |         # a valset should be passed to an optimizer's compile method.
138 |         # TODO: We should ensure that the prompt optimizers in DSPy respect the
139 |         # predictor.lm attributes. In particular,
140 |         # BootstrapFewShotWithRandomSearch seems to be resetting these. We are
141 |         # manually re-setting the LMs here to circumvent this issue, but we
142 |         # should consider addressing it in BFRS.
143 |         logger.info("Compiling the prompt optimizer...")
144 |         pred_lms = [pred.lm for pred in student.predictors()]
145 |         student = self.prompt_optimizer.compile(student, trainset=prompt_trainset, valset=prompt_valset)
146 |         for pred, lm in zip(student.predictors(), pred_lms, strict=False):
147 |             pred.lm = lm
148 | 
149 |         return student
150 | 
151 |     def _compile_weight_optimizer(self, student, trainset) -> Module:
152 |         logger.info("Preparing for weight optimization...")
153 | 
154 |         # Saving the LMs before compiling the weight optimizer
155 |         original_lms = [pred.lm for pred in student.predictors()]
156 | 
157 |         # TODO: To make this optimizer general, we need to ensure that all the
158 |         # prompt optimizers are accepting a valset or encode a way to check if
159 |         # a valset should be passed to an optimizer's compile.
160 |         logger.info("Compiling the weight optimizer...")
161 |         student = self.weight_optimizer.compile(student, trainset=trainset)
162 | 
163 |         # Updating the train kwargs for the new LMs. This is needed because the
164 |         # train_kwargs of the optimizer is configured for the original LMs.
165 |         new_lms = [pred.lm for pred in student.predictors()]
166 |         for original_lm, new_lm in zip(original_lms, new_lms, strict=False):
167 |             original_params = self.weight_optimizer.train_kwargs[original_lm]
168 |             self.weight_optimizer.train_kwargs[new_lm] = original_params
169 | 
170 |         return student
171 | 
```

--------------------------------------------------------------------------------
/dspy/clients/embedding.py:
--------------------------------------------------------------------------------

```python
  1 | from typing import Any, Callable
  2 | 
  3 | import litellm
  4 | import numpy as np
  5 | 
  6 | from dspy.clients.cache import request_cache
  7 | 
  8 | 
  9 | class Embedder:
 10 |     """DSPy embedding class.
 11 | 
 12 |     The class for computing embeddings for text inputs. This class provides a unified interface for both:
 13 | 
 14 |     1. Hosted embedding models (e.g. OpenAI's text-embedding-3-small) via litellm integration
 15 |     2. Custom embedding functions that you provide
 16 | 
 17 |     For hosted models, simply pass the model name as a string (e.g., "openai/text-embedding-3-small"). The class will use
 18 |     litellm to handle the API calls and caching.
 19 | 
 20 |     For custom embedding models, pass a callable function that:
 21 |     - Takes a list of strings as input.
 22 |     - Returns embeddings as either:
 23 |         - A 2D numpy array of float32 values
 24 |         - A 2D list of float32 values
 25 |     - Each row should represent one embedding vector
 26 | 
 27 |     Args:
 28 |         model: The embedding model to use. This can be either a string (representing the name of the hosted embedding
 29 |             model, must be an embedding model supported by litellm) or a callable that represents a custom embedding
 30 |             model.
 31 |         batch_size (int, optional): The default batch size for processing inputs in batches. Defaults to 200.
 32 |         caching (bool, optional): Whether to cache the embedding response when using a hosted model. Defaults to True.
 33 |         **kwargs: Additional default keyword arguments to pass to the embedding model.
 34 | 
 35 |     Examples:
 36 |         Example 1: Using a hosted model.
 37 | 
 38 |         ```python
 39 |         import dspy
 40 | 
 41 |         embedder = dspy.Embedder("openai/text-embedding-3-small", batch_size=100)
 42 |         embeddings = embedder(["hello", "world"])
 43 | 
 44 |         assert embeddings.shape == (2, 1536)
 45 |         ```
 46 | 
 47 |         Example 2: Using any local embedding model, e.g. from https://huggingface.co/models?library=sentence-transformers.
 48 | 
 49 |         ```python
 50 |         # pip install sentence_transformers
 51 |         import dspy
 52 |         from sentence_transformers import SentenceTransformer
 53 | 
 54 |         # Load an extremely efficient local model for retrieval
 55 |         model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1", device="cpu")
 56 | 
 57 |         embedder = dspy.Embedder(model.encode)
 58 |         embeddings = embedder(["hello", "world"], batch_size=1)
 59 | 
 60 |         assert embeddings.shape == (2, 1024)
 61 |         ```
 62 | 
 63 |         Example 3: Using a custom function.
 64 | 
 65 |         ```python
 66 |         import dspy
 67 |         import numpy as np
 68 | 
 69 |         def my_embedder(texts):
 70 |             return np.random.rand(len(texts), 10)
 71 | 
 72 |         embedder = dspy.Embedder(my_embedder)
 73 |         embeddings = embedder(["hello", "world"], batch_size=1)
 74 | 
 75 |         assert embeddings.shape == (2, 10)
 76 |         ```
 77 |     """
 78 | 
 79 |     def __init__(self, model: str | Callable, batch_size: int = 200, caching: bool = True, **kwargs: dict[str, Any]):
 80 |         self.model = model
 81 |         self.batch_size = batch_size
 82 |         self.caching = caching
 83 |         self.default_kwargs = kwargs
 84 | 
 85 |     def _preprocess(self, inputs, batch_size=None, caching=None, **kwargs):
 86 |         if isinstance(inputs, str):
 87 |             is_single_input = True
 88 |             inputs = [inputs]
 89 |         else:
 90 |             is_single_input = False
 91 | 
 92 |         if not all(isinstance(inp, str) for inp in inputs):
 93 |             raise ValueError("All inputs must be strings.")
 94 | 
 95 |         batch_size = batch_size or self.batch_size
 96 |         caching = caching or self.caching
 97 |         merged_kwargs = self.default_kwargs.copy()
 98 |         merged_kwargs.update(kwargs)
 99 | 
100 |         input_batches = []
101 |         for i in range(0, len(inputs), batch_size):
102 |             input_batches.append(inputs[i : i + batch_size])
103 | 
104 |         return input_batches, caching, merged_kwargs, is_single_input
105 | 
106 |     def _postprocess(self, embeddings_list, is_single_input):
107 |         embeddings = np.array(embeddings_list, dtype=np.float32)
108 |         if is_single_input:
109 |             return embeddings[0]
110 |         else:
111 |             return np.array(embeddings, dtype=np.float32)
112 | 
113 |     def __call__(self, inputs: str | list[str], batch_size: int | None = None, caching: bool | None = None, **kwargs: dict[str, Any]) -> np.ndarray:
114 |         """Compute embeddings for the given inputs.
115 | 
116 |         Args:
117 |             inputs: The inputs to compute embeddings for, can be a single string or a list of strings.
118 |             batch_size (int, optional): The batch size for processing inputs. If None, defaults to the batch_size set
119 |                 during initialization.
120 |             caching (bool, optional): Whether to cache the embedding response when using a hosted model. If None,
121 |                 defaults to the caching setting from initialization.
122 |             kwargs: Additional keyword arguments to pass to the embedding model. These will override the default
123 |                 kwargs provided during initialization.
124 | 
125 |         Returns:
126 |             numpy.ndarray: If the input is a single string, returns a 1D numpy array representing the embedding.
127 |             If the input is a list of strings, returns a 2D numpy array of embeddings, one embedding per row.
128 |         """
129 |         input_batches, caching, kwargs, is_single_input = self._preprocess(inputs, batch_size, caching, **kwargs)
130 | 
131 |         compute_embeddings = _cached_compute_embeddings if caching else _compute_embeddings
132 | 
133 |         embeddings_list = []
134 | 
135 |         for batch in input_batches:
136 |             embeddings_list.extend(compute_embeddings(self.model, batch, caching=caching, **kwargs))
137 |         return self._postprocess(embeddings_list, is_single_input)
138 | 
139 |     async def acall(self, inputs, batch_size=None, caching=None, **kwargs):
140 |         input_batches, caching, kwargs, is_single_input = self._preprocess(inputs, batch_size, caching, **kwargs)
141 | 
142 |         embeddings_list = []
143 |         acompute_embeddings = _cached_acompute_embeddings if caching else _acompute_embeddings
144 | 
145 |         for batch in input_batches:
146 |             embeddings_list.extend(await acompute_embeddings(self.model, batch, caching=caching, **kwargs))
147 |         return self._postprocess(embeddings_list, is_single_input)
148 | 
149 | 
150 | def _compute_embeddings(model, batch_inputs, caching=False, **kwargs):
151 |     if isinstance(model, str):
152 |         caching = caching and litellm.cache is not None
153 |         embedding_response = litellm.embedding(model=model, input=batch_inputs, caching=caching, **kwargs)
154 |         return [data["embedding"] for data in embedding_response.data]
155 |     elif callable(model):
156 |         return model(batch_inputs, **kwargs)
157 |     else:
158 |         raise ValueError(f"`model` in `dspy.Embedder` must be a string or a callable, but got {type(model)}.")
159 | 
160 | 
161 | @request_cache(ignored_args_for_cache_key=["api_key", "api_base", "base_url"])
162 | def _cached_compute_embeddings(model, batch_inputs, caching=True, **kwargs):
163 |     return _compute_embeddings(model, batch_inputs, caching=caching, **kwargs)
164 | 
165 | 
166 | async def _acompute_embeddings(model, batch_inputs, caching=False, **kwargs):
167 |     if isinstance(model, str):
168 |         caching = caching and litellm.cache is not None
169 |         embedding_response = await litellm.aembedding(model=model, input=batch_inputs, caching=caching, **kwargs)
170 |         return [data["embedding"] for data in embedding_response.data]
171 |     elif callable(model):
172 |         return model(batch_inputs, **kwargs)
173 |     else:
174 |         raise ValueError(f"`model` in `dspy.Embedder` must be a string or a callable, but got {type(model)}.")
175 | 
176 | 
177 | @request_cache(ignored_args_for_cache_key=["api_key", "api_base", "base_url"])
178 | async def _cached_acompute_embeddings(model, batch_inputs, caching=True, **kwargs):
179 |     return await _acompute_embeddings(model, batch_inputs, caching=caching, **kwargs)
180 | 
```

--------------------------------------------------------------------------------
/dspy/utils/dummies.py:
--------------------------------------------------------------------------------

```python
  1 | import random
  2 | from collections import defaultdict
  3 | from typing import Any
  4 | 
  5 | import numpy as np
  6 | 
  7 | from dspy.adapters.chat_adapter import FieldInfoWithName, field_header_pattern
  8 | from dspy.clients.lm import LM
  9 | from dspy.dsp.utils.utils import dotdict
 10 | from dspy.signatures.field import OutputField
 11 | from dspy.utils.callback import with_callbacks
 12 | 
 13 | 
 14 | class DummyLM(LM):
 15 |     """Dummy language model for unit testing purposes.
 16 | 
 17 |     Three modes of operation:
 18 | 
 19 |     Mode 1: List of dictionaries
 20 | 
 21 |     If a list of dictionaries is provided, the dummy model will return the next dictionary
 22 |     in the list for each request, formatted according to the `format_field_with_value` function.
 23 | 
 24 |     Example:
 25 | 
 26 |     ```
 27 |     lm = DummyLM([{"answer": "red"}, {"answer": "blue"}])
 28 |     dspy.settings.configure(lm=lm)
 29 |     predictor("What color is the sky?")
 30 |     # Output:
 31 |     # [[## answer ##]]
 32 |     # red
 33 |     predictor("What color is the sky?")
 34 |     # Output:
 35 |     # [[## answer ##]]
 36 |     # blue
 37 |     ```
 38 | 
 39 |     Mode 2: Dictionary of dictionaries
 40 | 
 41 |     If a dictionary of dictionaries is provided, the dummy model will return the value
 42 |     corresponding to the key which is contained with the final message of the prompt,
 43 |     formatted according to the `format_field_with_value` function from the chat adapter.
 44 | 
 45 |     ```
 46 |     lm = DummyLM({"What color is the sky?": {"answer": "blue"}})
 47 |     dspy.settings.configure(lm=lm)
 48 |     predictor("What color is the sky?")
 49 |     # Output:
 50 |     # [[## answer ##]]
 51 |     # blue
 52 |     ```
 53 | 
 54 |     Mode 3: Follow examples
 55 | 
 56 |     If `follow_examples` is set to True, and the prompt contains an example input exactly equal to the prompt,
 57 |     the dummy model will return the output from that example.
 58 | 
 59 |     ```
 60 |     lm = DummyLM([{"answer": "red"}], follow_examples=True)
 61 |     dspy.settings.configure(lm=lm)
 62 |     predictor("What color is the sky?, demos=dspy.Example(input="What color is the sky?", output="blue"))
 63 |     # Output:
 64 |     # [[## answer ##]]
 65 |     # blue
 66 |     ```
 67 | 
 68 |     """
 69 | 
 70 |     def __init__(self, answers: list[dict[str, Any]] | dict[str, dict[str, Any]], follow_examples: bool = False, adapter=None):
 71 |         super().__init__("dummy", "chat", 0.0, 1000, True)
 72 |         self.answers = answers
 73 |         if isinstance(answers, list):
 74 |             self.answers = iter(answers)
 75 |         self.follow_examples = follow_examples
 76 | 
 77 |         # Set adapter, defaulting to ChatAdapter
 78 |         if adapter is None:
 79 |             from dspy.adapters.chat_adapter import ChatAdapter
 80 |             adapter = ChatAdapter()
 81 |         self.adapter = adapter
 82 | 
 83 |     def _use_example(self, messages):
 84 |         # find all field names
 85 |         fields = defaultdict(int)
 86 |         for message in messages:
 87 |             if "content" in message:
 88 |                 if ma := field_header_pattern.match(message["content"]):
 89 |                     fields[message["content"][ma.start() : ma.end()]] += 1
 90 |         # find the fields which are missing from the final turns
 91 |         max_count = max(fields.values())
 92 |         output_fields = [field for field, count in fields.items() if count != max_count]
 93 | 
 94 |         # get the output from the last turn that has the output fields as headers
 95 |         final_input = messages[-1]["content"].split("\n\n")[0]
 96 |         for input, output in zip(reversed(messages[:-1]), reversed(messages), strict=False):
 97 |             if any(field in output["content"] for field in output_fields) and final_input in input["content"]:
 98 |                 return output["content"]
 99 | 
100 |     @with_callbacks
101 |     def __call__(self, prompt=None, messages=None, **kwargs):
102 |         def format_answer_fields(field_names_and_values: dict[str, Any]):
103 |             fields_with_values = {
104 |                 FieldInfoWithName(name=field_name, info=OutputField()): value
105 |                 for field_name, value in field_names_and_values.items()
106 |             }
107 |             # The reason why DummyLM needs an adapter is because it needs to know which output format to mimic.
108 |             # Normally LMs should not have any knowledge of an adapter, because the output format is defined in the prompt.
109 |             adapter = self.adapter
110 | 
111 |             # Try to use role="assistant" if the adapter supports it (like JSONAdapter)
112 |             try:
113 |                 return adapter.format_field_with_value(fields_with_values, role="assistant")
114 |             except TypeError:
115 |                 # Fallback for adapters that don't support role parameter (like ChatAdapter)
116 |                 return adapter.format_field_with_value(fields_with_values)
117 | 
118 |         # Build the request.
119 |         outputs = []
120 |         for _ in range(kwargs.get("n", 1)):
121 |             messages = messages or [{"role": "user", "content": prompt}]
122 |             kwargs = {**self.kwargs, **kwargs}
123 | 
124 |             if self.follow_examples:
125 |                 outputs.append(self._use_example(messages))
126 |             elif isinstance(self.answers, dict):
127 |                 outputs.append(
128 |                     next(
129 |                         (format_answer_fields(v) for k, v in self.answers.items() if k in messages[-1]["content"]),
130 |                         "No more responses",
131 |                     )
132 |                 )
133 |             else:
134 |                 outputs.append(format_answer_fields(next(self.answers, {"answer": "No more responses"})))
135 | 
136 |             # Logging, with removed api key & where `cost` is None on cache hit.
137 |             kwargs = {k: v for k, v in kwargs.items() if not k.startswith("api_")}
138 |             entry = {"prompt": prompt, "messages": messages, "kwargs": kwargs}
139 |             entry = {**entry, "outputs": outputs, "usage": 0}
140 |             entry = {**entry, "cost": 0}
141 |             self.update_history(entry)
142 | 
143 |         return outputs
144 | 
145 |     async def acall(self, prompt=None, messages=None, **kwargs):
146 |         return self.__call__(prompt=prompt, messages=messages, **kwargs)
147 | 
148 |     def get_convo(self, index):
149 |         """Get the prompt + answer from the ith message."""
150 |         return self.history[index]["messages"], self.history[index]["outputs"]
151 | 
152 | 
153 | def dummy_rm(passages=()) -> callable:
154 |     if not passages:
155 | 
156 |         def inner(query: str, *, k: int, **kwargs):
157 |             raise ValueError("No passages defined")
158 | 
159 |         return inner
160 |     max_length = max(map(len, passages)) + 100
161 |     vectorizer = DummyVectorizer(max_length)
162 |     passage_vecs = vectorizer(passages)
163 | 
164 |     def inner(query: str, *, k: int, **kwargs):
165 |         assert k <= len(passages)
166 |         query_vec = vectorizer([query])[0]
167 |         scores = passage_vecs @ query_vec
168 |         largest_idx = (-scores).argsort()[:k]
169 | 
170 |         return [dotdict(long_text=passages[i]) for i in largest_idx]
171 | 
172 |     return inner
173 | 
174 | 
175 | class DummyVectorizer:
176 |     """Simple vectorizer based on n-grams."""
177 | 
178 |     def __init__(self, max_length=100, n_gram=2):
179 |         self.max_length = max_length
180 |         self.n_gram = n_gram
181 |         self.P = 10**9 + 7  # A large prime number
182 |         random.seed(123)
183 |         self.coeffs = [random.randrange(1, self.P) for _ in range(n_gram)]
184 | 
185 |     def _hash(self, gram):
186 |         """Hashes a string using a polynomial hash function."""
187 |         h = 1
188 |         for coeff, c in zip(self.coeffs, gram, strict=False):
189 |             h = h * coeff + ord(c)
190 |             h %= self.P
191 |         return h % self.max_length
192 | 
193 |     def __call__(self, texts: list[str]) -> np.ndarray:
194 |         vecs = []
195 |         for text in texts:
196 |             grams = [text[i : i + self.n_gram] for i in range(len(text) - self.n_gram + 1)]
197 |             vec = [0] * self.max_length
198 |             for gram in grams:
199 |                 vec[self._hash(gram)] += 1
200 |             vecs.append(vec)
201 | 
202 |         vecs = np.array(vecs, dtype=np.float32)
203 |         vecs -= np.mean(vecs, axis=1, keepdims=True)
204 |         vecs /= np.linalg.norm(vecs, axis=1, keepdims=True) + 1e-10  # Added epsilon to avoid division by zero
205 |         return vecs
206 | 
```

--------------------------------------------------------------------------------
/tests/utils/test_settings.py:
--------------------------------------------------------------------------------

```python
  1 | import asyncio
  2 | import time
  3 | from concurrent.futures import ThreadPoolExecutor
  4 | from unittest import mock
  5 | 
  6 | import pytest
  7 | from litellm import Choices, Message, ModelResponse
  8 | 
  9 | import dspy
 10 | 
 11 | 
 12 | def test_basic_dspy_settings():
 13 |     dspy.configure(lm=dspy.LM("openai/gpt-4o"), adapter=dspy.JSONAdapter(), callbacks=[lambda x: x])
 14 |     assert dspy.settings.lm.model == "openai/gpt-4o"
 15 |     assert isinstance(dspy.settings.adapter, dspy.JSONAdapter)
 16 |     assert len(dspy.settings.callbacks) == 1
 17 | 
 18 | 
 19 | def test_forbid_configure_call_in_child_thread():
 20 |     dspy.configure(lm=dspy.LM("openai/gpt-4o"), adapter=dspy.JSONAdapter(), callbacks=[lambda x: x])
 21 | 
 22 |     def worker():
 23 |         with pytest.raises(RuntimeError, match="Cannot call dspy.configure"):
 24 |             dspy.configure(lm=dspy.LM("openai/gpt-4o-mini"), callbacks=[])
 25 | 
 26 |     with ThreadPoolExecutor(max_workers=1) as executor:
 27 |         executor.submit(worker)
 28 | 
 29 | 
 30 | def test_dspy_context():
 31 |     dspy.configure(lm=dspy.LM("openai/gpt-4o"), adapter=dspy.JSONAdapter(), callbacks=[lambda x: x])
 32 |     with dspy.context(lm=dspy.LM("openai/gpt-4o-mini"), callbacks=[]):
 33 |         assert dspy.settings.lm.model == "openai/gpt-4o-mini"
 34 |         assert len(dspy.settings.callbacks) == 0
 35 | 
 36 |     assert dspy.settings.lm.model == "openai/gpt-4o"
 37 |     assert len(dspy.settings.callbacks) == 1
 38 | 
 39 | 
 40 | def test_dspy_context_parallel():
 41 |     dspy.configure(lm=dspy.LM("openai/gpt-4o"), adapter=dspy.JSONAdapter(), callbacks=[lambda x: x])
 42 | 
 43 |     def worker(i):
 44 |         with dspy.context(lm=dspy.LM("openai/gpt-4o-mini"), trace=[i], callbacks=[]):
 45 |             assert dspy.settings.lm.model == "openai/gpt-4o-mini"
 46 |             assert dspy.settings.trace == [i]
 47 |             assert len(dspy.settings.callbacks) == 0
 48 | 
 49 |     with ThreadPoolExecutor(max_workers=5) as executor:
 50 |         executor.map(worker, range(3))
 51 | 
 52 |     assert dspy.settings.lm.model == "openai/gpt-4o"
 53 |     assert len(dspy.settings.callbacks) == 1
 54 | 
 55 | 
 56 | def test_dspy_context_with_dspy_parallel():
 57 |     dspy.configure(lm=dspy.LM("openai/gpt-4o", cache=False), adapter=dspy.ChatAdapter())
 58 | 
 59 |     class MyModule(dspy.Module):
 60 |         def __init__(self):
 61 |             self.predict = dspy.Predict("question -> answer")
 62 | 
 63 |         def forward(self, question: str) -> str:
 64 |             lm = dspy.LM("openai/gpt-4o-mini", cache=False) if "France" in question else dspy.settings.lm
 65 |             with dspy.context(lm=lm):
 66 |                 time.sleep(1)
 67 |                 assert dspy.settings.lm.model == lm.model
 68 |                 return self.predict(question=question)
 69 | 
 70 |     with mock.patch("litellm.completion") as mock_completion:
 71 |         mock_completion.return_value = ModelResponse(
 72 |             choices=[Choices(message=Message(content="[[ ## answer ## ]]\nParis"))],
 73 |             model="openai/gpt-4o-mini",
 74 |         )
 75 | 
 76 |         module = MyModule()
 77 |         parallelizer = dspy.Parallel()
 78 |         input_pairs = [
 79 |             (module, {"question": "What is the capital of France?"}),
 80 |             (module, {"question": "What is the capital of Germany?"}),
 81 |         ]
 82 |         parallelizer(input_pairs)
 83 | 
 84 |         # Verify mock was called correctly
 85 |         assert mock_completion.call_count == 2
 86 |         for call_args in mock_completion.call_args_list:
 87 |             if "France" in call_args.kwargs["messages"][-1]["content"]:
 88 |                 # France question uses gpt-4o-mini
 89 |                 assert call_args.kwargs["model"] == "openai/gpt-4o-mini"
 90 |             else:
 91 |                 # Germany question uses gpt-4o
 92 |                 assert call_args.kwargs["model"] == "openai/gpt-4o"
 93 | 
 94 |         # The main thread is not affected by the context
 95 |         assert dspy.settings.lm.model == "openai/gpt-4o"
 96 | 
 97 | 
 98 | @pytest.mark.asyncio
 99 | async def test_dspy_context_with_async_task_group():
100 |     class MyModule(dspy.Module):
101 |         def __init__(self):
102 |             self.predict = dspy.Predict("question -> answer")
103 | 
104 |         async def aforward(self, question: str) -> str:
105 |             lm = (
106 |                 dspy.LM("openai/gpt-4o-mini", cache=False)
107 |                 if "France" in question
108 |                 else dspy.LM("openai/gpt-4o", cache=False)
109 |             )
110 |             with dspy.context(lm=lm, trace=[]):
111 |                 await asyncio.sleep(1)
112 |                 assert dspy.settings.lm.model == lm.model
113 |                 result = await self.predict.acall(question=question)
114 |                 assert len(dspy.settings.trace) == 1
115 |                 return result
116 | 
117 |     module = MyModule()
118 | 
119 |     with dspy.context(lm=dspy.LM("openai/gpt-4.1", cache=False), adapter=dspy.ChatAdapter()):
120 |         with mock.patch("litellm.acompletion") as mock_completion:
121 |             mock_completion.return_value = ModelResponse(
122 |                 choices=[Choices(message=Message(content="[[ ## answer ## ]]\nParis"))],
123 |                 model="openai/gpt-4o-mini",
124 |             )
125 | 
126 |             # Define the coroutines to be run
127 |             coroutines = [
128 |                 module.acall(question="What is the capital of France?"),
129 |                 module.acall(question="What is the capital of France?"),
130 |                 module.acall(question="What is the capital of Germany?"),
131 |                 module.acall(question="What is the capital of Germany?"),
132 |             ]
133 | 
134 |             # Run them concurrently and gather results
135 |             results = await asyncio.gather(*coroutines)
136 | 
137 |         assert results[0].answer == "Paris"
138 |         assert results[1].answer == "Paris"
139 |         assert results[2].answer == "Paris"
140 |         assert results[3].answer == "Paris"
141 | 
142 |         # Verify mock was called correctly
143 |         assert mock_completion.call_count == 4
144 |         # France question uses gpt-4o-mini
145 |         assert mock_completion.call_args_list[0].kwargs["model"] == "openai/gpt-4o-mini"
146 |         assert mock_completion.call_args_list[1].kwargs["model"] == "openai/gpt-4o-mini"
147 |         # Germany question uses gpt-4o
148 |         assert mock_completion.call_args_list[2].kwargs["model"] == "openai/gpt-4o"
149 |         assert mock_completion.call_args_list[3].kwargs["model"] == "openai/gpt-4o"
150 | 
151 |         # The main thread is not affected by the context
152 |         assert dspy.settings.lm.model == "openai/gpt-4.1"
153 |         assert dspy.settings.trace == []
154 | 
155 | 
156 | @pytest.mark.asyncio
157 | async def test_dspy_configure_allowance_async():
158 |     def bar1():
159 |         # `dspy.configure` is disallowed in different async tasks from the initial one.
160 |         # In this case, foo1 (async) calls bar1 (sync), and bar1 uses the async task from foo1.
161 |         with pytest.raises(RuntimeError) as e:
162 |             dspy.configure(lm=dspy.LM("openai/gpt-4o"))
163 |         assert "dspy.settings.configure(...) can only be called from the same async" in str(e.value)
164 | 
165 |     async def foo1():
166 |         bar1()
167 |         await asyncio.sleep(0.1)
168 | 
169 |     async def foo2():
170 |         # `dspy.configure` is disallowed in different async tasks from the initial one.
171 |         with pytest.raises(RuntimeError) as e:
172 |             dspy.configure(lm=dspy.LM("openai/gpt-4o"))
173 |         assert "dspy.settings.configure(...) can only be called from the same async" in str(e.value)
174 |         await asyncio.sleep(0.1)
175 | 
176 |     async def foo3():
177 |         # `dspy.context` is allowed in different async tasks from the initial one.
178 |         with dspy.context(lm=dspy.LM("openai/gpt-4o")):
179 |             await asyncio.sleep(0.1)
180 | 
181 |     async def foo4():
182 |         # foo4 is directly invoked by the entry task, so it has the same async task as the entry task.
183 |         dspy.configure(lm=dspy.LM("openai/gpt-4o"))
184 |         await asyncio.sleep(0.1)
185 | 
186 |     # `dspy.configure` is allowed to be called multiple times in the same async task.
187 |     dspy.configure(lm=dspy.LM("openai/gpt-4o-mini"))
188 |     dspy.configure(lm=dspy.LM("openai/gpt-4o"))
189 |     dspy.configure(adapter=dspy.JSONAdapter())
190 | 
191 |     await asyncio.gather(foo1(), foo2(), foo3())
192 | 
193 |     foo4()
194 | 
```