langfuse

Langfuse GitHub Banner

Langfuse Python SDK

MIT License CI test status PyPI Version GitHub Repo stars Discord YC W23

Installation

Important

The SDK was rewritten in v3 and released in June 2025. Refer to the v3 migration guide for instructions on updating your code.

pip install langfuse

Docs

Please see our docs for detailed information on this SDK.

 1""".. include:: ../README.md"""
 2
 3from langfuse.batch_evaluation import (
 4    BatchEvaluationResult,
 5    BatchEvaluationResumeToken,
 6    CompositeEvaluatorFunction,
 7    EvaluatorInputs,
 8    EvaluatorStats,
 9    MapperFunction,
10)
11from langfuse.experiment import Evaluation
12
13from ._client import client as _client_module
14from ._client.attributes import LangfuseOtelSpanAttributes
15from ._client.constants import ObservationTypeLiteral
16from ._client.get_client import get_client
17from ._client.observe import observe
18from ._client.propagation import propagate_attributes
19from ._client.span import (
20    LangfuseAgent,
21    LangfuseChain,
22    LangfuseEmbedding,
23    LangfuseEvaluator,
24    LangfuseEvent,
25    LangfuseGeneration,
26    LangfuseGuardrail,
27    LangfuseRetriever,
28    LangfuseSpan,
29    LangfuseTool,
30)
31
32Langfuse = _client_module.Langfuse
33
34__all__ = [
35    "Langfuse",
36    "get_client",
37    "observe",
38    "propagate_attributes",
39    "ObservationTypeLiteral",
40    "LangfuseSpan",
41    "LangfuseGeneration",
42    "LangfuseEvent",
43    "LangfuseOtelSpanAttributes",
44    "LangfuseAgent",
45    "LangfuseTool",
46    "LangfuseChain",
47    "LangfuseEmbedding",
48    "LangfuseEvaluator",
49    "LangfuseRetriever",
50    "LangfuseGuardrail",
51    "Evaluation",
52    "EvaluatorInputs",
53    "MapperFunction",
54    "CompositeEvaluatorFunction",
55    "EvaluatorStats",
56    "BatchEvaluationResumeToken",
57    "BatchEvaluationResult",
58    "experiment",
59    "api",
60]
class Langfuse:
 134class Langfuse:
 135    """Main client for Langfuse tracing and platform features.
 136
 137    This class provides an interface for creating and managing traces, spans,
 138    and generations in Langfuse as well as interacting with the Langfuse API.
 139
 140    The client features a thread-safe singleton pattern for each unique public API key,
 141    ensuring consistent trace context propagation across your application. It implements
 142    efficient batching of spans with configurable flush settings and includes background
 143    thread management for media uploads and score ingestion.
 144
 145    Configuration is flexible through either direct parameters or environment variables,
 146    with graceful fallbacks and runtime configuration updates.
 147
 148    Attributes:
 149        api: Synchronous API client for Langfuse backend communication
 150        async_api: Asynchronous API client for Langfuse backend communication
 151        _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
 152
 153    Parameters:
 154        public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
 155        secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
 156        base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
 157        host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
 158        timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
 159        httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
 160        debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
 161        tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
 162        flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
 163        flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
 164        environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
 165        release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
 166        media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
 167        sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
 168        mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API.
 169        blocked_instrumentation_scopes (Optional[List[str]]): List of instrumentation scope names to block from being exported to Langfuse. Spans from these scopes will be filtered out before being sent to the API. Useful for filtering out spans from specific libraries or frameworks. For exported spans, you can see the instrumentation scope name in the span metadata in Langfuse (`metadata.scope.name`)
 170        additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well.
 171        tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
 172
 173    Example:
 174        ```python
 175        from langfuse.otel import Langfuse
 176
 177        # Initialize the client (reads from env vars if not provided)
 178        langfuse = Langfuse(
 179            public_key="your-public-key",
 180            secret_key="your-secret-key",
 181            host="https://cloud.langfuse.com",  # Optional, default shown
 182        )
 183
 184        # Create a trace span
 185        with langfuse.start_as_current_span(name="process-query") as span:
 186            # Your application code here
 187
 188            # Create a nested generation span for an LLM call
 189            with span.start_as_current_generation(
 190                name="generate-response",
 191                model="gpt-4",
 192                input={"query": "Tell me about AI"},
 193                model_parameters={"temperature": 0.7, "max_tokens": 500}
 194            ) as generation:
 195                # Generate response here
 196                response = "AI is a field of computer science..."
 197
 198                generation.update(
 199                    output=response,
 200                    usage_details={"prompt_tokens": 10, "completion_tokens": 50},
 201                    cost_details={"total_cost": 0.0023}
 202                )
 203
 204                # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
 205                generation.score(name="relevance", value=0.95, data_type="NUMERIC")
 206        ```
 207    """
 208
 209    _resources: Optional[LangfuseResourceManager] = None
 210    _mask: Optional[MaskFunction] = None
 211    _otel_tracer: otel_trace_api.Tracer
 212
 213    def __init__(
 214        self,
 215        *,
 216        public_key: Optional[str] = None,
 217        secret_key: Optional[str] = None,
 218        base_url: Optional[str] = None,
 219        host: Optional[str] = None,
 220        timeout: Optional[int] = None,
 221        httpx_client: Optional[httpx.Client] = None,
 222        debug: bool = False,
 223        tracing_enabled: Optional[bool] = True,
 224        flush_at: Optional[int] = None,
 225        flush_interval: Optional[float] = None,
 226        environment: Optional[str] = None,
 227        release: Optional[str] = None,
 228        media_upload_thread_count: Optional[int] = None,
 229        sample_rate: Optional[float] = None,
 230        mask: Optional[MaskFunction] = None,
 231        blocked_instrumentation_scopes: Optional[List[str]] = None,
 232        additional_headers: Optional[Dict[str, str]] = None,
 233        tracer_provider: Optional[TracerProvider] = None,
 234    ):
 235        self._base_url = (
 236            base_url
 237            or os.environ.get(LANGFUSE_BASE_URL)
 238            or host
 239            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
 240        )
 241        self._environment = environment or cast(
 242            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
 243        )
 244        self._project_id: Optional[str] = None
 245        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
 246        if not 0.0 <= sample_rate <= 1.0:
 247            raise ValueError(
 248                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
 249            )
 250
 251        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
 252
 253        self._tracing_enabled = (
 254            tracing_enabled
 255            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
 256        )
 257        if not self._tracing_enabled:
 258            langfuse_logger.info(
 259                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
 260            )
 261
 262        debug = (
 263            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
 264        )
 265        if debug:
 266            logging.basicConfig(
 267                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 268            )
 269            langfuse_logger.setLevel(logging.DEBUG)
 270
 271        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
 272        if public_key is None:
 273            langfuse_logger.warning(
 274                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
 275                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
 276            )
 277            self._otel_tracer = otel_trace_api.NoOpTracer()
 278            return
 279
 280        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
 281        if secret_key is None:
 282            langfuse_logger.warning(
 283                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
 284                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
 285            )
 286            self._otel_tracer = otel_trace_api.NoOpTracer()
 287            return
 288
 289        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
 290            langfuse_logger.warning(
 291                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
 292            )
 293
 294        # Initialize api and tracer if requirements are met
 295        self._resources = LangfuseResourceManager(
 296            public_key=public_key,
 297            secret_key=secret_key,
 298            base_url=self._base_url,
 299            timeout=timeout,
 300            environment=self._environment,
 301            release=release,
 302            flush_at=flush_at,
 303            flush_interval=flush_interval,
 304            httpx_client=httpx_client,
 305            media_upload_thread_count=media_upload_thread_count,
 306            sample_rate=sample_rate,
 307            mask=mask,
 308            tracing_enabled=self._tracing_enabled,
 309            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
 310            additional_headers=additional_headers,
 311            tracer_provider=tracer_provider,
 312        )
 313        self._mask = self._resources.mask
 314
 315        self._otel_tracer = (
 316            self._resources.tracer
 317            if self._tracing_enabled and self._resources.tracer is not None
 318            else otel_trace_api.NoOpTracer()
 319        )
 320        self.api = self._resources.api
 321        self.async_api = self._resources.async_api
 322
 323    def start_span(
 324        self,
 325        *,
 326        trace_context: Optional[TraceContext] = None,
 327        name: str,
 328        input: Optional[Any] = None,
 329        output: Optional[Any] = None,
 330        metadata: Optional[Any] = None,
 331        version: Optional[str] = None,
 332        level: Optional[SpanLevel] = None,
 333        status_message: Optional[str] = None,
 334    ) -> LangfuseSpan:
 335        """Create a new span for tracing a unit of work.
 336
 337        This method creates a new span but does not set it as the current span in the
 338        context. To create and use a span within a context, use start_as_current_span().
 339
 340        The created span will be the child of the current span in the context.
 341
 342        Args:
 343            trace_context: Optional context for connecting to an existing trace
 344            name: Name of the span (e.g., function or operation name)
 345            input: Input data for the operation (can be any JSON-serializable object)
 346            output: Output data from the operation (can be any JSON-serializable object)
 347            metadata: Additional metadata to associate with the span
 348            version: Version identifier for the code or component
 349            level: Importance level of the span (info, warning, error)
 350            status_message: Optional status message for the span
 351
 352        Returns:
 353            A LangfuseSpan object that must be ended with .end() when the operation completes
 354
 355        Example:
 356            ```python
 357            span = langfuse.start_span(name="process-data")
 358            try:
 359                # Do work
 360                span.update(output="result")
 361            finally:
 362                span.end()
 363            ```
 364        """
 365        return self.start_observation(
 366            trace_context=trace_context,
 367            name=name,
 368            as_type="span",
 369            input=input,
 370            output=output,
 371            metadata=metadata,
 372            version=version,
 373            level=level,
 374            status_message=status_message,
 375        )
 376
 377    def start_as_current_span(
 378        self,
 379        *,
 380        trace_context: Optional[TraceContext] = None,
 381        name: str,
 382        input: Optional[Any] = None,
 383        output: Optional[Any] = None,
 384        metadata: Optional[Any] = None,
 385        version: Optional[str] = None,
 386        level: Optional[SpanLevel] = None,
 387        status_message: Optional[str] = None,
 388        end_on_exit: Optional[bool] = None,
 389    ) -> _AgnosticContextManager[LangfuseSpan]:
 390        """Create a new span and set it as the current span in a context manager.
 391
 392        This method creates a new span and sets it as the current span within a context
 393        manager. Use this method with a 'with' statement to automatically handle span
 394        lifecycle within a code block.
 395
 396        The created span will be the child of the current span in the context.
 397
 398        Args:
 399            trace_context: Optional context for connecting to an existing trace
 400            name: Name of the span (e.g., function or operation name)
 401            input: Input data for the operation (can be any JSON-serializable object)
 402            output: Output data from the operation (can be any JSON-serializable object)
 403            metadata: Additional metadata to associate with the span
 404            version: Version identifier for the code or component
 405            level: Importance level of the span (info, warning, error)
 406            status_message: Optional status message for the span
 407            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 408
 409        Returns:
 410            A context manager that yields a LangfuseSpan
 411
 412        Example:
 413            ```python
 414            with langfuse.start_as_current_span(name="process-query") as span:
 415                # Do work
 416                result = process_data()
 417                span.update(output=result)
 418
 419                # Create a child span automatically
 420                with span.start_as_current_span(name="sub-operation") as child_span:
 421                    # Do sub-operation work
 422                    child_span.update(output="sub-result")
 423            ```
 424        """
 425        return self.start_as_current_observation(
 426            trace_context=trace_context,
 427            name=name,
 428            as_type="span",
 429            input=input,
 430            output=output,
 431            metadata=metadata,
 432            version=version,
 433            level=level,
 434            status_message=status_message,
 435            end_on_exit=end_on_exit,
 436        )
 437
 438    @overload
 439    def start_observation(
 440        self,
 441        *,
 442        trace_context: Optional[TraceContext] = None,
 443        name: str,
 444        as_type: Literal["generation"],
 445        input: Optional[Any] = None,
 446        output: Optional[Any] = None,
 447        metadata: Optional[Any] = None,
 448        version: Optional[str] = None,
 449        level: Optional[SpanLevel] = None,
 450        status_message: Optional[str] = None,
 451        completion_start_time: Optional[datetime] = None,
 452        model: Optional[str] = None,
 453        model_parameters: Optional[Dict[str, MapValue]] = None,
 454        usage_details: Optional[Dict[str, int]] = None,
 455        cost_details: Optional[Dict[str, float]] = None,
 456        prompt: Optional[PromptClient] = None,
 457    ) -> LangfuseGeneration: ...
 458
 459    @overload
 460    def start_observation(
 461        self,
 462        *,
 463        trace_context: Optional[TraceContext] = None,
 464        name: str,
 465        as_type: Literal["span"] = "span",
 466        input: Optional[Any] = None,
 467        output: Optional[Any] = None,
 468        metadata: Optional[Any] = None,
 469        version: Optional[str] = None,
 470        level: Optional[SpanLevel] = None,
 471        status_message: Optional[str] = None,
 472    ) -> LangfuseSpan: ...
 473
 474    @overload
 475    def start_observation(
 476        self,
 477        *,
 478        trace_context: Optional[TraceContext] = None,
 479        name: str,
 480        as_type: Literal["agent"],
 481        input: Optional[Any] = None,
 482        output: Optional[Any] = None,
 483        metadata: Optional[Any] = None,
 484        version: Optional[str] = None,
 485        level: Optional[SpanLevel] = None,
 486        status_message: Optional[str] = None,
 487    ) -> LangfuseAgent: ...
 488
 489    @overload
 490    def start_observation(
 491        self,
 492        *,
 493        trace_context: Optional[TraceContext] = None,
 494        name: str,
 495        as_type: Literal["tool"],
 496        input: Optional[Any] = None,
 497        output: Optional[Any] = None,
 498        metadata: Optional[Any] = None,
 499        version: Optional[str] = None,
 500        level: Optional[SpanLevel] = None,
 501        status_message: Optional[str] = None,
 502    ) -> LangfuseTool: ...
 503
 504    @overload
 505    def start_observation(
 506        self,
 507        *,
 508        trace_context: Optional[TraceContext] = None,
 509        name: str,
 510        as_type: Literal["chain"],
 511        input: Optional[Any] = None,
 512        output: Optional[Any] = None,
 513        metadata: Optional[Any] = None,
 514        version: Optional[str] = None,
 515        level: Optional[SpanLevel] = None,
 516        status_message: Optional[str] = None,
 517    ) -> LangfuseChain: ...
 518
 519    @overload
 520    def start_observation(
 521        self,
 522        *,
 523        trace_context: Optional[TraceContext] = None,
 524        name: str,
 525        as_type: Literal["retriever"],
 526        input: Optional[Any] = None,
 527        output: Optional[Any] = None,
 528        metadata: Optional[Any] = None,
 529        version: Optional[str] = None,
 530        level: Optional[SpanLevel] = None,
 531        status_message: Optional[str] = None,
 532    ) -> LangfuseRetriever: ...
 533
 534    @overload
 535    def start_observation(
 536        self,
 537        *,
 538        trace_context: Optional[TraceContext] = None,
 539        name: str,
 540        as_type: Literal["evaluator"],
 541        input: Optional[Any] = None,
 542        output: Optional[Any] = None,
 543        metadata: Optional[Any] = None,
 544        version: Optional[str] = None,
 545        level: Optional[SpanLevel] = None,
 546        status_message: Optional[str] = None,
 547    ) -> LangfuseEvaluator: ...
 548
 549    @overload
 550    def start_observation(
 551        self,
 552        *,
 553        trace_context: Optional[TraceContext] = None,
 554        name: str,
 555        as_type: Literal["embedding"],
 556        input: Optional[Any] = None,
 557        output: Optional[Any] = None,
 558        metadata: Optional[Any] = None,
 559        version: Optional[str] = None,
 560        level: Optional[SpanLevel] = None,
 561        status_message: Optional[str] = None,
 562        completion_start_time: Optional[datetime] = None,
 563        model: Optional[str] = None,
 564        model_parameters: Optional[Dict[str, MapValue]] = None,
 565        usage_details: Optional[Dict[str, int]] = None,
 566        cost_details: Optional[Dict[str, float]] = None,
 567        prompt: Optional[PromptClient] = None,
 568    ) -> LangfuseEmbedding: ...
 569
 570    @overload
 571    def start_observation(
 572        self,
 573        *,
 574        trace_context: Optional[TraceContext] = None,
 575        name: str,
 576        as_type: Literal["guardrail"],
 577        input: Optional[Any] = None,
 578        output: Optional[Any] = None,
 579        metadata: Optional[Any] = None,
 580        version: Optional[str] = None,
 581        level: Optional[SpanLevel] = None,
 582        status_message: Optional[str] = None,
 583    ) -> LangfuseGuardrail: ...
 584
 585    def start_observation(
 586        self,
 587        *,
 588        trace_context: Optional[TraceContext] = None,
 589        name: str,
 590        as_type: ObservationTypeLiteralNoEvent = "span",
 591        input: Optional[Any] = None,
 592        output: Optional[Any] = None,
 593        metadata: Optional[Any] = None,
 594        version: Optional[str] = None,
 595        level: Optional[SpanLevel] = None,
 596        status_message: Optional[str] = None,
 597        completion_start_time: Optional[datetime] = None,
 598        model: Optional[str] = None,
 599        model_parameters: Optional[Dict[str, MapValue]] = None,
 600        usage_details: Optional[Dict[str, int]] = None,
 601        cost_details: Optional[Dict[str, float]] = None,
 602        prompt: Optional[PromptClient] = None,
 603    ) -> Union[
 604        LangfuseSpan,
 605        LangfuseGeneration,
 606        LangfuseAgent,
 607        LangfuseTool,
 608        LangfuseChain,
 609        LangfuseRetriever,
 610        LangfuseEvaluator,
 611        LangfuseEmbedding,
 612        LangfuseGuardrail,
 613    ]:
 614        """Create a new observation of the specified type.
 615
 616        This method creates a new observation but does not set it as the current span in the
 617        context. To create and use an observation within a context, use start_as_current_observation().
 618
 619        Args:
 620            trace_context: Optional context for connecting to an existing trace
 621            name: Name of the observation
 622            as_type: Type of observation to create (defaults to "span")
 623            input: Input data for the operation
 624            output: Output data from the operation
 625            metadata: Additional metadata to associate with the observation
 626            version: Version identifier for the code or component
 627            level: Importance level of the observation
 628            status_message: Optional status message for the observation
 629            completion_start_time: When the model started generating (for generation types)
 630            model: Name/identifier of the AI model used (for generation types)
 631            model_parameters: Parameters used for the model (for generation types)
 632            usage_details: Token usage information (for generation types)
 633            cost_details: Cost information (for generation types)
 634            prompt: Associated prompt template (for generation types)
 635
 636        Returns:
 637            An observation object of the appropriate type that must be ended with .end()
 638        """
 639        if trace_context:
 640            trace_id = trace_context.get("trace_id", None)
 641            parent_span_id = trace_context.get("parent_span_id", None)
 642
 643            if trace_id:
 644                remote_parent_span = self._create_remote_parent_span(
 645                    trace_id=trace_id, parent_span_id=parent_span_id
 646                )
 647
 648                with otel_trace_api.use_span(
 649                    cast(otel_trace_api.Span, remote_parent_span)
 650                ):
 651                    otel_span = self._otel_tracer.start_span(name=name)
 652                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
 653
 654                    return self._create_observation_from_otel_span(
 655                        otel_span=otel_span,
 656                        as_type=as_type,
 657                        input=input,
 658                        output=output,
 659                        metadata=metadata,
 660                        version=version,
 661                        level=level,
 662                        status_message=status_message,
 663                        completion_start_time=completion_start_time,
 664                        model=model,
 665                        model_parameters=model_parameters,
 666                        usage_details=usage_details,
 667                        cost_details=cost_details,
 668                        prompt=prompt,
 669                    )
 670
 671        otel_span = self._otel_tracer.start_span(name=name)
 672
 673        return self._create_observation_from_otel_span(
 674            otel_span=otel_span,
 675            as_type=as_type,
 676            input=input,
 677            output=output,
 678            metadata=metadata,
 679            version=version,
 680            level=level,
 681            status_message=status_message,
 682            completion_start_time=completion_start_time,
 683            model=model,
 684            model_parameters=model_parameters,
 685            usage_details=usage_details,
 686            cost_details=cost_details,
 687            prompt=prompt,
 688        )
 689
 690    def _create_observation_from_otel_span(
 691        self,
 692        *,
 693        otel_span: otel_trace_api.Span,
 694        as_type: ObservationTypeLiteralNoEvent,
 695        input: Optional[Any] = None,
 696        output: Optional[Any] = None,
 697        metadata: Optional[Any] = None,
 698        version: Optional[str] = None,
 699        level: Optional[SpanLevel] = None,
 700        status_message: Optional[str] = None,
 701        completion_start_time: Optional[datetime] = None,
 702        model: Optional[str] = None,
 703        model_parameters: Optional[Dict[str, MapValue]] = None,
 704        usage_details: Optional[Dict[str, int]] = None,
 705        cost_details: Optional[Dict[str, float]] = None,
 706        prompt: Optional[PromptClient] = None,
 707    ) -> Union[
 708        LangfuseSpan,
 709        LangfuseGeneration,
 710        LangfuseAgent,
 711        LangfuseTool,
 712        LangfuseChain,
 713        LangfuseRetriever,
 714        LangfuseEvaluator,
 715        LangfuseEmbedding,
 716        LangfuseGuardrail,
 717    ]:
 718        """Create the appropriate observation type from an OTEL span."""
 719        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
 720            observation_class = self._get_span_class(as_type)
 721            # Type ignore to prevent overloads of internal _get_span_class function,
 722            # issue is that LangfuseEvent could be returned and that classes have diff. args
 723            return observation_class(  # type: ignore[return-value,call-arg]
 724                otel_span=otel_span,
 725                langfuse_client=self,
 726                environment=self._environment,
 727                input=input,
 728                output=output,
 729                metadata=metadata,
 730                version=version,
 731                level=level,
 732                status_message=status_message,
 733                completion_start_time=completion_start_time,
 734                model=model,
 735                model_parameters=model_parameters,
 736                usage_details=usage_details,
 737                cost_details=cost_details,
 738                prompt=prompt,
 739            )
 740        else:
 741            # For other types (e.g. span, guardrail), create appropriate class without generation properties
 742            observation_class = self._get_span_class(as_type)
 743            # Type ignore to prevent overloads of internal _get_span_class function,
 744            # issue is that LangfuseEvent could be returned and that classes have diff. args
 745            return observation_class(  # type: ignore[return-value,call-arg]
 746                otel_span=otel_span,
 747                langfuse_client=self,
 748                environment=self._environment,
 749                input=input,
 750                output=output,
 751                metadata=metadata,
 752                version=version,
 753                level=level,
 754                status_message=status_message,
 755            )
 756            # span._observation_type = as_type
 757            # span._otel_span.set_attribute("langfuse.observation.type", as_type)
 758            # return span
 759
 760    def start_generation(
 761        self,
 762        *,
 763        trace_context: Optional[TraceContext] = None,
 764        name: str,
 765        input: Optional[Any] = None,
 766        output: Optional[Any] = None,
 767        metadata: Optional[Any] = None,
 768        version: Optional[str] = None,
 769        level: Optional[SpanLevel] = None,
 770        status_message: Optional[str] = None,
 771        completion_start_time: Optional[datetime] = None,
 772        model: Optional[str] = None,
 773        model_parameters: Optional[Dict[str, MapValue]] = None,
 774        usage_details: Optional[Dict[str, int]] = None,
 775        cost_details: Optional[Dict[str, float]] = None,
 776        prompt: Optional[PromptClient] = None,
 777    ) -> LangfuseGeneration:
 778        """Create a new generation span for model generations.
 779
 780        DEPRECATED: This method is deprecated and will be removed in a future version.
 781        Use start_observation(as_type='generation') instead.
 782
 783        This method creates a specialized span for tracking model generations.
 784        It includes additional fields specific to model generations such as model name,
 785        token usage, and cost details.
 786
 787        The created generation span will be the child of the current span in the context.
 788
 789        Args:
 790            trace_context: Optional context for connecting to an existing trace
 791            name: Name of the generation operation
 792            input: Input data for the model (e.g., prompts)
 793            output: Output from the model (e.g., completions)
 794            metadata: Additional metadata to associate with the generation
 795            version: Version identifier for the model or component
 796            level: Importance level of the generation (info, warning, error)
 797            status_message: Optional status message for the generation
 798            completion_start_time: When the model started generating the response
 799            model: Name/identifier of the AI model used (e.g., "gpt-4")
 800            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 801            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 802            cost_details: Cost information for the model call
 803            prompt: Associated prompt template from Langfuse prompt management
 804
 805        Returns:
 806            A LangfuseGeneration object that must be ended with .end() when complete
 807
 808        Example:
 809            ```python
 810            generation = langfuse.start_generation(
 811                name="answer-generation",
 812                model="gpt-4",
 813                input={"prompt": "Explain quantum computing"},
 814                model_parameters={"temperature": 0.7}
 815            )
 816            try:
 817                # Call model API
 818                response = llm.generate(...)
 819
 820                generation.update(
 821                    output=response.text,
 822                    usage_details={
 823                        "prompt_tokens": response.usage.prompt_tokens,
 824                        "completion_tokens": response.usage.completion_tokens
 825                    }
 826                )
 827            finally:
 828                generation.end()
 829            ```
 830        """
 831        warnings.warn(
 832            "start_generation is deprecated and will be removed in a future version. "
 833            "Use start_observation(as_type='generation') instead.",
 834            DeprecationWarning,
 835            stacklevel=2,
 836        )
 837        return self.start_observation(
 838            trace_context=trace_context,
 839            name=name,
 840            as_type="generation",
 841            input=input,
 842            output=output,
 843            metadata=metadata,
 844            version=version,
 845            level=level,
 846            status_message=status_message,
 847            completion_start_time=completion_start_time,
 848            model=model,
 849            model_parameters=model_parameters,
 850            usage_details=usage_details,
 851            cost_details=cost_details,
 852            prompt=prompt,
 853        )
 854
 855    def start_as_current_generation(
 856        self,
 857        *,
 858        trace_context: Optional[TraceContext] = None,
 859        name: str,
 860        input: Optional[Any] = None,
 861        output: Optional[Any] = None,
 862        metadata: Optional[Any] = None,
 863        version: Optional[str] = None,
 864        level: Optional[SpanLevel] = None,
 865        status_message: Optional[str] = None,
 866        completion_start_time: Optional[datetime] = None,
 867        model: Optional[str] = None,
 868        model_parameters: Optional[Dict[str, MapValue]] = None,
 869        usage_details: Optional[Dict[str, int]] = None,
 870        cost_details: Optional[Dict[str, float]] = None,
 871        prompt: Optional[PromptClient] = None,
 872        end_on_exit: Optional[bool] = None,
 873    ) -> _AgnosticContextManager[LangfuseGeneration]:
 874        """Create a new generation span and set it as the current span in a context manager.
 875
 876        DEPRECATED: This method is deprecated and will be removed in a future version.
 877        Use start_as_current_observation(as_type='generation') instead.
 878
 879        This method creates a specialized span for model generations and sets it as the
 880        current span within a context manager. Use this method with a 'with' statement to
 881        automatically handle the generation span lifecycle within a code block.
 882
 883        The created generation span will be the child of the current span in the context.
 884
 885        Args:
 886            trace_context: Optional context for connecting to an existing trace
 887            name: Name of the generation operation
 888            input: Input data for the model (e.g., prompts)
 889            output: Output from the model (e.g., completions)
 890            metadata: Additional metadata to associate with the generation
 891            version: Version identifier for the model or component
 892            level: Importance level of the generation (info, warning, error)
 893            status_message: Optional status message for the generation
 894            completion_start_time: When the model started generating the response
 895            model: Name/identifier of the AI model used (e.g., "gpt-4")
 896            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
 897            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
 898            cost_details: Cost information for the model call
 899            prompt: Associated prompt template from Langfuse prompt management
 900            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
 901
 902        Returns:
 903            A context manager that yields a LangfuseGeneration
 904
 905        Example:
 906            ```python
 907            with langfuse.start_as_current_generation(
 908                name="answer-generation",
 909                model="gpt-4",
 910                input={"prompt": "Explain quantum computing"}
 911            ) as generation:
 912                # Call model API
 913                response = llm.generate(...)
 914
 915                # Update with results
 916                generation.update(
 917                    output=response.text,
 918                    usage_details={
 919                        "prompt_tokens": response.usage.prompt_tokens,
 920                        "completion_tokens": response.usage.completion_tokens
 921                    }
 922                )
 923            ```
 924        """
 925        warnings.warn(
 926            "start_as_current_generation is deprecated and will be removed in a future version. "
 927            "Use start_as_current_observation(as_type='generation') instead.",
 928            DeprecationWarning,
 929            stacklevel=2,
 930        )
 931        return self.start_as_current_observation(
 932            trace_context=trace_context,
 933            name=name,
 934            as_type="generation",
 935            input=input,
 936            output=output,
 937            metadata=metadata,
 938            version=version,
 939            level=level,
 940            status_message=status_message,
 941            completion_start_time=completion_start_time,
 942            model=model,
 943            model_parameters=model_parameters,
 944            usage_details=usage_details,
 945            cost_details=cost_details,
 946            prompt=prompt,
 947            end_on_exit=end_on_exit,
 948        )
 949
 950    @overload
 951    def start_as_current_observation(
 952        self,
 953        *,
 954        trace_context: Optional[TraceContext] = None,
 955        name: str,
 956        as_type: Literal["generation"],
 957        input: Optional[Any] = None,
 958        output: Optional[Any] = None,
 959        metadata: Optional[Any] = None,
 960        version: Optional[str] = None,
 961        level: Optional[SpanLevel] = None,
 962        status_message: Optional[str] = None,
 963        completion_start_time: Optional[datetime] = None,
 964        model: Optional[str] = None,
 965        model_parameters: Optional[Dict[str, MapValue]] = None,
 966        usage_details: Optional[Dict[str, int]] = None,
 967        cost_details: Optional[Dict[str, float]] = None,
 968        prompt: Optional[PromptClient] = None,
 969        end_on_exit: Optional[bool] = None,
 970    ) -> _AgnosticContextManager[LangfuseGeneration]: ...
 971
 972    @overload
 973    def start_as_current_observation(
 974        self,
 975        *,
 976        trace_context: Optional[TraceContext] = None,
 977        name: str,
 978        as_type: Literal["span"] = "span",
 979        input: Optional[Any] = None,
 980        output: Optional[Any] = None,
 981        metadata: Optional[Any] = None,
 982        version: Optional[str] = None,
 983        level: Optional[SpanLevel] = None,
 984        status_message: Optional[str] = None,
 985        end_on_exit: Optional[bool] = None,
 986    ) -> _AgnosticContextManager[LangfuseSpan]: ...
 987
 988    @overload
 989    def start_as_current_observation(
 990        self,
 991        *,
 992        trace_context: Optional[TraceContext] = None,
 993        name: str,
 994        as_type: Literal["agent"],
 995        input: Optional[Any] = None,
 996        output: Optional[Any] = None,
 997        metadata: Optional[Any] = None,
 998        version: Optional[str] = None,
 999        level: Optional[SpanLevel] = None,
1000        status_message: Optional[str] = None,
1001        end_on_exit: Optional[bool] = None,
1002    ) -> _AgnosticContextManager[LangfuseAgent]: ...
1003
1004    @overload
1005    def start_as_current_observation(
1006        self,
1007        *,
1008        trace_context: Optional[TraceContext] = None,
1009        name: str,
1010        as_type: Literal["tool"],
1011        input: Optional[Any] = None,
1012        output: Optional[Any] = None,
1013        metadata: Optional[Any] = None,
1014        version: Optional[str] = None,
1015        level: Optional[SpanLevel] = None,
1016        status_message: Optional[str] = None,
1017        end_on_exit: Optional[bool] = None,
1018    ) -> _AgnosticContextManager[LangfuseTool]: ...
1019
1020    @overload
1021    def start_as_current_observation(
1022        self,
1023        *,
1024        trace_context: Optional[TraceContext] = None,
1025        name: str,
1026        as_type: Literal["chain"],
1027        input: Optional[Any] = None,
1028        output: Optional[Any] = None,
1029        metadata: Optional[Any] = None,
1030        version: Optional[str] = None,
1031        level: Optional[SpanLevel] = None,
1032        status_message: Optional[str] = None,
1033        end_on_exit: Optional[bool] = None,
1034    ) -> _AgnosticContextManager[LangfuseChain]: ...
1035
1036    @overload
1037    def start_as_current_observation(
1038        self,
1039        *,
1040        trace_context: Optional[TraceContext] = None,
1041        name: str,
1042        as_type: Literal["retriever"],
1043        input: Optional[Any] = None,
1044        output: Optional[Any] = None,
1045        metadata: Optional[Any] = None,
1046        version: Optional[str] = None,
1047        level: Optional[SpanLevel] = None,
1048        status_message: Optional[str] = None,
1049        end_on_exit: Optional[bool] = None,
1050    ) -> _AgnosticContextManager[LangfuseRetriever]: ...
1051
1052    @overload
1053    def start_as_current_observation(
1054        self,
1055        *,
1056        trace_context: Optional[TraceContext] = None,
1057        name: str,
1058        as_type: Literal["evaluator"],
1059        input: Optional[Any] = None,
1060        output: Optional[Any] = None,
1061        metadata: Optional[Any] = None,
1062        version: Optional[str] = None,
1063        level: Optional[SpanLevel] = None,
1064        status_message: Optional[str] = None,
1065        end_on_exit: Optional[bool] = None,
1066    ) -> _AgnosticContextManager[LangfuseEvaluator]: ...
1067
1068    @overload
1069    def start_as_current_observation(
1070        self,
1071        *,
1072        trace_context: Optional[TraceContext] = None,
1073        name: str,
1074        as_type: Literal["embedding"],
1075        input: Optional[Any] = None,
1076        output: Optional[Any] = None,
1077        metadata: Optional[Any] = None,
1078        version: Optional[str] = None,
1079        level: Optional[SpanLevel] = None,
1080        status_message: Optional[str] = None,
1081        completion_start_time: Optional[datetime] = None,
1082        model: Optional[str] = None,
1083        model_parameters: Optional[Dict[str, MapValue]] = None,
1084        usage_details: Optional[Dict[str, int]] = None,
1085        cost_details: Optional[Dict[str, float]] = None,
1086        prompt: Optional[PromptClient] = None,
1087        end_on_exit: Optional[bool] = None,
1088    ) -> _AgnosticContextManager[LangfuseEmbedding]: ...
1089
1090    @overload
1091    def start_as_current_observation(
1092        self,
1093        *,
1094        trace_context: Optional[TraceContext] = None,
1095        name: str,
1096        as_type: Literal["guardrail"],
1097        input: Optional[Any] = None,
1098        output: Optional[Any] = None,
1099        metadata: Optional[Any] = None,
1100        version: Optional[str] = None,
1101        level: Optional[SpanLevel] = None,
1102        status_message: Optional[str] = None,
1103        end_on_exit: Optional[bool] = None,
1104    ) -> _AgnosticContextManager[LangfuseGuardrail]: ...
1105
1106    def start_as_current_observation(
1107        self,
1108        *,
1109        trace_context: Optional[TraceContext] = None,
1110        name: str,
1111        as_type: ObservationTypeLiteralNoEvent = "span",
1112        input: Optional[Any] = None,
1113        output: Optional[Any] = None,
1114        metadata: Optional[Any] = None,
1115        version: Optional[str] = None,
1116        level: Optional[SpanLevel] = None,
1117        status_message: Optional[str] = None,
1118        completion_start_time: Optional[datetime] = None,
1119        model: Optional[str] = None,
1120        model_parameters: Optional[Dict[str, MapValue]] = None,
1121        usage_details: Optional[Dict[str, int]] = None,
1122        cost_details: Optional[Dict[str, float]] = None,
1123        prompt: Optional[PromptClient] = None,
1124        end_on_exit: Optional[bool] = None,
1125    ) -> Union[
1126        _AgnosticContextManager[LangfuseGeneration],
1127        _AgnosticContextManager[LangfuseSpan],
1128        _AgnosticContextManager[LangfuseAgent],
1129        _AgnosticContextManager[LangfuseTool],
1130        _AgnosticContextManager[LangfuseChain],
1131        _AgnosticContextManager[LangfuseRetriever],
1132        _AgnosticContextManager[LangfuseEvaluator],
1133        _AgnosticContextManager[LangfuseEmbedding],
1134        _AgnosticContextManager[LangfuseGuardrail],
1135    ]:
1136        """Create a new observation and set it as the current span in a context manager.
1137
1138        This method creates a new observation of the specified type and sets it as the
1139        current span within a context manager. Use this method with a 'with' statement to
1140        automatically handle the observation lifecycle within a code block.
1141
1142        The created observation will be the child of the current span in the context.
1143
1144        Args:
1145            trace_context: Optional context for connecting to an existing trace
1146            name: Name of the observation (e.g., function or operation name)
1147            as_type: Type of observation to create (defaults to "span")
1148            input: Input data for the operation (can be any JSON-serializable object)
1149            output: Output data from the operation (can be any JSON-serializable object)
1150            metadata: Additional metadata to associate with the observation
1151            version: Version identifier for the code or component
1152            level: Importance level of the observation (info, warning, error)
1153            status_message: Optional status message for the observation
1154            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
1155
1156            The following parameters are available when as_type is: "generation" or "embedding".
1157            completion_start_time: When the model started generating the response
1158            model: Name/identifier of the AI model used (e.g., "gpt-4")
1159            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1160            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1161            cost_details: Cost information for the model call
1162            prompt: Associated prompt template from Langfuse prompt management
1163
1164        Returns:
1165            A context manager that yields the appropriate observation type based on as_type
1166
1167        Example:
1168            ```python
1169            # Create a span
1170            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
1171                # Do work
1172                result = process_data()
1173                span.update(output=result)
1174
1175                # Create a child span automatically
1176                with span.start_as_current_span(name="sub-operation") as child_span:
1177                    # Do sub-operation work
1178                    child_span.update(output="sub-result")
1179
1180            # Create a tool observation
1181            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
1182                # Do tool work
1183                results = search_web(query)
1184                tool.update(output=results)
1185
1186            # Create a generation observation
1187            with langfuse.start_as_current_observation(
1188                name="answer-generation",
1189                as_type="generation",
1190                model="gpt-4"
1191            ) as generation:
1192                # Generate answer
1193                response = llm.generate(...)
1194                generation.update(output=response)
1195            ```
1196        """
1197        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
1198            if trace_context:
1199                trace_id = trace_context.get("trace_id", None)
1200                parent_span_id = trace_context.get("parent_span_id", None)
1201
1202                if trace_id:
1203                    remote_parent_span = self._create_remote_parent_span(
1204                        trace_id=trace_id, parent_span_id=parent_span_id
1205                    )
1206
1207                    return cast(
1208                        Union[
1209                            _AgnosticContextManager[LangfuseGeneration],
1210                            _AgnosticContextManager[LangfuseEmbedding],
1211                        ],
1212                        self._create_span_with_parent_context(
1213                            as_type=as_type,
1214                            name=name,
1215                            remote_parent_span=remote_parent_span,
1216                            parent=None,
1217                            end_on_exit=end_on_exit,
1218                            input=input,
1219                            output=output,
1220                            metadata=metadata,
1221                            version=version,
1222                            level=level,
1223                            status_message=status_message,
1224                            completion_start_time=completion_start_time,
1225                            model=model,
1226                            model_parameters=model_parameters,
1227                            usage_details=usage_details,
1228                            cost_details=cost_details,
1229                            prompt=prompt,
1230                        ),
1231                    )
1232
1233            return cast(
1234                Union[
1235                    _AgnosticContextManager[LangfuseGeneration],
1236                    _AgnosticContextManager[LangfuseEmbedding],
1237                ],
1238                self._start_as_current_otel_span_with_processed_media(
1239                    as_type=as_type,
1240                    name=name,
1241                    end_on_exit=end_on_exit,
1242                    input=input,
1243                    output=output,
1244                    metadata=metadata,
1245                    version=version,
1246                    level=level,
1247                    status_message=status_message,
1248                    completion_start_time=completion_start_time,
1249                    model=model,
1250                    model_parameters=model_parameters,
1251                    usage_details=usage_details,
1252                    cost_details=cost_details,
1253                    prompt=prompt,
1254                ),
1255            )
1256
1257        if as_type in get_observation_types_list(ObservationTypeSpanLike):
1258            if trace_context:
1259                trace_id = trace_context.get("trace_id", None)
1260                parent_span_id = trace_context.get("parent_span_id", None)
1261
1262                if trace_id:
1263                    remote_parent_span = self._create_remote_parent_span(
1264                        trace_id=trace_id, parent_span_id=parent_span_id
1265                    )
1266
1267                    return cast(
1268                        Union[
1269                            _AgnosticContextManager[LangfuseSpan],
1270                            _AgnosticContextManager[LangfuseAgent],
1271                            _AgnosticContextManager[LangfuseTool],
1272                            _AgnosticContextManager[LangfuseChain],
1273                            _AgnosticContextManager[LangfuseRetriever],
1274                            _AgnosticContextManager[LangfuseEvaluator],
1275                            _AgnosticContextManager[LangfuseGuardrail],
1276                        ],
1277                        self._create_span_with_parent_context(
1278                            as_type=as_type,
1279                            name=name,
1280                            remote_parent_span=remote_parent_span,
1281                            parent=None,
1282                            end_on_exit=end_on_exit,
1283                            input=input,
1284                            output=output,
1285                            metadata=metadata,
1286                            version=version,
1287                            level=level,
1288                            status_message=status_message,
1289                        ),
1290                    )
1291
1292            return cast(
1293                Union[
1294                    _AgnosticContextManager[LangfuseSpan],
1295                    _AgnosticContextManager[LangfuseAgent],
1296                    _AgnosticContextManager[LangfuseTool],
1297                    _AgnosticContextManager[LangfuseChain],
1298                    _AgnosticContextManager[LangfuseRetriever],
1299                    _AgnosticContextManager[LangfuseEvaluator],
1300                    _AgnosticContextManager[LangfuseGuardrail],
1301                ],
1302                self._start_as_current_otel_span_with_processed_media(
1303                    as_type=as_type,
1304                    name=name,
1305                    end_on_exit=end_on_exit,
1306                    input=input,
1307                    output=output,
1308                    metadata=metadata,
1309                    version=version,
1310                    level=level,
1311                    status_message=status_message,
1312                ),
1313            )
1314
1315        # This should never be reached since all valid types are handled above
1316        langfuse_logger.warning(
1317            f"Unknown observation type: {as_type}, falling back to span"
1318        )
1319        return self._start_as_current_otel_span_with_processed_media(
1320            as_type="span",
1321            name=name,
1322            end_on_exit=end_on_exit,
1323            input=input,
1324            output=output,
1325            metadata=metadata,
1326            version=version,
1327            level=level,
1328            status_message=status_message,
1329        )
1330
1331    def _get_span_class(
1332        self,
1333        as_type: ObservationTypeLiteral,
1334    ) -> Union[
1335        Type[LangfuseAgent],
1336        Type[LangfuseTool],
1337        Type[LangfuseChain],
1338        Type[LangfuseRetriever],
1339        Type[LangfuseEvaluator],
1340        Type[LangfuseEmbedding],
1341        Type[LangfuseGuardrail],
1342        Type[LangfuseGeneration],
1343        Type[LangfuseEvent],
1344        Type[LangfuseSpan],
1345    ]:
1346        """Get the appropriate span class based on as_type."""
1347        normalized_type = as_type.lower()
1348
1349        if normalized_type == "agent":
1350            return LangfuseAgent
1351        elif normalized_type == "tool":
1352            return LangfuseTool
1353        elif normalized_type == "chain":
1354            return LangfuseChain
1355        elif normalized_type == "retriever":
1356            return LangfuseRetriever
1357        elif normalized_type == "evaluator":
1358            return LangfuseEvaluator
1359        elif normalized_type == "embedding":
1360            return LangfuseEmbedding
1361        elif normalized_type == "guardrail":
1362            return LangfuseGuardrail
1363        elif normalized_type == "generation":
1364            return LangfuseGeneration
1365        elif normalized_type == "event":
1366            return LangfuseEvent
1367        elif normalized_type == "span":
1368            return LangfuseSpan
1369        else:
1370            return LangfuseSpan
1371
1372    @_agnosticcontextmanager
1373    def _create_span_with_parent_context(
1374        self,
1375        *,
1376        name: str,
1377        parent: Optional[otel_trace_api.Span] = None,
1378        remote_parent_span: Optional[otel_trace_api.Span] = None,
1379        as_type: ObservationTypeLiteralNoEvent,
1380        end_on_exit: Optional[bool] = None,
1381        input: Optional[Any] = None,
1382        output: Optional[Any] = None,
1383        metadata: Optional[Any] = None,
1384        version: Optional[str] = None,
1385        level: Optional[SpanLevel] = None,
1386        status_message: Optional[str] = None,
1387        completion_start_time: Optional[datetime] = None,
1388        model: Optional[str] = None,
1389        model_parameters: Optional[Dict[str, MapValue]] = None,
1390        usage_details: Optional[Dict[str, int]] = None,
1391        cost_details: Optional[Dict[str, float]] = None,
1392        prompt: Optional[PromptClient] = None,
1393    ) -> Any:
1394        parent_span = parent or cast(otel_trace_api.Span, remote_parent_span)
1395
1396        with otel_trace_api.use_span(parent_span):
1397            with self._start_as_current_otel_span_with_processed_media(
1398                name=name,
1399                as_type=as_type,
1400                end_on_exit=end_on_exit,
1401                input=input,
1402                output=output,
1403                metadata=metadata,
1404                version=version,
1405                level=level,
1406                status_message=status_message,
1407                completion_start_time=completion_start_time,
1408                model=model,
1409                model_parameters=model_parameters,
1410                usage_details=usage_details,
1411                cost_details=cost_details,
1412                prompt=prompt,
1413            ) as langfuse_span:
1414                if remote_parent_span is not None:
1415                    langfuse_span._otel_span.set_attribute(
1416                        LangfuseOtelSpanAttributes.AS_ROOT, True
1417                    )
1418
1419                yield langfuse_span
1420
1421    @_agnosticcontextmanager
1422    def _start_as_current_otel_span_with_processed_media(
1423        self,
1424        *,
1425        name: str,
1426        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
1427        end_on_exit: Optional[bool] = None,
1428        input: Optional[Any] = None,
1429        output: Optional[Any] = None,
1430        metadata: Optional[Any] = None,
1431        version: Optional[str] = None,
1432        level: Optional[SpanLevel] = None,
1433        status_message: Optional[str] = None,
1434        completion_start_time: Optional[datetime] = None,
1435        model: Optional[str] = None,
1436        model_parameters: Optional[Dict[str, MapValue]] = None,
1437        usage_details: Optional[Dict[str, int]] = None,
1438        cost_details: Optional[Dict[str, float]] = None,
1439        prompt: Optional[PromptClient] = None,
1440    ) -> Any:
1441        with self._otel_tracer.start_as_current_span(
1442            name=name,
1443            end_on_exit=end_on_exit if end_on_exit is not None else True,
1444        ) as otel_span:
1445            span_class = self._get_span_class(
1446                as_type or "generation"
1447            )  # default was "generation"
1448            common_args = {
1449                "otel_span": otel_span,
1450                "langfuse_client": self,
1451                "environment": self._environment,
1452                "input": input,
1453                "output": output,
1454                "metadata": metadata,
1455                "version": version,
1456                "level": level,
1457                "status_message": status_message,
1458            }
1459
1460            if span_class in [
1461                LangfuseGeneration,
1462                LangfuseEmbedding,
1463            ]:
1464                common_args.update(
1465                    {
1466                        "completion_start_time": completion_start_time,
1467                        "model": model,
1468                        "model_parameters": model_parameters,
1469                        "usage_details": usage_details,
1470                        "cost_details": cost_details,
1471                        "prompt": prompt,
1472                    }
1473                )
1474            # For span-like types (span, agent, tool, chain, retriever, evaluator, guardrail), no generation properties needed
1475
1476            yield span_class(**common_args)  # type: ignore[arg-type]
1477
1478    def _get_current_otel_span(self) -> Optional[otel_trace_api.Span]:
1479        current_span = otel_trace_api.get_current_span()
1480
1481        if current_span is otel_trace_api.INVALID_SPAN:
1482            langfuse_logger.warning(
1483                "Context error: No active span in current context. Operations that depend on an active span will be skipped. "
1484                "Ensure spans are created with start_as_current_span() or that you're operating within an active span context."
1485            )
1486            return None
1487
1488        return current_span
1489
1490    def update_current_generation(
1491        self,
1492        *,
1493        name: Optional[str] = None,
1494        input: Optional[Any] = None,
1495        output: Optional[Any] = None,
1496        metadata: Optional[Any] = None,
1497        version: Optional[str] = None,
1498        level: Optional[SpanLevel] = None,
1499        status_message: Optional[str] = None,
1500        completion_start_time: Optional[datetime] = None,
1501        model: Optional[str] = None,
1502        model_parameters: Optional[Dict[str, MapValue]] = None,
1503        usage_details: Optional[Dict[str, int]] = None,
1504        cost_details: Optional[Dict[str, float]] = None,
1505        prompt: Optional[PromptClient] = None,
1506    ) -> None:
1507        """Update the current active generation span with new information.
1508
1509        This method updates the current generation span in the active context with
1510        additional information. It's useful for adding output, usage stats, or other
1511        details that become available during or after model generation.
1512
1513        Args:
1514            name: The generation name
1515            input: Updated input data for the model
1516            output: Output from the model (e.g., completions)
1517            metadata: Additional metadata to associate with the generation
1518            version: Version identifier for the model or component
1519            level: Importance level of the generation (info, warning, error)
1520            status_message: Optional status message for the generation
1521            completion_start_time: When the model started generating the response
1522            model: Name/identifier of the AI model used (e.g., "gpt-4")
1523            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1524            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1525            cost_details: Cost information for the model call
1526            prompt: Associated prompt template from Langfuse prompt management
1527
1528        Example:
1529            ```python
1530            with langfuse.start_as_current_generation(name="answer-query") as generation:
1531                # Initial setup and API call
1532                response = llm.generate(...)
1533
1534                # Update with results that weren't available at creation time
1535                langfuse.update_current_generation(
1536                    output=response.text,
1537                    usage_details={
1538                        "prompt_tokens": response.usage.prompt_tokens,
1539                        "completion_tokens": response.usage.completion_tokens
1540                    }
1541                )
1542            ```
1543        """
1544        if not self._tracing_enabled:
1545            langfuse_logger.debug(
1546                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1547            )
1548            return
1549
1550        current_otel_span = self._get_current_otel_span()
1551
1552        if current_otel_span is not None:
1553            generation = LangfuseGeneration(
1554                otel_span=current_otel_span, langfuse_client=self
1555            )
1556
1557            if name:
1558                current_otel_span.update_name(name)
1559
1560            generation.update(
1561                input=input,
1562                output=output,
1563                metadata=metadata,
1564                version=version,
1565                level=level,
1566                status_message=status_message,
1567                completion_start_time=completion_start_time,
1568                model=model,
1569                model_parameters=model_parameters,
1570                usage_details=usage_details,
1571                cost_details=cost_details,
1572                prompt=prompt,
1573            )
1574
1575    def update_current_span(
1576        self,
1577        *,
1578        name: Optional[str] = None,
1579        input: Optional[Any] = None,
1580        output: Optional[Any] = None,
1581        metadata: Optional[Any] = None,
1582        version: Optional[str] = None,
1583        level: Optional[SpanLevel] = None,
1584        status_message: Optional[str] = None,
1585    ) -> None:
1586        """Update the current active span with new information.
1587
1588        This method updates the current span in the active context with
1589        additional information. It's useful for adding outputs or metadata
1590        that become available during execution.
1591
1592        Args:
1593            name: The span name
1594            input: Updated input data for the operation
1595            output: Output data from the operation
1596            metadata: Additional metadata to associate with the span
1597            version: Version identifier for the code or component
1598            level: Importance level of the span (info, warning, error)
1599            status_message: Optional status message for the span
1600
1601        Example:
1602            ```python
1603            with langfuse.start_as_current_span(name="process-data") as span:
1604                # Initial processing
1605                result = process_first_part()
1606
1607                # Update with intermediate results
1608                langfuse.update_current_span(metadata={"intermediate_result": result})
1609
1610                # Continue processing
1611                final_result = process_second_part(result)
1612
1613                # Final update
1614                langfuse.update_current_span(output=final_result)
1615            ```
1616        """
1617        if not self._tracing_enabled:
1618            langfuse_logger.debug(
1619                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1620            )
1621            return
1622
1623        current_otel_span = self._get_current_otel_span()
1624
1625        if current_otel_span is not None:
1626            span = LangfuseSpan(
1627                otel_span=current_otel_span,
1628                langfuse_client=self,
1629                environment=self._environment,
1630            )
1631
1632            if name:
1633                current_otel_span.update_name(name)
1634
1635            span.update(
1636                input=input,
1637                output=output,
1638                metadata=metadata,
1639                version=version,
1640                level=level,
1641                status_message=status_message,
1642            )
1643
1644    def update_current_trace(
1645        self,
1646        *,
1647        name: Optional[str] = None,
1648        user_id: Optional[str] = None,
1649        session_id: Optional[str] = None,
1650        version: Optional[str] = None,
1651        input: Optional[Any] = None,
1652        output: Optional[Any] = None,
1653        metadata: Optional[Any] = None,
1654        tags: Optional[List[str]] = None,
1655        public: Optional[bool] = None,
1656    ) -> None:
1657        """Update the current trace with additional information.
1658
1659        Args:
1660            name: Updated name for the Langfuse trace
1661            user_id: ID of the user who initiated the Langfuse trace
1662            session_id: Session identifier for grouping related Langfuse traces
1663            version: Version identifier for the application or service
1664            input: Input data for the overall Langfuse trace
1665            output: Output data from the overall Langfuse trace
1666            metadata: Additional metadata to associate with the Langfuse trace
1667            tags: List of tags to categorize the Langfuse trace
1668            public: Whether the Langfuse trace should be publicly accessible
1669
1670        See Also:
1671            :func:`langfuse.propagate_attributes`: Recommended replacement
1672        """
1673        if not self._tracing_enabled:
1674            langfuse_logger.debug(
1675                "Operation skipped: update_current_trace - Tracing is disabled or client is in no-op mode."
1676            )
1677            return
1678
1679        current_otel_span = self._get_current_otel_span()
1680
1681        if current_otel_span is not None and current_otel_span.is_recording():
1682            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1683                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1684            )
1685            # We need to preserve the class to keep the correct observation type
1686            span_class = self._get_span_class(existing_observation_type)
1687            span = span_class(
1688                otel_span=current_otel_span,
1689                langfuse_client=self,
1690                environment=self._environment,
1691            )
1692
1693            span.update_trace(
1694                name=name,
1695                user_id=user_id,
1696                session_id=session_id,
1697                version=version,
1698                input=input,
1699                output=output,
1700                metadata=metadata,
1701                tags=tags,
1702                public=public,
1703            )
1704
1705    def create_event(
1706        self,
1707        *,
1708        trace_context: Optional[TraceContext] = None,
1709        name: str,
1710        input: Optional[Any] = None,
1711        output: Optional[Any] = None,
1712        metadata: Optional[Any] = None,
1713        version: Optional[str] = None,
1714        level: Optional[SpanLevel] = None,
1715        status_message: Optional[str] = None,
1716    ) -> LangfuseEvent:
1717        """Create a new Langfuse observation of type 'EVENT'.
1718
1719        The created Langfuse Event observation will be the child of the current span in the context.
1720
1721        Args:
1722            trace_context: Optional context for connecting to an existing trace
1723            name: Name of the span (e.g., function or operation name)
1724            input: Input data for the operation (can be any JSON-serializable object)
1725            output: Output data from the operation (can be any JSON-serializable object)
1726            metadata: Additional metadata to associate with the span
1727            version: Version identifier for the code or component
1728            level: Importance level of the span (info, warning, error)
1729            status_message: Optional status message for the span
1730
1731        Returns:
1732            The Langfuse Event object
1733
1734        Example:
1735            ```python
1736            event = langfuse.create_event(name="process-event")
1737            ```
1738        """
1739        timestamp = time_ns()
1740
1741        if trace_context:
1742            trace_id = trace_context.get("trace_id", None)
1743            parent_span_id = trace_context.get("parent_span_id", None)
1744
1745            if trace_id:
1746                remote_parent_span = self._create_remote_parent_span(
1747                    trace_id=trace_id, parent_span_id=parent_span_id
1748                )
1749
1750                with otel_trace_api.use_span(
1751                    cast(otel_trace_api.Span, remote_parent_span)
1752                ):
1753                    otel_span = self._otel_tracer.start_span(
1754                        name=name, start_time=timestamp
1755                    )
1756                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1757
1758                    return cast(
1759                        LangfuseEvent,
1760                        LangfuseEvent(
1761                            otel_span=otel_span,
1762                            langfuse_client=self,
1763                            environment=self._environment,
1764                            input=input,
1765                            output=output,
1766                            metadata=metadata,
1767                            version=version,
1768                            level=level,
1769                            status_message=status_message,
1770                        ).end(end_time=timestamp),
1771                    )
1772
1773        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1774
1775        return cast(
1776            LangfuseEvent,
1777            LangfuseEvent(
1778                otel_span=otel_span,
1779                langfuse_client=self,
1780                environment=self._environment,
1781                input=input,
1782                output=output,
1783                metadata=metadata,
1784                version=version,
1785                level=level,
1786                status_message=status_message,
1787            ).end(end_time=timestamp),
1788        )
1789
1790    def _create_remote_parent_span(
1791        self, *, trace_id: str, parent_span_id: Optional[str]
1792    ) -> Any:
1793        if not self._is_valid_trace_id(trace_id):
1794            langfuse_logger.warning(
1795                f"Passed trace ID '{trace_id}' is not a valid 32 lowercase hex char Langfuse trace id. Ignoring trace ID."
1796            )
1797
1798        if parent_span_id and not self._is_valid_span_id(parent_span_id):
1799            langfuse_logger.warning(
1800                f"Passed span ID '{parent_span_id}' is not a valid 16 lowercase hex char Langfuse span id. Ignoring parent span ID."
1801            )
1802
1803        int_trace_id = int(trace_id, 16)
1804        int_parent_span_id = (
1805            int(parent_span_id, 16)
1806            if parent_span_id
1807            else RandomIdGenerator().generate_span_id()
1808        )
1809
1810        span_context = otel_trace_api.SpanContext(
1811            trace_id=int_trace_id,
1812            span_id=int_parent_span_id,
1813            trace_flags=otel_trace_api.TraceFlags(0x01),  # mark span as sampled
1814            is_remote=False,
1815        )
1816
1817        return otel_trace_api.NonRecordingSpan(span_context)
1818
1819    def _is_valid_trace_id(self, trace_id: str) -> bool:
1820        pattern = r"^[0-9a-f]{32}$"
1821
1822        return bool(re.match(pattern, trace_id))
1823
1824    def _is_valid_span_id(self, span_id: str) -> bool:
1825        pattern = r"^[0-9a-f]{16}$"
1826
1827        return bool(re.match(pattern, span_id))
1828
1829    def _create_observation_id(self, *, seed: Optional[str] = None) -> str:
1830        """Create a unique observation ID for use with Langfuse.
1831
1832        This method generates a unique observation ID (span ID in OpenTelemetry terms)
1833        for use with various Langfuse APIs. It can either generate a random ID or
1834        create a deterministic ID based on a seed string.
1835
1836        Observation IDs must be 16 lowercase hexadecimal characters, representing 8 bytes.
1837        This method ensures the generated ID meets this requirement. If you need to
1838        correlate an external ID with a Langfuse observation ID, use the external ID as
1839        the seed to get a valid, deterministic observation ID.
1840
1841        Args:
1842            seed: Optional string to use as a seed for deterministic ID generation.
1843                 If provided, the same seed will always produce the same ID.
1844                 If not provided, a random ID will be generated.
1845
1846        Returns:
1847            A 16-character lowercase hexadecimal string representing the observation ID.
1848
1849        Example:
1850            ```python
1851            # Generate a random observation ID
1852            obs_id = langfuse.create_observation_id()
1853
1854            # Generate a deterministic ID based on a seed
1855            user_obs_id = langfuse.create_observation_id(seed="user-123-feedback")
1856
1857            # Correlate an external item ID with a Langfuse observation ID
1858            item_id = "item-789012"
1859            correlated_obs_id = langfuse.create_observation_id(seed=item_id)
1860
1861            # Use the ID with Langfuse APIs
1862            langfuse.create_score(
1863                name="relevance",
1864                value=0.95,
1865                trace_id=trace_id,
1866                observation_id=obs_id
1867            )
1868            ```
1869        """
1870        if not seed:
1871            span_id_int = RandomIdGenerator().generate_span_id()
1872
1873            return self._format_otel_span_id(span_id_int)
1874
1875        return sha256(seed.encode("utf-8")).digest()[:8].hex()
1876
1877    @staticmethod
1878    def create_trace_id(*, seed: Optional[str] = None) -> str:
1879        """Create a unique trace ID for use with Langfuse.
1880
1881        This method generates a unique trace ID for use with various Langfuse APIs.
1882        It can either generate a random ID or create a deterministic ID based on
1883        a seed string.
1884
1885        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1886        This method ensures the generated ID meets this requirement. If you need to
1887        correlate an external ID with a Langfuse trace ID, use the external ID as the
1888        seed to get a valid, deterministic Langfuse trace ID.
1889
1890        Args:
1891            seed: Optional string to use as a seed for deterministic ID generation.
1892                 If provided, the same seed will always produce the same ID.
1893                 If not provided, a random ID will be generated.
1894
1895        Returns:
1896            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1897
1898        Example:
1899            ```python
1900            # Generate a random trace ID
1901            trace_id = langfuse.create_trace_id()
1902
1903            # Generate a deterministic ID based on a seed
1904            session_trace_id = langfuse.create_trace_id(seed="session-456")
1905
1906            # Correlate an external ID with a Langfuse trace ID
1907            external_id = "external-system-123456"
1908            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1909
1910            # Use the ID with trace context
1911            with langfuse.start_as_current_span(
1912                name="process-request",
1913                trace_context={"trace_id": trace_id}
1914            ) as span:
1915                # Operation will be part of the specific trace
1916                pass
1917            ```
1918        """
1919        if not seed:
1920            trace_id_int = RandomIdGenerator().generate_trace_id()
1921
1922            return Langfuse._format_otel_trace_id(trace_id_int)
1923
1924        return sha256(seed.encode("utf-8")).digest()[:16].hex()
1925
1926    def _get_otel_trace_id(self, otel_span: otel_trace_api.Span) -> str:
1927        span_context = otel_span.get_span_context()
1928
1929        return self._format_otel_trace_id(span_context.trace_id)
1930
1931    def _get_otel_span_id(self, otel_span: otel_trace_api.Span) -> str:
1932        span_context = otel_span.get_span_context()
1933
1934        return self._format_otel_span_id(span_context.span_id)
1935
1936    @staticmethod
1937    def _format_otel_span_id(span_id_int: int) -> str:
1938        """Format an integer span ID to a 16-character lowercase hex string.
1939
1940        Internal method to convert an OpenTelemetry integer span ID to the standard
1941        W3C Trace Context format (16-character lowercase hex string).
1942
1943        Args:
1944            span_id_int: 64-bit integer representing a span ID
1945
1946        Returns:
1947            A 16-character lowercase hexadecimal string
1948        """
1949        return format(span_id_int, "016x")
1950
1951    @staticmethod
1952    def _format_otel_trace_id(trace_id_int: int) -> str:
1953        """Format an integer trace ID to a 32-character lowercase hex string.
1954
1955        Internal method to convert an OpenTelemetry integer trace ID to the standard
1956        W3C Trace Context format (32-character lowercase hex string).
1957
1958        Args:
1959            trace_id_int: 128-bit integer representing a trace ID
1960
1961        Returns:
1962            A 32-character lowercase hexadecimal string
1963        """
1964        return format(trace_id_int, "032x")
1965
1966    @overload
1967    def create_score(
1968        self,
1969        *,
1970        name: str,
1971        value: float,
1972        session_id: Optional[str] = None,
1973        dataset_run_id: Optional[str] = None,
1974        trace_id: Optional[str] = None,
1975        observation_id: Optional[str] = None,
1976        score_id: Optional[str] = None,
1977        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
1978        comment: Optional[str] = None,
1979        config_id: Optional[str] = None,
1980        metadata: Optional[Any] = None,
1981        timestamp: Optional[datetime] = None,
1982    ) -> None: ...
1983
1984    @overload
1985    def create_score(
1986        self,
1987        *,
1988        name: str,
1989        value: str,
1990        session_id: Optional[str] = None,
1991        dataset_run_id: Optional[str] = None,
1992        trace_id: Optional[str] = None,
1993        score_id: Optional[str] = None,
1994        observation_id: Optional[str] = None,
1995        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
1996        comment: Optional[str] = None,
1997        config_id: Optional[str] = None,
1998        metadata: Optional[Any] = None,
1999        timestamp: Optional[datetime] = None,
2000    ) -> None: ...
2001
2002    def create_score(
2003        self,
2004        *,
2005        name: str,
2006        value: Union[float, str],
2007        session_id: Optional[str] = None,
2008        dataset_run_id: Optional[str] = None,
2009        trace_id: Optional[str] = None,
2010        observation_id: Optional[str] = None,
2011        score_id: Optional[str] = None,
2012        data_type: Optional[ScoreDataType] = None,
2013        comment: Optional[str] = None,
2014        config_id: Optional[str] = None,
2015        metadata: Optional[Any] = None,
2016        timestamp: Optional[datetime] = None,
2017    ) -> None:
2018        """Create a score for a specific trace or observation.
2019
2020        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
2021        used to track quality metrics, user feedback, or automated evaluations.
2022
2023        Args:
2024            name: Name of the score (e.g., "relevance", "accuracy")
2025            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2026            session_id: ID of the Langfuse session to associate the score with
2027            dataset_run_id: ID of the Langfuse dataset run to associate the score with
2028            trace_id: ID of the Langfuse trace to associate the score with
2029            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
2030            score_id: Optional custom ID for the score (auto-generated if not provided)
2031            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2032            comment: Optional comment or explanation for the score
2033            config_id: Optional ID of a score config defined in Langfuse
2034            metadata: Optional metadata to be attached to the score
2035            timestamp: Optional timestamp for the score (defaults to current UTC time)
2036
2037        Example:
2038            ```python
2039            # Create a numeric score for accuracy
2040            langfuse.create_score(
2041                name="accuracy",
2042                value=0.92,
2043                trace_id="abcdef1234567890abcdef1234567890",
2044                data_type="NUMERIC",
2045                comment="High accuracy with minor irrelevant details"
2046            )
2047
2048            # Create a categorical score for sentiment
2049            langfuse.create_score(
2050                name="sentiment",
2051                value="positive",
2052                trace_id="abcdef1234567890abcdef1234567890",
2053                observation_id="abcdef1234567890",
2054                data_type="CATEGORICAL"
2055            )
2056            ```
2057        """
2058        if not self._tracing_enabled:
2059            return
2060
2061        score_id = score_id or self._create_observation_id()
2062
2063        try:
2064            new_body = ScoreBody(
2065                id=score_id,
2066                sessionId=session_id,
2067                datasetRunId=dataset_run_id,
2068                traceId=trace_id,
2069                observationId=observation_id,
2070                name=name,
2071                value=value,
2072                dataType=data_type,  # type: ignore
2073                comment=comment,
2074                configId=config_id,
2075                environment=self._environment,
2076                metadata=metadata,
2077            )
2078
2079            event = {
2080                "id": self.create_trace_id(),
2081                "type": "score-create",
2082                "timestamp": timestamp or _get_timestamp(),
2083                "body": new_body,
2084            }
2085
2086            if self._resources is not None:
2087                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
2088                force_sample = (
2089                    not self._is_valid_trace_id(trace_id) if trace_id else True
2090                )
2091
2092                self._resources.add_score_task(
2093                    event,
2094                    force_sample=force_sample,
2095                )
2096
2097        except Exception as e:
2098            langfuse_logger.exception(
2099                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
2100            )
2101
2102    @overload
2103    def score_current_span(
2104        self,
2105        *,
2106        name: str,
2107        value: float,
2108        score_id: Optional[str] = None,
2109        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
2110        comment: Optional[str] = None,
2111        config_id: Optional[str] = None,
2112        metadata: Optional[Any] = None,
2113    ) -> None: ...
2114
2115    @overload
2116    def score_current_span(
2117        self,
2118        *,
2119        name: str,
2120        value: str,
2121        score_id: Optional[str] = None,
2122        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
2123        comment: Optional[str] = None,
2124        config_id: Optional[str] = None,
2125        metadata: Optional[Any] = None,
2126    ) -> None: ...
2127
2128    def score_current_span(
2129        self,
2130        *,
2131        name: str,
2132        value: Union[float, str],
2133        score_id: Optional[str] = None,
2134        data_type: Optional[ScoreDataType] = None,
2135        comment: Optional[str] = None,
2136        config_id: Optional[str] = None,
2137        metadata: Optional[Any] = None,
2138    ) -> None:
2139        """Create a score for the current active span.
2140
2141        This method scores the currently active span in the context. It's a convenient
2142        way to score the current operation without needing to know its trace and span IDs.
2143
2144        Args:
2145            name: Name of the score (e.g., "relevance", "accuracy")
2146            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2147            score_id: Optional custom ID for the score (auto-generated if not provided)
2148            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2149            comment: Optional comment or explanation for the score
2150            config_id: Optional ID of a score config defined in Langfuse
2151            metadata: Optional metadata to be attached to the score
2152
2153        Example:
2154            ```python
2155            with langfuse.start_as_current_generation(name="answer-query") as generation:
2156                # Generate answer
2157                response = generate_answer(...)
2158                generation.update(output=response)
2159
2160                # Score the generation
2161                langfuse.score_current_span(
2162                    name="relevance",
2163                    value=0.85,
2164                    data_type="NUMERIC",
2165                    comment="Mostly relevant but contains some tangential information",
2166                    metadata={"model": "gpt-4", "prompt_version": "v2"}
2167                )
2168            ```
2169        """
2170        current_span = self._get_current_otel_span()
2171
2172        if current_span is not None:
2173            trace_id = self._get_otel_trace_id(current_span)
2174            observation_id = self._get_otel_span_id(current_span)
2175
2176            langfuse_logger.info(
2177                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
2178            )
2179
2180            self.create_score(
2181                trace_id=trace_id,
2182                observation_id=observation_id,
2183                name=name,
2184                value=cast(str, value),
2185                score_id=score_id,
2186                data_type=cast(Literal["CATEGORICAL"], data_type),
2187                comment=comment,
2188                config_id=config_id,
2189                metadata=metadata,
2190            )
2191
2192    @overload
2193    def score_current_trace(
2194        self,
2195        *,
2196        name: str,
2197        value: float,
2198        score_id: Optional[str] = None,
2199        data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None,
2200        comment: Optional[str] = None,
2201        config_id: Optional[str] = None,
2202        metadata: Optional[Any] = None,
2203    ) -> None: ...
2204
2205    @overload
2206    def score_current_trace(
2207        self,
2208        *,
2209        name: str,
2210        value: str,
2211        score_id: Optional[str] = None,
2212        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
2213        comment: Optional[str] = None,
2214        config_id: Optional[str] = None,
2215        metadata: Optional[Any] = None,
2216    ) -> None: ...
2217
2218    def score_current_trace(
2219        self,
2220        *,
2221        name: str,
2222        value: Union[float, str],
2223        score_id: Optional[str] = None,
2224        data_type: Optional[ScoreDataType] = None,
2225        comment: Optional[str] = None,
2226        config_id: Optional[str] = None,
2227        metadata: Optional[Any] = None,
2228    ) -> None:
2229        """Create a score for the current trace.
2230
2231        This method scores the trace of the currently active span. Unlike score_current_span,
2232        this method associates the score with the entire trace rather than a specific span.
2233        It's useful for scoring overall performance or quality of the entire operation.
2234
2235        Args:
2236            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2237            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2238            score_id: Optional custom ID for the score (auto-generated if not provided)
2239            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2240            comment: Optional comment or explanation for the score
2241            config_id: Optional ID of a score config defined in Langfuse
2242            metadata: Optional metadata to be attached to the score
2243
2244        Example:
2245            ```python
2246            with langfuse.start_as_current_span(name="process-user-request") as span:
2247                # Process request
2248                result = process_complete_request()
2249                span.update(output=result)
2250
2251                # Score the overall trace
2252                langfuse.score_current_trace(
2253                    name="overall_quality",
2254                    value=0.95,
2255                    data_type="NUMERIC",
2256                    comment="High quality end-to-end response",
2257                    metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
2258                )
2259            ```
2260        """
2261        current_span = self._get_current_otel_span()
2262
2263        if current_span is not None:
2264            trace_id = self._get_otel_trace_id(current_span)
2265
2266            langfuse_logger.info(
2267                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2268            )
2269
2270            self.create_score(
2271                trace_id=trace_id,
2272                name=name,
2273                value=cast(str, value),
2274                score_id=score_id,
2275                data_type=cast(Literal["CATEGORICAL"], data_type),
2276                comment=comment,
2277                config_id=config_id,
2278                metadata=metadata,
2279            )
2280
2281    def flush(self) -> None:
2282        """Force flush all pending spans and events to the Langfuse API.
2283
2284        This method manually flushes any pending spans, scores, and other events to the
2285        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2286        before proceeding, without waiting for the automatic flush interval.
2287
2288        Example:
2289            ```python
2290            # Record some spans and scores
2291            with langfuse.start_as_current_span(name="operation") as span:
2292                # Do work...
2293                pass
2294
2295            # Ensure all data is sent to Langfuse before proceeding
2296            langfuse.flush()
2297
2298            # Continue with other work
2299            ```
2300        """
2301        if self._resources is not None:
2302            self._resources.flush()
2303
2304    def shutdown(self) -> None:
2305        """Shut down the Langfuse client and flush all pending data.
2306
2307        This method cleanly shuts down the Langfuse client, ensuring all pending data
2308        is flushed to the API and all background threads are properly terminated.
2309
2310        It's important to call this method when your application is shutting down to
2311        prevent data loss and resource leaks. For most applications, using the client
2312        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2313
2314        Example:
2315            ```python
2316            # Initialize Langfuse
2317            langfuse = Langfuse(public_key="...", secret_key="...")
2318
2319            # Use Langfuse throughout your application
2320            # ...
2321
2322            # When application is shutting down
2323            langfuse.shutdown()
2324            ```
2325        """
2326        if self._resources is not None:
2327            self._resources.shutdown()
2328
2329    def get_current_trace_id(self) -> Optional[str]:
2330        """Get the trace ID of the current active span.
2331
2332        This method retrieves the trace ID from the currently active span in the context.
2333        It can be used to get the trace ID for referencing in logs, external systems,
2334        or for creating related operations.
2335
2336        Returns:
2337            The current trace ID as a 32-character lowercase hexadecimal string,
2338            or None if there is no active span.
2339
2340        Example:
2341            ```python
2342            with langfuse.start_as_current_span(name="process-request") as span:
2343                # Get the current trace ID for reference
2344                trace_id = langfuse.get_current_trace_id()
2345
2346                # Use it for external correlation
2347                log.info(f"Processing request with trace_id: {trace_id}")
2348
2349                # Or pass to another system
2350                external_system.process(data, trace_id=trace_id)
2351            ```
2352        """
2353        if not self._tracing_enabled:
2354            langfuse_logger.debug(
2355                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2356            )
2357            return None
2358
2359        current_otel_span = self._get_current_otel_span()
2360
2361        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None
2362
2363    def get_current_observation_id(self) -> Optional[str]:
2364        """Get the observation ID (span ID) of the current active span.
2365
2366        This method retrieves the observation ID from the currently active span in the context.
2367        It can be used to get the observation ID for referencing in logs, external systems,
2368        or for creating scores or other related operations.
2369
2370        Returns:
2371            The current observation ID as a 16-character lowercase hexadecimal string,
2372            or None if there is no active span.
2373
2374        Example:
2375            ```python
2376            with langfuse.start_as_current_span(name="process-user-query") as span:
2377                # Get the current observation ID
2378                observation_id = langfuse.get_current_observation_id()
2379
2380                # Store it for later reference
2381                cache.set(f"query_{query_id}_observation", observation_id)
2382
2383                # Process the query...
2384            ```
2385        """
2386        if not self._tracing_enabled:
2387            langfuse_logger.debug(
2388                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2389            )
2390            return None
2391
2392        current_otel_span = self._get_current_otel_span()
2393
2394        return self._get_otel_span_id(current_otel_span) if current_otel_span else None
2395
2396    def _get_project_id(self) -> Optional[str]:
2397        """Fetch and return the current project id. Persisted across requests. Returns None if no project id is found for api keys."""
2398        if not self._project_id:
2399            proj = self.api.projects.get()
2400            if not proj.data or not proj.data[0].id:
2401                return None
2402
2403            self._project_id = proj.data[0].id
2404
2405        return self._project_id
2406
2407    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2408        """Get the URL to view a trace in the Langfuse UI.
2409
2410        This method generates a URL that links directly to a trace in the Langfuse UI.
2411        It's useful for providing links in logs, notifications, or debugging tools.
2412
2413        Args:
2414            trace_id: Optional trace ID to generate a URL for. If not provided,
2415                     the trace ID of the current active span will be used.
2416
2417        Returns:
2418            A URL string pointing to the trace in the Langfuse UI,
2419            or None if the project ID couldn't be retrieved or no trace ID is available.
2420
2421        Example:
2422            ```python
2423            # Get URL for the current trace
2424            with langfuse.start_as_current_span(name="process-request") as span:
2425                trace_url = langfuse.get_trace_url()
2426                log.info(f"Processing trace: {trace_url}")
2427
2428            # Get URL for a specific trace
2429            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2430            send_notification(f"Review needed for trace: {specific_trace_url}")
2431            ```
2432        """
2433        project_id = self._get_project_id()
2434        final_trace_id = trace_id or self.get_current_trace_id()
2435
2436        return (
2437            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2438            if project_id and final_trace_id
2439            else None
2440        )
2441
2442    def get_dataset(
2443        self, name: str, *, fetch_items_page_size: Optional[int] = 50
2444    ) -> "DatasetClient":
2445        """Fetch a dataset by its name.
2446
2447        Args:
2448            name (str): The name of the dataset to fetch.
2449            fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2450
2451        Returns:
2452            DatasetClient: The dataset with the given name.
2453        """
2454        try:
2455            langfuse_logger.debug(f"Getting datasets {name}")
2456            dataset = self.api.datasets.get(dataset_name=self._url_encode(name))
2457
2458            dataset_items = []
2459            page = 1
2460
2461            while True:
2462                new_items = self.api.dataset_items.list(
2463                    dataset_name=self._url_encode(name, is_url_param=True),
2464                    page=page,
2465                    limit=fetch_items_page_size,
2466                )
2467                dataset_items.extend(new_items.data)
2468
2469                if new_items.meta.total_pages <= page:
2470                    break
2471
2472                page += 1
2473
2474            items = [DatasetItemClient(i, langfuse=self) for i in dataset_items]
2475
2476            return DatasetClient(dataset, items=items)
2477
2478        except Error as e:
2479            handle_fern_exception(e)
2480            raise e
2481
2482    def get_dataset_run(
2483        self, *, dataset_name: str, run_name: str
2484    ) -> DatasetRunWithItems:
2485        """Fetch a dataset run by dataset name and run name.
2486
2487        Args:
2488            dataset_name (str): The name of the dataset.
2489            run_name (str): The name of the run.
2490
2491        Returns:
2492            DatasetRunWithItems: The dataset run with its items.
2493        """
2494        try:
2495            return self.api.datasets.get_run(
2496                dataset_name=self._url_encode(dataset_name),
2497                run_name=self._url_encode(run_name),
2498                request_options=None,
2499            )
2500        except Error as e:
2501            handle_fern_exception(e)
2502            raise e
2503
2504    def get_dataset_runs(
2505        self,
2506        *,
2507        dataset_name: str,
2508        page: Optional[int] = None,
2509        limit: Optional[int] = None,
2510    ) -> PaginatedDatasetRuns:
2511        """Fetch all runs for a dataset.
2512
2513        Args:
2514            dataset_name (str): The name of the dataset.
2515            page (Optional[int]): Page number, starts at 1.
2516            limit (Optional[int]): Limit of items per page.
2517
2518        Returns:
2519            PaginatedDatasetRuns: Paginated list of dataset runs.
2520        """
2521        try:
2522            return self.api.datasets.get_runs(
2523                dataset_name=self._url_encode(dataset_name),
2524                page=page,
2525                limit=limit,
2526                request_options=None,
2527            )
2528        except Error as e:
2529            handle_fern_exception(e)
2530            raise e
2531
2532    def delete_dataset_run(
2533        self, *, dataset_name: str, run_name: str
2534    ) -> DeleteDatasetRunResponse:
2535        """Delete a dataset run and all its run items. This action is irreversible.
2536
2537        Args:
2538            dataset_name (str): The name of the dataset.
2539            run_name (str): The name of the run.
2540
2541        Returns:
2542            DeleteDatasetRunResponse: Confirmation of deletion.
2543        """
2544        try:
2545            return self.api.datasets.delete_run(
2546                dataset_name=self._url_encode(dataset_name),
2547                run_name=self._url_encode(run_name),
2548                request_options=None,
2549            )
2550        except Error as e:
2551            handle_fern_exception(e)
2552            raise e
2553
2554    def run_experiment(
2555        self,
2556        *,
2557        name: str,
2558        run_name: Optional[str] = None,
2559        description: Optional[str] = None,
2560        data: ExperimentData,
2561        task: TaskFunction,
2562        evaluators: List[EvaluatorFunction] = [],
2563        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2564        run_evaluators: List[RunEvaluatorFunction] = [],
2565        max_concurrency: int = 50,
2566        metadata: Optional[Dict[str, str]] = None,
2567    ) -> ExperimentResult:
2568        """Run an experiment on a dataset with automatic tracing and evaluation.
2569
2570        This method executes a task function on each item in the provided dataset,
2571        automatically traces all executions with Langfuse for observability, runs
2572        item-level and run-level evaluators on the outputs, and returns comprehensive
2573        results with evaluation metrics.
2574
2575        The experiment system provides:
2576        - Automatic tracing of all task executions
2577        - Concurrent processing with configurable limits
2578        - Comprehensive error handling that isolates failures
2579        - Integration with Langfuse datasets for experiment tracking
2580        - Flexible evaluation framework supporting both sync and async evaluators
2581
2582        Args:
2583            name: Human-readable name for the experiment. Used for identification
2584                in the Langfuse UI.
2585            run_name: Optional exact name for the experiment run. If provided, this will be
2586                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2587                If not provided, this will default to the experiment name appended with an ISO timestamp.
2588            description: Optional description explaining the experiment's purpose,
2589                methodology, or expected outcomes.
2590            data: Array of data items to process. Can be either:
2591                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2592                - List of Langfuse DatasetItem objects from dataset.items
2593            task: Function that processes each data item and returns output.
2594                Must accept 'item' as keyword argument and can return sync or async results.
2595                The task function signature should be: task(*, item, **kwargs) -> Any
2596            evaluators: List of functions to evaluate each item's output individually.
2597                Each evaluator receives input, output, expected_output, and metadata.
2598                Can return single Evaluation dict or list of Evaluation dicts.
2599            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2600                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2601                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2602                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2603            run_evaluators: List of functions to evaluate the entire experiment run.
2604                Each run evaluator receives all item_results and can compute aggregate metrics.
2605                Useful for calculating averages, distributions, or cross-item comparisons.
2606            max_concurrency: Maximum number of concurrent task executions (default: 50).
2607                Controls the number of items processed simultaneously. Adjust based on
2608                API rate limits and system resources.
2609            metadata: Optional metadata dictionary to attach to all experiment traces.
2610                This metadata will be included in every trace created during the experiment.
2611                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2612
2613        Returns:
2614            ExperimentResult containing:
2615            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2616            - item_results: List of results for each processed item with outputs and evaluations
2617            - run_evaluations: List of aggregate evaluation results for the entire run
2618            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2619            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2620
2621        Raises:
2622            ValueError: If required parameters are missing or invalid
2623            Exception: If experiment setup fails (individual item failures are handled gracefully)
2624
2625        Examples:
2626            Basic experiment with local data:
2627            ```python
2628            def summarize_text(*, item, **kwargs):
2629                return f"Summary: {item['input'][:50]}..."
2630
2631            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2632                return {
2633                    "name": "output_length",
2634                    "value": len(output),
2635                    "comment": f"Output contains {len(output)} characters"
2636                }
2637
2638            result = langfuse.run_experiment(
2639                name="Text Summarization Test",
2640                description="Evaluate summarization quality and length",
2641                data=[
2642                    {"input": "Long article text...", "expected_output": "Expected summary"},
2643                    {"input": "Another article...", "expected_output": "Another summary"}
2644                ],
2645                task=summarize_text,
2646                evaluators=[length_evaluator]
2647            )
2648
2649            print(f"Processed {len(result.item_results)} items")
2650            for item_result in result.item_results:
2651                print(f"Input: {item_result.item['input']}")
2652                print(f"Output: {item_result.output}")
2653                print(f"Evaluations: {item_result.evaluations}")
2654            ```
2655
2656            Advanced experiment with async task and multiple evaluators:
2657            ```python
2658            async def llm_task(*, item, **kwargs):
2659                # Simulate async LLM call
2660                response = await openai_client.chat.completions.create(
2661                    model="gpt-4",
2662                    messages=[{"role": "user", "content": item["input"]}]
2663                )
2664                return response.choices[0].message.content
2665
2666            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2667                if expected_output and expected_output.lower() in output.lower():
2668                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2669                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2670
2671            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2672                # Simulate toxicity check
2673                toxicity_score = check_toxicity(output)  # Your toxicity checker
2674                return {
2675                    "name": "toxicity",
2676                    "value": toxicity_score,
2677                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2678                }
2679
2680            def average_accuracy(*, item_results, **kwargs):
2681                accuracies = [
2682                    eval.value for result in item_results
2683                    for eval in result.evaluations
2684                    if eval.name == "accuracy"
2685                ]
2686                return {
2687                    "name": "average_accuracy",
2688                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2689                    "comment": f"Average accuracy across {len(accuracies)} items"
2690                }
2691
2692            result = langfuse.run_experiment(
2693                name="LLM Safety and Accuracy Test",
2694                description="Evaluate model accuracy and safety across diverse prompts",
2695                data=test_dataset,  # Your dataset items
2696                task=llm_task,
2697                evaluators=[accuracy_evaluator, toxicity_evaluator],
2698                run_evaluators=[average_accuracy],
2699                max_concurrency=5,  # Limit concurrent API calls
2700                metadata={"model": "gpt-4", "temperature": 0.7}
2701            )
2702            ```
2703
2704            Using with Langfuse datasets:
2705            ```python
2706            # Get dataset from Langfuse
2707            dataset = langfuse.get_dataset("my-eval-dataset")
2708
2709            result = dataset.run_experiment(
2710                name="Production Model Evaluation",
2711                description="Monthly evaluation of production model performance",
2712                task=my_production_task,
2713                evaluators=[accuracy_evaluator, latency_evaluator]
2714            )
2715
2716            # Results automatically linked to dataset in Langfuse UI
2717            print(f"View results: {result['dataset_run_url']}")
2718            ```
2719
2720        Note:
2721            - Task and evaluator functions can be either synchronous or asynchronous
2722            - Individual item failures are logged but don't stop the experiment
2723            - All executions are automatically traced and visible in Langfuse UI
2724            - When using Langfuse datasets, results are automatically linked for easy comparison
2725            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2726            - Async execution is handled automatically with smart event loop detection
2727        """
2728        return cast(
2729            ExperimentResult,
2730            run_async_safely(
2731                self._run_experiment_async(
2732                    name=name,
2733                    run_name=self._create_experiment_run_name(
2734                        name=name, run_name=run_name
2735                    ),
2736                    description=description,
2737                    data=data,
2738                    task=task,
2739                    evaluators=evaluators or [],
2740                    composite_evaluator=composite_evaluator,
2741                    run_evaluators=run_evaluators or [],
2742                    max_concurrency=max_concurrency,
2743                    metadata=metadata,
2744                ),
2745            ),
2746        )
2747
2748    async def _run_experiment_async(
2749        self,
2750        *,
2751        name: str,
2752        run_name: str,
2753        description: Optional[str],
2754        data: ExperimentData,
2755        task: TaskFunction,
2756        evaluators: List[EvaluatorFunction],
2757        composite_evaluator: Optional[CompositeEvaluatorFunction],
2758        run_evaluators: List[RunEvaluatorFunction],
2759        max_concurrency: int,
2760        metadata: Optional[Dict[str, Any]] = None,
2761    ) -> ExperimentResult:
2762        langfuse_logger.debug(
2763            f"Starting experiment '{name}' run '{run_name}' with {len(data)} items"
2764        )
2765
2766        # Set up concurrency control
2767        semaphore = asyncio.Semaphore(max_concurrency)
2768
2769        # Process all items
2770        async def process_item(item: ExperimentItem) -> ExperimentItemResult:
2771            async with semaphore:
2772                return await self._process_experiment_item(
2773                    item,
2774                    task,
2775                    evaluators,
2776                    composite_evaluator,
2777                    name,
2778                    run_name,
2779                    description,
2780                    metadata,
2781                )
2782
2783        # Run all items concurrently
2784        tasks = [process_item(item) for item in data]
2785        item_results = await asyncio.gather(*tasks, return_exceptions=True)
2786
2787        # Filter out any exceptions and log errors
2788        valid_results: List[ExperimentItemResult] = []
2789        for i, result in enumerate(item_results):
2790            if isinstance(result, Exception):
2791                langfuse_logger.error(f"Item {i} failed: {result}")
2792            elif isinstance(result, ExperimentItemResult):
2793                valid_results.append(result)  # type: ignore
2794
2795        # Run experiment-level evaluators
2796        run_evaluations: List[Evaluation] = []
2797        for run_evaluator in run_evaluators:
2798            try:
2799                evaluations = await _run_evaluator(
2800                    run_evaluator, item_results=valid_results
2801                )
2802                run_evaluations.extend(evaluations)
2803            except Exception as e:
2804                langfuse_logger.error(f"Run evaluator failed: {e}")
2805
2806        # Generate dataset run URL if applicable
2807        dataset_run_id = valid_results[0].dataset_run_id if valid_results else None
2808        dataset_run_url = None
2809        if dataset_run_id and data:
2810            try:
2811                # Check if the first item has dataset_id (for DatasetItem objects)
2812                first_item = data[0]
2813                dataset_id = None
2814
2815                if hasattr(first_item, "dataset_id"):
2816                    dataset_id = getattr(first_item, "dataset_id", None)
2817
2818                if dataset_id:
2819                    project_id = self._get_project_id()
2820
2821                    if project_id:
2822                        dataset_run_url = f"{self._base_url}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}"
2823
2824            except Exception:
2825                pass  # URL generation is optional
2826
2827        # Store run-level evaluations as scores
2828        for evaluation in run_evaluations:
2829            try:
2830                if dataset_run_id:
2831                    self.create_score(
2832                        dataset_run_id=dataset_run_id,
2833                        name=evaluation.name or "<unknown>",
2834                        value=evaluation.value,  # type: ignore
2835                        comment=evaluation.comment,
2836                        metadata=evaluation.metadata,
2837                        data_type=evaluation.data_type,  # type: ignore
2838                        config_id=evaluation.config_id,
2839                    )
2840
2841            except Exception as e:
2842                langfuse_logger.error(f"Failed to store run evaluation: {e}")
2843
2844        # Flush scores and traces
2845        self.flush()
2846
2847        return ExperimentResult(
2848            name=name,
2849            run_name=run_name,
2850            description=description,
2851            item_results=valid_results,
2852            run_evaluations=run_evaluations,
2853            dataset_run_id=dataset_run_id,
2854            dataset_run_url=dataset_run_url,
2855        )
2856
2857    async def _process_experiment_item(
2858        self,
2859        item: ExperimentItem,
2860        task: Callable,
2861        evaluators: List[Callable],
2862        composite_evaluator: Optional[CompositeEvaluatorFunction],
2863        experiment_name: str,
2864        experiment_run_name: str,
2865        experiment_description: Optional[str],
2866        experiment_metadata: Optional[Dict[str, Any]] = None,
2867    ) -> ExperimentItemResult:
2868        span_name = "experiment-item-run"
2869
2870        with self.start_as_current_span(name=span_name) as span:
2871            try:
2872                input_data = (
2873                    item.get("input")
2874                    if isinstance(item, dict)
2875                    else getattr(item, "input", None)
2876                )
2877
2878                if input_data is None:
2879                    raise ValueError("Experiment Item is missing input. Skipping item.")
2880
2881                expected_output = (
2882                    item.get("expected_output")
2883                    if isinstance(item, dict)
2884                    else getattr(item, "expected_output", None)
2885                )
2886
2887                item_metadata = (
2888                    item.get("metadata")
2889                    if isinstance(item, dict)
2890                    else getattr(item, "metadata", None)
2891                )
2892
2893                final_observation_metadata = {
2894                    "experiment_name": experiment_name,
2895                    "experiment_run_name": experiment_run_name,
2896                    **(experiment_metadata or {}),
2897                }
2898
2899                trace_id = span.trace_id
2900                dataset_id = None
2901                dataset_item_id = None
2902                dataset_run_id = None
2903
2904                # Link to dataset run if this is a dataset item
2905                if hasattr(item, "id") and hasattr(item, "dataset_id"):
2906                    try:
2907                        # Use sync API to avoid event loop issues when run_async_safely
2908                        # creates multiple event loops across different threads
2909                        dataset_run_item = await asyncio.to_thread(
2910                            self.api.dataset_run_items.create,
2911                            request=CreateDatasetRunItemRequest(
2912                                runName=experiment_run_name,
2913                                runDescription=experiment_description,
2914                                metadata=experiment_metadata,
2915                                datasetItemId=item.id,  # type: ignore
2916                                traceId=trace_id,
2917                                observationId=span.id,
2918                            ),
2919                        )
2920
2921                        dataset_run_id = dataset_run_item.dataset_run_id
2922
2923                    except Exception as e:
2924                        langfuse_logger.error(f"Failed to create dataset run item: {e}")
2925
2926                if (
2927                    not isinstance(item, dict)
2928                    and hasattr(item, "dataset_id")
2929                    and hasattr(item, "id")
2930                ):
2931                    dataset_id = item.dataset_id
2932                    dataset_item_id = item.id
2933
2934                    final_observation_metadata.update(
2935                        {"dataset_id": dataset_id, "dataset_item_id": dataset_item_id}
2936                    )
2937
2938                if isinstance(item_metadata, dict):
2939                    final_observation_metadata.update(item_metadata)
2940
2941                experiment_id = dataset_run_id or self._create_observation_id()
2942                experiment_item_id = (
2943                    dataset_item_id or get_sha256_hash_hex(_serialize(input_data))[:16]
2944                )
2945                span._otel_span.set_attributes(
2946                    {
2947                        k: v
2948                        for k, v in {
2949                            LangfuseOtelSpanAttributes.ENVIRONMENT: LANGFUSE_SDK_EXPERIMENT_ENVIRONMENT,
2950                            LangfuseOtelSpanAttributes.EXPERIMENT_DESCRIPTION: experiment_description,
2951                            LangfuseOtelSpanAttributes.EXPERIMENT_ITEM_EXPECTED_OUTPUT: _serialize(
2952                                expected_output
2953                            ),
2954                        }.items()
2955                        if v is not None
2956                    }
2957                )
2958
2959                propagated_experiment_attributes = PropagatedExperimentAttributes(
2960                    experiment_id=experiment_id,
2961                    experiment_name=experiment_run_name,
2962                    experiment_metadata=_serialize(experiment_metadata),
2963                    experiment_dataset_id=dataset_id,
2964                    experiment_item_id=experiment_item_id,
2965                    experiment_item_metadata=_serialize(item_metadata),
2966                    experiment_item_root_observation_id=span.id,
2967                )
2968
2969                with _propagate_attributes(experiment=propagated_experiment_attributes):
2970                    output = await _run_task(task, item)
2971
2972                span.update(
2973                    input=input_data,
2974                    output=output,
2975                    metadata=final_observation_metadata,
2976                )
2977
2978            except Exception as e:
2979                span.update(
2980                    output=f"Error: {str(e)}", level="ERROR", status_message=str(e)
2981                )
2982                raise e
2983
2984            # Run evaluators
2985            evaluations = []
2986
2987            for evaluator in evaluators:
2988                try:
2989                    eval_metadata: Optional[Dict[str, Any]] = None
2990
2991                    if isinstance(item, dict):
2992                        eval_metadata = item.get("metadata")
2993                    elif hasattr(item, "metadata"):
2994                        eval_metadata = item.metadata
2995
2996                    with _propagate_attributes(
2997                        experiment=propagated_experiment_attributes
2998                    ):
2999                        eval_results = await _run_evaluator(
3000                            evaluator,
3001                            input=input_data,
3002                            output=output,
3003                            expected_output=expected_output,
3004                            metadata=eval_metadata,
3005                        )
3006                        evaluations.extend(eval_results)
3007
3008                        # Store evaluations as scores
3009                        for evaluation in eval_results:
3010                            self.create_score(
3011                                trace_id=trace_id,
3012                                observation_id=span.id,
3013                                name=evaluation.name,
3014                                value=evaluation.value,  # type: ignore
3015                                comment=evaluation.comment,
3016                                metadata=evaluation.metadata,
3017                                config_id=evaluation.config_id,
3018                                data_type=evaluation.data_type,  # type: ignore
3019                            )
3020
3021                except Exception as e:
3022                    langfuse_logger.error(f"Evaluator failed: {e}")
3023
3024            # Run composite evaluator if provided and we have evaluations
3025            if composite_evaluator and evaluations:
3026                try:
3027                    composite_eval_metadata: Optional[Dict[str, Any]] = None
3028                    if isinstance(item, dict):
3029                        composite_eval_metadata = item.get("metadata")
3030                    elif hasattr(item, "metadata"):
3031                        composite_eval_metadata = item.metadata
3032
3033                    with _propagate_attributes(
3034                        experiment=propagated_experiment_attributes
3035                    ):
3036                        result = composite_evaluator(
3037                            input=input_data,
3038                            output=output,
3039                            expected_output=expected_output,
3040                            metadata=composite_eval_metadata,
3041                            evaluations=evaluations,
3042                        )
3043
3044                        # Handle async composite evaluators
3045                        if asyncio.iscoroutine(result):
3046                            result = await result
3047
3048                        # Normalize to list
3049                        composite_evals: List[Evaluation] = []
3050                        if isinstance(result, (dict, Evaluation)):
3051                            composite_evals = [result]  # type: ignore
3052                        elif isinstance(result, list):
3053                            composite_evals = result  # type: ignore
3054
3055                        # Store composite evaluations as scores and add to evaluations list
3056                        for composite_evaluation in composite_evals:
3057                            self.create_score(
3058                                trace_id=trace_id,
3059                                observation_id=span.id,
3060                                name=composite_evaluation.name,
3061                                value=composite_evaluation.value,  # type: ignore
3062                                comment=composite_evaluation.comment,
3063                                metadata=composite_evaluation.metadata,
3064                                config_id=composite_evaluation.config_id,
3065                                data_type=composite_evaluation.data_type,  # type: ignore
3066                            )
3067                            evaluations.append(composite_evaluation)
3068
3069                except Exception as e:
3070                    langfuse_logger.error(f"Composite evaluator failed: {e}")
3071
3072            return ExperimentItemResult(
3073                item=item,
3074                output=output,
3075                evaluations=evaluations,
3076                trace_id=trace_id,
3077                dataset_run_id=dataset_run_id,
3078            )
3079
3080    def _create_experiment_run_name(
3081        self, *, name: Optional[str] = None, run_name: Optional[str] = None
3082    ) -> str:
3083        if run_name:
3084            return run_name
3085
3086        iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z")
3087
3088        return f"{name} - {iso_timestamp}"
3089
3090    def run_batched_evaluation(
3091        self,
3092        *,
3093        scope: Literal["traces", "observations"],
3094        mapper: MapperFunction,
3095        filter: Optional[str] = None,
3096        fetch_batch_size: int = 50,
3097        max_items: Optional[int] = None,
3098        max_retries: int = 3,
3099        evaluators: List[EvaluatorFunction],
3100        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
3101        max_concurrency: int = 50,
3102        metadata: Optional[Dict[str, Any]] = None,
3103        resume_from: Optional[BatchEvaluationResumeToken] = None,
3104        verbose: bool = False,
3105    ) -> BatchEvaluationResult:
3106        """Fetch traces or observations and run evaluations on each item.
3107
3108        This method provides a powerful way to evaluate existing data in Langfuse at scale.
3109        It fetches items based on filters, transforms them using a mapper function, runs
3110        evaluators on each item, and creates scores that are linked back to the original
3111        entities. This is ideal for:
3112
3113        - Running evaluations on production traces after deployment
3114        - Backtesting new evaluation metrics on historical data
3115        - Batch scoring of observations for quality monitoring
3116        - Periodic evaluation runs on recent data
3117
3118        The method uses a streaming/pipeline approach to process items in batches, making
3119        it memory-efficient for large datasets. It includes comprehensive error handling,
3120        retry logic, and resume capability for long-running evaluations.
3121
3122        Args:
3123            scope: The type of items to evaluate. Must be one of:
3124                - "traces": Evaluate complete traces with all their observations
3125                - "observations": Evaluate individual observations (spans, generations, events)
3126            mapper: Function that transforms API response objects into evaluator inputs.
3127                Receives a trace/observation object and returns an EvaluatorInputs
3128                instance with input, output, expected_output, and metadata fields.
3129                Can be sync or async.
3130            evaluators: List of evaluation functions to run on each item. Each evaluator
3131                receives the mapped inputs and returns Evaluation object(s). Evaluator
3132                failures are logged but don't stop the batch evaluation.
3133            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
3134                - '{"tags": ["production"]}'
3135                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
3136                Default: None (fetches all items).
3137            fetch_batch_size: Number of items to fetch per API call and hold in memory.
3138                Larger values may be faster but use more memory. Default: 50.
3139            max_items: Maximum total number of items to process. If None, processes all
3140                items matching the filter. Useful for testing or limiting evaluation runs.
3141                Default: None (process all).
3142            max_concurrency: Maximum number of items to evaluate concurrently. Controls
3143                parallelism and resource usage. Default: 50.
3144            composite_evaluator: Optional function that creates a composite score from
3145                item-level evaluations. Receives the original item and its evaluations,
3146                returns a single Evaluation. Useful for weighted averages or combined metrics.
3147                Default: None.
3148            metadata: Optional metadata dict to add to all created scores. Useful for
3149                tracking evaluation runs, versions, or other context. Default: None.
3150            max_retries: Maximum number of retry attempts for failed batch fetches.
3151                Uses exponential backoff (1s, 2s, 4s). Default: 3.
3152            verbose: If True, logs progress information to console. Useful for monitoring
3153                long-running evaluations. Default: False.
3154            resume_from: Optional resume token from a previous incomplete run. Allows
3155                continuing evaluation after interruption or failure. Default: None.
3156
3157
3158        Returns:
3159            BatchEvaluationResult containing:
3160                - total_items_fetched: Number of items fetched from API
3161                - total_items_processed: Number of items successfully evaluated
3162                - total_items_failed: Number of items that failed evaluation
3163                - total_scores_created: Scores created by item-level evaluators
3164                - total_composite_scores_created: Scores created by composite evaluator
3165                - total_evaluations_failed: Individual evaluator failures
3166                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
3167                - resume_token: Token for resuming if incomplete (None if completed)
3168                - completed: True if all items processed
3169                - duration_seconds: Total execution time
3170                - failed_item_ids: IDs of items that failed
3171                - error_summary: Error types and counts
3172                - has_more_items: True if max_items reached but more exist
3173
3174        Raises:
3175            ValueError: If invalid scope is provided.
3176
3177        Examples:
3178            Basic trace evaluation:
3179            ```python
3180            from langfuse import Langfuse, EvaluatorInputs, Evaluation
3181
3182            client = Langfuse()
3183
3184            # Define mapper to extract fields from traces
3185            def trace_mapper(trace):
3186                return EvaluatorInputs(
3187                    input=trace.input,
3188                    output=trace.output,
3189                    expected_output=None,
3190                    metadata={"trace_id": trace.id}
3191                )
3192
3193            # Define evaluator
3194            def length_evaluator(*, input, output, expected_output, metadata):
3195                return Evaluation(
3196                    name="output_length",
3197                    value=len(output) if output else 0
3198                )
3199
3200            # Run batch evaluation
3201            result = client.run_batched_evaluation(
3202                scope="traces",
3203                mapper=trace_mapper,
3204                evaluators=[length_evaluator],
3205                filter='{"tags": ["production"]}',
3206                max_items=1000,
3207                verbose=True
3208            )
3209
3210            print(f"Processed {result.total_items_processed} traces")
3211            print(f"Created {result.total_scores_created} scores")
3212            ```
3213
3214            Evaluation with composite scorer:
3215            ```python
3216            def accuracy_evaluator(*, input, output, expected_output, metadata):
3217                # ... evaluation logic
3218                return Evaluation(name="accuracy", value=0.85)
3219
3220            def relevance_evaluator(*, input, output, expected_output, metadata):
3221                # ... evaluation logic
3222                return Evaluation(name="relevance", value=0.92)
3223
3224            def composite_evaluator(*, item, evaluations):
3225                # Weighted average of evaluations
3226                weights = {"accuracy": 0.6, "relevance": 0.4}
3227                total = sum(
3228                    e.value * weights.get(e.name, 0)
3229                    for e in evaluations
3230                    if isinstance(e.value, (int, float))
3231                )
3232                return Evaluation(
3233                    name="composite_score",
3234                    value=total,
3235                    comment=f"Weighted average of {len(evaluations)} metrics"
3236                )
3237
3238            result = client.run_batched_evaluation(
3239                scope="traces",
3240                mapper=trace_mapper,
3241                evaluators=[accuracy_evaluator, relevance_evaluator],
3242                composite_evaluator=composite_evaluator,
3243                filter='{"user_id": "important_user"}',
3244                verbose=True
3245            )
3246            ```
3247
3248            Handling incomplete runs with resume:
3249            ```python
3250            # Initial run that may fail or timeout
3251            result = client.run_batched_evaluation(
3252                scope="observations",
3253                mapper=obs_mapper,
3254                evaluators=[my_evaluator],
3255                max_items=10000,
3256                verbose=True
3257            )
3258
3259            # Check if incomplete
3260            if not result.completed and result.resume_token:
3261                print(f"Processed {result.resume_token.items_processed} items before interruption")
3262
3263                # Resume from where it left off
3264                result = client.run_batched_evaluation(
3265                    scope="observations",
3266                    mapper=obs_mapper,
3267                    evaluators=[my_evaluator],
3268                    resume_from=result.resume_token,
3269                    verbose=True
3270                )
3271
3272            print(f"Total items processed: {result.total_items_processed}")
3273            ```
3274
3275            Monitoring evaluator performance:
3276            ```python
3277            result = client.run_batched_evaluation(...)
3278
3279            for stats in result.evaluator_stats:
3280                success_rate = stats.successful_runs / stats.total_runs
3281                print(f"{stats.name}:")
3282                print(f"  Success rate: {success_rate:.1%}")
3283                print(f"  Scores created: {stats.total_scores_created}")
3284
3285                if stats.failed_runs > 0:
3286                    print(f"  âš ī¸  Failed {stats.failed_runs} times")
3287            ```
3288
3289        Note:
3290            - Evaluator failures are logged but don't stop the batch evaluation
3291            - Individual item failures are tracked but don't stop processing
3292            - Fetch failures are retried with exponential backoff
3293            - All scores are automatically flushed to Langfuse at the end
3294            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3295        """
3296        runner = BatchEvaluationRunner(self)
3297
3298        return cast(
3299            BatchEvaluationResult,
3300            run_async_safely(
3301                runner.run_async(
3302                    scope=scope,
3303                    mapper=mapper,
3304                    evaluators=evaluators,
3305                    filter=filter,
3306                    fetch_batch_size=fetch_batch_size,
3307                    max_items=max_items,
3308                    max_concurrency=max_concurrency,
3309                    composite_evaluator=composite_evaluator,
3310                    metadata=metadata,
3311                    max_retries=max_retries,
3312                    verbose=verbose,
3313                    resume_from=resume_from,
3314                )
3315            ),
3316        )
3317
3318    def auth_check(self) -> bool:
3319        """Check if the provided credentials (public and secret key) are valid.
3320
3321        Raises:
3322            Exception: If no projects were found for the provided credentials.
3323
3324        Note:
3325            This method is blocking. It is discouraged to use it in production code.
3326        """
3327        try:
3328            projects = self.api.projects.get()
3329            langfuse_logger.debug(
3330                f"Auth check successful, found {len(projects.data)} projects"
3331            )
3332            if len(projects.data) == 0:
3333                raise Exception(
3334                    "Auth check failed, no project found for the keys provided."
3335                )
3336            return True
3337
3338        except AttributeError as e:
3339            langfuse_logger.warning(
3340                f"Auth check failed: Client not properly initialized. Error: {e}"
3341            )
3342            return False
3343
3344        except Error as e:
3345            handle_fern_exception(e)
3346            raise e
3347
3348    def create_dataset(
3349        self,
3350        *,
3351        name: str,
3352        description: Optional[str] = None,
3353        metadata: Optional[Any] = None,
3354        input_schema: Optional[Any] = None,
3355        expected_output_schema: Optional[Any] = None,
3356    ) -> Dataset:
3357        """Create a dataset with the given name on Langfuse.
3358
3359        Args:
3360            name: Name of the dataset to create.
3361            description: Description of the dataset. Defaults to None.
3362            metadata: Additional metadata. Defaults to None.
3363            input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
3364            expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
3365
3366        Returns:
3367            Dataset: The created dataset as returned by the Langfuse API.
3368        """
3369        try:
3370            body = CreateDatasetRequest(
3371                name=name,
3372                description=description,
3373                metadata=metadata,
3374                inputSchema=input_schema,
3375                expectedOutputSchema=expected_output_schema,
3376            )
3377            langfuse_logger.debug(f"Creating datasets {body}")
3378
3379            return self.api.datasets.create(request=body)
3380
3381        except Error as e:
3382            handle_fern_exception(e)
3383            raise e
3384
3385    def create_dataset_item(
3386        self,
3387        *,
3388        dataset_name: str,
3389        input: Optional[Any] = None,
3390        expected_output: Optional[Any] = None,
3391        metadata: Optional[Any] = None,
3392        source_trace_id: Optional[str] = None,
3393        source_observation_id: Optional[str] = None,
3394        status: Optional[DatasetStatus] = None,
3395        id: Optional[str] = None,
3396    ) -> DatasetItem:
3397        """Create a dataset item.
3398
3399        Upserts if an item with id already exists.
3400
3401        Args:
3402            dataset_name: Name of the dataset in which the dataset item should be created.
3403            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3404            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3405            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3406            source_trace_id: Id of the source trace. Defaults to None.
3407            source_observation_id: Id of the source observation. Defaults to None.
3408            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3409            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3410
3411        Returns:
3412            DatasetItem: The created dataset item as returned by the Langfuse API.
3413
3414        Example:
3415            ```python
3416            from langfuse import Langfuse
3417
3418            langfuse = Langfuse()
3419
3420            # Uploading items to the Langfuse dataset named "capital_cities"
3421            langfuse.create_dataset_item(
3422                dataset_name="capital_cities",
3423                input={"input": {"country": "Italy"}},
3424                expected_output={"expected_output": "Rome"},
3425                metadata={"foo": "bar"}
3426            )
3427            ```
3428        """
3429        try:
3430            body = CreateDatasetItemRequest(
3431                datasetName=dataset_name,
3432                input=input,
3433                expectedOutput=expected_output,
3434                metadata=metadata,
3435                sourceTraceId=source_trace_id,
3436                sourceObservationId=source_observation_id,
3437                status=status,
3438                id=id,
3439            )
3440            langfuse_logger.debug(f"Creating dataset item {body}")
3441            return self.api.dataset_items.create(request=body)
3442        except Error as e:
3443            handle_fern_exception(e)
3444            raise e
3445
3446    def resolve_media_references(
3447        self,
3448        *,
3449        obj: Any,
3450        resolve_with: Literal["base64_data_uri"],
3451        max_depth: int = 10,
3452        content_fetch_timeout_seconds: int = 5,
3453    ) -> Any:
3454        """Replace media reference strings in an object with base64 data URIs.
3455
3456        This method recursively traverses an object (up to max_depth) looking for media reference strings
3457        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3458        the provided Langfuse client and replaces the reference string with a base64 data URI.
3459
3460        If fetching media content fails for a reference string, a warning is logged and the reference
3461        string is left unchanged.
3462
3463        Args:
3464            obj: The object to process. Can be a primitive value, array, or nested object.
3465                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3466            resolve_with: The representation of the media content to replace the media reference string with.
3467                Currently only "base64_data_uri" is supported.
3468            max_depth: int: The maximum depth to traverse the object. Default is 10.
3469            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3470
3471        Returns:
3472            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3473            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3474
3475        Example:
3476            obj = {
3477                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3478                "nested": {
3479                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3480                }
3481            }
3482
3483            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3484
3485            # Result:
3486            # {
3487            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
3488            #     "nested": {
3489            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3490            #     }
3491            # }
3492        """
3493        return LangfuseMedia.resolve_media_references(
3494            langfuse_client=self,
3495            obj=obj,
3496            resolve_with=resolve_with,
3497            max_depth=max_depth,
3498            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3499        )
3500
3501    @overload
3502    def get_prompt(
3503        self,
3504        name: str,
3505        *,
3506        version: Optional[int] = None,
3507        label: Optional[str] = None,
3508        type: Literal["chat"],
3509        cache_ttl_seconds: Optional[int] = None,
3510        fallback: Optional[List[ChatMessageDict]] = None,
3511        max_retries: Optional[int] = None,
3512        fetch_timeout_seconds: Optional[int] = None,
3513    ) -> ChatPromptClient: ...
3514
3515    @overload
3516    def get_prompt(
3517        self,
3518        name: str,
3519        *,
3520        version: Optional[int] = None,
3521        label: Optional[str] = None,
3522        type: Literal["text"] = "text",
3523        cache_ttl_seconds: Optional[int] = None,
3524        fallback: Optional[str] = None,
3525        max_retries: Optional[int] = None,
3526        fetch_timeout_seconds: Optional[int] = None,
3527    ) -> TextPromptClient: ...
3528
3529    def get_prompt(
3530        self,
3531        name: str,
3532        *,
3533        version: Optional[int] = None,
3534        label: Optional[str] = None,
3535        type: Literal["chat", "text"] = "text",
3536        cache_ttl_seconds: Optional[int] = None,
3537        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3538        max_retries: Optional[int] = None,
3539        fetch_timeout_seconds: Optional[int] = None,
3540    ) -> PromptClient:
3541        """Get a prompt.
3542
3543        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3544        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3545        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3546        return the expired prompt as a fallback.
3547
3548        Args:
3549            name (str): The name of the prompt to retrieve.
3550
3551        Keyword Args:
3552            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3553            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3554            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3555            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3556            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3557            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3558            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3559            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3560
3561        Returns:
3562            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3563            - TextPromptClient, if type argument is 'text'.
3564            - ChatPromptClient, if type argument is 'chat'.
3565
3566        Raises:
3567            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3568            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3569        """
3570        if self._resources is None:
3571            raise Error(
3572                "SDK is not correctly initialized. Check the init logs for more details."
3573            )
3574        if version is not None and label is not None:
3575            raise ValueError("Cannot specify both version and label at the same time.")
3576
3577        if not name:
3578            raise ValueError("Prompt name cannot be empty.")
3579
3580        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3581        bounded_max_retries = self._get_bounded_max_retries(
3582            max_retries, default_max_retries=2, max_retries_upper_bound=4
3583        )
3584
3585        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3586        cached_prompt = self._resources.prompt_cache.get(cache_key)
3587
3588        if cached_prompt is None or cache_ttl_seconds == 0:
3589            langfuse_logger.debug(
3590                f"Prompt '{cache_key}' not found in cache or caching disabled."
3591            )
3592            try:
3593                return self._fetch_prompt_and_update_cache(
3594                    name,
3595                    version=version,
3596                    label=label,
3597                    ttl_seconds=cache_ttl_seconds,
3598                    max_retries=bounded_max_retries,
3599                    fetch_timeout_seconds=fetch_timeout_seconds,
3600                )
3601            except Exception as e:
3602                if fallback:
3603                    langfuse_logger.warning(
3604                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3605                    )
3606
3607                    fallback_client_args: Dict[str, Any] = {
3608                        "name": name,
3609                        "prompt": fallback,
3610                        "type": type,
3611                        "version": version or 0,
3612                        "config": {},
3613                        "labels": [label] if label else [],
3614                        "tags": [],
3615                    }
3616
3617                    if type == "text":
3618                        return TextPromptClient(
3619                            prompt=Prompt_Text(**fallback_client_args),
3620                            is_fallback=True,
3621                        )
3622
3623                    if type == "chat":
3624                        return ChatPromptClient(
3625                            prompt=Prompt_Chat(**fallback_client_args),
3626                            is_fallback=True,
3627                        )
3628
3629                raise e
3630
3631        if cached_prompt.is_expired():
3632            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3633            try:
3634                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3635                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3636
3637                def refresh_task() -> None:
3638                    self._fetch_prompt_and_update_cache(
3639                        name,
3640                        version=version,
3641                        label=label,
3642                        ttl_seconds=cache_ttl_seconds,
3643                        max_retries=bounded_max_retries,
3644                        fetch_timeout_seconds=fetch_timeout_seconds,
3645                    )
3646
3647                self._resources.prompt_cache.add_refresh_prompt_task(
3648                    cache_key,
3649                    refresh_task,
3650                )
3651                langfuse_logger.debug(
3652                    f"Returning stale prompt '{cache_key}' from cache."
3653                )
3654                # return stale prompt
3655                return cached_prompt.value
3656
3657            except Exception as e:
3658                langfuse_logger.warning(
3659                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3660                )
3661                # creation of refresh prompt task failed, return stale prompt
3662                return cached_prompt.value
3663
3664        return cached_prompt.value
3665
3666    def _fetch_prompt_and_update_cache(
3667        self,
3668        name: str,
3669        *,
3670        version: Optional[int] = None,
3671        label: Optional[str] = None,
3672        ttl_seconds: Optional[int] = None,
3673        max_retries: int,
3674        fetch_timeout_seconds: Optional[int],
3675    ) -> PromptClient:
3676        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3677        langfuse_logger.debug(f"Fetching prompt '{cache_key}' from server...")
3678
3679        try:
3680
3681            @backoff.on_exception(
3682                backoff.constant, Exception, max_tries=max_retries + 1, logger=None
3683            )
3684            def fetch_prompts() -> Any:
3685                return self.api.prompts.get(
3686                    self._url_encode(name),
3687                    version=version,
3688                    label=label,
3689                    request_options={
3690                        "timeout_in_seconds": fetch_timeout_seconds,
3691                    }
3692                    if fetch_timeout_seconds is not None
3693                    else None,
3694                )
3695
3696            prompt_response = fetch_prompts()
3697
3698            prompt: PromptClient
3699            if prompt_response.type == "chat":
3700                prompt = ChatPromptClient(prompt_response)
3701            else:
3702                prompt = TextPromptClient(prompt_response)
3703
3704            if self._resources is not None:
3705                self._resources.prompt_cache.set(cache_key, prompt, ttl_seconds)
3706
3707            return prompt
3708
3709        except NotFoundError as not_found_error:
3710            langfuse_logger.warning(
3711                f"Prompt '{cache_key}' not found during refresh, evicting from cache."
3712            )
3713            if self._resources is not None:
3714                self._resources.prompt_cache.delete(cache_key)
3715            raise not_found_error
3716
3717        except Exception as e:
3718            langfuse_logger.error(
3719                f"Error while fetching prompt '{cache_key}': {str(e)}"
3720            )
3721            raise e
3722
3723    def _get_bounded_max_retries(
3724        self,
3725        max_retries: Optional[int],
3726        *,
3727        default_max_retries: int = 2,
3728        max_retries_upper_bound: int = 4,
3729    ) -> int:
3730        if max_retries is None:
3731            return default_max_retries
3732
3733        bounded_max_retries = min(
3734            max(max_retries, 0),
3735            max_retries_upper_bound,
3736        )
3737
3738        return bounded_max_retries
3739
3740    @overload
3741    def create_prompt(
3742        self,
3743        *,
3744        name: str,
3745        prompt: List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]],
3746        labels: List[str] = [],
3747        tags: Optional[List[str]] = None,
3748        type: Optional[Literal["chat"]],
3749        config: Optional[Any] = None,
3750        commit_message: Optional[str] = None,
3751    ) -> ChatPromptClient: ...
3752
3753    @overload
3754    def create_prompt(
3755        self,
3756        *,
3757        name: str,
3758        prompt: str,
3759        labels: List[str] = [],
3760        tags: Optional[List[str]] = None,
3761        type: Optional[Literal["text"]] = "text",
3762        config: Optional[Any] = None,
3763        commit_message: Optional[str] = None,
3764    ) -> TextPromptClient: ...
3765
3766    def create_prompt(
3767        self,
3768        *,
3769        name: str,
3770        prompt: Union[
3771            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3772        ],
3773        labels: List[str] = [],
3774        tags: Optional[List[str]] = None,
3775        type: Optional[Literal["chat", "text"]] = "text",
3776        config: Optional[Any] = None,
3777        commit_message: Optional[str] = None,
3778    ) -> PromptClient:
3779        """Create a new prompt in Langfuse.
3780
3781        Keyword Args:
3782            name : The name of the prompt to be created.
3783            prompt : The content of the prompt to be created.
3784            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3785            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3786            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3787            config: Additional structured data to be saved with the prompt. Defaults to None.
3788            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3789            commit_message: Optional string describing the change.
3790
3791        Returns:
3792            TextPromptClient: The prompt if type argument is 'text'.
3793            ChatPromptClient: The prompt if type argument is 'chat'.
3794        """
3795        try:
3796            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3797
3798            if type == "chat":
3799                if not isinstance(prompt, list):
3800                    raise ValueError(
3801                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3802                    )
3803                request: Union[CreatePromptRequest_Chat, CreatePromptRequest_Text] = (
3804                    CreatePromptRequest_Chat(
3805                        name=name,
3806                        prompt=cast(Any, prompt),
3807                        labels=labels,
3808                        tags=tags,
3809                        config=config or {},
3810                        commitMessage=commit_message,
3811                        type="chat",
3812                    )
3813                )
3814                server_prompt = self.api.prompts.create(request=request)
3815
3816                if self._resources is not None:
3817                    self._resources.prompt_cache.invalidate(name)
3818
3819                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3820
3821            if not isinstance(prompt, str):
3822                raise ValueError("For 'text' type, 'prompt' must be a string.")
3823
3824            request = CreatePromptRequest_Text(
3825                name=name,
3826                prompt=prompt,
3827                labels=labels,
3828                tags=tags,
3829                config=config or {},
3830                commitMessage=commit_message,
3831                type="text",
3832            )
3833
3834            server_prompt = self.api.prompts.create(request=request)
3835
3836            if self._resources is not None:
3837                self._resources.prompt_cache.invalidate(name)
3838
3839            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
3840
3841        except Error as e:
3842            handle_fern_exception(e)
3843            raise e
3844
3845    def update_prompt(
3846        self,
3847        *,
3848        name: str,
3849        version: int,
3850        new_labels: List[str] = [],
3851    ) -> Any:
3852        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
3853
3854        Args:
3855            name (str): The name of the prompt to update.
3856            version (int): The version number of the prompt to update.
3857            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
3858
3859        Returns:
3860            Prompt: The updated prompt from the Langfuse API.
3861
3862        """
3863        updated_prompt = self.api.prompt_version.update(
3864            name=self._url_encode(name),
3865            version=version,
3866            new_labels=new_labels,
3867        )
3868
3869        if self._resources is not None:
3870            self._resources.prompt_cache.invalidate(name)
3871
3872        return updated_prompt
3873
3874    def _url_encode(self, url: str, *, is_url_param: Optional[bool] = False) -> str:
3875        # httpx â‰Ĩ 0.28 does its own WHATWG-compliant quoting (eg. encodes bare
3876        # “%”, “?”, “#”, “|”, â€Ļ in query/path parts).  Re-quoting here would
3877        # double-encode, so we skip when the value is about to be sent straight
3878        # to httpx (`is_url_param=True`) and the installed version is â‰Ĩ 0.28.
3879        if is_url_param and Version(httpx.__version__) >= Version("0.28.0"):
3880            return url
3881
3882        # urllib.parse.quote does not escape slashes "/" by default; we need to add safe="" to force escaping
3883        # we need add safe="" to force escaping of slashes
3884        # This is necessary for prompts in prompt folders
3885        return urllib.parse.quote(url, safe="")
3886
3887    def clear_prompt_cache(self) -> None:
3888        """Clear the entire prompt cache, removing all cached prompts.
3889
3890        This method is useful when you want to force a complete refresh of all
3891        cached prompts, for example after major updates or when you need to
3892        ensure the latest versions are fetched from the server.
3893        """
3894        if self._resources is not None:
3895            self._resources.prompt_cache.clear()

Main client for Langfuse tracing and platform features.

This class provides an interface for creating and managing traces, spans, and generations in Langfuse as well as interacting with the Langfuse API.

The client features a thread-safe singleton pattern for each unique public API key, ensuring consistent trace context propagation across your application. It implements efficient batching of spans with configurable flush settings and includes background thread management for media uploads and score ingestion.

Configuration is flexible through either direct parameters or environment variables, with graceful fallbacks and runtime configuration updates.

Attributes:
  • api: Synchronous API client for Langfuse backend communication
  • async_api: Asynchronous API client for Langfuse backend communication
  • _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
Arguments:
  • public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
  • secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
  • base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
  • host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
  • timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
  • httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
  • debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
  • tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
  • flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
  • flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
  • environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
  • release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
  • media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
  • sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
  • mask (Optional[MaskFunction]): Function to mask sensitive data in traces before sending to the API.
  • blocked_instrumentation_scopes (Optional[List[str]]): List of instrumentation scope names to block from being exported to Langfuse. Spans from these scopes will be filtered out before being sent to the API. Useful for filtering out spans from specific libraries or frameworks. For exported spans, you can see the instrumentation scope name in the span metadata in Langfuse (metadata.scope.name)
  • additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well.
  • tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
Example:
from langfuse.otel import Langfuse

# Initialize the client (reads from env vars if not provided)
langfuse = Langfuse(
    public_key="your-public-key",
    secret_key="your-secret-key",
    host="https://cloud.langfuse.com",  # Optional, default shown
)

# Create a trace span
with langfuse.start_as_current_span(name="process-query") as span:
    # Your application code here

    # Create a nested generation span for an LLM call
    with span.start_as_current_generation(
        name="generate-response",
        model="gpt-4",
        input={"query": "Tell me about AI"},
        model_parameters={"temperature": 0.7, "max_tokens": 500}
    ) as generation:
        # Generate response here
        response = "AI is a field of computer science..."

        generation.update(
            output=response,
            usage_details={"prompt_tokens": 10, "completion_tokens": 50},
            cost_details={"total_cost": 0.0023}
        )

        # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
        generation.score(name="relevance", value=0.95, data_type="NUMERIC")
Langfuse( *, public_key: Optional[str] = None, secret_key: Optional[str] = None, base_url: Optional[str] = None, host: Optional[str] = None, timeout: Optional[int] = None, httpx_client: Optional[httpx.Client] = None, debug: bool = False, tracing_enabled: Optional[bool] = True, flush_at: Optional[int] = None, flush_interval: Optional[float] = None, environment: Optional[str] = None, release: Optional[str] = None, media_upload_thread_count: Optional[int] = None, sample_rate: Optional[float] = None, mask: Optional[langfuse.types.MaskFunction] = None, blocked_instrumentation_scopes: Optional[List[str]] = None, additional_headers: Optional[Dict[str, str]] = None, tracer_provider: Optional[opentelemetry.sdk.trace.TracerProvider] = None)
213    def __init__(
214        self,
215        *,
216        public_key: Optional[str] = None,
217        secret_key: Optional[str] = None,
218        base_url: Optional[str] = None,
219        host: Optional[str] = None,
220        timeout: Optional[int] = None,
221        httpx_client: Optional[httpx.Client] = None,
222        debug: bool = False,
223        tracing_enabled: Optional[bool] = True,
224        flush_at: Optional[int] = None,
225        flush_interval: Optional[float] = None,
226        environment: Optional[str] = None,
227        release: Optional[str] = None,
228        media_upload_thread_count: Optional[int] = None,
229        sample_rate: Optional[float] = None,
230        mask: Optional[MaskFunction] = None,
231        blocked_instrumentation_scopes: Optional[List[str]] = None,
232        additional_headers: Optional[Dict[str, str]] = None,
233        tracer_provider: Optional[TracerProvider] = None,
234    ):
235        self._base_url = (
236            base_url
237            or os.environ.get(LANGFUSE_BASE_URL)
238            or host
239            or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com")
240        )
241        self._environment = environment or cast(
242            str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT)
243        )
244        self._project_id: Optional[str] = None
245        sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0))
246        if not 0.0 <= sample_rate <= 1.0:
247            raise ValueError(
248                f"Sample rate must be between 0.0 and 1.0, got {sample_rate}"
249            )
250
251        timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5))
252
253        self._tracing_enabled = (
254            tracing_enabled
255            and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false"
256        )
257        if not self._tracing_enabled:
258            langfuse_logger.info(
259                "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API."
260            )
261
262        debug = (
263            debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true")
264        )
265        if debug:
266            logging.basicConfig(
267                format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
268            )
269            langfuse_logger.setLevel(logging.DEBUG)
270
271        public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY)
272        if public_key is None:
273            langfuse_logger.warning(
274                "Authentication error: Langfuse client initialized without public_key. Client will be disabled. "
275                "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. "
276            )
277            self._otel_tracer = otel_trace_api.NoOpTracer()
278            return
279
280        secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY)
281        if secret_key is None:
282            langfuse_logger.warning(
283                "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. "
284                "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. "
285            )
286            self._otel_tracer = otel_trace_api.NoOpTracer()
287            return
288
289        if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true":
290            langfuse_logger.warning(
291                "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI."
292            )
293
294        # Initialize api and tracer if requirements are met
295        self._resources = LangfuseResourceManager(
296            public_key=public_key,
297            secret_key=secret_key,
298            base_url=self._base_url,
299            timeout=timeout,
300            environment=self._environment,
301            release=release,
302            flush_at=flush_at,
303            flush_interval=flush_interval,
304            httpx_client=httpx_client,
305            media_upload_thread_count=media_upload_thread_count,
306            sample_rate=sample_rate,
307            mask=mask,
308            tracing_enabled=self._tracing_enabled,
309            blocked_instrumentation_scopes=blocked_instrumentation_scopes,
310            additional_headers=additional_headers,
311            tracer_provider=tracer_provider,
312        )
313        self._mask = self._resources.mask
314
315        self._otel_tracer = (
316            self._resources.tracer
317            if self._tracing_enabled and self._resources.tracer is not None
318            else otel_trace_api.NoOpTracer()
319        )
320        self.api = self._resources.api
321        self.async_api = self._resources.async_api
api
async_api
def start_span( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseSpan:
323    def start_span(
324        self,
325        *,
326        trace_context: Optional[TraceContext] = None,
327        name: str,
328        input: Optional[Any] = None,
329        output: Optional[Any] = None,
330        metadata: Optional[Any] = None,
331        version: Optional[str] = None,
332        level: Optional[SpanLevel] = None,
333        status_message: Optional[str] = None,
334    ) -> LangfuseSpan:
335        """Create a new span for tracing a unit of work.
336
337        This method creates a new span but does not set it as the current span in the
338        context. To create and use a span within a context, use start_as_current_span().
339
340        The created span will be the child of the current span in the context.
341
342        Args:
343            trace_context: Optional context for connecting to an existing trace
344            name: Name of the span (e.g., function or operation name)
345            input: Input data for the operation (can be any JSON-serializable object)
346            output: Output data from the operation (can be any JSON-serializable object)
347            metadata: Additional metadata to associate with the span
348            version: Version identifier for the code or component
349            level: Importance level of the span (info, warning, error)
350            status_message: Optional status message for the span
351
352        Returns:
353            A LangfuseSpan object that must be ended with .end() when the operation completes
354
355        Example:
356            ```python
357            span = langfuse.start_span(name="process-data")
358            try:
359                # Do work
360                span.update(output="result")
361            finally:
362                span.end()
363            ```
364        """
365        return self.start_observation(
366            trace_context=trace_context,
367            name=name,
368            as_type="span",
369            input=input,
370            output=output,
371            metadata=metadata,
372            version=version,
373            level=level,
374            status_message=status_message,
375        )

Create a new span for tracing a unit of work.

This method creates a new span but does not set it as the current span in the context. To create and use a span within a context, use start_as_current_span().

The created span will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

A LangfuseSpan object that must be ended with .end() when the operation completes

Example:
span = langfuse.start_span(name="process-data")
try:
    # Do work
    span.update(output="result")
finally:
    span.end()
def start_as_current_span( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, end_on_exit: Optional[bool] = None) -> opentelemetry.util._decorator._AgnosticContextManager[LangfuseSpan]:
377    def start_as_current_span(
378        self,
379        *,
380        trace_context: Optional[TraceContext] = None,
381        name: str,
382        input: Optional[Any] = None,
383        output: Optional[Any] = None,
384        metadata: Optional[Any] = None,
385        version: Optional[str] = None,
386        level: Optional[SpanLevel] = None,
387        status_message: Optional[str] = None,
388        end_on_exit: Optional[bool] = None,
389    ) -> _AgnosticContextManager[LangfuseSpan]:
390        """Create a new span and set it as the current span in a context manager.
391
392        This method creates a new span and sets it as the current span within a context
393        manager. Use this method with a 'with' statement to automatically handle span
394        lifecycle within a code block.
395
396        The created span will be the child of the current span in the context.
397
398        Args:
399            trace_context: Optional context for connecting to an existing trace
400            name: Name of the span (e.g., function or operation name)
401            input: Input data for the operation (can be any JSON-serializable object)
402            output: Output data from the operation (can be any JSON-serializable object)
403            metadata: Additional metadata to associate with the span
404            version: Version identifier for the code or component
405            level: Importance level of the span (info, warning, error)
406            status_message: Optional status message for the span
407            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
408
409        Returns:
410            A context manager that yields a LangfuseSpan
411
412        Example:
413            ```python
414            with langfuse.start_as_current_span(name="process-query") as span:
415                # Do work
416                result = process_data()
417                span.update(output=result)
418
419                # Create a child span automatically
420                with span.start_as_current_span(name="sub-operation") as child_span:
421                    # Do sub-operation work
422                    child_span.update(output="sub-result")
423            ```
424        """
425        return self.start_as_current_observation(
426            trace_context=trace_context,
427            name=name,
428            as_type="span",
429            input=input,
430            output=output,
431            metadata=metadata,
432            version=version,
433            level=level,
434            status_message=status_message,
435            end_on_exit=end_on_exit,
436        )

Create a new span and set it as the current span in a context manager.

This method creates a new span and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle span lifecycle within a code block.

The created span will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
  • end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
Returns:

A context manager that yields a LangfuseSpan

Example:
with langfuse.start_as_current_span(name="process-query") as span:
    # Do work
    result = process_data()
    span.update(output=result)

    # Create a child span automatically
    with span.start_as_current_span(name="sub-operation") as child_span:
        # Do sub-operation work
        child_span.update(output="sub-result")
def start_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> Union[LangfuseSpan, LangfuseGeneration, LangfuseAgent, LangfuseTool, LangfuseChain, LangfuseRetriever, LangfuseEvaluator, LangfuseEmbedding, LangfuseGuardrail]:
585    def start_observation(
586        self,
587        *,
588        trace_context: Optional[TraceContext] = None,
589        name: str,
590        as_type: ObservationTypeLiteralNoEvent = "span",
591        input: Optional[Any] = None,
592        output: Optional[Any] = None,
593        metadata: Optional[Any] = None,
594        version: Optional[str] = None,
595        level: Optional[SpanLevel] = None,
596        status_message: Optional[str] = None,
597        completion_start_time: Optional[datetime] = None,
598        model: Optional[str] = None,
599        model_parameters: Optional[Dict[str, MapValue]] = None,
600        usage_details: Optional[Dict[str, int]] = None,
601        cost_details: Optional[Dict[str, float]] = None,
602        prompt: Optional[PromptClient] = None,
603    ) -> Union[
604        LangfuseSpan,
605        LangfuseGeneration,
606        LangfuseAgent,
607        LangfuseTool,
608        LangfuseChain,
609        LangfuseRetriever,
610        LangfuseEvaluator,
611        LangfuseEmbedding,
612        LangfuseGuardrail,
613    ]:
614        """Create a new observation of the specified type.
615
616        This method creates a new observation but does not set it as the current span in the
617        context. To create and use an observation within a context, use start_as_current_observation().
618
619        Args:
620            trace_context: Optional context for connecting to an existing trace
621            name: Name of the observation
622            as_type: Type of observation to create (defaults to "span")
623            input: Input data for the operation
624            output: Output data from the operation
625            metadata: Additional metadata to associate with the observation
626            version: Version identifier for the code or component
627            level: Importance level of the observation
628            status_message: Optional status message for the observation
629            completion_start_time: When the model started generating (for generation types)
630            model: Name/identifier of the AI model used (for generation types)
631            model_parameters: Parameters used for the model (for generation types)
632            usage_details: Token usage information (for generation types)
633            cost_details: Cost information (for generation types)
634            prompt: Associated prompt template (for generation types)
635
636        Returns:
637            An observation object of the appropriate type that must be ended with .end()
638        """
639        if trace_context:
640            trace_id = trace_context.get("trace_id", None)
641            parent_span_id = trace_context.get("parent_span_id", None)
642
643            if trace_id:
644                remote_parent_span = self._create_remote_parent_span(
645                    trace_id=trace_id, parent_span_id=parent_span_id
646                )
647
648                with otel_trace_api.use_span(
649                    cast(otel_trace_api.Span, remote_parent_span)
650                ):
651                    otel_span = self._otel_tracer.start_span(name=name)
652                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
653
654                    return self._create_observation_from_otel_span(
655                        otel_span=otel_span,
656                        as_type=as_type,
657                        input=input,
658                        output=output,
659                        metadata=metadata,
660                        version=version,
661                        level=level,
662                        status_message=status_message,
663                        completion_start_time=completion_start_time,
664                        model=model,
665                        model_parameters=model_parameters,
666                        usage_details=usage_details,
667                        cost_details=cost_details,
668                        prompt=prompt,
669                    )
670
671        otel_span = self._otel_tracer.start_span(name=name)
672
673        return self._create_observation_from_otel_span(
674            otel_span=otel_span,
675            as_type=as_type,
676            input=input,
677            output=output,
678            metadata=metadata,
679            version=version,
680            level=level,
681            status_message=status_message,
682            completion_start_time=completion_start_time,
683            model=model,
684            model_parameters=model_parameters,
685            usage_details=usage_details,
686            cost_details=cost_details,
687            prompt=prompt,
688        )

Create a new observation of the specified type.

This method creates a new observation but does not set it as the current span in the context. To create and use an observation within a context, use start_as_current_observation().

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the observation
  • as_type: Type of observation to create (defaults to "span")
  • input: Input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the observation
  • version: Version identifier for the code or component
  • level: Importance level of the observation
  • status_message: Optional status message for the observation
  • completion_start_time: When the model started generating (for generation types)
  • model: Name/identifier of the AI model used (for generation types)
  • model_parameters: Parameters used for the model (for generation types)
  • usage_details: Token usage information (for generation types)
  • cost_details: Cost information (for generation types)
  • prompt: Associated prompt template (for generation types)
Returns:

An observation object of the appropriate type that must be ended with .end()

def start_generation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> LangfuseGeneration:
760    def start_generation(
761        self,
762        *,
763        trace_context: Optional[TraceContext] = None,
764        name: str,
765        input: Optional[Any] = None,
766        output: Optional[Any] = None,
767        metadata: Optional[Any] = None,
768        version: Optional[str] = None,
769        level: Optional[SpanLevel] = None,
770        status_message: Optional[str] = None,
771        completion_start_time: Optional[datetime] = None,
772        model: Optional[str] = None,
773        model_parameters: Optional[Dict[str, MapValue]] = None,
774        usage_details: Optional[Dict[str, int]] = None,
775        cost_details: Optional[Dict[str, float]] = None,
776        prompt: Optional[PromptClient] = None,
777    ) -> LangfuseGeneration:
778        """Create a new generation span for model generations.
779
780        DEPRECATED: This method is deprecated and will be removed in a future version.
781        Use start_observation(as_type='generation') instead.
782
783        This method creates a specialized span for tracking model generations.
784        It includes additional fields specific to model generations such as model name,
785        token usage, and cost details.
786
787        The created generation span will be the child of the current span in the context.
788
789        Args:
790            trace_context: Optional context for connecting to an existing trace
791            name: Name of the generation operation
792            input: Input data for the model (e.g., prompts)
793            output: Output from the model (e.g., completions)
794            metadata: Additional metadata to associate with the generation
795            version: Version identifier for the model or component
796            level: Importance level of the generation (info, warning, error)
797            status_message: Optional status message for the generation
798            completion_start_time: When the model started generating the response
799            model: Name/identifier of the AI model used (e.g., "gpt-4")
800            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
801            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
802            cost_details: Cost information for the model call
803            prompt: Associated prompt template from Langfuse prompt management
804
805        Returns:
806            A LangfuseGeneration object that must be ended with .end() when complete
807
808        Example:
809            ```python
810            generation = langfuse.start_generation(
811                name="answer-generation",
812                model="gpt-4",
813                input={"prompt": "Explain quantum computing"},
814                model_parameters={"temperature": 0.7}
815            )
816            try:
817                # Call model API
818                response = llm.generate(...)
819
820                generation.update(
821                    output=response.text,
822                    usage_details={
823                        "prompt_tokens": response.usage.prompt_tokens,
824                        "completion_tokens": response.usage.completion_tokens
825                    }
826                )
827            finally:
828                generation.end()
829            ```
830        """
831        warnings.warn(
832            "start_generation is deprecated and will be removed in a future version. "
833            "Use start_observation(as_type='generation') instead.",
834            DeprecationWarning,
835            stacklevel=2,
836        )
837        return self.start_observation(
838            trace_context=trace_context,
839            name=name,
840            as_type="generation",
841            input=input,
842            output=output,
843            metadata=metadata,
844            version=version,
845            level=level,
846            status_message=status_message,
847            completion_start_time=completion_start_time,
848            model=model,
849            model_parameters=model_parameters,
850            usage_details=usage_details,
851            cost_details=cost_details,
852            prompt=prompt,
853        )

Create a new generation span for model generations.

DEPRECATED: This method is deprecated and will be removed in a future version. Use start_observation(as_type='generation') instead.

This method creates a specialized span for tracking model generations. It includes additional fields specific to model generations such as model name, token usage, and cost details.

The created generation span will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the generation operation
  • input: Input data for the model (e.g., prompts)
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Returns:

A LangfuseGeneration object that must be ended with .end() when complete

Example:
generation = langfuse.start_generation(
    name="answer-generation",
    model="gpt-4",
    input={"prompt": "Explain quantum computing"},
    model_parameters={"temperature": 0.7}
)
try:
    # Call model API
    response = llm.generate(...)

    generation.update(
        output=response.text,
        usage_details={
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens
        }
    )
finally:
    generation.end()
def start_as_current_generation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, end_on_exit: Optional[bool] = None) -> opentelemetry.util._decorator._AgnosticContextManager[LangfuseGeneration]:
855    def start_as_current_generation(
856        self,
857        *,
858        trace_context: Optional[TraceContext] = None,
859        name: str,
860        input: Optional[Any] = None,
861        output: Optional[Any] = None,
862        metadata: Optional[Any] = None,
863        version: Optional[str] = None,
864        level: Optional[SpanLevel] = None,
865        status_message: Optional[str] = None,
866        completion_start_time: Optional[datetime] = None,
867        model: Optional[str] = None,
868        model_parameters: Optional[Dict[str, MapValue]] = None,
869        usage_details: Optional[Dict[str, int]] = None,
870        cost_details: Optional[Dict[str, float]] = None,
871        prompt: Optional[PromptClient] = None,
872        end_on_exit: Optional[bool] = None,
873    ) -> _AgnosticContextManager[LangfuseGeneration]:
874        """Create a new generation span and set it as the current span in a context manager.
875
876        DEPRECATED: This method is deprecated and will be removed in a future version.
877        Use start_as_current_observation(as_type='generation') instead.
878
879        This method creates a specialized span for model generations and sets it as the
880        current span within a context manager. Use this method with a 'with' statement to
881        automatically handle the generation span lifecycle within a code block.
882
883        The created generation span will be the child of the current span in the context.
884
885        Args:
886            trace_context: Optional context for connecting to an existing trace
887            name: Name of the generation operation
888            input: Input data for the model (e.g., prompts)
889            output: Output from the model (e.g., completions)
890            metadata: Additional metadata to associate with the generation
891            version: Version identifier for the model or component
892            level: Importance level of the generation (info, warning, error)
893            status_message: Optional status message for the generation
894            completion_start_time: When the model started generating the response
895            model: Name/identifier of the AI model used (e.g., "gpt-4")
896            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
897            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
898            cost_details: Cost information for the model call
899            prompt: Associated prompt template from Langfuse prompt management
900            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
901
902        Returns:
903            A context manager that yields a LangfuseGeneration
904
905        Example:
906            ```python
907            with langfuse.start_as_current_generation(
908                name="answer-generation",
909                model="gpt-4",
910                input={"prompt": "Explain quantum computing"}
911            ) as generation:
912                # Call model API
913                response = llm.generate(...)
914
915                # Update with results
916                generation.update(
917                    output=response.text,
918                    usage_details={
919                        "prompt_tokens": response.usage.prompt_tokens,
920                        "completion_tokens": response.usage.completion_tokens
921                    }
922                )
923            ```
924        """
925        warnings.warn(
926            "start_as_current_generation is deprecated and will be removed in a future version. "
927            "Use start_as_current_observation(as_type='generation') instead.",
928            DeprecationWarning,
929            stacklevel=2,
930        )
931        return self.start_as_current_observation(
932            trace_context=trace_context,
933            name=name,
934            as_type="generation",
935            input=input,
936            output=output,
937            metadata=metadata,
938            version=version,
939            level=level,
940            status_message=status_message,
941            completion_start_time=completion_start_time,
942            model=model,
943            model_parameters=model_parameters,
944            usage_details=usage_details,
945            cost_details=cost_details,
946            prompt=prompt,
947            end_on_exit=end_on_exit,
948        )

Create a new generation span and set it as the current span in a context manager.

DEPRECATED: This method is deprecated and will be removed in a future version. Use start_as_current_observation(as_type='generation') instead.

This method creates a specialized span for model generations and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the generation span lifecycle within a code block.

The created generation span will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the generation operation
  • input: Input data for the model (e.g., prompts)
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
  • end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
Returns:

A context manager that yields a LangfuseGeneration

Example:
with langfuse.start_as_current_generation(
    name="answer-generation",
    model="gpt-4",
    input={"prompt": "Explain quantum computing"}
) as generation:
    # Call model API
    response = llm.generate(...)

    # Update with results
    generation.update(
        output=response.text,
        usage_details={
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens
        }
    )
def start_as_current_observation( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail']] = 'span', input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, end_on_exit: Optional[bool] = None) -> Union[opentelemetry.util._decorator._AgnosticContextManager[LangfuseGeneration], opentelemetry.util._decorator._AgnosticContextManager[LangfuseSpan], opentelemetry.util._decorator._AgnosticContextManager[LangfuseAgent], opentelemetry.util._decorator._AgnosticContextManager[LangfuseTool], opentelemetry.util._decorator._AgnosticContextManager[LangfuseChain], opentelemetry.util._decorator._AgnosticContextManager[LangfuseRetriever], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEvaluator], opentelemetry.util._decorator._AgnosticContextManager[LangfuseEmbedding], opentelemetry.util._decorator._AgnosticContextManager[LangfuseGuardrail]]:
1106    def start_as_current_observation(
1107        self,
1108        *,
1109        trace_context: Optional[TraceContext] = None,
1110        name: str,
1111        as_type: ObservationTypeLiteralNoEvent = "span",
1112        input: Optional[Any] = None,
1113        output: Optional[Any] = None,
1114        metadata: Optional[Any] = None,
1115        version: Optional[str] = None,
1116        level: Optional[SpanLevel] = None,
1117        status_message: Optional[str] = None,
1118        completion_start_time: Optional[datetime] = None,
1119        model: Optional[str] = None,
1120        model_parameters: Optional[Dict[str, MapValue]] = None,
1121        usage_details: Optional[Dict[str, int]] = None,
1122        cost_details: Optional[Dict[str, float]] = None,
1123        prompt: Optional[PromptClient] = None,
1124        end_on_exit: Optional[bool] = None,
1125    ) -> Union[
1126        _AgnosticContextManager[LangfuseGeneration],
1127        _AgnosticContextManager[LangfuseSpan],
1128        _AgnosticContextManager[LangfuseAgent],
1129        _AgnosticContextManager[LangfuseTool],
1130        _AgnosticContextManager[LangfuseChain],
1131        _AgnosticContextManager[LangfuseRetriever],
1132        _AgnosticContextManager[LangfuseEvaluator],
1133        _AgnosticContextManager[LangfuseEmbedding],
1134        _AgnosticContextManager[LangfuseGuardrail],
1135    ]:
1136        """Create a new observation and set it as the current span in a context manager.
1137
1138        This method creates a new observation of the specified type and sets it as the
1139        current span within a context manager. Use this method with a 'with' statement to
1140        automatically handle the observation lifecycle within a code block.
1141
1142        The created observation will be the child of the current span in the context.
1143
1144        Args:
1145            trace_context: Optional context for connecting to an existing trace
1146            name: Name of the observation (e.g., function or operation name)
1147            as_type: Type of observation to create (defaults to "span")
1148            input: Input data for the operation (can be any JSON-serializable object)
1149            output: Output data from the operation (can be any JSON-serializable object)
1150            metadata: Additional metadata to associate with the observation
1151            version: Version identifier for the code or component
1152            level: Importance level of the observation (info, warning, error)
1153            status_message: Optional status message for the observation
1154            end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
1155
1156            The following parameters are available when as_type is: "generation" or "embedding".
1157            completion_start_time: When the model started generating the response
1158            model: Name/identifier of the AI model used (e.g., "gpt-4")
1159            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1160            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1161            cost_details: Cost information for the model call
1162            prompt: Associated prompt template from Langfuse prompt management
1163
1164        Returns:
1165            A context manager that yields the appropriate observation type based on as_type
1166
1167        Example:
1168            ```python
1169            # Create a span
1170            with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
1171                # Do work
1172                result = process_data()
1173                span.update(output=result)
1174
1175                # Create a child span automatically
1176                with span.start_as_current_span(name="sub-operation") as child_span:
1177                    # Do sub-operation work
1178                    child_span.update(output="sub-result")
1179
1180            # Create a tool observation
1181            with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
1182                # Do tool work
1183                results = search_web(query)
1184                tool.update(output=results)
1185
1186            # Create a generation observation
1187            with langfuse.start_as_current_observation(
1188                name="answer-generation",
1189                as_type="generation",
1190                model="gpt-4"
1191            ) as generation:
1192                # Generate answer
1193                response = llm.generate(...)
1194                generation.update(output=response)
1195            ```
1196        """
1197        if as_type in get_observation_types_list(ObservationTypeGenerationLike):
1198            if trace_context:
1199                trace_id = trace_context.get("trace_id", None)
1200                parent_span_id = trace_context.get("parent_span_id", None)
1201
1202                if trace_id:
1203                    remote_parent_span = self._create_remote_parent_span(
1204                        trace_id=trace_id, parent_span_id=parent_span_id
1205                    )
1206
1207                    return cast(
1208                        Union[
1209                            _AgnosticContextManager[LangfuseGeneration],
1210                            _AgnosticContextManager[LangfuseEmbedding],
1211                        ],
1212                        self._create_span_with_parent_context(
1213                            as_type=as_type,
1214                            name=name,
1215                            remote_parent_span=remote_parent_span,
1216                            parent=None,
1217                            end_on_exit=end_on_exit,
1218                            input=input,
1219                            output=output,
1220                            metadata=metadata,
1221                            version=version,
1222                            level=level,
1223                            status_message=status_message,
1224                            completion_start_time=completion_start_time,
1225                            model=model,
1226                            model_parameters=model_parameters,
1227                            usage_details=usage_details,
1228                            cost_details=cost_details,
1229                            prompt=prompt,
1230                        ),
1231                    )
1232
1233            return cast(
1234                Union[
1235                    _AgnosticContextManager[LangfuseGeneration],
1236                    _AgnosticContextManager[LangfuseEmbedding],
1237                ],
1238                self._start_as_current_otel_span_with_processed_media(
1239                    as_type=as_type,
1240                    name=name,
1241                    end_on_exit=end_on_exit,
1242                    input=input,
1243                    output=output,
1244                    metadata=metadata,
1245                    version=version,
1246                    level=level,
1247                    status_message=status_message,
1248                    completion_start_time=completion_start_time,
1249                    model=model,
1250                    model_parameters=model_parameters,
1251                    usage_details=usage_details,
1252                    cost_details=cost_details,
1253                    prompt=prompt,
1254                ),
1255            )
1256
1257        if as_type in get_observation_types_list(ObservationTypeSpanLike):
1258            if trace_context:
1259                trace_id = trace_context.get("trace_id", None)
1260                parent_span_id = trace_context.get("parent_span_id", None)
1261
1262                if trace_id:
1263                    remote_parent_span = self._create_remote_parent_span(
1264                        trace_id=trace_id, parent_span_id=parent_span_id
1265                    )
1266
1267                    return cast(
1268                        Union[
1269                            _AgnosticContextManager[LangfuseSpan],
1270                            _AgnosticContextManager[LangfuseAgent],
1271                            _AgnosticContextManager[LangfuseTool],
1272                            _AgnosticContextManager[LangfuseChain],
1273                            _AgnosticContextManager[LangfuseRetriever],
1274                            _AgnosticContextManager[LangfuseEvaluator],
1275                            _AgnosticContextManager[LangfuseGuardrail],
1276                        ],
1277                        self._create_span_with_parent_context(
1278                            as_type=as_type,
1279                            name=name,
1280                            remote_parent_span=remote_parent_span,
1281                            parent=None,
1282                            end_on_exit=end_on_exit,
1283                            input=input,
1284                            output=output,
1285                            metadata=metadata,
1286                            version=version,
1287                            level=level,
1288                            status_message=status_message,
1289                        ),
1290                    )
1291
1292            return cast(
1293                Union[
1294                    _AgnosticContextManager[LangfuseSpan],
1295                    _AgnosticContextManager[LangfuseAgent],
1296                    _AgnosticContextManager[LangfuseTool],
1297                    _AgnosticContextManager[LangfuseChain],
1298                    _AgnosticContextManager[LangfuseRetriever],
1299                    _AgnosticContextManager[LangfuseEvaluator],
1300                    _AgnosticContextManager[LangfuseGuardrail],
1301                ],
1302                self._start_as_current_otel_span_with_processed_media(
1303                    as_type=as_type,
1304                    name=name,
1305                    end_on_exit=end_on_exit,
1306                    input=input,
1307                    output=output,
1308                    metadata=metadata,
1309                    version=version,
1310                    level=level,
1311                    status_message=status_message,
1312                ),
1313            )
1314
1315        # This should never be reached since all valid types are handled above
1316        langfuse_logger.warning(
1317            f"Unknown observation type: {as_type}, falling back to span"
1318        )
1319        return self._start_as_current_otel_span_with_processed_media(
1320            as_type="span",
1321            name=name,
1322            end_on_exit=end_on_exit,
1323            input=input,
1324            output=output,
1325            metadata=metadata,
1326            version=version,
1327            level=level,
1328            status_message=status_message,
1329        )

Create a new observation and set it as the current span in a context manager.

This method creates a new observation of the specified type and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the observation lifecycle within a code block.

The created observation will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the observation (e.g., function or operation name)
  • as_type: Type of observation to create (defaults to "span")
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the observation
  • version: Version identifier for the code or component
  • level: Importance level of the observation (info, warning, error)
  • status_message: Optional status message for the observation
  • end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
  • The following parameters are available when as_type is: "generation" or "embedding".
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Returns:

A context manager that yields the appropriate observation type based on as_type

Example:
# Create a span
with langfuse.start_as_current_observation(name="process-query", as_type="span") as span:
    # Do work
    result = process_data()
    span.update(output=result)

    # Create a child span automatically
    with span.start_as_current_span(name="sub-operation") as child_span:
        # Do sub-operation work
        child_span.update(output="sub-result")

# Create a tool observation
with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool:
    # Do tool work
    results = search_web(query)
    tool.update(output=results)

# Create a generation observation
with langfuse.start_as_current_observation(
    name="answer-generation",
    as_type="generation",
    model="gpt-4"
) as generation:
    # Generate answer
    response = llm.generate(...)
    generation.update(output=response)
def update_current_generation( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> None:
1490    def update_current_generation(
1491        self,
1492        *,
1493        name: Optional[str] = None,
1494        input: Optional[Any] = None,
1495        output: Optional[Any] = None,
1496        metadata: Optional[Any] = None,
1497        version: Optional[str] = None,
1498        level: Optional[SpanLevel] = None,
1499        status_message: Optional[str] = None,
1500        completion_start_time: Optional[datetime] = None,
1501        model: Optional[str] = None,
1502        model_parameters: Optional[Dict[str, MapValue]] = None,
1503        usage_details: Optional[Dict[str, int]] = None,
1504        cost_details: Optional[Dict[str, float]] = None,
1505        prompt: Optional[PromptClient] = None,
1506    ) -> None:
1507        """Update the current active generation span with new information.
1508
1509        This method updates the current generation span in the active context with
1510        additional information. It's useful for adding output, usage stats, or other
1511        details that become available during or after model generation.
1512
1513        Args:
1514            name: The generation name
1515            input: Updated input data for the model
1516            output: Output from the model (e.g., completions)
1517            metadata: Additional metadata to associate with the generation
1518            version: Version identifier for the model or component
1519            level: Importance level of the generation (info, warning, error)
1520            status_message: Optional status message for the generation
1521            completion_start_time: When the model started generating the response
1522            model: Name/identifier of the AI model used (e.g., "gpt-4")
1523            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1524            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1525            cost_details: Cost information for the model call
1526            prompt: Associated prompt template from Langfuse prompt management
1527
1528        Example:
1529            ```python
1530            with langfuse.start_as_current_generation(name="answer-query") as generation:
1531                # Initial setup and API call
1532                response = llm.generate(...)
1533
1534                # Update with results that weren't available at creation time
1535                langfuse.update_current_generation(
1536                    output=response.text,
1537                    usage_details={
1538                        "prompt_tokens": response.usage.prompt_tokens,
1539                        "completion_tokens": response.usage.completion_tokens
1540                    }
1541                )
1542            ```
1543        """
1544        if not self._tracing_enabled:
1545            langfuse_logger.debug(
1546                "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode."
1547            )
1548            return
1549
1550        current_otel_span = self._get_current_otel_span()
1551
1552        if current_otel_span is not None:
1553            generation = LangfuseGeneration(
1554                otel_span=current_otel_span, langfuse_client=self
1555            )
1556
1557            if name:
1558                current_otel_span.update_name(name)
1559
1560            generation.update(
1561                input=input,
1562                output=output,
1563                metadata=metadata,
1564                version=version,
1565                level=level,
1566                status_message=status_message,
1567                completion_start_time=completion_start_time,
1568                model=model,
1569                model_parameters=model_parameters,
1570                usage_details=usage_details,
1571                cost_details=cost_details,
1572                prompt=prompt,
1573            )

Update the current active generation span with new information.

This method updates the current generation span in the active context with additional information. It's useful for adding output, usage stats, or other details that become available during or after model generation.

Arguments:
  • name: The generation name
  • input: Updated input data for the model
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Initial setup and API call
    response = llm.generate(...)

    # Update with results that weren't available at creation time
    langfuse.update_current_generation(
        output=response.text,
        usage_details={
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens
        }
    )
def update_current_span( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> None:
1575    def update_current_span(
1576        self,
1577        *,
1578        name: Optional[str] = None,
1579        input: Optional[Any] = None,
1580        output: Optional[Any] = None,
1581        metadata: Optional[Any] = None,
1582        version: Optional[str] = None,
1583        level: Optional[SpanLevel] = None,
1584        status_message: Optional[str] = None,
1585    ) -> None:
1586        """Update the current active span with new information.
1587
1588        This method updates the current span in the active context with
1589        additional information. It's useful for adding outputs or metadata
1590        that become available during execution.
1591
1592        Args:
1593            name: The span name
1594            input: Updated input data for the operation
1595            output: Output data from the operation
1596            metadata: Additional metadata to associate with the span
1597            version: Version identifier for the code or component
1598            level: Importance level of the span (info, warning, error)
1599            status_message: Optional status message for the span
1600
1601        Example:
1602            ```python
1603            with langfuse.start_as_current_span(name="process-data") as span:
1604                # Initial processing
1605                result = process_first_part()
1606
1607                # Update with intermediate results
1608                langfuse.update_current_span(metadata={"intermediate_result": result})
1609
1610                # Continue processing
1611                final_result = process_second_part(result)
1612
1613                # Final update
1614                langfuse.update_current_span(output=final_result)
1615            ```
1616        """
1617        if not self._tracing_enabled:
1618            langfuse_logger.debug(
1619                "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode."
1620            )
1621            return
1622
1623        current_otel_span = self._get_current_otel_span()
1624
1625        if current_otel_span is not None:
1626            span = LangfuseSpan(
1627                otel_span=current_otel_span,
1628                langfuse_client=self,
1629                environment=self._environment,
1630            )
1631
1632            if name:
1633                current_otel_span.update_name(name)
1634
1635            span.update(
1636                input=input,
1637                output=output,
1638                metadata=metadata,
1639                version=version,
1640                level=level,
1641                status_message=status_message,
1642            )

Update the current active span with new information.

This method updates the current span in the active context with additional information. It's useful for adding outputs or metadata that become available during execution.

Arguments:
  • name: The span name
  • input: Updated input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Example:
with langfuse.start_as_current_span(name="process-data") as span:
    # Initial processing
    result = process_first_part()

    # Update with intermediate results
    langfuse.update_current_span(metadata={"intermediate_result": result})

    # Continue processing
    final_result = process_second_part(result)

    # Final update
    langfuse.update_current_span(output=final_result)
def update_current_trace( self, *, name: Optional[str] = None, user_id: Optional[str] = None, session_id: Optional[str] = None, version: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, tags: Optional[List[str]] = None, public: Optional[bool] = None) -> None:
1644    def update_current_trace(
1645        self,
1646        *,
1647        name: Optional[str] = None,
1648        user_id: Optional[str] = None,
1649        session_id: Optional[str] = None,
1650        version: Optional[str] = None,
1651        input: Optional[Any] = None,
1652        output: Optional[Any] = None,
1653        metadata: Optional[Any] = None,
1654        tags: Optional[List[str]] = None,
1655        public: Optional[bool] = None,
1656    ) -> None:
1657        """Update the current trace with additional information.
1658
1659        Args:
1660            name: Updated name for the Langfuse trace
1661            user_id: ID of the user who initiated the Langfuse trace
1662            session_id: Session identifier for grouping related Langfuse traces
1663            version: Version identifier for the application or service
1664            input: Input data for the overall Langfuse trace
1665            output: Output data from the overall Langfuse trace
1666            metadata: Additional metadata to associate with the Langfuse trace
1667            tags: List of tags to categorize the Langfuse trace
1668            public: Whether the Langfuse trace should be publicly accessible
1669
1670        See Also:
1671            :func:`langfuse.propagate_attributes`: Recommended replacement
1672        """
1673        if not self._tracing_enabled:
1674            langfuse_logger.debug(
1675                "Operation skipped: update_current_trace - Tracing is disabled or client is in no-op mode."
1676            )
1677            return
1678
1679        current_otel_span = self._get_current_otel_span()
1680
1681        if current_otel_span is not None and current_otel_span.is_recording():
1682            existing_observation_type = current_otel_span.attributes.get(  # type: ignore[attr-defined]
1683                LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span"
1684            )
1685            # We need to preserve the class to keep the correct observation type
1686            span_class = self._get_span_class(existing_observation_type)
1687            span = span_class(
1688                otel_span=current_otel_span,
1689                langfuse_client=self,
1690                environment=self._environment,
1691            )
1692
1693            span.update_trace(
1694                name=name,
1695                user_id=user_id,
1696                session_id=session_id,
1697                version=version,
1698                input=input,
1699                output=output,
1700                metadata=metadata,
1701                tags=tags,
1702                public=public,
1703            )

Update the current trace with additional information.

Arguments:
  • name: Updated name for the Langfuse trace
  • user_id: ID of the user who initiated the Langfuse trace
  • session_id: Session identifier for grouping related Langfuse traces
  • version: Version identifier for the application or service
  • input: Input data for the overall Langfuse trace
  • output: Output data from the overall Langfuse trace
  • metadata: Additional metadata to associate with the Langfuse trace
  • tags: List of tags to categorize the Langfuse trace
  • public: Whether the Langfuse trace should be publicly accessible
See Also:

langfuse.propagate_attributes(): Recommended replacement

def create_event( self, *, trace_context: Optional[langfuse.types.TraceContext] = None, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseEvent:
1705    def create_event(
1706        self,
1707        *,
1708        trace_context: Optional[TraceContext] = None,
1709        name: str,
1710        input: Optional[Any] = None,
1711        output: Optional[Any] = None,
1712        metadata: Optional[Any] = None,
1713        version: Optional[str] = None,
1714        level: Optional[SpanLevel] = None,
1715        status_message: Optional[str] = None,
1716    ) -> LangfuseEvent:
1717        """Create a new Langfuse observation of type 'EVENT'.
1718
1719        The created Langfuse Event observation will be the child of the current span in the context.
1720
1721        Args:
1722            trace_context: Optional context for connecting to an existing trace
1723            name: Name of the span (e.g., function or operation name)
1724            input: Input data for the operation (can be any JSON-serializable object)
1725            output: Output data from the operation (can be any JSON-serializable object)
1726            metadata: Additional metadata to associate with the span
1727            version: Version identifier for the code or component
1728            level: Importance level of the span (info, warning, error)
1729            status_message: Optional status message for the span
1730
1731        Returns:
1732            The Langfuse Event object
1733
1734        Example:
1735            ```python
1736            event = langfuse.create_event(name="process-event")
1737            ```
1738        """
1739        timestamp = time_ns()
1740
1741        if trace_context:
1742            trace_id = trace_context.get("trace_id", None)
1743            parent_span_id = trace_context.get("parent_span_id", None)
1744
1745            if trace_id:
1746                remote_parent_span = self._create_remote_parent_span(
1747                    trace_id=trace_id, parent_span_id=parent_span_id
1748                )
1749
1750                with otel_trace_api.use_span(
1751                    cast(otel_trace_api.Span, remote_parent_span)
1752                ):
1753                    otel_span = self._otel_tracer.start_span(
1754                        name=name, start_time=timestamp
1755                    )
1756                    otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True)
1757
1758                    return cast(
1759                        LangfuseEvent,
1760                        LangfuseEvent(
1761                            otel_span=otel_span,
1762                            langfuse_client=self,
1763                            environment=self._environment,
1764                            input=input,
1765                            output=output,
1766                            metadata=metadata,
1767                            version=version,
1768                            level=level,
1769                            status_message=status_message,
1770                        ).end(end_time=timestamp),
1771                    )
1772
1773        otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp)
1774
1775        return cast(
1776            LangfuseEvent,
1777            LangfuseEvent(
1778                otel_span=otel_span,
1779                langfuse_client=self,
1780                environment=self._environment,
1781                input=input,
1782                output=output,
1783                metadata=metadata,
1784                version=version,
1785                level=level,
1786                status_message=status_message,
1787            ).end(end_time=timestamp),
1788        )

Create a new Langfuse observation of type 'EVENT'.

The created Langfuse Event observation will be the child of the current span in the context.

Arguments:
  • trace_context: Optional context for connecting to an existing trace
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

The Langfuse Event object

Example:
event = langfuse.create_event(name="process-event")
@staticmethod
def create_trace_id(*, seed: Optional[str] = None) -> str:
1877    @staticmethod
1878    def create_trace_id(*, seed: Optional[str] = None) -> str:
1879        """Create a unique trace ID for use with Langfuse.
1880
1881        This method generates a unique trace ID for use with various Langfuse APIs.
1882        It can either generate a random ID or create a deterministic ID based on
1883        a seed string.
1884
1885        Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes.
1886        This method ensures the generated ID meets this requirement. If you need to
1887        correlate an external ID with a Langfuse trace ID, use the external ID as the
1888        seed to get a valid, deterministic Langfuse trace ID.
1889
1890        Args:
1891            seed: Optional string to use as a seed for deterministic ID generation.
1892                 If provided, the same seed will always produce the same ID.
1893                 If not provided, a random ID will be generated.
1894
1895        Returns:
1896            A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
1897
1898        Example:
1899            ```python
1900            # Generate a random trace ID
1901            trace_id = langfuse.create_trace_id()
1902
1903            # Generate a deterministic ID based on a seed
1904            session_trace_id = langfuse.create_trace_id(seed="session-456")
1905
1906            # Correlate an external ID with a Langfuse trace ID
1907            external_id = "external-system-123456"
1908            correlated_trace_id = langfuse.create_trace_id(seed=external_id)
1909
1910            # Use the ID with trace context
1911            with langfuse.start_as_current_span(
1912                name="process-request",
1913                trace_context={"trace_id": trace_id}
1914            ) as span:
1915                # Operation will be part of the specific trace
1916                pass
1917            ```
1918        """
1919        if not seed:
1920            trace_id_int = RandomIdGenerator().generate_trace_id()
1921
1922            return Langfuse._format_otel_trace_id(trace_id_int)
1923
1924        return sha256(seed.encode("utf-8")).digest()[:16].hex()

Create a unique trace ID for use with Langfuse.

This method generates a unique trace ID for use with various Langfuse APIs. It can either generate a random ID or create a deterministic ID based on a seed string.

Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. This method ensures the generated ID meets this requirement. If you need to correlate an external ID with a Langfuse trace ID, use the external ID as the seed to get a valid, deterministic Langfuse trace ID.

Arguments:
  • seed: Optional string to use as a seed for deterministic ID generation. If provided, the same seed will always produce the same ID. If not provided, a random ID will be generated.
Returns:

A 32-character lowercase hexadecimal string representing the Langfuse trace ID.

Example:
# Generate a random trace ID
trace_id = langfuse.create_trace_id()

# Generate a deterministic ID based on a seed
session_trace_id = langfuse.create_trace_id(seed="session-456")

# Correlate an external ID with a Langfuse trace ID
external_id = "external-system-123456"
correlated_trace_id = langfuse.create_trace_id(seed=external_id)

# Use the ID with trace context
with langfuse.start_as_current_span(
    name="process-request",
    trace_context={"trace_id": trace_id}
) as span:
    # Operation will be part of the specific trace
    pass
def create_score( self, *, name: str, value: Union[float, str], session_id: Optional[str] = None, dataset_run_id: Optional[str] = None, trace_id: Optional[str] = None, observation_id: Optional[str] = None, score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None, timestamp: Optional[datetime.datetime] = None) -> None:
2002    def create_score(
2003        self,
2004        *,
2005        name: str,
2006        value: Union[float, str],
2007        session_id: Optional[str] = None,
2008        dataset_run_id: Optional[str] = None,
2009        trace_id: Optional[str] = None,
2010        observation_id: Optional[str] = None,
2011        score_id: Optional[str] = None,
2012        data_type: Optional[ScoreDataType] = None,
2013        comment: Optional[str] = None,
2014        config_id: Optional[str] = None,
2015        metadata: Optional[Any] = None,
2016        timestamp: Optional[datetime] = None,
2017    ) -> None:
2018        """Create a score for a specific trace or observation.
2019
2020        This method creates a score for evaluating a Langfuse trace or observation. Scores can be
2021        used to track quality metrics, user feedback, or automated evaluations.
2022
2023        Args:
2024            name: Name of the score (e.g., "relevance", "accuracy")
2025            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2026            session_id: ID of the Langfuse session to associate the score with
2027            dataset_run_id: ID of the Langfuse dataset run to associate the score with
2028            trace_id: ID of the Langfuse trace to associate the score with
2029            observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
2030            score_id: Optional custom ID for the score (auto-generated if not provided)
2031            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2032            comment: Optional comment or explanation for the score
2033            config_id: Optional ID of a score config defined in Langfuse
2034            metadata: Optional metadata to be attached to the score
2035            timestamp: Optional timestamp for the score (defaults to current UTC time)
2036
2037        Example:
2038            ```python
2039            # Create a numeric score for accuracy
2040            langfuse.create_score(
2041                name="accuracy",
2042                value=0.92,
2043                trace_id="abcdef1234567890abcdef1234567890",
2044                data_type="NUMERIC",
2045                comment="High accuracy with minor irrelevant details"
2046            )
2047
2048            # Create a categorical score for sentiment
2049            langfuse.create_score(
2050                name="sentiment",
2051                value="positive",
2052                trace_id="abcdef1234567890abcdef1234567890",
2053                observation_id="abcdef1234567890",
2054                data_type="CATEGORICAL"
2055            )
2056            ```
2057        """
2058        if not self._tracing_enabled:
2059            return
2060
2061        score_id = score_id or self._create_observation_id()
2062
2063        try:
2064            new_body = ScoreBody(
2065                id=score_id,
2066                sessionId=session_id,
2067                datasetRunId=dataset_run_id,
2068                traceId=trace_id,
2069                observationId=observation_id,
2070                name=name,
2071                value=value,
2072                dataType=data_type,  # type: ignore
2073                comment=comment,
2074                configId=config_id,
2075                environment=self._environment,
2076                metadata=metadata,
2077            )
2078
2079            event = {
2080                "id": self.create_trace_id(),
2081                "type": "score-create",
2082                "timestamp": timestamp or _get_timestamp(),
2083                "body": new_body,
2084            }
2085
2086            if self._resources is not None:
2087                # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar
2088                force_sample = (
2089                    not self._is_valid_trace_id(trace_id) if trace_id else True
2090                )
2091
2092                self._resources.add_score_task(
2093                    event,
2094                    force_sample=force_sample,
2095                )
2096
2097        except Exception as e:
2098            langfuse_logger.exception(
2099                f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}"
2100            )

Create a score for a specific trace or observation.

This method creates a score for evaluating a Langfuse trace or observation. Scores can be used to track quality metrics, user feedback, or automated evaluations.

Arguments:
  • name: Name of the score (e.g., "relevance", "accuracy")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
  • session_id: ID of the Langfuse session to associate the score with
  • dataset_run_id: ID of the Langfuse dataset run to associate the score with
  • trace_id: ID of the Langfuse trace to associate the score with
  • observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
  • timestamp: Optional timestamp for the score (defaults to current UTC time)
Example:
# Create a numeric score for accuracy
langfuse.create_score(
    name="accuracy",
    value=0.92,
    trace_id="abcdef1234567890abcdef1234567890",
    data_type="NUMERIC",
    comment="High accuracy with minor irrelevant details"
)

# Create a categorical score for sentiment
langfuse.create_score(
    name="sentiment",
    value="positive",
    trace_id="abcdef1234567890abcdef1234567890",
    observation_id="abcdef1234567890",
    data_type="CATEGORICAL"
)
def score_current_span( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None) -> None:
2128    def score_current_span(
2129        self,
2130        *,
2131        name: str,
2132        value: Union[float, str],
2133        score_id: Optional[str] = None,
2134        data_type: Optional[ScoreDataType] = None,
2135        comment: Optional[str] = None,
2136        config_id: Optional[str] = None,
2137        metadata: Optional[Any] = None,
2138    ) -> None:
2139        """Create a score for the current active span.
2140
2141        This method scores the currently active span in the context. It's a convenient
2142        way to score the current operation without needing to know its trace and span IDs.
2143
2144        Args:
2145            name: Name of the score (e.g., "relevance", "accuracy")
2146            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2147            score_id: Optional custom ID for the score (auto-generated if not provided)
2148            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2149            comment: Optional comment or explanation for the score
2150            config_id: Optional ID of a score config defined in Langfuse
2151            metadata: Optional metadata to be attached to the score
2152
2153        Example:
2154            ```python
2155            with langfuse.start_as_current_generation(name="answer-query") as generation:
2156                # Generate answer
2157                response = generate_answer(...)
2158                generation.update(output=response)
2159
2160                # Score the generation
2161                langfuse.score_current_span(
2162                    name="relevance",
2163                    value=0.85,
2164                    data_type="NUMERIC",
2165                    comment="Mostly relevant but contains some tangential information",
2166                    metadata={"model": "gpt-4", "prompt_version": "v2"}
2167                )
2168            ```
2169        """
2170        current_span = self._get_current_otel_span()
2171
2172        if current_span is not None:
2173            trace_id = self._get_otel_trace_id(current_span)
2174            observation_id = self._get_otel_span_id(current_span)
2175
2176            langfuse_logger.info(
2177                f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}"
2178            )
2179
2180            self.create_score(
2181                trace_id=trace_id,
2182                observation_id=observation_id,
2183                name=name,
2184                value=cast(str, value),
2185                score_id=score_id,
2186                data_type=cast(Literal["CATEGORICAL"], data_type),
2187                comment=comment,
2188                config_id=config_id,
2189                metadata=metadata,
2190            )

Create a score for the current active span.

This method scores the currently active span in the context. It's a convenient way to score the current operation without needing to know its trace and span IDs.

Arguments:
  • name: Name of the score (e.g., "relevance", "accuracy")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation:
    # Generate answer
    response = generate_answer(...)
    generation.update(output=response)

    # Score the generation
    langfuse.score_current_span(
        name="relevance",
        value=0.85,
        data_type="NUMERIC",
        comment="Mostly relevant but contains some tangential information",
        metadata={"model": "gpt-4", "prompt_version": "v2"}
    )
def score_current_trace( self, *, name: str, value: Union[float, str], score_id: Optional[str] = None, data_type: Optional[Literal['NUMERIC', 'CATEGORICAL', 'BOOLEAN']] = None, comment: Optional[str] = None, config_id: Optional[str] = None, metadata: Optional[Any] = None) -> None:
2218    def score_current_trace(
2219        self,
2220        *,
2221        name: str,
2222        value: Union[float, str],
2223        score_id: Optional[str] = None,
2224        data_type: Optional[ScoreDataType] = None,
2225        comment: Optional[str] = None,
2226        config_id: Optional[str] = None,
2227        metadata: Optional[Any] = None,
2228    ) -> None:
2229        """Create a score for the current trace.
2230
2231        This method scores the trace of the currently active span. Unlike score_current_span,
2232        this method associates the score with the entire trace rather than a specific span.
2233        It's useful for scoring overall performance or quality of the entire operation.
2234
2235        Args:
2236            name: Name of the score (e.g., "user_satisfaction", "overall_quality")
2237            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
2238            score_id: Optional custom ID for the score (auto-generated if not provided)
2239            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
2240            comment: Optional comment or explanation for the score
2241            config_id: Optional ID of a score config defined in Langfuse
2242            metadata: Optional metadata to be attached to the score
2243
2244        Example:
2245            ```python
2246            with langfuse.start_as_current_span(name="process-user-request") as span:
2247                # Process request
2248                result = process_complete_request()
2249                span.update(output=result)
2250
2251                # Score the overall trace
2252                langfuse.score_current_trace(
2253                    name="overall_quality",
2254                    value=0.95,
2255                    data_type="NUMERIC",
2256                    comment="High quality end-to-end response",
2257                    metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
2258                )
2259            ```
2260        """
2261        current_span = self._get_current_otel_span()
2262
2263        if current_span is not None:
2264            trace_id = self._get_otel_trace_id(current_span)
2265
2266            langfuse_logger.info(
2267                f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}"
2268            )
2269
2270            self.create_score(
2271                trace_id=trace_id,
2272                name=name,
2273                value=cast(str, value),
2274                score_id=score_id,
2275                data_type=cast(Literal["CATEGORICAL"], data_type),
2276                comment=comment,
2277                config_id=config_id,
2278                metadata=metadata,
2279            )

Create a score for the current trace.

This method scores the trace of the currently active span. Unlike score_current_span, this method associates the score with the entire trace rather than a specific span. It's useful for scoring overall performance or quality of the entire operation.

Arguments:
  • name: Name of the score (e.g., "user_satisfaction", "overall_quality")
  • value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
  • score_id: Optional custom ID for the score (auto-generated if not provided)
  • data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
  • comment: Optional comment or explanation for the score
  • config_id: Optional ID of a score config defined in Langfuse
  • metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_span(name="process-user-request") as span:
    # Process request
    result = process_complete_request()
    span.update(output=result)

    # Score the overall trace
    langfuse.score_current_trace(
        name="overall_quality",
        value=0.95,
        data_type="NUMERIC",
        comment="High quality end-to-end response",
        metadata={"evaluator": "gpt-4", "criteria": "comprehensive"}
    )
def flush(self) -> None:
2281    def flush(self) -> None:
2282        """Force flush all pending spans and events to the Langfuse API.
2283
2284        This method manually flushes any pending spans, scores, and other events to the
2285        Langfuse API. It's useful in scenarios where you want to ensure all data is sent
2286        before proceeding, without waiting for the automatic flush interval.
2287
2288        Example:
2289            ```python
2290            # Record some spans and scores
2291            with langfuse.start_as_current_span(name="operation") as span:
2292                # Do work...
2293                pass
2294
2295            # Ensure all data is sent to Langfuse before proceeding
2296            langfuse.flush()
2297
2298            # Continue with other work
2299            ```
2300        """
2301        if self._resources is not None:
2302            self._resources.flush()

Force flush all pending spans and events to the Langfuse API.

This method manually flushes any pending spans, scores, and other events to the Langfuse API. It's useful in scenarios where you want to ensure all data is sent before proceeding, without waiting for the automatic flush interval.

Example:
# Record some spans and scores
with langfuse.start_as_current_span(name="operation") as span:
    # Do work...
    pass

# Ensure all data is sent to Langfuse before proceeding
langfuse.flush()

# Continue with other work
def shutdown(self) -> None:
2304    def shutdown(self) -> None:
2305        """Shut down the Langfuse client and flush all pending data.
2306
2307        This method cleanly shuts down the Langfuse client, ensuring all pending data
2308        is flushed to the API and all background threads are properly terminated.
2309
2310        It's important to call this method when your application is shutting down to
2311        prevent data loss and resource leaks. For most applications, using the client
2312        as a context manager or relying on the automatic shutdown via atexit is sufficient.
2313
2314        Example:
2315            ```python
2316            # Initialize Langfuse
2317            langfuse = Langfuse(public_key="...", secret_key="...")
2318
2319            # Use Langfuse throughout your application
2320            # ...
2321
2322            # When application is shutting down
2323            langfuse.shutdown()
2324            ```
2325        """
2326        if self._resources is not None:
2327            self._resources.shutdown()

Shut down the Langfuse client and flush all pending data.

This method cleanly shuts down the Langfuse client, ensuring all pending data is flushed to the API and all background threads are properly terminated.

It's important to call this method when your application is shutting down to prevent data loss and resource leaks. For most applications, using the client as a context manager or relying on the automatic shutdown via atexit is sufficient.

Example:
# Initialize Langfuse
langfuse = Langfuse(public_key="...", secret_key="...")

# Use Langfuse throughout your application
# ...

# When application is shutting down
langfuse.shutdown()
def get_current_trace_id(self) -> Optional[str]:
2329    def get_current_trace_id(self) -> Optional[str]:
2330        """Get the trace ID of the current active span.
2331
2332        This method retrieves the trace ID from the currently active span in the context.
2333        It can be used to get the trace ID for referencing in logs, external systems,
2334        or for creating related operations.
2335
2336        Returns:
2337            The current trace ID as a 32-character lowercase hexadecimal string,
2338            or None if there is no active span.
2339
2340        Example:
2341            ```python
2342            with langfuse.start_as_current_span(name="process-request") as span:
2343                # Get the current trace ID for reference
2344                trace_id = langfuse.get_current_trace_id()
2345
2346                # Use it for external correlation
2347                log.info(f"Processing request with trace_id: {trace_id}")
2348
2349                # Or pass to another system
2350                external_system.process(data, trace_id=trace_id)
2351            ```
2352        """
2353        if not self._tracing_enabled:
2354            langfuse_logger.debug(
2355                "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode."
2356            )
2357            return None
2358
2359        current_otel_span = self._get_current_otel_span()
2360
2361        return self._get_otel_trace_id(current_otel_span) if current_otel_span else None

Get the trace ID of the current active span.

This method retrieves the trace ID from the currently active span in the context. It can be used to get the trace ID for referencing in logs, external systems, or for creating related operations.

Returns:

The current trace ID as a 32-character lowercase hexadecimal string, or None if there is no active span.

Example:
with langfuse.start_as_current_span(name="process-request") as span:
    # Get the current trace ID for reference
    trace_id = langfuse.get_current_trace_id()

    # Use it for external correlation
    log.info(f"Processing request with trace_id: {trace_id}")

    # Or pass to another system
    external_system.process(data, trace_id=trace_id)
def get_current_observation_id(self) -> Optional[str]:
2363    def get_current_observation_id(self) -> Optional[str]:
2364        """Get the observation ID (span ID) of the current active span.
2365
2366        This method retrieves the observation ID from the currently active span in the context.
2367        It can be used to get the observation ID for referencing in logs, external systems,
2368        or for creating scores or other related operations.
2369
2370        Returns:
2371            The current observation ID as a 16-character lowercase hexadecimal string,
2372            or None if there is no active span.
2373
2374        Example:
2375            ```python
2376            with langfuse.start_as_current_span(name="process-user-query") as span:
2377                # Get the current observation ID
2378                observation_id = langfuse.get_current_observation_id()
2379
2380                # Store it for later reference
2381                cache.set(f"query_{query_id}_observation", observation_id)
2382
2383                # Process the query...
2384            ```
2385        """
2386        if not self._tracing_enabled:
2387            langfuse_logger.debug(
2388                "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode."
2389            )
2390            return None
2391
2392        current_otel_span = self._get_current_otel_span()
2393
2394        return self._get_otel_span_id(current_otel_span) if current_otel_span else None

Get the observation ID (span ID) of the current active span.

This method retrieves the observation ID from the currently active span in the context. It can be used to get the observation ID for referencing in logs, external systems, or for creating scores or other related operations.

Returns:

The current observation ID as a 16-character lowercase hexadecimal string, or None if there is no active span.

Example:
with langfuse.start_as_current_span(name="process-user-query") as span:
    # Get the current observation ID
    observation_id = langfuse.get_current_observation_id()

    # Store it for later reference
    cache.set(f"query_{query_id}_observation", observation_id)

    # Process the query...
def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2407    def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]:
2408        """Get the URL to view a trace in the Langfuse UI.
2409
2410        This method generates a URL that links directly to a trace in the Langfuse UI.
2411        It's useful for providing links in logs, notifications, or debugging tools.
2412
2413        Args:
2414            trace_id: Optional trace ID to generate a URL for. If not provided,
2415                     the trace ID of the current active span will be used.
2416
2417        Returns:
2418            A URL string pointing to the trace in the Langfuse UI,
2419            or None if the project ID couldn't be retrieved or no trace ID is available.
2420
2421        Example:
2422            ```python
2423            # Get URL for the current trace
2424            with langfuse.start_as_current_span(name="process-request") as span:
2425                trace_url = langfuse.get_trace_url()
2426                log.info(f"Processing trace: {trace_url}")
2427
2428            # Get URL for a specific trace
2429            specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
2430            send_notification(f"Review needed for trace: {specific_trace_url}")
2431            ```
2432        """
2433        project_id = self._get_project_id()
2434        final_trace_id = trace_id or self.get_current_trace_id()
2435
2436        return (
2437            f"{self._base_url}/project/{project_id}/traces/{final_trace_id}"
2438            if project_id and final_trace_id
2439            else None
2440        )

Get the URL to view a trace in the Langfuse UI.

This method generates a URL that links directly to a trace in the Langfuse UI. It's useful for providing links in logs, notifications, or debugging tools.

Arguments:
  • trace_id: Optional trace ID to generate a URL for. If not provided, the trace ID of the current active span will be used.
Returns:

A URL string pointing to the trace in the Langfuse UI, or None if the project ID couldn't be retrieved or no trace ID is available.

Example:
# Get URL for the current trace
with langfuse.start_as_current_span(name="process-request") as span:
    trace_url = langfuse.get_trace_url()
    log.info(f"Processing trace: {trace_url}")

# Get URL for a specific trace
specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef")
send_notification(f"Review needed for trace: {specific_trace_url}")
def get_dataset( self, name: str, *, fetch_items_page_size: Optional[int] = 50) -> langfuse._client.datasets.DatasetClient:
2442    def get_dataset(
2443        self, name: str, *, fetch_items_page_size: Optional[int] = 50
2444    ) -> "DatasetClient":
2445        """Fetch a dataset by its name.
2446
2447        Args:
2448            name (str): The name of the dataset to fetch.
2449            fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
2450
2451        Returns:
2452            DatasetClient: The dataset with the given name.
2453        """
2454        try:
2455            langfuse_logger.debug(f"Getting datasets {name}")
2456            dataset = self.api.datasets.get(dataset_name=self._url_encode(name))
2457
2458            dataset_items = []
2459            page = 1
2460
2461            while True:
2462                new_items = self.api.dataset_items.list(
2463                    dataset_name=self._url_encode(name, is_url_param=True),
2464                    page=page,
2465                    limit=fetch_items_page_size,
2466                )
2467                dataset_items.extend(new_items.data)
2468
2469                if new_items.meta.total_pages <= page:
2470                    break
2471
2472                page += 1
2473
2474            items = [DatasetItemClient(i, langfuse=self) for i in dataset_items]
2475
2476            return DatasetClient(dataset, items=items)
2477
2478        except Error as e:
2479            handle_fern_exception(e)
2480            raise e

Fetch a dataset by its name.

Arguments:
  • name (str): The name of the dataset to fetch.
  • fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
Returns:

DatasetClient: The dataset with the given name.

def get_dataset_run( self, *, dataset_name: str, run_name: str) -> langfuse.api.DatasetRunWithItems:
2482    def get_dataset_run(
2483        self, *, dataset_name: str, run_name: str
2484    ) -> DatasetRunWithItems:
2485        """Fetch a dataset run by dataset name and run name.
2486
2487        Args:
2488            dataset_name (str): The name of the dataset.
2489            run_name (str): The name of the run.
2490
2491        Returns:
2492            DatasetRunWithItems: The dataset run with its items.
2493        """
2494        try:
2495            return self.api.datasets.get_run(
2496                dataset_name=self._url_encode(dataset_name),
2497                run_name=self._url_encode(run_name),
2498                request_options=None,
2499            )
2500        except Error as e:
2501            handle_fern_exception(e)
2502            raise e

Fetch a dataset run by dataset name and run name.

Arguments:
  • dataset_name (str): The name of the dataset.
  • run_name (str): The name of the run.
Returns:

DatasetRunWithItems: The dataset run with its items.

def get_dataset_runs( self, *, dataset_name: str, page: Optional[int] = None, limit: Optional[int] = None) -> langfuse.api.PaginatedDatasetRuns:
2504    def get_dataset_runs(
2505        self,
2506        *,
2507        dataset_name: str,
2508        page: Optional[int] = None,
2509        limit: Optional[int] = None,
2510    ) -> PaginatedDatasetRuns:
2511        """Fetch all runs for a dataset.
2512
2513        Args:
2514            dataset_name (str): The name of the dataset.
2515            page (Optional[int]): Page number, starts at 1.
2516            limit (Optional[int]): Limit of items per page.
2517
2518        Returns:
2519            PaginatedDatasetRuns: Paginated list of dataset runs.
2520        """
2521        try:
2522            return self.api.datasets.get_runs(
2523                dataset_name=self._url_encode(dataset_name),
2524                page=page,
2525                limit=limit,
2526                request_options=None,
2527            )
2528        except Error as e:
2529            handle_fern_exception(e)
2530            raise e

Fetch all runs for a dataset.

Arguments:
  • dataset_name (str): The name of the dataset.
  • page (Optional[int]): Page number, starts at 1.
  • limit (Optional[int]): Limit of items per page.
Returns:

PaginatedDatasetRuns: Paginated list of dataset runs.

def delete_dataset_run( self, *, dataset_name: str, run_name: str) -> langfuse.api.DeleteDatasetRunResponse:
2532    def delete_dataset_run(
2533        self, *, dataset_name: str, run_name: str
2534    ) -> DeleteDatasetRunResponse:
2535        """Delete a dataset run and all its run items. This action is irreversible.
2536
2537        Args:
2538            dataset_name (str): The name of the dataset.
2539            run_name (str): The name of the run.
2540
2541        Returns:
2542            DeleteDatasetRunResponse: Confirmation of deletion.
2543        """
2544        try:
2545            return self.api.datasets.delete_run(
2546                dataset_name=self._url_encode(dataset_name),
2547                run_name=self._url_encode(run_name),
2548                request_options=None,
2549            )
2550        except Error as e:
2551            handle_fern_exception(e)
2552            raise e

Delete a dataset run and all its run items. This action is irreversible.

Arguments:
  • dataset_name (str): The name of the dataset.
  • run_name (str): The name of the run.
Returns:

DeleteDatasetRunResponse: Confirmation of deletion.

def run_experiment( self, *, name: str, run_name: Optional[str] = None, description: Optional[str] = None, data: Union[List[langfuse.experiment.LocalExperimentItem], List[langfuse._client.datasets.DatasetItemClient]], task: langfuse.experiment.TaskFunction, evaluators: List[langfuse.experiment.EvaluatorFunction] = [], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, run_evaluators: List[langfuse.experiment.RunEvaluatorFunction] = [], max_concurrency: int = 50, metadata: Optional[Dict[str, str]] = None) -> langfuse.experiment.ExperimentResult:
2554    def run_experiment(
2555        self,
2556        *,
2557        name: str,
2558        run_name: Optional[str] = None,
2559        description: Optional[str] = None,
2560        data: ExperimentData,
2561        task: TaskFunction,
2562        evaluators: List[EvaluatorFunction] = [],
2563        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
2564        run_evaluators: List[RunEvaluatorFunction] = [],
2565        max_concurrency: int = 50,
2566        metadata: Optional[Dict[str, str]] = None,
2567    ) -> ExperimentResult:
2568        """Run an experiment on a dataset with automatic tracing and evaluation.
2569
2570        This method executes a task function on each item in the provided dataset,
2571        automatically traces all executions with Langfuse for observability, runs
2572        item-level and run-level evaluators on the outputs, and returns comprehensive
2573        results with evaluation metrics.
2574
2575        The experiment system provides:
2576        - Automatic tracing of all task executions
2577        - Concurrent processing with configurable limits
2578        - Comprehensive error handling that isolates failures
2579        - Integration with Langfuse datasets for experiment tracking
2580        - Flexible evaluation framework supporting both sync and async evaluators
2581
2582        Args:
2583            name: Human-readable name for the experiment. Used for identification
2584                in the Langfuse UI.
2585            run_name: Optional exact name for the experiment run. If provided, this will be
2586                used as the exact dataset run name if the `data` contains Langfuse dataset items.
2587                If not provided, this will default to the experiment name appended with an ISO timestamp.
2588            description: Optional description explaining the experiment's purpose,
2589                methodology, or expected outcomes.
2590            data: Array of data items to process. Can be either:
2591                - List of dict-like items with 'input', 'expected_output', 'metadata' keys
2592                - List of Langfuse DatasetItem objects from dataset.items
2593            task: Function that processes each data item and returns output.
2594                Must accept 'item' as keyword argument and can return sync or async results.
2595                The task function signature should be: task(*, item, **kwargs) -> Any
2596            evaluators: List of functions to evaluate each item's output individually.
2597                Each evaluator receives input, output, expected_output, and metadata.
2598                Can return single Evaluation dict or list of Evaluation dicts.
2599            composite_evaluator: Optional function that creates composite scores from item-level evaluations.
2600                Receives the same inputs as item-level evaluators (input, output, expected_output, metadata)
2601                plus the list of evaluations from item-level evaluators. Useful for weighted averages,
2602                pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
2603            run_evaluators: List of functions to evaluate the entire experiment run.
2604                Each run evaluator receives all item_results and can compute aggregate metrics.
2605                Useful for calculating averages, distributions, or cross-item comparisons.
2606            max_concurrency: Maximum number of concurrent task executions (default: 50).
2607                Controls the number of items processed simultaneously. Adjust based on
2608                API rate limits and system resources.
2609            metadata: Optional metadata dictionary to attach to all experiment traces.
2610                This metadata will be included in every trace created during the experiment.
2611                If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too.
2612
2613        Returns:
2614            ExperimentResult containing:
2615            - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
2616            - item_results: List of results for each processed item with outputs and evaluations
2617            - run_evaluations: List of aggregate evaluation results for the entire run
2618            - dataset_run_id: ID of the dataset run (if using Langfuse datasets)
2619            - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
2620
2621        Raises:
2622            ValueError: If required parameters are missing or invalid
2623            Exception: If experiment setup fails (individual item failures are handled gracefully)
2624
2625        Examples:
2626            Basic experiment with local data:
2627            ```python
2628            def summarize_text(*, item, **kwargs):
2629                return f"Summary: {item['input'][:50]}..."
2630
2631            def length_evaluator(*, input, output, expected_output=None, **kwargs):
2632                return {
2633                    "name": "output_length",
2634                    "value": len(output),
2635                    "comment": f"Output contains {len(output)} characters"
2636                }
2637
2638            result = langfuse.run_experiment(
2639                name="Text Summarization Test",
2640                description="Evaluate summarization quality and length",
2641                data=[
2642                    {"input": "Long article text...", "expected_output": "Expected summary"},
2643                    {"input": "Another article...", "expected_output": "Another summary"}
2644                ],
2645                task=summarize_text,
2646                evaluators=[length_evaluator]
2647            )
2648
2649            print(f"Processed {len(result.item_results)} items")
2650            for item_result in result.item_results:
2651                print(f"Input: {item_result.item['input']}")
2652                print(f"Output: {item_result.output}")
2653                print(f"Evaluations: {item_result.evaluations}")
2654            ```
2655
2656            Advanced experiment with async task and multiple evaluators:
2657            ```python
2658            async def llm_task(*, item, **kwargs):
2659                # Simulate async LLM call
2660                response = await openai_client.chat.completions.create(
2661                    model="gpt-4",
2662                    messages=[{"role": "user", "content": item["input"]}]
2663                )
2664                return response.choices[0].message.content
2665
2666            def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
2667                if expected_output and expected_output.lower() in output.lower():
2668                    return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
2669                return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}
2670
2671            def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
2672                # Simulate toxicity check
2673                toxicity_score = check_toxicity(output)  # Your toxicity checker
2674                return {
2675                    "name": "toxicity",
2676                    "value": toxicity_score,
2677                    "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
2678                }
2679
2680            def average_accuracy(*, item_results, **kwargs):
2681                accuracies = [
2682                    eval.value for result in item_results
2683                    for eval in result.evaluations
2684                    if eval.name == "accuracy"
2685                ]
2686                return {
2687                    "name": "average_accuracy",
2688                    "value": sum(accuracies) / len(accuracies) if accuracies else 0,
2689                    "comment": f"Average accuracy across {len(accuracies)} items"
2690                }
2691
2692            result = langfuse.run_experiment(
2693                name="LLM Safety and Accuracy Test",
2694                description="Evaluate model accuracy and safety across diverse prompts",
2695                data=test_dataset,  # Your dataset items
2696                task=llm_task,
2697                evaluators=[accuracy_evaluator, toxicity_evaluator],
2698                run_evaluators=[average_accuracy],
2699                max_concurrency=5,  # Limit concurrent API calls
2700                metadata={"model": "gpt-4", "temperature": 0.7}
2701            )
2702            ```
2703
2704            Using with Langfuse datasets:
2705            ```python
2706            # Get dataset from Langfuse
2707            dataset = langfuse.get_dataset("my-eval-dataset")
2708
2709            result = dataset.run_experiment(
2710                name="Production Model Evaluation",
2711                description="Monthly evaluation of production model performance",
2712                task=my_production_task,
2713                evaluators=[accuracy_evaluator, latency_evaluator]
2714            )
2715
2716            # Results automatically linked to dataset in Langfuse UI
2717            print(f"View results: {result['dataset_run_url']}")
2718            ```
2719
2720        Note:
2721            - Task and evaluator functions can be either synchronous or asynchronous
2722            - Individual item failures are logged but don't stop the experiment
2723            - All executions are automatically traced and visible in Langfuse UI
2724            - When using Langfuse datasets, results are automatically linked for easy comparison
2725            - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
2726            - Async execution is handled automatically with smart event loop detection
2727        """
2728        return cast(
2729            ExperimentResult,
2730            run_async_safely(
2731                self._run_experiment_async(
2732                    name=name,
2733                    run_name=self._create_experiment_run_name(
2734                        name=name, run_name=run_name
2735                    ),
2736                    description=description,
2737                    data=data,
2738                    task=task,
2739                    evaluators=evaluators or [],
2740                    composite_evaluator=composite_evaluator,
2741                    run_evaluators=run_evaluators or [],
2742                    max_concurrency=max_concurrency,
2743                    metadata=metadata,
2744                ),
2745            ),
2746        )

Run an experiment on a dataset with automatic tracing and evaluation.

This method executes a task function on each item in the provided dataset, automatically traces all executions with Langfuse for observability, runs item-level and run-level evaluators on the outputs, and returns comprehensive results with evaluation metrics.

The experiment system provides:

  • Automatic tracing of all task executions
  • Concurrent processing with configurable limits
  • Comprehensive error handling that isolates failures
  • Integration with Langfuse datasets for experiment tracking
  • Flexible evaluation framework supporting both sync and async evaluators
Arguments:
  • name: Human-readable name for the experiment. Used for identification in the Langfuse UI.
  • run_name: Optional exact name for the experiment run. If provided, this will be used as the exact dataset run name if the data contains Langfuse dataset items. If not provided, this will default to the experiment name appended with an ISO timestamp.
  • description: Optional description explaining the experiment's purpose, methodology, or expected outcomes.
  • data: Array of data items to process. Can be either:
    • List of dict-like items with 'input', 'expected_output', 'metadata' keys
    • List of Langfuse DatasetItem objects from dataset.items
  • task: Function that processes each data item and returns output. Must accept 'item' as keyword argument and can return sync or async results. The task function signature should be: task(, item, *kwargs) -> Any
  • evaluators: List of functions to evaluate each item's output individually. Each evaluator receives input, output, expected_output, and metadata. Can return single Evaluation dict or list of Evaluation dicts.
  • composite_evaluator: Optional function that creates composite scores from item-level evaluations. Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations from item-level evaluators. Useful for weighted averages, pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
  • run_evaluators: List of functions to evaluate the entire experiment run. Each run evaluator receives all item_results and can compute aggregate metrics. Useful for calculating averages, distributions, or cross-item comparisons.
  • max_concurrency: Maximum number of concurrent task executions (default: 50). Controls the number of items processed simultaneously. Adjust based on API rate limits and system resources.
  • metadata: Optional metadata dictionary to attach to all experiment traces. This metadata will be included in every trace created during the experiment. If data are Langfuse dataset items, the metadata will be attached to the dataset run, too.
Returns:

ExperimentResult containing:

  • run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
  • item_results: List of results for each processed item with outputs and evaluations
  • run_evaluations: List of aggregate evaluation results for the entire run
  • dataset_run_id: ID of the dataset run (if using Langfuse datasets)
  • dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
Raises:
  • ValueError: If required parameters are missing or invalid
  • Exception: If experiment setup fails (individual item failures are handled gracefully)
Examples:

Basic experiment with local data:

def summarize_text(*, item, **kwargs):
    return f"Summary: {item['input'][:50]}..."

def length_evaluator(*, input, output, expected_output=None, **kwargs):
    return {
        "name": "output_length",
        "value": len(output),
        "comment": f"Output contains {len(output)} characters"
    }

result = langfuse.run_experiment(
    name="Text Summarization Test",
    description="Evaluate summarization quality and length",
    data=[
        {"input": "Long article text...", "expected_output": "Expected summary"},
        {"input": "Another article...", "expected_output": "Another summary"}
    ],
    task=summarize_text,
    evaluators=[length_evaluator]
)

print(f"Processed {len(result.item_results)} items")
for item_result in result.item_results:
    print(f"Input: {item_result.item['input']}")
    print(f"Output: {item_result.output}")
    print(f"Evaluations: {item_result.evaluations}")

Advanced experiment with async task and multiple evaluators:

async def llm_task(*, item, **kwargs):
    # Simulate async LLM call
    response = await openai_client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": item["input"]}]
    )
    return response.choices[0].message.content

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if expected_output and expected_output.lower() in output.lower():
        return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"}
    return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"}

def toxicity_evaluator(*, input, output, expected_output=None, **kwargs):
    # Simulate toxicity check
    toxicity_score = check_toxicity(output)  # Your toxicity checker
    return {
        "name": "toxicity",
        "value": toxicity_score,
        "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}"
    }

def average_accuracy(*, item_results, **kwargs):
    accuracies = [
        eval.value for result in item_results
        for eval in result.evaluations
        if eval.name == "accuracy"
    ]
    return {
        "name": "average_accuracy",
        "value": sum(accuracies) / len(accuracies) if accuracies else 0,
        "comment": f"Average accuracy across {len(accuracies)} items"
    }

result = langfuse.run_experiment(
    name="LLM Safety and Accuracy Test",
    description="Evaluate model accuracy and safety across diverse prompts",
    data=test_dataset,  # Your dataset items
    task=llm_task,
    evaluators=[accuracy_evaluator, toxicity_evaluator],
    run_evaluators=[average_accuracy],
    max_concurrency=5,  # Limit concurrent API calls
    metadata={"model": "gpt-4", "temperature": 0.7}
)

Using with Langfuse datasets:

# Get dataset from Langfuse
dataset = langfuse.get_dataset("my-eval-dataset")

result = dataset.run_experiment(
    name="Production Model Evaluation",
    description="Monthly evaluation of production model performance",
    task=my_production_task,
    evaluators=[accuracy_evaluator, latency_evaluator]
)

# Results automatically linked to dataset in Langfuse UI
print(f"View results: {result['dataset_run_url']}")
Note:
  • Task and evaluator functions can be either synchronous or asynchronous
  • Individual item failures are logged but don't stop the experiment
  • All executions are automatically traced and visible in Langfuse UI
  • When using Langfuse datasets, results are automatically linked for easy comparison
  • This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
  • Async execution is handled automatically with smart event loop detection
def run_batched_evaluation( self, *, scope: Literal['traces', 'observations'], mapper: MapperFunction, filter: Optional[str] = None, fetch_batch_size: int = 50, max_items: Optional[int] = None, max_retries: int = 3, evaluators: List[langfuse.experiment.EvaluatorFunction], composite_evaluator: Optional[CompositeEvaluatorFunction] = None, max_concurrency: int = 50, metadata: Optional[Dict[str, Any]] = None, resume_from: Optional[BatchEvaluationResumeToken] = None, verbose: bool = False) -> BatchEvaluationResult:
3090    def run_batched_evaluation(
3091        self,
3092        *,
3093        scope: Literal["traces", "observations"],
3094        mapper: MapperFunction,
3095        filter: Optional[str] = None,
3096        fetch_batch_size: int = 50,
3097        max_items: Optional[int] = None,
3098        max_retries: int = 3,
3099        evaluators: List[EvaluatorFunction],
3100        composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
3101        max_concurrency: int = 50,
3102        metadata: Optional[Dict[str, Any]] = None,
3103        resume_from: Optional[BatchEvaluationResumeToken] = None,
3104        verbose: bool = False,
3105    ) -> BatchEvaluationResult:
3106        """Fetch traces or observations and run evaluations on each item.
3107
3108        This method provides a powerful way to evaluate existing data in Langfuse at scale.
3109        It fetches items based on filters, transforms them using a mapper function, runs
3110        evaluators on each item, and creates scores that are linked back to the original
3111        entities. This is ideal for:
3112
3113        - Running evaluations on production traces after deployment
3114        - Backtesting new evaluation metrics on historical data
3115        - Batch scoring of observations for quality monitoring
3116        - Periodic evaluation runs on recent data
3117
3118        The method uses a streaming/pipeline approach to process items in batches, making
3119        it memory-efficient for large datasets. It includes comprehensive error handling,
3120        retry logic, and resume capability for long-running evaluations.
3121
3122        Args:
3123            scope: The type of items to evaluate. Must be one of:
3124                - "traces": Evaluate complete traces with all their observations
3125                - "observations": Evaluate individual observations (spans, generations, events)
3126            mapper: Function that transforms API response objects into evaluator inputs.
3127                Receives a trace/observation object and returns an EvaluatorInputs
3128                instance with input, output, expected_output, and metadata fields.
3129                Can be sync or async.
3130            evaluators: List of evaluation functions to run on each item. Each evaluator
3131                receives the mapped inputs and returns Evaluation object(s). Evaluator
3132                failures are logged but don't stop the batch evaluation.
3133            filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
3134                - '{"tags": ["production"]}'
3135                - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}'
3136                Default: None (fetches all items).
3137            fetch_batch_size: Number of items to fetch per API call and hold in memory.
3138                Larger values may be faster but use more memory. Default: 50.
3139            max_items: Maximum total number of items to process. If None, processes all
3140                items matching the filter. Useful for testing or limiting evaluation runs.
3141                Default: None (process all).
3142            max_concurrency: Maximum number of items to evaluate concurrently. Controls
3143                parallelism and resource usage. Default: 50.
3144            composite_evaluator: Optional function that creates a composite score from
3145                item-level evaluations. Receives the original item and its evaluations,
3146                returns a single Evaluation. Useful for weighted averages or combined metrics.
3147                Default: None.
3148            metadata: Optional metadata dict to add to all created scores. Useful for
3149                tracking evaluation runs, versions, or other context. Default: None.
3150            max_retries: Maximum number of retry attempts for failed batch fetches.
3151                Uses exponential backoff (1s, 2s, 4s). Default: 3.
3152            verbose: If True, logs progress information to console. Useful for monitoring
3153                long-running evaluations. Default: False.
3154            resume_from: Optional resume token from a previous incomplete run. Allows
3155                continuing evaluation after interruption or failure. Default: None.
3156
3157
3158        Returns:
3159            BatchEvaluationResult containing:
3160                - total_items_fetched: Number of items fetched from API
3161                - total_items_processed: Number of items successfully evaluated
3162                - total_items_failed: Number of items that failed evaluation
3163                - total_scores_created: Scores created by item-level evaluators
3164                - total_composite_scores_created: Scores created by composite evaluator
3165                - total_evaluations_failed: Individual evaluator failures
3166                - evaluator_stats: Per-evaluator statistics (success rate, scores created)
3167                - resume_token: Token for resuming if incomplete (None if completed)
3168                - completed: True if all items processed
3169                - duration_seconds: Total execution time
3170                - failed_item_ids: IDs of items that failed
3171                - error_summary: Error types and counts
3172                - has_more_items: True if max_items reached but more exist
3173
3174        Raises:
3175            ValueError: If invalid scope is provided.
3176
3177        Examples:
3178            Basic trace evaluation:
3179            ```python
3180            from langfuse import Langfuse, EvaluatorInputs, Evaluation
3181
3182            client = Langfuse()
3183
3184            # Define mapper to extract fields from traces
3185            def trace_mapper(trace):
3186                return EvaluatorInputs(
3187                    input=trace.input,
3188                    output=trace.output,
3189                    expected_output=None,
3190                    metadata={"trace_id": trace.id}
3191                )
3192
3193            # Define evaluator
3194            def length_evaluator(*, input, output, expected_output, metadata):
3195                return Evaluation(
3196                    name="output_length",
3197                    value=len(output) if output else 0
3198                )
3199
3200            # Run batch evaluation
3201            result = client.run_batched_evaluation(
3202                scope="traces",
3203                mapper=trace_mapper,
3204                evaluators=[length_evaluator],
3205                filter='{"tags": ["production"]}',
3206                max_items=1000,
3207                verbose=True
3208            )
3209
3210            print(f"Processed {result.total_items_processed} traces")
3211            print(f"Created {result.total_scores_created} scores")
3212            ```
3213
3214            Evaluation with composite scorer:
3215            ```python
3216            def accuracy_evaluator(*, input, output, expected_output, metadata):
3217                # ... evaluation logic
3218                return Evaluation(name="accuracy", value=0.85)
3219
3220            def relevance_evaluator(*, input, output, expected_output, metadata):
3221                # ... evaluation logic
3222                return Evaluation(name="relevance", value=0.92)
3223
3224            def composite_evaluator(*, item, evaluations):
3225                # Weighted average of evaluations
3226                weights = {"accuracy": 0.6, "relevance": 0.4}
3227                total = sum(
3228                    e.value * weights.get(e.name, 0)
3229                    for e in evaluations
3230                    if isinstance(e.value, (int, float))
3231                )
3232                return Evaluation(
3233                    name="composite_score",
3234                    value=total,
3235                    comment=f"Weighted average of {len(evaluations)} metrics"
3236                )
3237
3238            result = client.run_batched_evaluation(
3239                scope="traces",
3240                mapper=trace_mapper,
3241                evaluators=[accuracy_evaluator, relevance_evaluator],
3242                composite_evaluator=composite_evaluator,
3243                filter='{"user_id": "important_user"}',
3244                verbose=True
3245            )
3246            ```
3247
3248            Handling incomplete runs with resume:
3249            ```python
3250            # Initial run that may fail or timeout
3251            result = client.run_batched_evaluation(
3252                scope="observations",
3253                mapper=obs_mapper,
3254                evaluators=[my_evaluator],
3255                max_items=10000,
3256                verbose=True
3257            )
3258
3259            # Check if incomplete
3260            if not result.completed and result.resume_token:
3261                print(f"Processed {result.resume_token.items_processed} items before interruption")
3262
3263                # Resume from where it left off
3264                result = client.run_batched_evaluation(
3265                    scope="observations",
3266                    mapper=obs_mapper,
3267                    evaluators=[my_evaluator],
3268                    resume_from=result.resume_token,
3269                    verbose=True
3270                )
3271
3272            print(f"Total items processed: {result.total_items_processed}")
3273            ```
3274
3275            Monitoring evaluator performance:
3276            ```python
3277            result = client.run_batched_evaluation(...)
3278
3279            for stats in result.evaluator_stats:
3280                success_rate = stats.successful_runs / stats.total_runs
3281                print(f"{stats.name}:")
3282                print(f"  Success rate: {success_rate:.1%}")
3283                print(f"  Scores created: {stats.total_scores_created}")
3284
3285                if stats.failed_runs > 0:
3286                    print(f"  âš ī¸  Failed {stats.failed_runs} times")
3287            ```
3288
3289        Note:
3290            - Evaluator failures are logged but don't stop the batch evaluation
3291            - Individual item failures are tracked but don't stop processing
3292            - Fetch failures are retried with exponential backoff
3293            - All scores are automatically flushed to Langfuse at the end
3294            - The resume mechanism uses timestamp-based filtering to avoid duplicates
3295        """
3296        runner = BatchEvaluationRunner(self)
3297
3298        return cast(
3299            BatchEvaluationResult,
3300            run_async_safely(
3301                runner.run_async(
3302                    scope=scope,
3303                    mapper=mapper,
3304                    evaluators=evaluators,
3305                    filter=filter,
3306                    fetch_batch_size=fetch_batch_size,
3307                    max_items=max_items,
3308                    max_concurrency=max_concurrency,
3309                    composite_evaluator=composite_evaluator,
3310                    metadata=metadata,
3311                    max_retries=max_retries,
3312                    verbose=verbose,
3313                    resume_from=resume_from,
3314                )
3315            ),
3316        )

Fetch traces or observations and run evaluations on each item.

This method provides a powerful way to evaluate existing data in Langfuse at scale. It fetches items based on filters, transforms them using a mapper function, runs evaluators on each item, and creates scores that are linked back to the original entities. This is ideal for:

  • Running evaluations on production traces after deployment
  • Backtesting new evaluation metrics on historical data
  • Batch scoring of observations for quality monitoring
  • Periodic evaluation runs on recent data

The method uses a streaming/pipeline approach to process items in batches, making it memory-efficient for large datasets. It includes comprehensive error handling, retry logic, and resume capability for long-running evaluations.

Arguments:
  • scope: The type of items to evaluate. Must be one of:
    • "traces": Evaluate complete traces with all their observations
    • "observations": Evaluate individual observations (spans, generations, events)
  • mapper: Function that transforms API response objects into evaluator inputs. Receives a trace/observation object and returns an EvaluatorInputs instance with input, output, expected_output, and metadata fields. Can be sync or async.
  • evaluators: List of evaluation functions to run on each item. Each evaluator receives the mapped inputs and returns Evaluation object(s). Evaluator failures are logged but don't stop the batch evaluation.
  • filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
    • '{"tags": ["production"]}'
    • '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' Default: None (fetches all items).
  • fetch_batch_size: Number of items to fetch per API call and hold in memory. Larger values may be faster but use more memory. Default: 50.
  • max_items: Maximum total number of items to process. If None, processes all items matching the filter. Useful for testing or limiting evaluation runs. Default: None (process all).
  • max_concurrency: Maximum number of items to evaluate concurrently. Controls parallelism and resource usage. Default: 50.
  • composite_evaluator: Optional function that creates a composite score from item-level evaluations. Receives the original item and its evaluations, returns a single Evaluation. Useful for weighted averages or combined metrics. Default: None.
  • metadata: Optional metadata dict to add to all created scores. Useful for tracking evaluation runs, versions, or other context. Default: None.
  • max_retries: Maximum number of retry attempts for failed batch fetches. Uses exponential backoff (1s, 2s, 4s). Default: 3.
  • verbose: If True, logs progress information to console. Useful for monitoring long-running evaluations. Default: False.
  • resume_from: Optional resume token from a previous incomplete run. Allows continuing evaluation after interruption or failure. Default: None.
Returns:

BatchEvaluationResult containing: - total_items_fetched: Number of items fetched from API - total_items_processed: Number of items successfully evaluated - total_items_failed: Number of items that failed evaluation - total_scores_created: Scores created by item-level evaluators - total_composite_scores_created: Scores created by composite evaluator - total_evaluations_failed: Individual evaluator failures - evaluator_stats: Per-evaluator statistics (success rate, scores created) - resume_token: Token for resuming if incomplete (None if completed) - completed: True if all items processed - duration_seconds: Total execution time - failed_item_ids: IDs of items that failed - error_summary: Error types and counts - has_more_items: True if max_items reached but more exist

Raises:
  • ValueError: If invalid scope is provided.
Examples:

Basic trace evaluation:

from langfuse import Langfuse, EvaluatorInputs, Evaluation

client = Langfuse()

# Define mapper to extract fields from traces
def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,
        metadata={"trace_id": trace.id}
    )

# Define evaluator
def length_evaluator(*, input, output, expected_output, metadata):
    return Evaluation(
        name="output_length",
        value=len(output) if output else 0
    )

# Run batch evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[length_evaluator],
    filter='{"tags": ["production"]}',
    max_items=1000,
    verbose=True
)

print(f"Processed {result.total_items_processed} traces")
print(f"Created {result.total_scores_created} scores")

Evaluation with composite scorer:

def accuracy_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="accuracy", value=0.85)

def relevance_evaluator(*, input, output, expected_output, metadata):
    # ... evaluation logic
    return Evaluation(name="relevance", value=0.92)

def composite_evaluator(*, item, evaluations):
    # Weighted average of evaluations
    weights = {"accuracy": 0.6, "relevance": 0.4}
    total = sum(
        e.value * weights.get(e.name, 0)
        for e in evaluations
        if isinstance(e.value, (int, float))
    )
    return Evaluation(
        name="composite_score",
        value=total,
        comment=f"Weighted average of {len(evaluations)} metrics"
    )

result = client.run_batched_evaluation(
    scope="traces",
    mapper=trace_mapper,
    evaluators=[accuracy_evaluator, relevance_evaluator],
    composite_evaluator=composite_evaluator,
    filter='{"user_id": "important_user"}',
    verbose=True
)

Handling incomplete runs with resume:

# Initial run that may fail or timeout
result = client.run_batched_evaluation(
    scope="observations",
    mapper=obs_mapper,
    evaluators=[my_evaluator],
    max_items=10000,
    verbose=True
)

# Check if incomplete
if not result.completed and result.resume_token:
    print(f"Processed {result.resume_token.items_processed} items before interruption")

    # Resume from where it left off
    result = client.run_batched_evaluation(
        scope="observations",
        mapper=obs_mapper,
        evaluators=[my_evaluator],
        resume_from=result.resume_token,
        verbose=True
    )

print(f"Total items processed: {result.total_items_processed}")

Monitoring evaluator performance:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs
    print(f"{stats.name}:")
    print(f"  Success rate: {success_rate:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  âš ī¸  Failed {stats.failed_runs} times")
Note:
  • Evaluator failures are logged but don't stop the batch evaluation
  • Individual item failures are tracked but don't stop processing
  • Fetch failures are retried with exponential backoff
  • All scores are automatically flushed to Langfuse at the end
  • The resume mechanism uses timestamp-based filtering to avoid duplicates
def auth_check(self) -> bool:
3318    def auth_check(self) -> bool:
3319        """Check if the provided credentials (public and secret key) are valid.
3320
3321        Raises:
3322            Exception: If no projects were found for the provided credentials.
3323
3324        Note:
3325            This method is blocking. It is discouraged to use it in production code.
3326        """
3327        try:
3328            projects = self.api.projects.get()
3329            langfuse_logger.debug(
3330                f"Auth check successful, found {len(projects.data)} projects"
3331            )
3332            if len(projects.data) == 0:
3333                raise Exception(
3334                    "Auth check failed, no project found for the keys provided."
3335                )
3336            return True
3337
3338        except AttributeError as e:
3339            langfuse_logger.warning(
3340                f"Auth check failed: Client not properly initialized. Error: {e}"
3341            )
3342            return False
3343
3344        except Error as e:
3345            handle_fern_exception(e)
3346            raise e

Check if the provided credentials (public and secret key) are valid.

Raises:
  • Exception: If no projects were found for the provided credentials.
Note:

This method is blocking. It is discouraged to use it in production code.

def create_dataset( self, *, name: str, description: Optional[str] = None, metadata: Optional[Any] = None, input_schema: Optional[Any] = None, expected_output_schema: Optional[Any] = None) -> langfuse.api.Dataset:
3348    def create_dataset(
3349        self,
3350        *,
3351        name: str,
3352        description: Optional[str] = None,
3353        metadata: Optional[Any] = None,
3354        input_schema: Optional[Any] = None,
3355        expected_output_schema: Optional[Any] = None,
3356    ) -> Dataset:
3357        """Create a dataset with the given name on Langfuse.
3358
3359        Args:
3360            name: Name of the dataset to create.
3361            description: Description of the dataset. Defaults to None.
3362            metadata: Additional metadata. Defaults to None.
3363            input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
3364            expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
3365
3366        Returns:
3367            Dataset: The created dataset as returned by the Langfuse API.
3368        """
3369        try:
3370            body = CreateDatasetRequest(
3371                name=name,
3372                description=description,
3373                metadata=metadata,
3374                inputSchema=input_schema,
3375                expectedOutputSchema=expected_output_schema,
3376            )
3377            langfuse_logger.debug(f"Creating datasets {body}")
3378
3379            return self.api.datasets.create(request=body)
3380
3381        except Error as e:
3382            handle_fern_exception(e)
3383            raise e

Create a dataset with the given name on Langfuse.

Arguments:
  • name: Name of the dataset to create.
  • description: Description of the dataset. Defaults to None.
  • metadata: Additional metadata. Defaults to None.
  • input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
  • expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
Returns:

Dataset: The created dataset as returned by the Langfuse API.

def create_dataset_item( self, *, dataset_name: str, input: Optional[Any] = None, expected_output: Optional[Any] = None, metadata: Optional[Any] = None, source_trace_id: Optional[str] = None, source_observation_id: Optional[str] = None, status: Optional[langfuse.api.DatasetStatus] = None, id: Optional[str] = None) -> langfuse.api.DatasetItem:
3385    def create_dataset_item(
3386        self,
3387        *,
3388        dataset_name: str,
3389        input: Optional[Any] = None,
3390        expected_output: Optional[Any] = None,
3391        metadata: Optional[Any] = None,
3392        source_trace_id: Optional[str] = None,
3393        source_observation_id: Optional[str] = None,
3394        status: Optional[DatasetStatus] = None,
3395        id: Optional[str] = None,
3396    ) -> DatasetItem:
3397        """Create a dataset item.
3398
3399        Upserts if an item with id already exists.
3400
3401        Args:
3402            dataset_name: Name of the dataset in which the dataset item should be created.
3403            input: Input data. Defaults to None. Can contain any dict, list or scalar.
3404            expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
3405            metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
3406            source_trace_id: Id of the source trace. Defaults to None.
3407            source_observation_id: Id of the source observation. Defaults to None.
3408            status: Status of the dataset item. Defaults to ACTIVE for newly created items.
3409            id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
3410
3411        Returns:
3412            DatasetItem: The created dataset item as returned by the Langfuse API.
3413
3414        Example:
3415            ```python
3416            from langfuse import Langfuse
3417
3418            langfuse = Langfuse()
3419
3420            # Uploading items to the Langfuse dataset named "capital_cities"
3421            langfuse.create_dataset_item(
3422                dataset_name="capital_cities",
3423                input={"input": {"country": "Italy"}},
3424                expected_output={"expected_output": "Rome"},
3425                metadata={"foo": "bar"}
3426            )
3427            ```
3428        """
3429        try:
3430            body = CreateDatasetItemRequest(
3431                datasetName=dataset_name,
3432                input=input,
3433                expectedOutput=expected_output,
3434                metadata=metadata,
3435                sourceTraceId=source_trace_id,
3436                sourceObservationId=source_observation_id,
3437                status=status,
3438                id=id,
3439            )
3440            langfuse_logger.debug(f"Creating dataset item {body}")
3441            return self.api.dataset_items.create(request=body)
3442        except Error as e:
3443            handle_fern_exception(e)
3444            raise e

Create a dataset item.

Upserts if an item with id already exists.

Arguments:
  • dataset_name: Name of the dataset in which the dataset item should be created.
  • input: Input data. Defaults to None. Can contain any dict, list or scalar.
  • expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
  • metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
  • source_trace_id: Id of the source trace. Defaults to None.
  • source_observation_id: Id of the source observation. Defaults to None.
  • status: Status of the dataset item. Defaults to ACTIVE for newly created items.
  • id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
Returns:

DatasetItem: The created dataset item as returned by the Langfuse API.

Example:
from langfuse import Langfuse

langfuse = Langfuse()

# Uploading items to the Langfuse dataset named "capital_cities"
langfuse.create_dataset_item(
    dataset_name="capital_cities",
    input={"input": {"country": "Italy"}},
    expected_output={"expected_output": "Rome"},
    metadata={"foo": "bar"}
)
def resolve_media_references( self, *, obj: Any, resolve_with: Literal['base64_data_uri'], max_depth: int = 10, content_fetch_timeout_seconds: int = 5) -> Any:
3446    def resolve_media_references(
3447        self,
3448        *,
3449        obj: Any,
3450        resolve_with: Literal["base64_data_uri"],
3451        max_depth: int = 10,
3452        content_fetch_timeout_seconds: int = 5,
3453    ) -> Any:
3454        """Replace media reference strings in an object with base64 data URIs.
3455
3456        This method recursively traverses an object (up to max_depth) looking for media reference strings
3457        in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using
3458        the provided Langfuse client and replaces the reference string with a base64 data URI.
3459
3460        If fetching media content fails for a reference string, a warning is logged and the reference
3461        string is left unchanged.
3462
3463        Args:
3464            obj: The object to process. Can be a primitive value, array, or nested object.
3465                If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
3466            resolve_with: The representation of the media content to replace the media reference string with.
3467                Currently only "base64_data_uri" is supported.
3468            max_depth: int: The maximum depth to traverse the object. Default is 10.
3469            content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
3470
3471        Returns:
3472            A deep copy of the input object with all media references replaced with base64 data URIs where possible.
3473            If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
3474
3475        Example:
3476            obj = {
3477                "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@",
3478                "nested": {
3479                    "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@"
3480                }
3481            }
3482
3483            result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
3484
3485            # Result:
3486            # {
3487            #     "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
3488            #     "nested": {
3489            #         "pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
3490            #     }
3491            # }
3492        """
3493        return LangfuseMedia.resolve_media_references(
3494            langfuse_client=self,
3495            obj=obj,
3496            resolve_with=resolve_with,
3497            max_depth=max_depth,
3498            content_fetch_timeout_seconds=content_fetch_timeout_seconds,
3499        )

Replace media reference strings in an object with base64 data URIs.

This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.

If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.

Arguments:
  • obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
  • resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
  • max_depth: int: The maximum depth to traverse the object. Default is 10.
  • content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
Returns:

A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.

Example:

obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }

result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)

Result:

{

"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",

"nested": {

"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."

}

}

def get_prompt( self, name: str, *, version: Optional[int] = None, label: Optional[str] = None, type: Literal['chat', 'text'] = 'text', cache_ttl_seconds: Optional[int] = None, fallback: Union[List[langfuse.model.ChatMessageDict], NoneType, str] = None, max_retries: Optional[int] = None, fetch_timeout_seconds: Optional[int] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]:
3529    def get_prompt(
3530        self,
3531        name: str,
3532        *,
3533        version: Optional[int] = None,
3534        label: Optional[str] = None,
3535        type: Literal["chat", "text"] = "text",
3536        cache_ttl_seconds: Optional[int] = None,
3537        fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None,
3538        max_retries: Optional[int] = None,
3539        fetch_timeout_seconds: Optional[int] = None,
3540    ) -> PromptClient:
3541        """Get a prompt.
3542
3543        This method attempts to fetch the requested prompt from the local cache. If the prompt is not found
3544        in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again
3545        and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will
3546        return the expired prompt as a fallback.
3547
3548        Args:
3549            name (str): The name of the prompt to retrieve.
3550
3551        Keyword Args:
3552            version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3553            label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both.
3554            cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
3555            keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
3556            type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
3557            fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
3558            max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
3559            fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
3560
3561        Returns:
3562            The prompt object retrieved from the cache or directly fetched if not cached or expired of type
3563            - TextPromptClient, if type argument is 'text'.
3564            - ChatPromptClient, if type argument is 'chat'.
3565
3566        Raises:
3567            Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
3568            expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3569        """
3570        if self._resources is None:
3571            raise Error(
3572                "SDK is not correctly initialized. Check the init logs for more details."
3573            )
3574        if version is not None and label is not None:
3575            raise ValueError("Cannot specify both version and label at the same time.")
3576
3577        if not name:
3578            raise ValueError("Prompt name cannot be empty.")
3579
3580        cache_key = PromptCache.generate_cache_key(name, version=version, label=label)
3581        bounded_max_retries = self._get_bounded_max_retries(
3582            max_retries, default_max_retries=2, max_retries_upper_bound=4
3583        )
3584
3585        langfuse_logger.debug(f"Getting prompt '{cache_key}'")
3586        cached_prompt = self._resources.prompt_cache.get(cache_key)
3587
3588        if cached_prompt is None or cache_ttl_seconds == 0:
3589            langfuse_logger.debug(
3590                f"Prompt '{cache_key}' not found in cache or caching disabled."
3591            )
3592            try:
3593                return self._fetch_prompt_and_update_cache(
3594                    name,
3595                    version=version,
3596                    label=label,
3597                    ttl_seconds=cache_ttl_seconds,
3598                    max_retries=bounded_max_retries,
3599                    fetch_timeout_seconds=fetch_timeout_seconds,
3600                )
3601            except Exception as e:
3602                if fallback:
3603                    langfuse_logger.warning(
3604                        f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}"
3605                    )
3606
3607                    fallback_client_args: Dict[str, Any] = {
3608                        "name": name,
3609                        "prompt": fallback,
3610                        "type": type,
3611                        "version": version or 0,
3612                        "config": {},
3613                        "labels": [label] if label else [],
3614                        "tags": [],
3615                    }
3616
3617                    if type == "text":
3618                        return TextPromptClient(
3619                            prompt=Prompt_Text(**fallback_client_args),
3620                            is_fallback=True,
3621                        )
3622
3623                    if type == "chat":
3624                        return ChatPromptClient(
3625                            prompt=Prompt_Chat(**fallback_client_args),
3626                            is_fallback=True,
3627                        )
3628
3629                raise e
3630
3631        if cached_prompt.is_expired():
3632            langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.")
3633            try:
3634                # refresh prompt in background thread, refresh_prompt deduplicates tasks
3635                langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.")
3636
3637                def refresh_task() -> None:
3638                    self._fetch_prompt_and_update_cache(
3639                        name,
3640                        version=version,
3641                        label=label,
3642                        ttl_seconds=cache_ttl_seconds,
3643                        max_retries=bounded_max_retries,
3644                        fetch_timeout_seconds=fetch_timeout_seconds,
3645                    )
3646
3647                self._resources.prompt_cache.add_refresh_prompt_task(
3648                    cache_key,
3649                    refresh_task,
3650                )
3651                langfuse_logger.debug(
3652                    f"Returning stale prompt '{cache_key}' from cache."
3653                )
3654                # return stale prompt
3655                return cached_prompt.value
3656
3657            except Exception as e:
3658                langfuse_logger.warning(
3659                    f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}"
3660                )
3661                # creation of refresh prompt task failed, return stale prompt
3662                return cached_prompt.value
3663
3664        return cached_prompt.value

Get a prompt.

This method attempts to fetch the requested prompt from the local cache. If the prompt is not found in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will return the expired prompt as a fallback.

Arguments:
  • name (str): The name of the prompt to retrieve.
Keyword Args:

version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both. label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the production label is returned. Specify either version or label, not both. cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.

Returns:

The prompt object retrieved from the cache or directly fetched if not cached or expired of type

  • TextPromptClient, if type argument is 'text'.
  • ChatPromptClient, if type argument is 'chat'.
Raises:
  • Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
  • expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
def create_prompt( self, *, name: str, prompt: Union[str, List[Union[langfuse.model.ChatMessageDict, langfuse.model.ChatMessageWithPlaceholdersDict_Message, langfuse.model.ChatMessageWithPlaceholdersDict_Placeholder]]], labels: List[str] = [], tags: Optional[List[str]] = None, type: Optional[Literal['chat', 'text']] = 'text', config: Optional[Any] = None, commit_message: Optional[str] = None) -> Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient]:
3766    def create_prompt(
3767        self,
3768        *,
3769        name: str,
3770        prompt: Union[
3771            str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]]
3772        ],
3773        labels: List[str] = [],
3774        tags: Optional[List[str]] = None,
3775        type: Optional[Literal["chat", "text"]] = "text",
3776        config: Optional[Any] = None,
3777        commit_message: Optional[str] = None,
3778    ) -> PromptClient:
3779        """Create a new prompt in Langfuse.
3780
3781        Keyword Args:
3782            name : The name of the prompt to be created.
3783            prompt : The content of the prompt to be created.
3784            is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
3785            labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
3786            tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
3787            config: Additional structured data to be saved with the prompt. Defaults to None.
3788            type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
3789            commit_message: Optional string describing the change.
3790
3791        Returns:
3792            TextPromptClient: The prompt if type argument is 'text'.
3793            ChatPromptClient: The prompt if type argument is 'chat'.
3794        """
3795        try:
3796            langfuse_logger.debug(f"Creating prompt {name=}, {labels=}")
3797
3798            if type == "chat":
3799                if not isinstance(prompt, list):
3800                    raise ValueError(
3801                        "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes."
3802                    )
3803                request: Union[CreatePromptRequest_Chat, CreatePromptRequest_Text] = (
3804                    CreatePromptRequest_Chat(
3805                        name=name,
3806                        prompt=cast(Any, prompt),
3807                        labels=labels,
3808                        tags=tags,
3809                        config=config or {},
3810                        commitMessage=commit_message,
3811                        type="chat",
3812                    )
3813                )
3814                server_prompt = self.api.prompts.create(request=request)
3815
3816                if self._resources is not None:
3817                    self._resources.prompt_cache.invalidate(name)
3818
3819                return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt))
3820
3821            if not isinstance(prompt, str):
3822                raise ValueError("For 'text' type, 'prompt' must be a string.")
3823
3824            request = CreatePromptRequest_Text(
3825                name=name,
3826                prompt=prompt,
3827                labels=labels,
3828                tags=tags,
3829                config=config or {},
3830                commitMessage=commit_message,
3831                type="text",
3832            )
3833
3834            server_prompt = self.api.prompts.create(request=request)
3835
3836            if self._resources is not None:
3837                self._resources.prompt_cache.invalidate(name)
3838
3839            return TextPromptClient(prompt=cast(Prompt_Text, server_prompt))
3840
3841        except Error as e:
3842            handle_fern_exception(e)
3843            raise e

Create a new prompt in Langfuse.

Keyword Args:

name : The name of the prompt to be created. prompt : The content of the prompt to be created. is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. config: Additional structured data to be saved with the prompt. Defaults to None. type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". commit_message: Optional string describing the change.

Returns:

TextPromptClient: The prompt if type argument is 'text'. ChatPromptClient: The prompt if type argument is 'chat'.

def update_prompt(self, *, name: str, version: int, new_labels: List[str] = []) -> Any:
3845    def update_prompt(
3846        self,
3847        *,
3848        name: str,
3849        version: int,
3850        new_labels: List[str] = [],
3851    ) -> Any:
3852        """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
3853
3854        Args:
3855            name (str): The name of the prompt to update.
3856            version (int): The version number of the prompt to update.
3857            new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
3858
3859        Returns:
3860            Prompt: The updated prompt from the Langfuse API.
3861
3862        """
3863        updated_prompt = self.api.prompt_version.update(
3864            name=self._url_encode(name),
3865            version=version,
3866            new_labels=new_labels,
3867        )
3868
3869        if self._resources is not None:
3870            self._resources.prompt_cache.invalidate(name)
3871
3872        return updated_prompt

Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.

Arguments:
  • name (str): The name of the prompt to update.
  • version (int): The version number of the prompt to update.
  • new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
Returns:

Prompt: The updated prompt from the Langfuse API.

def clear_prompt_cache(self) -> None:
3887    def clear_prompt_cache(self) -> None:
3888        """Clear the entire prompt cache, removing all cached prompts.
3889
3890        This method is useful when you want to force a complete refresh of all
3891        cached prompts, for example after major updates or when you need to
3892        ensure the latest versions are fetched from the server.
3893        """
3894        if self._resources is not None:
3895            self._resources.prompt_cache.clear()

Clear the entire prompt cache, removing all cached prompts.

This method is useful when you want to force a complete refresh of all cached prompts, for example after major updates or when you need to ensure the latest versions are fetched from the server.

def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 61def get_client(*, public_key: Optional[str] = None) -> Langfuse:
 62    """Get or create a Langfuse client instance.
 63
 64    Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups,
 65    providing a public_key is required. Multi-project support is experimental - see Langfuse docs.
 66
 67    Behavior:
 68    - Single project: Returns existing client or creates new one
 69    - Multi-project: Requires public_key to return specific client
 70    - No public_key in multi-project: Returns disabled client to prevent data leakage
 71
 72    The function uses a singleton pattern per public_key to conserve resources and maintain state.
 73
 74    Args:
 75        public_key (Optional[str]): Project identifier
 76            - With key: Returns client for that project
 77            - Without key: Returns single client or disabled client if multiple exist
 78
 79    Returns:
 80        Langfuse: Client instance in one of three states:
 81            1. Client for specified public_key
 82            2. Default client for single-project setup
 83            3. Disabled client when multiple projects exist without key
 84
 85    Security:
 86        Disables tracing when multiple projects exist without explicit key to prevent
 87        cross-project data leakage. Multi-project setups are experimental.
 88
 89    Example:
 90        ```python
 91        # Single project
 92        client = get_client()  # Default client
 93
 94        # In multi-project usage:
 95        client_a = get_client(public_key="project_a_key")  # Returns project A's client
 96        client_b = get_client(public_key="project_b_key")  # Returns project B's client
 97
 98        # Without specific key in multi-project setup:
 99        client = get_client()  # Returns disabled client for safety
100        ```
101    """
102    with LangfuseResourceManager._lock:
103        active_instances = LangfuseResourceManager._instances
104
105        # If no explicit public_key provided, check execution context
106        if not public_key:
107            public_key = _current_public_key.get(None)
108
109        if not public_key:
110            if len(active_instances) == 0:
111                # No clients initialized yet, create default instance
112                return Langfuse()
113
114            if len(active_instances) == 1:
115                # Only one client exists, safe to use without specifying key
116                instance = list(active_instances.values())[0]
117
118                # Initialize with the credentials bound to the instance
119                # This is important if the original instance was instantiated
120                # via constructor arguments
121                return _create_client_from_instance(instance)
122
123            else:
124                # Multiple clients exist but no key specified - disable tracing
125                # to prevent cross-project data leakage
126                langfuse_logger.warning(
127                    "No 'langfuse_public_key' passed to decorated function, but multiple langfuse clients are instantiated in current process. Skipping tracing for this function to avoid cross-project leakage."
128                )
129                return Langfuse(
130                    tracing_enabled=False, public_key="fake", secret_key="fake"
131                )
132
133        else:
134            # Specific key provided, look up existing instance
135            target_instance: Optional[LangfuseResourceManager] = active_instances.get(
136                public_key, None
137            )
138
139            if target_instance is None:
140                # No instance found with this key - client not initialized properly
141                langfuse_logger.warning(
142                    f"No Langfuse client with public key {public_key} has been initialized. Skipping tracing for decorated function."
143                )
144                return Langfuse(
145                    tracing_enabled=False, public_key="fake", secret_key="fake"
146                )
147
148            # target_instance is guaranteed to be not None at this point
149            return _create_client_from_instance(target_instance, public_key)

Get or create a Langfuse client instance.

Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, providing a public_key is required. Multi-project support is experimental - see Langfuse docs.

Behavior:

  • Single project: Returns existing client or creates new one
  • Multi-project: Requires public_key to return specific client
  • No public_key in multi-project: Returns disabled client to prevent data leakage

The function uses a singleton pattern per public_key to conserve resources and maintain state.

Arguments:
  • public_key (Optional[str]): Project identifier
    • With key: Returns client for that project
    • Without key: Returns single client or disabled client if multiple exist
Returns:

Langfuse: Client instance in one of three states: 1. Client for specified public_key 2. Default client for single-project setup 3. Disabled client when multiple projects exist without key

Security:

Disables tracing when multiple projects exist without explicit key to prevent cross-project data leakage. Multi-project setups are experimental.

Example:
# Single project
client = get_client()  # Default client

# In multi-project usage:
client_a = get_client(public_key="project_a_key")  # Returns project A's client
client_b = get_client(public_key="project_b_key")  # Returns project B's client

# Without specific key in multi-project setup:
client = get_client()  # Returns disabled client for safety
def observe( func: Optional[~F] = None, *, name: Optional[str] = None, as_type: Union[Literal['generation', 'embedding'], Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], NoneType] = None, capture_input: Optional[bool] = None, capture_output: Optional[bool] = None, transform_to_string: Optional[Callable[[Iterable], str]] = None) -> Union[~F, Callable[[~F], ~F]]:
 90    def observe(
 91        self,
 92        func: Optional[F] = None,
 93        *,
 94        name: Optional[str] = None,
 95        as_type: Optional[ObservationTypeLiteralNoEvent] = None,
 96        capture_input: Optional[bool] = None,
 97        capture_output: Optional[bool] = None,
 98        transform_to_string: Optional[Callable[[Iterable], str]] = None,
 99    ) -> Union[F, Callable[[F], F]]:
100        """Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.
101
102        This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates
103        spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator
104        intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.
105
106        Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application,
107        enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.
108
109        Args:
110            func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
111            name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
112            as_type (Optional[Literal]): Set the observation type. Supported values:
113                    "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail".
114                    Observation types are highlighted in the Langfuse UI for filtering and visualization.
115                    The types "generation" and "embedding" create a span on which additional attributes such as model metrics
116                    can be set.
117
118        Returns:
119            Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.
120
121        Example:
122            For general function tracing with automatic naming:
123            ```python
124            @observe()
125            def process_user_request(user_id, query):
126                # Function is automatically traced with name "process_user_request"
127                return get_response(query)
128            ```
129
130            For language model generation tracking:
131            ```python
132            @observe(name="answer-generation", as_type="generation")
133            async def generate_answer(query):
134                # Creates a generation-type span with extended LLM metrics
135                response = await openai.chat.completions.create(
136                    model="gpt-4",
137                    messages=[{"role": "user", "content": query}]
138                )
139                return response.choices[0].message.content
140            ```
141
142            For trace context propagation between functions:
143            ```python
144            @observe()
145            def main_process():
146                # Parent span is created
147                return sub_process()  # Child span automatically connected to parent
148
149            @observe()
150            def sub_process():
151                # Automatically becomes a child span of main_process
152                return "result"
153            ```
154
155        Raises:
156            Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
157
158        Notes:
159            - The decorator preserves the original function's signature, docstring, and return type.
160            - Proper parent-child relationships between spans are automatically maintained.
161            - Special keyword arguments can be passed to control tracing:
162              - langfuse_trace_id: Explicitly set the trace ID for this function call
163              - langfuse_parent_observation_id: Explicitly set the parent span ID
164              - langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
165            - For async functions, the decorator returns an async function wrapper.
166            - For sync functions, the decorator returns a synchronous wrapper.
167        """
168        valid_types = set(get_observation_types_list(ObservationTypeLiteralNoEvent))
169        if as_type is not None and as_type not in valid_types:
170            self._log.warning(
171                f"Invalid as_type '{as_type}'. Valid types are: {', '.join(sorted(valid_types))}. Defaulting to 'span'."
172            )
173            as_type = "span"
174
175        function_io_capture_enabled = os.environ.get(
176            LANGFUSE_OBSERVE_DECORATOR_IO_CAPTURE_ENABLED, "True"
177        ).lower() not in ("false", "0")
178
179        should_capture_input = (
180            capture_input if capture_input is not None else function_io_capture_enabled
181        )
182
183        should_capture_output = (
184            capture_output
185            if capture_output is not None
186            else function_io_capture_enabled
187        )
188
189        def decorator(func: F) -> F:
190            return (
191                self._async_observe(
192                    func,
193                    name=name,
194                    as_type=as_type,
195                    capture_input=should_capture_input,
196                    capture_output=should_capture_output,
197                    transform_to_string=transform_to_string,
198                )
199                if asyncio.iscoroutinefunction(func)
200                else self._sync_observe(
201                    func,
202                    name=name,
203                    as_type=as_type,
204                    capture_input=should_capture_input,
205                    capture_output=should_capture_output,
206                    transform_to_string=transform_to_string,
207                )
208            )
209
210        """Handle decorator with or without parentheses.
211
212        This logic enables the decorator to work both with and without parentheses:
213        - @observe - Python passes the function directly to the decorator
214        - @observe() - Python calls the decorator first, which must return a function decorator
215
216        When called without arguments (@observe), the func parameter contains the function to decorate,
217        so we directly apply the decorator to it. When called with parentheses (@observe()),
218        func is None, so we return the decorator function itself for Python to apply in the next step.
219        """
220        if func is None:
221            return decorator
222        else:
223            return decorator(func)

Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.

This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.

Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.

Arguments:
  • func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
  • name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
  • as_type (Optional[Literal]): Set the observation type. Supported values: "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". Observation types are highlighted in the Langfuse UI for filtering and visualization. The types "generation" and "embedding" create a span on which additional attributes such as model metrics can be set.
Returns:

Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.

Example:

For general function tracing with automatic naming:

@observe()
def process_user_request(user_id, query):
    # Function is automatically traced with name "process_user_request"
    return get_response(query)

For language model generation tracking:

@observe(name="answer-generation", as_type="generation")
async def generate_answer(query):
    # Creates a generation-type span with extended LLM metrics
    response = await openai.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": query}]
    )
    return response.choices[0].message.content

For trace context propagation between functions:

@observe()
def main_process():
    # Parent span is created
    return sub_process()  # Child span automatically connected to parent

@observe()
def sub_process():
    # Automatically becomes a child span of main_process
    return "result"
Raises:
  • Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
Notes:
  • The decorator preserves the original function's signature, docstring, and return type.
  • Proper parent-child relationships between spans are automatically maintained.
  • Special keyword arguments can be passed to control tracing:
    • langfuse_trace_id: Explicitly set the trace ID for this function call
    • langfuse_parent_observation_id: Explicitly set the parent span ID
    • langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
  • For async functions, the decorator returns an async function wrapper.
  • For sync functions, the decorator returns a synchronous wrapper.
def propagate_attributes( *, user_id: Optional[str] = None, session_id: Optional[str] = None, metadata: Optional[Dict[str, str]] = None, version: Optional[str] = None, tags: Optional[List[str]] = None, trace_name: Optional[str] = None, as_baggage: bool = False) -> opentelemetry.util._decorator._AgnosticContextManager[typing.Any]:
 76def propagate_attributes(
 77    *,
 78    user_id: Optional[str] = None,
 79    session_id: Optional[str] = None,
 80    metadata: Optional[Dict[str, str]] = None,
 81    version: Optional[str] = None,
 82    tags: Optional[List[str]] = None,
 83    trace_name: Optional[str] = None,
 84    as_baggage: bool = False,
 85) -> _AgnosticContextManager[Any]:
 86    """Propagate trace-level attributes to all spans created within this context.
 87
 88    This context manager sets attributes on the currently active span AND automatically
 89    propagates them to all new child spans created within the context. This is the
 90    recommended way to set trace-level attributes like user_id, session_id, and metadata
 91    dimensions that should be consistently applied across all observations in a trace.
 92
 93    **IMPORTANT**: Call this as early as possible within your trace/workflow. Only the
 94    currently active span and spans created after entering this context will have these
 95    attributes. Pre-existing spans will NOT be retroactively updated.
 96
 97    **Why this matters**: Langfuse aggregation queries (e.g., total cost by user_id,
 98    filtering by session_id) only include observations that have the attribute set.
 99    If you call `propagate_attributes` late in your workflow, earlier spans won't be
100    included in aggregations for that attribute.
101
102    Args:
103        user_id: User identifier to associate with all spans in this context.
104            Must be US-ASCII string, ≤200 characters. Use this to track which user
105            generated each trace and enable e.g. per-user cost/performance analysis.
106        session_id: Session identifier to associate with all spans in this context.
107            Must be US-ASCII string, ≤200 characters. Use this to group related traces
108            within a user session (e.g., a conversation thread, multi-turn interaction).
109        metadata: Additional key-value metadata to propagate to all spans.
110            - Keys and values must be US-ASCII strings
111            - All values must be ≤200 characters
112            - Use for dimensions like internal correlating identifiers
113            - AVOID: large payloads, sensitive data, non-string values (will be dropped with warning)
114        version: Version identfier for parts of your application that are independently versioned, e.g. agents
115        tags: List of tags to categorize the group of observations
116        trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters.
117            Use this to set a consistent trace name for all spans created within this context.
118        as_baggage: If True, propagates attributes using OpenTelemetry baggage for
119            cross-process/service propagation. **Security warning**: When enabled,
120            attribute values are added to HTTP headers on ALL outbound requests.
121            Only enable if values are safe to transmit via HTTP headers and you need
122            cross-service tracing. Default: False.
123
124    Returns:
125        Context manager that propagates attributes to all child spans.
126
127    Example:
128        Basic usage with user and session tracking:
129
130        ```python
131        from langfuse import Langfuse
132
133        langfuse = Langfuse()
134
135        # Set attributes early in the trace
136        with langfuse.start_as_current_span(name="user_workflow") as span:
137            with langfuse.propagate_attributes(
138                user_id="user_123",
139                session_id="session_abc",
140                metadata={"experiment": "variant_a", "environment": "production"}
141            ):
142                # All spans created here will have user_id, session_id, and metadata
143                with langfuse.start_span(name="llm_call") as llm_span:
144                    # This span inherits: user_id, session_id, experiment, environment
145                    ...
146
147                with langfuse.start_generation(name="completion") as gen:
148                    # This span also inherits all attributes
149                    ...
150        ```
151
152        Late propagation (anti-pattern):
153
154        ```python
155        with langfuse.start_as_current_span(name="workflow") as span:
156            # These spans WON'T have user_id
157            early_span = langfuse.start_span(name="early_work")
158            early_span.end()
159
160            # Set attributes in the middle
161            with langfuse.propagate_attributes(user_id="user_123"):
162                # Only spans created AFTER this point will have user_id
163                late_span = langfuse.start_span(name="late_work")
164                late_span.end()
165
166            # Result: Aggregations by user_id will miss "early_work" span
167        ```
168
169        Cross-service propagation with baggage (advanced):
170
171        ```python
172        # Service A - originating service
173        with langfuse.start_as_current_span(name="api_request"):
174            with langfuse.propagate_attributes(
175                user_id="user_123",
176                session_id="session_abc",
177                as_baggage=True  # Propagate via HTTP headers
178            ):
179                # Make HTTP request to Service B
180                response = requests.get("https://service-b.example.com/api")
181                # user_id and session_id are now in HTTP headers
182
183        # Service B - downstream service
184        # OpenTelemetry will automatically extract baggage from HTTP headers
185        # and propagate to spans in Service B
186        ```
187
188    Note:
189        - **Validation**: All attribute values (user_id, session_id, metadata values)
190          must be strings ≤200 characters. Invalid values will be dropped with a
191          warning logged. Ensure values meet constraints before calling.
192        - **OpenTelemetry**: This uses OpenTelemetry context propagation under the hood,
193          making it compatible with other OTel-instrumented libraries.
194
195    Raises:
196        No exceptions are raised. Invalid values are logged as warnings and dropped.
197    """
198    return _propagate_attributes(
199        user_id=user_id,
200        session_id=session_id,
201        metadata=metadata,
202        version=version,
203        tags=tags,
204        trace_name=trace_name,
205        as_baggage=as_baggage,
206    )

Propagate trace-level attributes to all spans created within this context.

This context manager sets attributes on the currently active span AND automatically propagates them to all new child spans created within the context. This is the recommended way to set trace-level attributes like user_id, session_id, and metadata dimensions that should be consistently applied across all observations in a trace.

IMPORTANT: Call this as early as possible within your trace/workflow. Only the currently active span and spans created after entering this context will have these attributes. Pre-existing spans will NOT be retroactively updated.

Why this matters: Langfuse aggregation queries (e.g., total cost by user_id, filtering by session_id) only include observations that have the attribute set. If you call propagate_attributes late in your workflow, earlier spans won't be included in aggregations for that attribute.

Arguments:
  • user_id: User identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to track which user generated each trace and enable e.g. per-user cost/performance analysis.
  • session_id: Session identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to group related traces within a user session (e.g., a conversation thread, multi-turn interaction).
  • metadata: Additional key-value metadata to propagate to all spans.
    • Keys and values must be US-ASCII strings
    • All values must be ≤200 characters
    • Use for dimensions like internal correlating identifiers
    • AVOID: large payloads, sensitive data, non-string values (will be dropped with warning)
  • version: Version identfier for parts of your application that are independently versioned, e.g. agents
  • tags: List of tags to categorize the group of observations
  • trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters. Use this to set a consistent trace name for all spans created within this context.
  • as_baggage: If True, propagates attributes using OpenTelemetry baggage for cross-process/service propagation. Security warning: When enabled, attribute values are added to HTTP headers on ALL outbound requests. Only enable if values are safe to transmit via HTTP headers and you need cross-service tracing. Default: False.
Returns:

Context manager that propagates attributes to all child spans.

Example:

Basic usage with user and session tracking:

from langfuse import Langfuse

langfuse = Langfuse()

# Set attributes early in the trace
with langfuse.start_as_current_span(name="user_workflow") as span:
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        metadata={"experiment": "variant_a", "environment": "production"}
    ):
        # All spans created here will have user_id, session_id, and metadata
        with langfuse.start_span(name="llm_call") as llm_span:
            # This span inherits: user_id, session_id, experiment, environment
            ...

        with langfuse.start_generation(name="completion") as gen:
            # This span also inherits all attributes
            ...

Late propagation (anti-pattern):

with langfuse.start_as_current_span(name="workflow") as span:
    # These spans WON'T have user_id
    early_span = langfuse.start_span(name="early_work")
    early_span.end()

    # Set attributes in the middle
    with langfuse.propagate_attributes(user_id="user_123"):
        # Only spans created AFTER this point will have user_id
        late_span = langfuse.start_span(name="late_work")
        late_span.end()

    # Result: Aggregations by user_id will miss "early_work" span

Cross-service propagation with baggage (advanced):

# Service A - originating service
with langfuse.start_as_current_span(name="api_request"):
    with langfuse.propagate_attributes(
        user_id="user_123",
        session_id="session_abc",
        as_baggage=True  # Propagate via HTTP headers
    ):
        # Make HTTP request to Service B
        response = requests.get("https://service-b.example.com/api")
        # user_id and session_id are now in HTTP headers

# Service B - downstream service
# OpenTelemetry will automatically extract baggage from HTTP headers
# and propagate to spans in Service B
Note:
  • Validation: All attribute values (user_id, session_id, metadata values) must be strings ≤200 characters. Invalid values will be dropped with a warning logged. Ensure values meet constraints before calling.
  • OpenTelemetry: This uses OpenTelemetry context propagation under the hood, making it compatible with other OTel-instrumented libraries.
Raises:
  • No exceptions are raised. Invalid values are logged as warnings and dropped.
ObservationTypeLiteral = typing.Union[typing.Literal['generation', 'embedding'], typing.Literal['span', 'agent', 'tool', 'chain', 'retriever', 'evaluator', 'guardrail'], typing.Literal['event']]
class LangfuseSpan(langfuse._client.span.LangfuseObservationWrapper):
1166class LangfuseSpan(LangfuseObservationWrapper):
1167    """Standard span implementation for general operations in Langfuse.
1168
1169    This class represents a general-purpose span that can be used to trace
1170    any operation in your application. It extends the base LangfuseObservationWrapper
1171    with specific methods for creating child spans, generations, and updating
1172    span-specific attributes. If possible, use a more specific type for
1173    better observability and insights.
1174    """
1175
1176    def __init__(
1177        self,
1178        *,
1179        otel_span: otel_trace_api.Span,
1180        langfuse_client: "Langfuse",
1181        input: Optional[Any] = None,
1182        output: Optional[Any] = None,
1183        metadata: Optional[Any] = None,
1184        environment: Optional[str] = None,
1185        version: Optional[str] = None,
1186        level: Optional[SpanLevel] = None,
1187        status_message: Optional[str] = None,
1188    ):
1189        """Initialize a new LangfuseSpan.
1190
1191        Args:
1192            otel_span: The OpenTelemetry span to wrap
1193            langfuse_client: Reference to the parent Langfuse client
1194            input: Input data for the span (any JSON-serializable object)
1195            output: Output data from the span (any JSON-serializable object)
1196            metadata: Additional metadata to associate with the span
1197            environment: The tracing environment
1198            version: Version identifier for the code or component
1199            level: Importance level of the span (info, warning, error)
1200            status_message: Optional status message for the span
1201        """
1202        super().__init__(
1203            otel_span=otel_span,
1204            as_type="span",
1205            langfuse_client=langfuse_client,
1206            input=input,
1207            output=output,
1208            metadata=metadata,
1209            environment=environment,
1210            version=version,
1211            level=level,
1212            status_message=status_message,
1213        )
1214
1215    def start_span(
1216        self,
1217        name: str,
1218        input: Optional[Any] = None,
1219        output: Optional[Any] = None,
1220        metadata: Optional[Any] = None,
1221        version: Optional[str] = None,
1222        level: Optional[SpanLevel] = None,
1223        status_message: Optional[str] = None,
1224    ) -> "LangfuseSpan":
1225        """Create a new child span.
1226
1227        This method creates a new child span with this span as the parent.
1228        Unlike start_as_current_span(), this method does not set the new span
1229        as the current span in the context.
1230
1231        Args:
1232            name: Name of the span (e.g., function or operation name)
1233            input: Input data for the operation
1234            output: Output data from the operation
1235            metadata: Additional metadata to associate with the span
1236            version: Version identifier for the code or component
1237            level: Importance level of the span (info, warning, error)
1238            status_message: Optional status message for the span
1239
1240        Returns:
1241            A new LangfuseSpan that must be ended with .end() when complete
1242
1243        Example:
1244            ```python
1245            parent_span = langfuse.start_span(name="process-request")
1246            try:
1247                # Create a child span
1248                child_span = parent_span.start_span(name="validate-input")
1249                try:
1250                    # Do validation work
1251                    validation_result = validate(request_data)
1252                    child_span.update(output=validation_result)
1253                finally:
1254                    child_span.end()
1255
1256                # Continue with parent span
1257                result = process_validated_data(validation_result)
1258                parent_span.update(output=result)
1259            finally:
1260                parent_span.end()
1261            ```
1262        """
1263        return self.start_observation(
1264            name=name,
1265            as_type="span",
1266            input=input,
1267            output=output,
1268            metadata=metadata,
1269            version=version,
1270            level=level,
1271            status_message=status_message,
1272        )
1273
1274    def start_as_current_span(
1275        self,
1276        *,
1277        name: str,
1278        input: Optional[Any] = None,
1279        output: Optional[Any] = None,
1280        metadata: Optional[Any] = None,
1281        version: Optional[str] = None,
1282        level: Optional[SpanLevel] = None,
1283        status_message: Optional[str] = None,
1284    ) -> _AgnosticContextManager["LangfuseSpan"]:
1285        """[DEPRECATED] Create a new child span and set it as the current span in a context manager.
1286
1287        DEPRECATED: This method is deprecated and will be removed in a future version.
1288        Use start_as_current_observation(as_type='span') instead.
1289
1290        This method creates a new child span and sets it as the current span within
1291        a context manager. It should be used with a 'with' statement to automatically
1292        manage the span's lifecycle.
1293
1294        Args:
1295            name: Name of the span (e.g., function or operation name)
1296            input: Input data for the operation
1297            output: Output data from the operation
1298            metadata: Additional metadata to associate with the span
1299            version: Version identifier for the code or component
1300            level: Importance level of the span (info, warning, error)
1301            status_message: Optional status message for the span
1302
1303        Returns:
1304            A context manager that yields a new LangfuseSpan
1305
1306        Example:
1307            ```python
1308            with langfuse.start_as_current_span(name="process-request") as parent_span:
1309                # Parent span is active here
1310
1311                # Create a child span with context management
1312                with parent_span.start_as_current_span(name="validate-input") as child_span:
1313                    # Child span is active here
1314                    validation_result = validate(request_data)
1315                    child_span.update(output=validation_result)
1316
1317                # Back to parent span context
1318                result = process_validated_data(validation_result)
1319                parent_span.update(output=result)
1320            ```
1321        """
1322        warnings.warn(
1323            "start_as_current_span is deprecated and will be removed in a future version. "
1324            "Use start_as_current_observation(as_type='span') instead.",
1325            DeprecationWarning,
1326            stacklevel=2,
1327        )
1328        return self.start_as_current_observation(
1329            name=name,
1330            as_type="span",
1331            input=input,
1332            output=output,
1333            metadata=metadata,
1334            version=version,
1335            level=level,
1336            status_message=status_message,
1337        )
1338
1339    def start_generation(
1340        self,
1341        *,
1342        name: str,
1343        input: Optional[Any] = None,
1344        output: Optional[Any] = None,
1345        metadata: Optional[Any] = None,
1346        version: Optional[str] = None,
1347        level: Optional[SpanLevel] = None,
1348        status_message: Optional[str] = None,
1349        completion_start_time: Optional[datetime] = None,
1350        model: Optional[str] = None,
1351        model_parameters: Optional[Dict[str, MapValue]] = None,
1352        usage_details: Optional[Dict[str, int]] = None,
1353        cost_details: Optional[Dict[str, float]] = None,
1354        prompt: Optional[PromptClient] = None,
1355    ) -> "LangfuseGeneration":
1356        """[DEPRECATED] Create a new child generation span.
1357
1358        DEPRECATED: This method is deprecated and will be removed in a future version.
1359        Use start_observation(as_type='generation') instead.
1360
1361        This method creates a new child generation span with this span as the parent.
1362        Generation spans are specialized for AI/LLM operations and include additional
1363        fields for model information, usage stats, and costs.
1364
1365        Unlike start_as_current_generation(), this method does not set the new span
1366        as the current span in the context.
1367
1368        Args:
1369            name: Name of the generation operation
1370            input: Input data for the model (e.g., prompts)
1371            output: Output from the model (e.g., completions)
1372            metadata: Additional metadata to associate with the generation
1373            version: Version identifier for the model or component
1374            level: Importance level of the generation (info, warning, error)
1375            status_message: Optional status message for the generation
1376            completion_start_time: When the model started generating the response
1377            model: Name/identifier of the AI model used (e.g., "gpt-4")
1378            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1379            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1380            cost_details: Cost information for the model call
1381            prompt: Associated prompt template from Langfuse prompt management
1382
1383        Returns:
1384            A new LangfuseGeneration that must be ended with .end() when complete
1385
1386        Example:
1387            ```python
1388            span = langfuse.start_span(name="process-query")
1389            try:
1390                # Create a generation child span
1391                generation = span.start_generation(
1392                    name="generate-answer",
1393                    model="gpt-4",
1394                    input={"prompt": "Explain quantum computing"}
1395                )
1396                try:
1397                    # Call model API
1398                    response = llm.generate(...)
1399
1400                    generation.update(
1401                        output=response.text,
1402                        usage_details={
1403                            "prompt_tokens": response.usage.prompt_tokens,
1404                            "completion_tokens": response.usage.completion_tokens
1405                        }
1406                    )
1407                finally:
1408                    generation.end()
1409
1410                # Continue with parent span
1411                span.update(output={"answer": response.text, "source": "gpt-4"})
1412            finally:
1413                span.end()
1414            ```
1415        """
1416        warnings.warn(
1417            "start_generation is deprecated and will be removed in a future version. "
1418            "Use start_observation(as_type='generation') instead.",
1419            DeprecationWarning,
1420            stacklevel=2,
1421        )
1422        return self.start_observation(
1423            name=name,
1424            as_type="generation",
1425            input=input,
1426            output=output,
1427            metadata=metadata,
1428            version=version,
1429            level=level,
1430            status_message=status_message,
1431            completion_start_time=completion_start_time,
1432            model=model,
1433            model_parameters=model_parameters,
1434            usage_details=usage_details,
1435            cost_details=cost_details,
1436            prompt=prompt,
1437        )
1438
1439    def start_as_current_generation(
1440        self,
1441        *,
1442        name: str,
1443        input: Optional[Any] = None,
1444        output: Optional[Any] = None,
1445        metadata: Optional[Any] = None,
1446        version: Optional[str] = None,
1447        level: Optional[SpanLevel] = None,
1448        status_message: Optional[str] = None,
1449        completion_start_time: Optional[datetime] = None,
1450        model: Optional[str] = None,
1451        model_parameters: Optional[Dict[str, MapValue]] = None,
1452        usage_details: Optional[Dict[str, int]] = None,
1453        cost_details: Optional[Dict[str, float]] = None,
1454        prompt: Optional[PromptClient] = None,
1455    ) -> _AgnosticContextManager["LangfuseGeneration"]:
1456        """[DEPRECATED] Create a new child generation span and set it as the current span in a context manager.
1457
1458        DEPRECATED: This method is deprecated and will be removed in a future version.
1459        Use start_as_current_observation(as_type='generation') instead.
1460
1461        This method creates a new child generation span and sets it as the current span
1462        within a context manager. Generation spans are specialized for AI/LLM operations
1463        and include additional fields for model information, usage stats, and costs.
1464
1465        Args:
1466            name: Name of the generation operation
1467            input: Input data for the model (e.g., prompts)
1468            output: Output from the model (e.g., completions)
1469            metadata: Additional metadata to associate with the generation
1470            version: Version identifier for the model or component
1471            level: Importance level of the generation (info, warning, error)
1472            status_message: Optional status message for the generation
1473            completion_start_time: When the model started generating the response
1474            model: Name/identifier of the AI model used (e.g., "gpt-4")
1475            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1476            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1477            cost_details: Cost information for the model call
1478            prompt: Associated prompt template from Langfuse prompt management
1479
1480        Returns:
1481            A context manager that yields a new LangfuseGeneration
1482
1483        Example:
1484            ```python
1485            with langfuse.start_as_current_span(name="process-request") as span:
1486                # Prepare data
1487                query = preprocess_user_query(user_input)
1488
1489                # Create a generation span with context management
1490                with span.start_as_current_generation(
1491                    name="generate-answer",
1492                    model="gpt-4",
1493                    input={"query": query}
1494                ) as generation:
1495                    # Generation span is active here
1496                    response = llm.generate(query)
1497
1498                    # Update with results
1499                    generation.update(
1500                        output=response.text,
1501                        usage_details={
1502                            "prompt_tokens": response.usage.prompt_tokens,
1503                            "completion_tokens": response.usage.completion_tokens
1504                        }
1505                    )
1506
1507                # Back to parent span context
1508                span.update(output={"answer": response.text, "source": "gpt-4"})
1509            ```
1510        """
1511        warnings.warn(
1512            "start_as_current_generation is deprecated and will be removed in a future version. "
1513            "Use start_as_current_observation(as_type='generation') instead.",
1514            DeprecationWarning,
1515            stacklevel=2,
1516        )
1517        return self.start_as_current_observation(
1518            name=name,
1519            as_type="generation",
1520            input=input,
1521            output=output,
1522            metadata=metadata,
1523            version=version,
1524            level=level,
1525            status_message=status_message,
1526            completion_start_time=completion_start_time,
1527            model=model,
1528            model_parameters=model_parameters,
1529            usage_details=usage_details,
1530            cost_details=cost_details,
1531            prompt=prompt,
1532        )
1533
1534    def create_event(
1535        self,
1536        *,
1537        name: str,
1538        input: Optional[Any] = None,
1539        output: Optional[Any] = None,
1540        metadata: Optional[Any] = None,
1541        version: Optional[str] = None,
1542        level: Optional[SpanLevel] = None,
1543        status_message: Optional[str] = None,
1544    ) -> "LangfuseEvent":
1545        """Create a new Langfuse observation of type 'EVENT'.
1546
1547        Args:
1548            name: Name of the span (e.g., function or operation name)
1549            input: Input data for the operation (can be any JSON-serializable object)
1550            output: Output data from the operation (can be any JSON-serializable object)
1551            metadata: Additional metadata to associate with the span
1552            version: Version identifier for the code or component
1553            level: Importance level of the span (info, warning, error)
1554            status_message: Optional status message for the span
1555
1556        Returns:
1557            The LangfuseEvent object
1558
1559        Example:
1560            ```python
1561            event = langfuse.create_event(name="process-event")
1562            ```
1563        """
1564        timestamp = time_ns()
1565
1566        with otel_trace_api.use_span(self._otel_span):
1567            new_otel_span = self._langfuse_client._otel_tracer.start_span(
1568                name=name, start_time=timestamp
1569            )
1570
1571        return cast(
1572            "LangfuseEvent",
1573            LangfuseEvent(
1574                otel_span=new_otel_span,
1575                langfuse_client=self._langfuse_client,
1576                input=input,
1577                output=output,
1578                metadata=metadata,
1579                environment=self._environment,
1580                version=version,
1581                level=level,
1582                status_message=status_message,
1583            ).end(end_time=timestamp),
1584        )

Standard span implementation for general operations in Langfuse.

This class represents a general-purpose span that can be used to trace any operation in your application. It extends the base LangfuseObservationWrapper with specific methods for creating child spans, generations, and updating span-specific attributes. If possible, use a more specific type for better observability and insights.

LangfuseSpan( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None)
1176    def __init__(
1177        self,
1178        *,
1179        otel_span: otel_trace_api.Span,
1180        langfuse_client: "Langfuse",
1181        input: Optional[Any] = None,
1182        output: Optional[Any] = None,
1183        metadata: Optional[Any] = None,
1184        environment: Optional[str] = None,
1185        version: Optional[str] = None,
1186        level: Optional[SpanLevel] = None,
1187        status_message: Optional[str] = None,
1188    ):
1189        """Initialize a new LangfuseSpan.
1190
1191        Args:
1192            otel_span: The OpenTelemetry span to wrap
1193            langfuse_client: Reference to the parent Langfuse client
1194            input: Input data for the span (any JSON-serializable object)
1195            output: Output data from the span (any JSON-serializable object)
1196            metadata: Additional metadata to associate with the span
1197            environment: The tracing environment
1198            version: Version identifier for the code or component
1199            level: Importance level of the span (info, warning, error)
1200            status_message: Optional status message for the span
1201        """
1202        super().__init__(
1203            otel_span=otel_span,
1204            as_type="span",
1205            langfuse_client=langfuse_client,
1206            input=input,
1207            output=output,
1208            metadata=metadata,
1209            environment=environment,
1210            version=version,
1211            level=level,
1212            status_message=status_message,
1213        )

Initialize a new LangfuseSpan.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the span (any JSON-serializable object)
  • output: Output data from the span (any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • environment: The tracing environment
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
def start_span( self, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseSpan:
1215    def start_span(
1216        self,
1217        name: str,
1218        input: Optional[Any] = None,
1219        output: Optional[Any] = None,
1220        metadata: Optional[Any] = None,
1221        version: Optional[str] = None,
1222        level: Optional[SpanLevel] = None,
1223        status_message: Optional[str] = None,
1224    ) -> "LangfuseSpan":
1225        """Create a new child span.
1226
1227        This method creates a new child span with this span as the parent.
1228        Unlike start_as_current_span(), this method does not set the new span
1229        as the current span in the context.
1230
1231        Args:
1232            name: Name of the span (e.g., function or operation name)
1233            input: Input data for the operation
1234            output: Output data from the operation
1235            metadata: Additional metadata to associate with the span
1236            version: Version identifier for the code or component
1237            level: Importance level of the span (info, warning, error)
1238            status_message: Optional status message for the span
1239
1240        Returns:
1241            A new LangfuseSpan that must be ended with .end() when complete
1242
1243        Example:
1244            ```python
1245            parent_span = langfuse.start_span(name="process-request")
1246            try:
1247                # Create a child span
1248                child_span = parent_span.start_span(name="validate-input")
1249                try:
1250                    # Do validation work
1251                    validation_result = validate(request_data)
1252                    child_span.update(output=validation_result)
1253                finally:
1254                    child_span.end()
1255
1256                # Continue with parent span
1257                result = process_validated_data(validation_result)
1258                parent_span.update(output=result)
1259            finally:
1260                parent_span.end()
1261            ```
1262        """
1263        return self.start_observation(
1264            name=name,
1265            as_type="span",
1266            input=input,
1267            output=output,
1268            metadata=metadata,
1269            version=version,
1270            level=level,
1271            status_message=status_message,
1272        )

Create a new child span.

This method creates a new child span with this span as the parent. Unlike start_as_current_span(), this method does not set the new span as the current span in the context.

Arguments:
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

A new LangfuseSpan that must be ended with .end() when complete

Example:
parent_span = langfuse.start_span(name="process-request")
try:
    # Create a child span
    child_span = parent_span.start_span(name="validate-input")
    try:
        # Do validation work
        validation_result = validate(request_data)
        child_span.update(output=validation_result)
    finally:
        child_span.end()

    # Continue with parent span
    result = process_validated_data(validation_result)
    parent_span.update(output=result)
finally:
    parent_span.end()
def start_as_current_span( self, *, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> opentelemetry.util._decorator._AgnosticContextManager[LangfuseSpan]:
1274    def start_as_current_span(
1275        self,
1276        *,
1277        name: str,
1278        input: Optional[Any] = None,
1279        output: Optional[Any] = None,
1280        metadata: Optional[Any] = None,
1281        version: Optional[str] = None,
1282        level: Optional[SpanLevel] = None,
1283        status_message: Optional[str] = None,
1284    ) -> _AgnosticContextManager["LangfuseSpan"]:
1285        """[DEPRECATED] Create a new child span and set it as the current span in a context manager.
1286
1287        DEPRECATED: This method is deprecated and will be removed in a future version.
1288        Use start_as_current_observation(as_type='span') instead.
1289
1290        This method creates a new child span and sets it as the current span within
1291        a context manager. It should be used with a 'with' statement to automatically
1292        manage the span's lifecycle.
1293
1294        Args:
1295            name: Name of the span (e.g., function or operation name)
1296            input: Input data for the operation
1297            output: Output data from the operation
1298            metadata: Additional metadata to associate with the span
1299            version: Version identifier for the code or component
1300            level: Importance level of the span (info, warning, error)
1301            status_message: Optional status message for the span
1302
1303        Returns:
1304            A context manager that yields a new LangfuseSpan
1305
1306        Example:
1307            ```python
1308            with langfuse.start_as_current_span(name="process-request") as parent_span:
1309                # Parent span is active here
1310
1311                # Create a child span with context management
1312                with parent_span.start_as_current_span(name="validate-input") as child_span:
1313                    # Child span is active here
1314                    validation_result = validate(request_data)
1315                    child_span.update(output=validation_result)
1316
1317                # Back to parent span context
1318                result = process_validated_data(validation_result)
1319                parent_span.update(output=result)
1320            ```
1321        """
1322        warnings.warn(
1323            "start_as_current_span is deprecated and will be removed in a future version. "
1324            "Use start_as_current_observation(as_type='span') instead.",
1325            DeprecationWarning,
1326            stacklevel=2,
1327        )
1328        return self.start_as_current_observation(
1329            name=name,
1330            as_type="span",
1331            input=input,
1332            output=output,
1333            metadata=metadata,
1334            version=version,
1335            level=level,
1336            status_message=status_message,
1337        )

[DEPRECATED] Create a new child span and set it as the current span in a context manager.

DEPRECATED: This method is deprecated and will be removed in a future version. Use start_as_current_observation(as_type='span') instead.

This method creates a new child span and sets it as the current span within a context manager. It should be used with a 'with' statement to automatically manage the span's lifecycle.

Arguments:
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation
  • output: Output data from the operation
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

A context manager that yields a new LangfuseSpan

Example:
with langfuse.start_as_current_span(name="process-request") as parent_span:
    # Parent span is active here

    # Create a child span with context management
    with parent_span.start_as_current_span(name="validate-input") as child_span:
        # Child span is active here
        validation_result = validate(request_data)
        child_span.update(output=validation_result)

    # Back to parent span context
    result = process_validated_data(validation_result)
    parent_span.update(output=result)
def start_generation( self, *, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> LangfuseGeneration:
1339    def start_generation(
1340        self,
1341        *,
1342        name: str,
1343        input: Optional[Any] = None,
1344        output: Optional[Any] = None,
1345        metadata: Optional[Any] = None,
1346        version: Optional[str] = None,
1347        level: Optional[SpanLevel] = None,
1348        status_message: Optional[str] = None,
1349        completion_start_time: Optional[datetime] = None,
1350        model: Optional[str] = None,
1351        model_parameters: Optional[Dict[str, MapValue]] = None,
1352        usage_details: Optional[Dict[str, int]] = None,
1353        cost_details: Optional[Dict[str, float]] = None,
1354        prompt: Optional[PromptClient] = None,
1355    ) -> "LangfuseGeneration":
1356        """[DEPRECATED] Create a new child generation span.
1357
1358        DEPRECATED: This method is deprecated and will be removed in a future version.
1359        Use start_observation(as_type='generation') instead.
1360
1361        This method creates a new child generation span with this span as the parent.
1362        Generation spans are specialized for AI/LLM operations and include additional
1363        fields for model information, usage stats, and costs.
1364
1365        Unlike start_as_current_generation(), this method does not set the new span
1366        as the current span in the context.
1367
1368        Args:
1369            name: Name of the generation operation
1370            input: Input data for the model (e.g., prompts)
1371            output: Output from the model (e.g., completions)
1372            metadata: Additional metadata to associate with the generation
1373            version: Version identifier for the model or component
1374            level: Importance level of the generation (info, warning, error)
1375            status_message: Optional status message for the generation
1376            completion_start_time: When the model started generating the response
1377            model: Name/identifier of the AI model used (e.g., "gpt-4")
1378            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1379            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1380            cost_details: Cost information for the model call
1381            prompt: Associated prompt template from Langfuse prompt management
1382
1383        Returns:
1384            A new LangfuseGeneration that must be ended with .end() when complete
1385
1386        Example:
1387            ```python
1388            span = langfuse.start_span(name="process-query")
1389            try:
1390                # Create a generation child span
1391                generation = span.start_generation(
1392                    name="generate-answer",
1393                    model="gpt-4",
1394                    input={"prompt": "Explain quantum computing"}
1395                )
1396                try:
1397                    # Call model API
1398                    response = llm.generate(...)
1399
1400                    generation.update(
1401                        output=response.text,
1402                        usage_details={
1403                            "prompt_tokens": response.usage.prompt_tokens,
1404                            "completion_tokens": response.usage.completion_tokens
1405                        }
1406                    )
1407                finally:
1408                    generation.end()
1409
1410                # Continue with parent span
1411                span.update(output={"answer": response.text, "source": "gpt-4"})
1412            finally:
1413                span.end()
1414            ```
1415        """
1416        warnings.warn(
1417            "start_generation is deprecated and will be removed in a future version. "
1418            "Use start_observation(as_type='generation') instead.",
1419            DeprecationWarning,
1420            stacklevel=2,
1421        )
1422        return self.start_observation(
1423            name=name,
1424            as_type="generation",
1425            input=input,
1426            output=output,
1427            metadata=metadata,
1428            version=version,
1429            level=level,
1430            status_message=status_message,
1431            completion_start_time=completion_start_time,
1432            model=model,
1433            model_parameters=model_parameters,
1434            usage_details=usage_details,
1435            cost_details=cost_details,
1436            prompt=prompt,
1437        )

[DEPRECATED] Create a new child generation span.

DEPRECATED: This method is deprecated and will be removed in a future version. Use start_observation(as_type='generation') instead.

This method creates a new child generation span with this span as the parent. Generation spans are specialized for AI/LLM operations and include additional fields for model information, usage stats, and costs.

Unlike start_as_current_generation(), this method does not set the new span as the current span in the context.

Arguments:
  • name: Name of the generation operation
  • input: Input data for the model (e.g., prompts)
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Returns:

A new LangfuseGeneration that must be ended with .end() when complete

Example:
span = langfuse.start_span(name="process-query")
try:
    # Create a generation child span
    generation = span.start_generation(
        name="generate-answer",
        model="gpt-4",
        input={"prompt": "Explain quantum computing"}
    )
    try:
        # Call model API
        response = llm.generate(...)

        generation.update(
            output=response.text,
            usage_details={
                "prompt_tokens": response.usage.prompt_tokens,
                "completion_tokens": response.usage.completion_tokens
            }
        )
    finally:
        generation.end()

    # Continue with parent span
    span.update(output={"answer": response.text, "source": "gpt-4"})
finally:
    span.end()
def start_as_current_generation( self, *, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None) -> opentelemetry.util._decorator._AgnosticContextManager[LangfuseGeneration]:
1439    def start_as_current_generation(
1440        self,
1441        *,
1442        name: str,
1443        input: Optional[Any] = None,
1444        output: Optional[Any] = None,
1445        metadata: Optional[Any] = None,
1446        version: Optional[str] = None,
1447        level: Optional[SpanLevel] = None,
1448        status_message: Optional[str] = None,
1449        completion_start_time: Optional[datetime] = None,
1450        model: Optional[str] = None,
1451        model_parameters: Optional[Dict[str, MapValue]] = None,
1452        usage_details: Optional[Dict[str, int]] = None,
1453        cost_details: Optional[Dict[str, float]] = None,
1454        prompt: Optional[PromptClient] = None,
1455    ) -> _AgnosticContextManager["LangfuseGeneration"]:
1456        """[DEPRECATED] Create a new child generation span and set it as the current span in a context manager.
1457
1458        DEPRECATED: This method is deprecated and will be removed in a future version.
1459        Use start_as_current_observation(as_type='generation') instead.
1460
1461        This method creates a new child generation span and sets it as the current span
1462        within a context manager. Generation spans are specialized for AI/LLM operations
1463        and include additional fields for model information, usage stats, and costs.
1464
1465        Args:
1466            name: Name of the generation operation
1467            input: Input data for the model (e.g., prompts)
1468            output: Output from the model (e.g., completions)
1469            metadata: Additional metadata to associate with the generation
1470            version: Version identifier for the model or component
1471            level: Importance level of the generation (info, warning, error)
1472            status_message: Optional status message for the generation
1473            completion_start_time: When the model started generating the response
1474            model: Name/identifier of the AI model used (e.g., "gpt-4")
1475            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1476            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1477            cost_details: Cost information for the model call
1478            prompt: Associated prompt template from Langfuse prompt management
1479
1480        Returns:
1481            A context manager that yields a new LangfuseGeneration
1482
1483        Example:
1484            ```python
1485            with langfuse.start_as_current_span(name="process-request") as span:
1486                # Prepare data
1487                query = preprocess_user_query(user_input)
1488
1489                # Create a generation span with context management
1490                with span.start_as_current_generation(
1491                    name="generate-answer",
1492                    model="gpt-4",
1493                    input={"query": query}
1494                ) as generation:
1495                    # Generation span is active here
1496                    response = llm.generate(query)
1497
1498                    # Update with results
1499                    generation.update(
1500                        output=response.text,
1501                        usage_details={
1502                            "prompt_tokens": response.usage.prompt_tokens,
1503                            "completion_tokens": response.usage.completion_tokens
1504                        }
1505                    )
1506
1507                # Back to parent span context
1508                span.update(output={"answer": response.text, "source": "gpt-4"})
1509            ```
1510        """
1511        warnings.warn(
1512            "start_as_current_generation is deprecated and will be removed in a future version. "
1513            "Use start_as_current_observation(as_type='generation') instead.",
1514            DeprecationWarning,
1515            stacklevel=2,
1516        )
1517        return self.start_as_current_observation(
1518            name=name,
1519            as_type="generation",
1520            input=input,
1521            output=output,
1522            metadata=metadata,
1523            version=version,
1524            level=level,
1525            status_message=status_message,
1526            completion_start_time=completion_start_time,
1527            model=model,
1528            model_parameters=model_parameters,
1529            usage_details=usage_details,
1530            cost_details=cost_details,
1531            prompt=prompt,
1532        )

[DEPRECATED] Create a new child generation span and set it as the current span in a context manager.

DEPRECATED: This method is deprecated and will be removed in a future version. Use start_as_current_observation(as_type='generation') instead.

This method creates a new child generation span and sets it as the current span within a context manager. Generation spans are specialized for AI/LLM operations and include additional fields for model information, usage stats, and costs.

Arguments:
  • name: Name of the generation operation
  • input: Input data for the model (e.g., prompts)
  • output: Output from the model (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
Returns:

A context manager that yields a new LangfuseGeneration

Example:
with langfuse.start_as_current_span(name="process-request") as span:
    # Prepare data
    query = preprocess_user_query(user_input)

    # Create a generation span with context management
    with span.start_as_current_generation(
        name="generate-answer",
        model="gpt-4",
        input={"query": query}
    ) as generation:
        # Generation span is active here
        response = llm.generate(query)

        # Update with results
        generation.update(
            output=response.text,
            usage_details={
                "prompt_tokens": response.usage.prompt_tokens,
                "completion_tokens": response.usage.completion_tokens
            }
        )

    # Back to parent span context
    span.update(output={"answer": response.text, "source": "gpt-4"})
def create_event( self, *, name: str, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None) -> LangfuseEvent:
1534    def create_event(
1535        self,
1536        *,
1537        name: str,
1538        input: Optional[Any] = None,
1539        output: Optional[Any] = None,
1540        metadata: Optional[Any] = None,
1541        version: Optional[str] = None,
1542        level: Optional[SpanLevel] = None,
1543        status_message: Optional[str] = None,
1544    ) -> "LangfuseEvent":
1545        """Create a new Langfuse observation of type 'EVENT'.
1546
1547        Args:
1548            name: Name of the span (e.g., function or operation name)
1549            input: Input data for the operation (can be any JSON-serializable object)
1550            output: Output data from the operation (can be any JSON-serializable object)
1551            metadata: Additional metadata to associate with the span
1552            version: Version identifier for the code or component
1553            level: Importance level of the span (info, warning, error)
1554            status_message: Optional status message for the span
1555
1556        Returns:
1557            The LangfuseEvent object
1558
1559        Example:
1560            ```python
1561            event = langfuse.create_event(name="process-event")
1562            ```
1563        """
1564        timestamp = time_ns()
1565
1566        with otel_trace_api.use_span(self._otel_span):
1567            new_otel_span = self._langfuse_client._otel_tracer.start_span(
1568                name=name, start_time=timestamp
1569            )
1570
1571        return cast(
1572            "LangfuseEvent",
1573            LangfuseEvent(
1574                otel_span=new_otel_span,
1575                langfuse_client=self._langfuse_client,
1576                input=input,
1577                output=output,
1578                metadata=metadata,
1579                environment=self._environment,
1580                version=version,
1581                level=level,
1582                status_message=status_message,
1583            ).end(end_time=timestamp),
1584        )

Create a new Langfuse observation of type 'EVENT'.

Arguments:
  • name: Name of the span (e.g., function or operation name)
  • input: Input data for the operation (can be any JSON-serializable object)
  • output: Output data from the operation (can be any JSON-serializable object)
  • metadata: Additional metadata to associate with the span
  • version: Version identifier for the code or component
  • level: Importance level of the span (info, warning, error)
  • status_message: Optional status message for the span
Returns:

The LangfuseEvent object

Example:
event = langfuse.create_event(name="process-event")
class LangfuseGeneration(langfuse._client.span.LangfuseObservationWrapper):
1587class LangfuseGeneration(LangfuseObservationWrapper):
1588    """Specialized span implementation for AI model generations in Langfuse.
1589
1590    This class represents a generation span specifically designed for tracking
1591    AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized
1592    attributes for model details, token usage, and costs.
1593    """
1594
1595    def __init__(
1596        self,
1597        *,
1598        otel_span: otel_trace_api.Span,
1599        langfuse_client: "Langfuse",
1600        input: Optional[Any] = None,
1601        output: Optional[Any] = None,
1602        metadata: Optional[Any] = None,
1603        environment: Optional[str] = None,
1604        version: Optional[str] = None,
1605        level: Optional[SpanLevel] = None,
1606        status_message: Optional[str] = None,
1607        completion_start_time: Optional[datetime] = None,
1608        model: Optional[str] = None,
1609        model_parameters: Optional[Dict[str, MapValue]] = None,
1610        usage_details: Optional[Dict[str, int]] = None,
1611        cost_details: Optional[Dict[str, float]] = None,
1612        prompt: Optional[PromptClient] = None,
1613    ):
1614        """Initialize a new LangfuseGeneration span.
1615
1616        Args:
1617            otel_span: The OpenTelemetry span to wrap
1618            langfuse_client: Reference to the parent Langfuse client
1619            input: Input data for the generation (e.g., prompts)
1620            output: Output from the generation (e.g., completions)
1621            metadata: Additional metadata to associate with the generation
1622            environment: The tracing environment
1623            version: Version identifier for the model or component
1624            level: Importance level of the generation (info, warning, error)
1625            status_message: Optional status message for the generation
1626            completion_start_time: When the model started generating the response
1627            model: Name/identifier of the AI model used (e.g., "gpt-4")
1628            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1629            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1630            cost_details: Cost information for the model call
1631            prompt: Associated prompt template from Langfuse prompt management
1632        """
1633        super().__init__(
1634            as_type="generation",
1635            otel_span=otel_span,
1636            langfuse_client=langfuse_client,
1637            input=input,
1638            output=output,
1639            metadata=metadata,
1640            environment=environment,
1641            version=version,
1642            level=level,
1643            status_message=status_message,
1644            completion_start_time=completion_start_time,
1645            model=model,
1646            model_parameters=model_parameters,
1647            usage_details=usage_details,
1648            cost_details=cost_details,
1649            prompt=prompt,
1650        )

Specialized span implementation for AI model generations in Langfuse.

This class represents a generation span specifically designed for tracking AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized attributes for model details, token usage, and costs.

LangfuseGeneration( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None)
1595    def __init__(
1596        self,
1597        *,
1598        otel_span: otel_trace_api.Span,
1599        langfuse_client: "Langfuse",
1600        input: Optional[Any] = None,
1601        output: Optional[Any] = None,
1602        metadata: Optional[Any] = None,
1603        environment: Optional[str] = None,
1604        version: Optional[str] = None,
1605        level: Optional[SpanLevel] = None,
1606        status_message: Optional[str] = None,
1607        completion_start_time: Optional[datetime] = None,
1608        model: Optional[str] = None,
1609        model_parameters: Optional[Dict[str, MapValue]] = None,
1610        usage_details: Optional[Dict[str, int]] = None,
1611        cost_details: Optional[Dict[str, float]] = None,
1612        prompt: Optional[PromptClient] = None,
1613    ):
1614        """Initialize a new LangfuseGeneration span.
1615
1616        Args:
1617            otel_span: The OpenTelemetry span to wrap
1618            langfuse_client: Reference to the parent Langfuse client
1619            input: Input data for the generation (e.g., prompts)
1620            output: Output from the generation (e.g., completions)
1621            metadata: Additional metadata to associate with the generation
1622            environment: The tracing environment
1623            version: Version identifier for the model or component
1624            level: Importance level of the generation (info, warning, error)
1625            status_message: Optional status message for the generation
1626            completion_start_time: When the model started generating the response
1627            model: Name/identifier of the AI model used (e.g., "gpt-4")
1628            model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
1629            usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
1630            cost_details: Cost information for the model call
1631            prompt: Associated prompt template from Langfuse prompt management
1632        """
1633        super().__init__(
1634            as_type="generation",
1635            otel_span=otel_span,
1636            langfuse_client=langfuse_client,
1637            input=input,
1638            output=output,
1639            metadata=metadata,
1640            environment=environment,
1641            version=version,
1642            level=level,
1643            status_message=status_message,
1644            completion_start_time=completion_start_time,
1645            model=model,
1646            model_parameters=model_parameters,
1647            usage_details=usage_details,
1648            cost_details=cost_details,
1649            prompt=prompt,
1650        )

Initialize a new LangfuseGeneration span.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the generation (e.g., prompts)
  • output: Output from the generation (e.g., completions)
  • metadata: Additional metadata to associate with the generation
  • environment: The tracing environment
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
  • completion_start_time: When the model started generating the response
  • model: Name/identifier of the AI model used (e.g., "gpt-4")
  • model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
  • usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
  • cost_details: Cost information for the model call
  • prompt: Associated prompt template from Langfuse prompt management
class LangfuseEvent(langfuse._client.span.LangfuseObservationWrapper):
1653class LangfuseEvent(LangfuseObservationWrapper):
1654    """Specialized span implementation for Langfuse Events."""
1655
1656    def __init__(
1657        self,
1658        *,
1659        otel_span: otel_trace_api.Span,
1660        langfuse_client: "Langfuse",
1661        input: Optional[Any] = None,
1662        output: Optional[Any] = None,
1663        metadata: Optional[Any] = None,
1664        environment: Optional[str] = None,
1665        version: Optional[str] = None,
1666        level: Optional[SpanLevel] = None,
1667        status_message: Optional[str] = None,
1668    ):
1669        """Initialize a new LangfuseEvent span.
1670
1671        Args:
1672            otel_span: The OpenTelemetry span to wrap
1673            langfuse_client: Reference to the parent Langfuse client
1674            input: Input data for the event
1675            output: Output from the event
1676            metadata: Additional metadata to associate with the generation
1677            environment: The tracing environment
1678            version: Version identifier for the model or component
1679            level: Importance level of the generation (info, warning, error)
1680            status_message: Optional status message for the generation
1681        """
1682        super().__init__(
1683            otel_span=otel_span,
1684            as_type="event",
1685            langfuse_client=langfuse_client,
1686            input=input,
1687            output=output,
1688            metadata=metadata,
1689            environment=environment,
1690            version=version,
1691            level=level,
1692            status_message=status_message,
1693        )
1694
1695    def update(
1696        self,
1697        *,
1698        name: Optional[str] = None,
1699        input: Optional[Any] = None,
1700        output: Optional[Any] = None,
1701        metadata: Optional[Any] = None,
1702        version: Optional[str] = None,
1703        level: Optional[SpanLevel] = None,
1704        status_message: Optional[str] = None,
1705        completion_start_time: Optional[datetime] = None,
1706        model: Optional[str] = None,
1707        model_parameters: Optional[Dict[str, MapValue]] = None,
1708        usage_details: Optional[Dict[str, int]] = None,
1709        cost_details: Optional[Dict[str, float]] = None,
1710        prompt: Optional[PromptClient] = None,
1711        **kwargs: Any,
1712    ) -> "LangfuseEvent":
1713        """Update is not allowed for LangfuseEvent because events cannot be updated.
1714
1715        This method logs a warning and returns self without making changes.
1716
1717        Returns:
1718            self: Returns the unchanged LangfuseEvent instance
1719        """
1720        langfuse_logger.warning(
1721            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1722        )
1723        return self

Specialized span implementation for Langfuse Events.

LangfuseEvent( *, otel_span: opentelemetry.trace.span.Span, langfuse_client: Langfuse, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, environment: Optional[str] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None)
1656    def __init__(
1657        self,
1658        *,
1659        otel_span: otel_trace_api.Span,
1660        langfuse_client: "Langfuse",
1661        input: Optional[Any] = None,
1662        output: Optional[Any] = None,
1663        metadata: Optional[Any] = None,
1664        environment: Optional[str] = None,
1665        version: Optional[str] = None,
1666        level: Optional[SpanLevel] = None,
1667        status_message: Optional[str] = None,
1668    ):
1669        """Initialize a new LangfuseEvent span.
1670
1671        Args:
1672            otel_span: The OpenTelemetry span to wrap
1673            langfuse_client: Reference to the parent Langfuse client
1674            input: Input data for the event
1675            output: Output from the event
1676            metadata: Additional metadata to associate with the generation
1677            environment: The tracing environment
1678            version: Version identifier for the model or component
1679            level: Importance level of the generation (info, warning, error)
1680            status_message: Optional status message for the generation
1681        """
1682        super().__init__(
1683            otel_span=otel_span,
1684            as_type="event",
1685            langfuse_client=langfuse_client,
1686            input=input,
1687            output=output,
1688            metadata=metadata,
1689            environment=environment,
1690            version=version,
1691            level=level,
1692            status_message=status_message,
1693        )

Initialize a new LangfuseEvent span.

Arguments:
  • otel_span: The OpenTelemetry span to wrap
  • langfuse_client: Reference to the parent Langfuse client
  • input: Input data for the event
  • output: Output from the event
  • metadata: Additional metadata to associate with the generation
  • environment: The tracing environment
  • version: Version identifier for the model or component
  • level: Importance level of the generation (info, warning, error)
  • status_message: Optional status message for the generation
def update( self, *, name: Optional[str] = None, input: Optional[Any] = None, output: Optional[Any] = None, metadata: Optional[Any] = None, version: Optional[str] = None, level: Optional[Literal['DEBUG', 'DEFAULT', 'WARNING', 'ERROR']] = None, status_message: Optional[str] = None, completion_start_time: Optional[datetime.datetime] = None, model: Optional[str] = None, model_parameters: Optional[Dict[str, Union[str, NoneType, int, bool, List[str]]]] = None, usage_details: Optional[Dict[str, int]] = None, cost_details: Optional[Dict[str, float]] = None, prompt: Union[langfuse.model.TextPromptClient, langfuse.model.ChatPromptClient, NoneType] = None, **kwargs: Any) -> LangfuseEvent:
1695    def update(
1696        self,
1697        *,
1698        name: Optional[str] = None,
1699        input: Optional[Any] = None,
1700        output: Optional[Any] = None,
1701        metadata: Optional[Any] = None,
1702        version: Optional[str] = None,
1703        level: Optional[SpanLevel] = None,
1704        status_message: Optional[str] = None,
1705        completion_start_time: Optional[datetime] = None,
1706        model: Optional[str] = None,
1707        model_parameters: Optional[Dict[str, MapValue]] = None,
1708        usage_details: Optional[Dict[str, int]] = None,
1709        cost_details: Optional[Dict[str, float]] = None,
1710        prompt: Optional[PromptClient] = None,
1711        **kwargs: Any,
1712    ) -> "LangfuseEvent":
1713        """Update is not allowed for LangfuseEvent because events cannot be updated.
1714
1715        This method logs a warning and returns self without making changes.
1716
1717        Returns:
1718            self: Returns the unchanged LangfuseEvent instance
1719        """
1720        langfuse_logger.warning(
1721            "Attempted to update LangfuseEvent observation. Events cannot be updated after creation."
1722        )
1723        return self

Update is not allowed for LangfuseEvent because events cannot be updated.

This method logs a warning and returns self without making changes.

Returns:

self: Returns the unchanged LangfuseEvent instance

class LangfuseOtelSpanAttributes:
27class LangfuseOtelSpanAttributes:
28    # Langfuse-Trace attributes
29    TRACE_NAME = "langfuse.trace.name"
30    TRACE_USER_ID = "user.id"
31    TRACE_SESSION_ID = "session.id"
32    TRACE_TAGS = "langfuse.trace.tags"
33    TRACE_PUBLIC = "langfuse.trace.public"
34    TRACE_METADATA = "langfuse.trace.metadata"
35    TRACE_INPUT = "langfuse.trace.input"
36    TRACE_OUTPUT = "langfuse.trace.output"
37
38    # Langfuse-observation attributes
39    OBSERVATION_TYPE = "langfuse.observation.type"
40    OBSERVATION_METADATA = "langfuse.observation.metadata"
41    OBSERVATION_LEVEL = "langfuse.observation.level"
42    OBSERVATION_STATUS_MESSAGE = "langfuse.observation.status_message"
43    OBSERVATION_INPUT = "langfuse.observation.input"
44    OBSERVATION_OUTPUT = "langfuse.observation.output"
45
46    # Langfuse-observation of type Generation attributes
47    OBSERVATION_COMPLETION_START_TIME = "langfuse.observation.completion_start_time"
48    OBSERVATION_MODEL = "langfuse.observation.model.name"
49    OBSERVATION_MODEL_PARAMETERS = "langfuse.observation.model.parameters"
50    OBSERVATION_USAGE_DETAILS = "langfuse.observation.usage_details"
51    OBSERVATION_COST_DETAILS = "langfuse.observation.cost_details"
52    OBSERVATION_PROMPT_NAME = "langfuse.observation.prompt.name"
53    OBSERVATION_PROMPT_VERSION = "langfuse.observation.prompt.version"
54
55    # General
56    ENVIRONMENT = "langfuse.environment"
57    RELEASE = "langfuse.release"
58    VERSION = "langfuse.version"
59
60    # Internal
61    AS_ROOT = "langfuse.internal.as_root"
62
63    # Experiments
64    EXPERIMENT_ID = "langfuse.experiment.id"
65    EXPERIMENT_NAME = "langfuse.experiment.name"
66    EXPERIMENT_DESCRIPTION = "langfuse.experiment.description"
67    EXPERIMENT_METADATA = "langfuse.experiment.metadata"
68    EXPERIMENT_DATASET_ID = "langfuse.experiment.dataset.id"
69    EXPERIMENT_ITEM_ID = "langfuse.experiment.item.id"
70    EXPERIMENT_ITEM_EXPECTED_OUTPUT = "langfuse.experiment.item.expected_output"
71    EXPERIMENT_ITEM_METADATA = "langfuse.experiment.item.metadata"
72    EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = "langfuse.experiment.item.root_observation_id"
TRACE_NAME = 'langfuse.trace.name'
TRACE_USER_ID = 'user.id'
TRACE_SESSION_ID = 'session.id'
TRACE_TAGS = 'langfuse.trace.tags'
TRACE_PUBLIC = 'langfuse.trace.public'
TRACE_METADATA = 'langfuse.trace.metadata'
TRACE_INPUT = 'langfuse.trace.input'
TRACE_OUTPUT = 'langfuse.trace.output'
OBSERVATION_TYPE = 'langfuse.observation.type'
OBSERVATION_METADATA = 'langfuse.observation.metadata'
OBSERVATION_LEVEL = 'langfuse.observation.level'
OBSERVATION_STATUS_MESSAGE = 'langfuse.observation.status_message'
OBSERVATION_INPUT = 'langfuse.observation.input'
OBSERVATION_OUTPUT = 'langfuse.observation.output'
OBSERVATION_COMPLETION_START_TIME = 'langfuse.observation.completion_start_time'
OBSERVATION_MODEL = 'langfuse.observation.model.name'
OBSERVATION_MODEL_PARAMETERS = 'langfuse.observation.model.parameters'
OBSERVATION_USAGE_DETAILS = 'langfuse.observation.usage_details'
OBSERVATION_COST_DETAILS = 'langfuse.observation.cost_details'
OBSERVATION_PROMPT_NAME = 'langfuse.observation.prompt.name'
OBSERVATION_PROMPT_VERSION = 'langfuse.observation.prompt.version'
ENVIRONMENT = 'langfuse.environment'
RELEASE = 'langfuse.release'
VERSION = 'langfuse.version'
AS_ROOT = 'langfuse.internal.as_root'
EXPERIMENT_ID = 'langfuse.experiment.id'
EXPERIMENT_NAME = 'langfuse.experiment.name'
EXPERIMENT_DESCRIPTION = 'langfuse.experiment.description'
EXPERIMENT_METADATA = 'langfuse.experiment.metadata'
EXPERIMENT_DATASET_ID = 'langfuse.experiment.dataset.id'
EXPERIMENT_ITEM_ID = 'langfuse.experiment.item.id'
EXPERIMENT_ITEM_EXPECTED_OUTPUT = 'langfuse.experiment.item.expected_output'
EXPERIMENT_ITEM_METADATA = 'langfuse.experiment.item.metadata'
EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = 'langfuse.experiment.item.root_observation_id'
class LangfuseAgent(langfuse._client.span.LangfuseObservationWrapper):
1726class LangfuseAgent(LangfuseObservationWrapper):
1727    """Agent observation for reasoning blocks that act on tools using LLM guidance."""
1728
1729    def __init__(self, **kwargs: Any) -> None:
1730        """Initialize a new LangfuseAgent span."""
1731        kwargs["as_type"] = "agent"
1732        super().__init__(**kwargs)

Agent observation for reasoning blocks that act on tools using LLM guidance.

LangfuseAgent(**kwargs: Any)
1729    def __init__(self, **kwargs: Any) -> None:
1730        """Initialize a new LangfuseAgent span."""
1731        kwargs["as_type"] = "agent"
1732        super().__init__(**kwargs)

Initialize a new LangfuseAgent span.

class LangfuseTool(langfuse._client.span.LangfuseObservationWrapper):
1735class LangfuseTool(LangfuseObservationWrapper):
1736    """Tool observation representing external tool calls, e.g., calling a weather API."""
1737
1738    def __init__(self, **kwargs: Any) -> None:
1739        """Initialize a new LangfuseTool span."""
1740        kwargs["as_type"] = "tool"
1741        super().__init__(**kwargs)

Tool observation representing external tool calls, e.g., calling a weather API.

LangfuseTool(**kwargs: Any)
1738    def __init__(self, **kwargs: Any) -> None:
1739        """Initialize a new LangfuseTool span."""
1740        kwargs["as_type"] = "tool"
1741        super().__init__(**kwargs)

Initialize a new LangfuseTool span.

class LangfuseChain(langfuse._client.span.LangfuseObservationWrapper):
1744class LangfuseChain(LangfuseObservationWrapper):
1745    """Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM."""
1746
1747    def __init__(self, **kwargs: Any) -> None:
1748        """Initialize a new LangfuseChain span."""
1749        kwargs["as_type"] = "chain"
1750        super().__init__(**kwargs)

Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.

LangfuseChain(**kwargs: Any)
1747    def __init__(self, **kwargs: Any) -> None:
1748        """Initialize a new LangfuseChain span."""
1749        kwargs["as_type"] = "chain"
1750        super().__init__(**kwargs)

Initialize a new LangfuseChain span.

class LangfuseEmbedding(langfuse._client.span.LangfuseObservationWrapper):
1762class LangfuseEmbedding(LangfuseObservationWrapper):
1763    """Embedding observation for LLM embedding calls, typically used before retrieval."""
1764
1765    def __init__(self, **kwargs: Any) -> None:
1766        """Initialize a new LangfuseEmbedding span."""
1767        kwargs["as_type"] = "embedding"
1768        super().__init__(**kwargs)

Embedding observation for LLM embedding calls, typically used before retrieval.

LangfuseEmbedding(**kwargs: Any)
1765    def __init__(self, **kwargs: Any) -> None:
1766        """Initialize a new LangfuseEmbedding span."""
1767        kwargs["as_type"] = "embedding"
1768        super().__init__(**kwargs)

Initialize a new LangfuseEmbedding span.

class LangfuseEvaluator(langfuse._client.span.LangfuseObservationWrapper):
1771class LangfuseEvaluator(LangfuseObservationWrapper):
1772    """Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs."""
1773
1774    def __init__(self, **kwargs: Any) -> None:
1775        """Initialize a new LangfuseEvaluator span."""
1776        kwargs["as_type"] = "evaluator"
1777        super().__init__(**kwargs)

Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.

LangfuseEvaluator(**kwargs: Any)
1774    def __init__(self, **kwargs: Any) -> None:
1775        """Initialize a new LangfuseEvaluator span."""
1776        kwargs["as_type"] = "evaluator"
1777        super().__init__(**kwargs)

Initialize a new LangfuseEvaluator span.

class LangfuseRetriever(langfuse._client.span.LangfuseObservationWrapper):
1753class LangfuseRetriever(LangfuseObservationWrapper):
1754    """Retriever observation for data retrieval steps, e.g. vector store or database queries."""
1755
1756    def __init__(self, **kwargs: Any) -> None:
1757        """Initialize a new LangfuseRetriever span."""
1758        kwargs["as_type"] = "retriever"
1759        super().__init__(**kwargs)

Retriever observation for data retrieval steps, e.g. vector store or database queries.

LangfuseRetriever(**kwargs: Any)
1756    def __init__(self, **kwargs: Any) -> None:
1757        """Initialize a new LangfuseRetriever span."""
1758        kwargs["as_type"] = "retriever"
1759        super().__init__(**kwargs)

Initialize a new LangfuseRetriever span.

class LangfuseGuardrail(langfuse._client.span.LangfuseObservationWrapper):
1780class LangfuseGuardrail(LangfuseObservationWrapper):
1781    """Guardrail observation for protection e.g. against jailbreaks or offensive content."""
1782
1783    def __init__(self, **kwargs: Any) -> None:
1784        """Initialize a new LangfuseGuardrail span."""
1785        kwargs["as_type"] = "guardrail"
1786        super().__init__(**kwargs)

Guardrail observation for protection e.g. against jailbreaks or offensive content.

LangfuseGuardrail(**kwargs: Any)
1783    def __init__(self, **kwargs: Any) -> None:
1784        """Initialize a new LangfuseGuardrail span."""
1785        kwargs["as_type"] = "guardrail"
1786        super().__init__(**kwargs)

Initialize a new LangfuseGuardrail span.

class Evaluation:
 97class Evaluation:
 98    """Represents an evaluation result for an experiment item or an entire experiment run.
 99
100    This class provides a strongly-typed way to create evaluation results in evaluator functions.
101    Users must use keyword arguments when instantiating this class.
102
103    Attributes:
104        name: Unique identifier for the evaluation metric. Should be descriptive
105            and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity").
106            Used for aggregation and comparison across experiment runs.
107        value: The evaluation score or result. Can be:
108            - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
109            - String: For categorical results like "positive", "negative", "neutral"
110            - Boolean: For binary assessments like "passes_safety_check"
111        comment: Optional human-readable explanation of the evaluation result.
112            Useful for providing context, explaining scoring rationale, or noting
113            special conditions. Displayed in Langfuse UI for interpretability.
114        metadata: Optional structured metadata about the evaluation process.
115            Can include confidence scores, intermediate calculations, model versions,
116            or any other relevant technical details.
117        data_type: Optional score data type. Required if value is not NUMERIC.
118            One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
119        config_id: Optional Langfuse score config ID.
120
121    Examples:
122        Basic accuracy evaluation:
123        ```python
124        from langfuse import Evaluation
125
126        def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
127            if not expected_output:
128                return Evaluation(name="accuracy", value=None, comment="No expected output")
129
130            is_correct = output.strip().lower() == expected_output.strip().lower()
131            return Evaluation(
132                name="accuracy",
133                value=1.0 if is_correct else 0.0,
134                comment="Correct answer" if is_correct else "Incorrect answer"
135            )
136        ```
137
138        Multi-metric evaluator:
139        ```python
140        def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
141            return [
142                Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
143                Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
144                Evaluation(
145                    name="quality",
146                    value=0.85,
147                    comment="High quality response",
148                    metadata={"confidence": 0.92, "model": "gpt-4"}
149                )
150            ]
151        ```
152
153        Categorical evaluation:
154        ```python
155        def sentiment_evaluator(*, input, output, **kwargs):
156            sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
157            return Evaluation(
158                name="sentiment",
159                value=sentiment,
160                comment=f"Response expresses {sentiment} sentiment",
161                data_type="CATEGORICAL"
162            )
163        ```
164
165        Failed evaluation with error handling:
166        ```python
167        def external_api_evaluator(*, input, output, **kwargs):
168            try:
169                score = external_api.evaluate(output)
170                return Evaluation(name="external_score", value=score)
171            except Exception as e:
172                return Evaluation(
173                    name="external_score",
174                    value=None,
175                    comment=f"API unavailable: {e}",
176                    metadata={"error": str(e), "retry_count": 3}
177                )
178        ```
179
180    Note:
181        All arguments must be passed as keywords. Positional arguments are not allowed
182        to ensure code clarity and prevent errors from argument reordering.
183    """
184
185    def __init__(
186        self,
187        *,
188        name: str,
189        value: Union[int, float, str, bool],
190        comment: Optional[str] = None,
191        metadata: Optional[Dict[str, Any]] = None,
192        data_type: Optional[ScoreDataType] = None,
193        config_id: Optional[str] = None,
194    ):
195        """Initialize an Evaluation with the provided data.
196
197        Args:
198            name: Unique identifier for the evaluation metric.
199            value: The evaluation score or result.
200            comment: Optional human-readable explanation of the result.
201            metadata: Optional structured metadata about the evaluation process.
202            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
203            config_id: Optional Langfuse score config ID.
204
205        Note:
206            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
207        """
208        self.name = name
209        self.value = value
210        self.comment = comment
211        self.metadata = metadata
212        self.data_type = data_type
213        self.config_id = config_id

Represents an evaluation result for an experiment item or an entire experiment run.

This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.

Attributes:
  • name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
  • value: The evaluation score or result. Can be:
    • Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
    • String: For categorical results like "positive", "negative", "neutral"
    • Boolean: For binary assessments like "passes_safety_check"
  • comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
  • metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
  • data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
  • config_id: Optional Langfuse score config ID.
Examples:

Basic accuracy evaluation:

from langfuse import Evaluation

def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
    if not expected_output:
        return Evaluation(name="accuracy", value=None, comment="No expected output")

    is_correct = output.strip().lower() == expected_output.strip().lower()
    return Evaluation(
        name="accuracy",
        value=1.0 if is_correct else 0.0,
        comment="Correct answer" if is_correct else "Incorrect answer"
    )

Multi-metric evaluator:

def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs):
    return [
        Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"),
        Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"),
        Evaluation(
            name="quality",
            value=0.85,
            comment="High quality response",
            metadata={"confidence": 0.92, "model": "gpt-4"}
        )
    ]

Categorical evaluation:

def sentiment_evaluator(*, input, output, **kwargs):
    sentiment = analyze_sentiment(output)  # Returns "positive", "negative", or "neutral"
    return Evaluation(
        name="sentiment",
        value=sentiment,
        comment=f"Response expresses {sentiment} sentiment",
        data_type="CATEGORICAL"
    )

Failed evaluation with error handling:

def external_api_evaluator(*, input, output, **kwargs):
    try:
        score = external_api.evaluate(output)
        return Evaluation(name="external_score", value=score)
    except Exception as e:
        return Evaluation(
            name="external_score",
            value=None,
            comment=f"API unavailable: {e}",
            metadata={"error": str(e), "retry_count": 3}
        )
Note:

All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.

Evaluation( *, name: str, value: Union[int, float, str, bool], comment: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, data_type: Optional[langfuse.api.ScoreDataType] = None, config_id: Optional[str] = None)
185    def __init__(
186        self,
187        *,
188        name: str,
189        value: Union[int, float, str, bool],
190        comment: Optional[str] = None,
191        metadata: Optional[Dict[str, Any]] = None,
192        data_type: Optional[ScoreDataType] = None,
193        config_id: Optional[str] = None,
194    ):
195        """Initialize an Evaluation with the provided data.
196
197        Args:
198            name: Unique identifier for the evaluation metric.
199            value: The evaluation score or result.
200            comment: Optional human-readable explanation of the result.
201            metadata: Optional structured metadata about the evaluation process.
202            data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
203            config_id: Optional Langfuse score config ID.
204
205        Note:
206            All arguments must be provided as keywords. Positional arguments will raise a TypeError.
207        """
208        self.name = name
209        self.value = value
210        self.comment = comment
211        self.metadata = metadata
212        self.data_type = data_type
213        self.config_id = config_id

Initialize an Evaluation with the provided data.

Arguments:
  • name: Unique identifier for the evaluation metric.
  • value: The evaluation score or result.
  • comment: Optional human-readable explanation of the result.
  • metadata: Optional structured metadata about the evaluation process.
  • data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
  • config_id: Optional Langfuse score config ID.
Note:

All arguments must be provided as keywords. Positional arguments will raise a TypeError.

name
value
comment
metadata
data_type
config_id
class EvaluatorInputs:
 38class EvaluatorInputs:
 39    """Input data structure for evaluators, returned by mapper functions.
 40
 41    This class provides a strongly-typed container for transforming API response
 42    objects (traces, observations) into the standardized format expected
 43    by evaluator functions. It ensures consistent access to input, output, expected
 44    output, and metadata regardless of the source entity type.
 45
 46    Attributes:
 47        input: The input data that was provided to generate the output being evaluated.
 48            For traces, this might be the initial prompt or request. For observations,
 49            this could be the span's input. The exact meaning depends on your use case.
 50        output: The actual output that was produced and needs to be evaluated.
 51            For traces, this is typically the final response. For observations,
 52            this might be the generation output or span result.
 53        expected_output: Optional ground truth or expected result for comparison.
 54            Used by evaluators to assess correctness. May be None if no ground truth
 55            is available for the entity being evaluated.
 56        metadata: Optional structured metadata providing additional context for evaluation.
 57            Can include information about the entity, execution context, user attributes,
 58            or any other relevant data that evaluators might use.
 59
 60    Examples:
 61        Simple mapper for traces:
 62        ```python
 63        from langfuse import EvaluatorInputs
 64
 65        def trace_mapper(trace):
 66            return EvaluatorInputs(
 67                input=trace.input,
 68                output=trace.output,
 69                expected_output=None,  # No ground truth available
 70                metadata={"user_id": trace.user_id, "tags": trace.tags}
 71            )
 72        ```
 73
 74        Mapper for observations extracting specific fields:
 75        ```python
 76        def observation_mapper(observation):
 77            # Extract input/output from observation's data
 78            input_data = observation.input if hasattr(observation, 'input') else None
 79            output_data = observation.output if hasattr(observation, 'output') else None
 80
 81            return EvaluatorInputs(
 82                input=input_data,
 83                output=output_data,
 84                expected_output=None,
 85                metadata={
 86                    "observation_type": observation.type,
 87                    "model": observation.model,
 88                    "latency_ms": observation.end_time - observation.start_time
 89                }
 90            )
 91        ```
 92        ```
 93
 94    Note:
 95        All arguments must be passed as keywords when instantiating this class.
 96    """
 97
 98    def __init__(
 99        self,
100        *,
101        input: Any,
102        output: Any,
103        expected_output: Any = None,
104        metadata: Optional[Dict[str, Any]] = None,
105    ):
106        """Initialize EvaluatorInputs with the provided data.
107
108        Args:
109            input: The input data for evaluation.
110            output: The output data to be evaluated.
111            expected_output: Optional ground truth for comparison.
112            metadata: Optional additional context for evaluation.
113
114        Note:
115            All arguments must be provided as keywords.
116        """
117        self.input = input
118        self.output = output
119        self.expected_output = expected_output
120        self.metadata = metadata

Input data structure for evaluators, returned by mapper functions.

This class provides a strongly-typed container for transforming API response objects (traces, observations) into the standardized format expected by evaluator functions. It ensures consistent access to input, output, expected output, and metadata regardless of the source entity type.

Attributes:
  • input: The input data that was provided to generate the output being evaluated. For traces, this might be the initial prompt or request. For observations, this could be the span's input. The exact meaning depends on your use case.
  • output: The actual output that was produced and needs to be evaluated. For traces, this is typically the final response. For observations, this might be the generation output or span result.
  • expected_output: Optional ground truth or expected result for comparison. Used by evaluators to assess correctness. May be None if no ground truth is available for the entity being evaluated.
  • metadata: Optional structured metadata providing additional context for evaluation. Can include information about the entity, execution context, user attributes, or any other relevant data that evaluators might use.
Examples:

Simple mapper for traces:

from langfuse import EvaluatorInputs

def trace_mapper(trace):
    return EvaluatorInputs(
        input=trace.input,
        output=trace.output,
        expected_output=None,  # No ground truth available
        metadata={"user_id": trace.user_id, "tags": trace.tags}
    )

Mapper for observations extracting specific fields:

def observation_mapper(observation):
    # Extract input/output from observation's data
    input_data = observation.input if hasattr(observation, 'input') else None
    output_data = observation.output if hasattr(observation, 'output') else None

    return EvaluatorInputs(
        input=input_data,
        output=output_data,
        expected_output=None,
        metadata={
            "observation_type": observation.type,
            "model": observation.model,
            "latency_ms": observation.end_time - observation.start_time
        }
    )

```

Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorInputs( *, input: Any, output: Any, expected_output: Any = None, metadata: Optional[Dict[str, Any]] = None)
 98    def __init__(
 99        self,
100        *,
101        input: Any,
102        output: Any,
103        expected_output: Any = None,
104        metadata: Optional[Dict[str, Any]] = None,
105    ):
106        """Initialize EvaluatorInputs with the provided data.
107
108        Args:
109            input: The input data for evaluation.
110            output: The output data to be evaluated.
111            expected_output: Optional ground truth for comparison.
112            metadata: Optional additional context for evaluation.
113
114        Note:
115            All arguments must be provided as keywords.
116        """
117        self.input = input
118        self.output = output
119        self.expected_output = expected_output
120        self.metadata = metadata

Initialize EvaluatorInputs with the provided data.

Arguments:
  • input: The input data for evaluation.
  • output: The output data to be evaluated.
  • expected_output: Optional ground truth for comparison.
  • metadata: Optional additional context for evaluation.
Note:

All arguments must be provided as keywords.

input
output
expected_output
metadata
class MapperFunction(typing.Protocol):
123class MapperFunction(Protocol):
124    """Protocol defining the interface for mapper functions in batch evaluation.
125
126    Mapper functions transform API response objects (traces or observations)
127    into the standardized EvaluatorInputs format that evaluators expect. This abstraction
128    allows you to define how to extract and structure evaluation data from different
129    entity types.
130
131    Mapper functions must:
132    - Accept a single item parameter (trace, observation)
133    - Return an EvaluatorInputs instance with input, output, expected_output, metadata
134    - Can be either synchronous or asynchronous
135    - Should handle missing or malformed data gracefully
136    """
137
138    def __call__(
139        self,
140        *,
141        item: Union["TraceWithFullDetails", "ObservationsView"],
142        **kwargs: Dict[str, Any],
143    ) -> Union[EvaluatorInputs, Awaitable[EvaluatorInputs]]:
144        """Transform an API response object into evaluator inputs.
145
146        This method defines how to extract evaluation-relevant data from the raw
147        API response object. The implementation should map entity-specific fields
148        to the standardized input/output/expected_output/metadata structure.
149
150        Args:
151            item: The API response object to transform. The type depends on the scope:
152                - TraceWithFullDetails: When evaluating traces
153                - ObservationsView: When evaluating observations
154
155        Returns:
156            EvaluatorInputs: A structured container with:
157                - input: The input data that generated the output
158                - output: The output to be evaluated
159                - expected_output: Optional ground truth for comparison
160                - metadata: Optional additional context
161
162            Can return either a direct EvaluatorInputs instance or an awaitable
163            (for async mappers that need to fetch additional data).
164
165        Examples:
166            Basic trace mapper:
167            ```python
168            def map_trace(trace):
169                return EvaluatorInputs(
170                    input=trace.input,
171                    output=trace.output,
172                    expected_output=None,
173                    metadata={"trace_id": trace.id, "user": trace.user_id}
174                )
175            ```
176
177            Observation mapper with conditional logic:
178            ```python
179            def map_observation(observation):
180                # Extract fields based on observation type
181                if observation.type == "GENERATION":
182                    input_data = observation.input
183                    output_data = observation.output
184                else:
185                    # For other types, use different fields
186                    input_data = observation.metadata.get("input")
187                    output_data = observation.metadata.get("output")
188
189                return EvaluatorInputs(
190                    input=input_data,
191                    output=output_data,
192                    expected_output=None,
193                    metadata={"obs_id": observation.id, "type": observation.type}
194                )
195            ```
196
197            Async mapper (if additional processing needed):
198            ```python
199            async def map_trace_async(trace):
200                # Could do async processing here if needed
201                processed_output = await some_async_transformation(trace.output)
202
203                return EvaluatorInputs(
204                    input=trace.input,
205                    output=processed_output,
206                    expected_output=None,
207                    metadata={"trace_id": trace.id}
208                )
209            ```
210        """
211        ...

Protocol defining the interface for mapper functions in batch evaluation.

Mapper functions transform API response objects (traces or observations) into the standardized EvaluatorInputs format that evaluators expect. This abstraction allows you to define how to extract and structure evaluation data from different entity types.

Mapper functions must:

  • Accept a single item parameter (trace, observation)
  • Return an EvaluatorInputs instance with input, output, expected_output, metadata
  • Can be either synchronous or asynchronous
  • Should handle missing or malformed data gracefully
MapperFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class CompositeEvaluatorFunction(typing.Protocol):
214class CompositeEvaluatorFunction(Protocol):
215    """Protocol defining the interface for composite evaluator functions.
216
217    Composite evaluators create aggregate scores from multiple item-level evaluations.
218    This is commonly used to compute weighted averages, combined metrics, or other
219    composite assessments based on individual evaluation results.
220
221    Composite evaluators:
222    - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata)
223      plus the list of evaluations
224    - Return either a single Evaluation, a list of Evaluations, or a dict
225    - Can be either synchronous or asynchronous
226    - Have access to both raw item data and evaluation results
227    """
228
229    def __call__(
230        self,
231        *,
232        input: Optional[Any] = None,
233        output: Optional[Any] = None,
234        expected_output: Optional[Any] = None,
235        metadata: Optional[Dict[str, Any]] = None,
236        evaluations: List[Evaluation],
237        **kwargs: Dict[str, Any],
238    ) -> Union[
239        Evaluation,
240        List[Evaluation],
241        Dict[str, Any],
242        Awaitable[Evaluation],
243        Awaitable[List[Evaluation]],
244        Awaitable[Dict[str, Any]],
245    ]:
246        r"""Create a composite evaluation from item-level evaluation results.
247
248        This method combines multiple evaluation scores into a single composite metric.
249        Common use cases include weighted averages, pass/fail decisions based on multiple
250        criteria, or custom scoring logic that considers multiple dimensions.
251
252        Args:
253            input: The input data that was provided to the system being evaluated.
254            output: The output generated by the system being evaluated.
255            expected_output: The expected/reference output for comparison (if available).
256            metadata: Additional metadata about the evaluation context.
257            evaluations: List of evaluation results from item-level evaluators.
258                Each evaluation contains name, value, comment, and metadata.
259
260        Returns:
261            Can return any of:
262            - Evaluation: A single composite evaluation result
263            - List[Evaluation]: Multiple composite evaluations
264            - Dict: A dict that will be converted to an Evaluation
265                - name: Identifier for the composite metric (e.g., "composite_score")
266                - value: The computed composite value
267                - comment: Optional explanation of how the score was computed
268                - metadata: Optional details about the composition logic
269
270            Can return either a direct Evaluation instance or an awaitable
271            (for async composite evaluators).
272
273        Examples:
274            Simple weighted average:
275            ```python
276            def weighted_composite(*, input, output, expected_output, metadata, evaluations):
277                weights = {
278                    "accuracy": 0.5,
279                    "relevance": 0.3,
280                    "safety": 0.2
281                }
282
283                total_score = 0.0
284                total_weight = 0.0
285
286                for eval in evaluations:
287                    if eval.name in weights and isinstance(eval.value, (int, float)):
288                        total_score += eval.value * weights[eval.name]
289                        total_weight += weights[eval.name]
290
291                final_score = total_score / total_weight if total_weight > 0 else 0.0
292
293                return Evaluation(
294                    name="composite_score",
295                    value=final_score,
296                    comment=f"Weighted average of {len(evaluations)} metrics"
297                )
298            ```
299
300            Pass/fail composite based on thresholds:
301            ```python
302            def pass_fail_composite(*, input, output, expected_output, metadata, evaluations):
303                # Must pass all criteria
304                thresholds = {
305                    "accuracy": 0.7,
306                    "safety": 0.9,
307                    "relevance": 0.6
308                }
309
310                passes = True
311                failing_metrics = []
312
313                for metric, threshold in thresholds.items():
314                    eval_result = next((e for e in evaluations if e.name == metric), None)
315                    if eval_result and isinstance(eval_result.value, (int, float)):
316                        if eval_result.value < threshold:
317                            passes = False
318                            failing_metrics.append(metric)
319
320                return Evaluation(
321                    name="passes_all_checks",
322                    value=passes,
323                    comment=f"Failed: {', '.join(failing_metrics)}" if failing_metrics else "All checks passed",
324                    data_type="BOOLEAN"
325                )
326            ```
327
328            Async composite with external scoring:
329            ```python
330            async def llm_composite(*, input, output, expected_output, metadata, evaluations):
331                # Use LLM to synthesize multiple evaluation results
332                eval_summary = "\n".join(
333                    f"- {e.name}: {e.value}" for e in evaluations
334                )
335
336                prompt = f"Given these evaluation scores:\n{eval_summary}\n"
337                prompt += f"For the output: {output}\n"
338                prompt += "Provide an overall quality score from 0-1."
339
340                response = await openai.chat.completions.create(
341                    model="gpt-4",
342                    messages=[{"role": "user", "content": prompt}]
343                )
344
345                score = float(response.choices[0].message.content.strip())
346
347                return Evaluation(
348                    name="llm_composite_score",
349                    value=score,
350                    comment="LLM-synthesized composite score"
351                )
352            ```
353
354            Context-aware composite:
355            ```python
356            def context_composite(*, input, output, expected_output, metadata, evaluations):
357                # Adjust weighting based on metadata
358                base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2}
359
360                # If metadata indicates high importance, prioritize accuracy
361                if metadata and metadata.get('importance') == 'high':
362                    weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1}
363                else:
364                    weights = base_weights
365
366                total = sum(
367                    e.value * weights.get(e.name, 0)
368                    for e in evaluations
369                    if isinstance(e.value, (int, float))
370                )
371
372                return Evaluation(
373                    name="weighted_composite",
374                    value=total,
375                    comment="Context-aware weighted composite"
376                )
377            ```
378        """
379        ...

Protocol defining the interface for composite evaluator functions.

Composite evaluators create aggregate scores from multiple item-level evaluations. This is commonly used to compute weighted averages, combined metrics, or other composite assessments based on individual evaluation results.

Composite evaluators:

  • Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations
  • Return either a single Evaluation, a list of Evaluations, or a dict
  • Can be either synchronous or asynchronous
  • Have access to both raw item data and evaluation results
CompositeEvaluatorFunction(*args, **kwargs)
1927def _no_init_or_replace_init(self, *args, **kwargs):
1928    cls = type(self)
1929
1930    if cls._is_protocol:
1931        raise TypeError('Protocols cannot be instantiated')
1932
1933    # Already using a custom `__init__`. No need to calculate correct
1934    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1935    if cls.__init__ is not _no_init_or_replace_init:
1936        return
1937
1938    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1939    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1940    # searches for a proper new `__init__` in the MRO. The new `__init__`
1941    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1942    # instantiation of the protocol subclass will thus use the new
1943    # `__init__` and no longer call `_no_init_or_replace_init`.
1944    for base in cls.__mro__:
1945        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1946        if init is not _no_init_or_replace_init:
1947            cls.__init__ = init
1948            break
1949    else:
1950        # should not happen
1951        cls.__init__ = object.__init__
1952
1953    cls.__init__(self, *args, **kwargs)
class EvaluatorStats:
382class EvaluatorStats:
383    """Statistics for a single evaluator's performance during batch evaluation.
384
385    This class tracks detailed metrics about how a specific evaluator performed
386    across all items in a batch evaluation run. It helps identify evaluator issues,
387    understand reliability, and optimize evaluation pipelines.
388
389    Attributes:
390        name: The name of the evaluator function (extracted from __name__).
391        total_runs: Total number of times the evaluator was invoked.
392        successful_runs: Number of times the evaluator completed successfully.
393        failed_runs: Number of times the evaluator raised an exception or failed.
394        total_scores_created: Total number of evaluation scores created by this evaluator.
395            Can be higher than successful_runs if the evaluator returns multiple scores.
396
397    Examples:
398        Accessing evaluator stats from batch evaluation result:
399        ```python
400        result = client.run_batched_evaluation(...)
401
402        for stats in result.evaluator_stats:
403            print(f"Evaluator: {stats.name}")
404            print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
405            print(f"  Scores created: {stats.total_scores_created}")
406
407            if stats.failed_runs > 0:
408                print(f"  âš ī¸  Failed {stats.failed_runs} times")
409        ```
410
411        Identifying problematic evaluators:
412        ```python
413        result = client.run_batched_evaluation(...)
414
415        # Find evaluators with high failure rates
416        for stats in result.evaluator_stats:
417            failure_rate = stats.failed_runs / stats.total_runs
418            if failure_rate > 0.1:  # More than 10% failures
419                print(f"âš ī¸  {stats.name} has {failure_rate:.1%} failure rate")
420                print(f"    Consider debugging or removing this evaluator")
421        ```
422
423    Note:
424        All arguments must be passed as keywords when instantiating this class.
425    """
426
427    def __init__(
428        self,
429        *,
430        name: str,
431        total_runs: int = 0,
432        successful_runs: int = 0,
433        failed_runs: int = 0,
434        total_scores_created: int = 0,
435    ):
436        """Initialize EvaluatorStats with the provided metrics.
437
438        Args:
439            name: The evaluator function name.
440            total_runs: Total number of evaluator invocations.
441            successful_runs: Number of successful completions.
442            failed_runs: Number of failures.
443            total_scores_created: Total scores created by this evaluator.
444
445        Note:
446            All arguments must be provided as keywords.
447        """
448        self.name = name
449        self.total_runs = total_runs
450        self.successful_runs = successful_runs
451        self.failed_runs = failed_runs
452        self.total_scores_created = total_scores_created

Statistics for a single evaluator's performance during batch evaluation.

This class tracks detailed metrics about how a specific evaluator performed across all items in a batch evaluation run. It helps identify evaluator issues, understand reliability, and optimize evaluation pipelines.

Attributes:
  • name: The name of the evaluator function (extracted from __name__).
  • total_runs: Total number of times the evaluator was invoked.
  • successful_runs: Number of times the evaluator completed successfully.
  • failed_runs: Number of times the evaluator raised an exception or failed.
  • total_scores_created: Total number of evaluation scores created by this evaluator. Can be higher than successful_runs if the evaluator returns multiple scores.
Examples:

Accessing evaluator stats from batch evaluation result:

result = client.run_batched_evaluation(...)

for stats in result.evaluator_stats:
    print(f"Evaluator: {stats.name}")
    print(f"  Success rate: {stats.successful_runs / stats.total_runs:.1%}")
    print(f"  Scores created: {stats.total_scores_created}")

    if stats.failed_runs > 0:
        print(f"  âš ī¸  Failed {stats.failed_runs} times")

Identifying problematic evaluators:

result = client.run_batched_evaluation(...)

# Find evaluators with high failure rates
for stats in result.evaluator_stats:
    failure_rate = stats.failed_runs / stats.total_runs
    if failure_rate > 0.1:  # More than 10% failures
        print(f"âš ī¸  {stats.name} has {failure_rate:.1%} failure rate")
        print(f"    Consider debugging or removing this evaluator")
Note:

All arguments must be passed as keywords when instantiating this class.

EvaluatorStats( *, name: str, total_runs: int = 0, successful_runs: int = 0, failed_runs: int = 0, total_scores_created: int = 0)
427    def __init__(
428        self,
429        *,
430        name: str,
431        total_runs: int = 0,
432        successful_runs: int = 0,
433        failed_runs: int = 0,
434        total_scores_created: int = 0,
435    ):
436        """Initialize EvaluatorStats with the provided metrics.
437
438        Args:
439            name: The evaluator function name.
440            total_runs: Total number of evaluator invocations.
441            successful_runs: Number of successful completions.
442            failed_runs: Number of failures.
443            total_scores_created: Total scores created by this evaluator.
444
445        Note:
446            All arguments must be provided as keywords.
447        """
448        self.name = name
449        self.total_runs = total_runs
450        self.successful_runs = successful_runs
451        self.failed_runs = failed_runs
452        self.total_scores_created = total_scores_created

Initialize EvaluatorStats with the provided metrics.

Arguments:
  • name: The evaluator function name.
  • total_runs: Total number of evaluator invocations.
  • successful_runs: Number of successful completions.
  • failed_runs: Number of failures.
  • total_scores_created: Total scores created by this evaluator.
Note:

All arguments must be provided as keywords.

name
total_runs
successful_runs
failed_runs
total_scores_created
class BatchEvaluationResumeToken:
455class BatchEvaluationResumeToken:
456    """Token for resuming a failed batch evaluation run.
457
458    This class encapsulates all the information needed to resume a batch evaluation
459    that was interrupted or failed partway through. It uses timestamp-based filtering
460    to avoid re-processing items that were already evaluated, even if the underlying
461    dataset changed between runs.
462
463    Attributes:
464        scope: The type of items being evaluated ("traces", "observations").
465        filter: The original JSON filter string used to query items.
466        last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item.
467            Used to construct a filter that only fetches items after this timestamp.
468        last_processed_id: The ID of the last successfully processed item, for reference.
469        items_processed: Count of items successfully processed before interruption.
470
471    Examples:
472        Resuming a failed batch evaluation:
473        ```python
474        # Initial run that fails partway through
475        try:
476            result = client.run_batched_evaluation(
477                scope="traces",
478                mapper=my_mapper,
479                evaluators=[evaluator1, evaluator2],
480                filter='{"tags": ["production"]}',
481                max_items=10000
482            )
483        except Exception as e:
484            print(f"Evaluation failed: {e}")
485
486            # Save the resume token
487            if result.resume_token:
488                # Store resume token for later (e.g., in a file or database)
489                import json
490                with open("resume_token.json", "w") as f:
491                    json.dump({
492                        "scope": result.resume_token.scope,
493                        "filter": result.resume_token.filter,
494                        "last_timestamp": result.resume_token.last_processed_timestamp,
495                        "last_id": result.resume_token.last_processed_id,
496                        "items_done": result.resume_token.items_processed
497                    }, f)
498
499        # Later, resume from where it left off
500        with open("resume_token.json") as f:
501            token_data = json.load(f)
502
503        resume_token = BatchEvaluationResumeToken(
504            scope=token_data["scope"],
505            filter=token_data["filter"],
506            last_processed_timestamp=token_data["last_timestamp"],
507            last_processed_id=token_data["last_id"],
508            items_processed=token_data["items_done"]
509        )
510
511        # Resume the evaluation
512        result = client.run_batched_evaluation(
513            scope="traces",
514            mapper=my_mapper,
515            evaluators=[evaluator1, evaluator2],
516            resume_from=resume_token
517        )
518
519        print(f"Processed {result.total_items_processed} additional items")
520        ```
521
522        Handling partial completion:
523        ```python
524        result = client.run_batched_evaluation(...)
525
526        if not result.completed:
527            print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
528            print(f"Last item: {result.resume_token.last_processed_id}")
529            print(f"Resume from: {result.resume_token.last_processed_timestamp}")
530
531            # Optionally retry automatically
532            if result.resume_token:
533                print("Retrying...")
534                result = client.run_batched_evaluation(
535                    scope=result.resume_token.scope,
536                    mapper=my_mapper,
537                    evaluators=my_evaluators,
538                    resume_from=result.resume_token
539                )
540        ```
541
542    Note:
543        All arguments must be passed as keywords when instantiating this class.
544        The timestamp-based approach means that items created after the initial run
545        but before the timestamp will be skipped. This is intentional to avoid
546        duplicates and ensure consistent evaluation.
547    """
548
549    def __init__(
550        self,
551        *,
552        scope: str,
553        filter: Optional[str],
554        last_processed_timestamp: str,
555        last_processed_id: str,
556        items_processed: int,
557    ):
558        """Initialize BatchEvaluationResumeToken with the provided state.
559
560        Args:
561            scope: The scope type ("traces", "observations").
562            filter: The original JSON filter string.
563            last_processed_timestamp: ISO 8601 timestamp of last processed item.
564            last_processed_id: ID of last processed item.
565            items_processed: Count of items processed before interruption.
566
567        Note:
568            All arguments must be provided as keywords.
569        """
570        self.scope = scope
571        self.filter = filter
572        self.last_processed_timestamp = last_processed_timestamp
573        self.last_processed_id = last_processed_id
574        self.items_processed = items_processed

Token for resuming a failed batch evaluation run.

This class encapsulates all the information needed to resume a batch evaluation that was interrupted or failed partway through. It uses timestamp-based filtering to avoid re-processing items that were already evaluated, even if the underlying dataset changed between runs.

Attributes:
  • scope: The type of items being evaluated ("traces", "observations").
  • filter: The original JSON filter string used to query items.
  • last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. Used to construct a filter that only fetches items after this timestamp.
  • last_processed_id: The ID of the last successfully processed item, for reference.
  • items_processed: Count of items successfully processed before interruption.
Examples:

Resuming a failed batch evaluation:

# Initial run that fails partway through
try:
    result = client.run_batched_evaluation(
        scope="traces",
        mapper=my_mapper,
        evaluators=[evaluator1, evaluator2],
        filter='{"tags": ["production"]}',
        max_items=10000
    )
except Exception as e:
    print(f"Evaluation failed: {e}")

    # Save the resume token
    if result.resume_token:
        # Store resume token for later (e.g., in a file or database)
        import json
        with open("resume_token.json", "w") as f:
            json.dump({
                "scope": result.resume_token.scope,
                "filter": result.resume_token.filter,
                "last_timestamp": result.resume_token.last_processed_timestamp,
                "last_id": result.resume_token.last_processed_id,
                "items_done": result.resume_token.items_processed
            }, f)

# Later, resume from where it left off
with open("resume_token.json") as f:
    token_data = json.load(f)

resume_token = BatchEvaluationResumeToken(
    scope=token_data["scope"],
    filter=token_data["filter"],
    last_processed_timestamp=token_data["last_timestamp"],
    last_processed_id=token_data["last_id"],
    items_processed=token_data["items_done"]
)

# Resume the evaluation
result = client.run_batched_evaluation(
    scope="traces",
    mapper=my_mapper,
    evaluators=[evaluator1, evaluator2],
    resume_from=resume_token
)

print(f"Processed {result.total_items_processed} additional items")

Handling partial completion:

result = client.run_batched_evaluation(...)

if not result.completed:
    print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
    print(f"Last item: {result.resume_token.last_processed_id}")
    print(f"Resume from: {result.resume_token.last_processed_timestamp}")

    # Optionally retry automatically
    if result.resume_token:
        print("Retrying...")
        result = client.run_batched_evaluation(
            scope=result.resume_token.scope,
            mapper=my_mapper,
            evaluators=my_evaluators,
            resume_from=result.resume_token
        )
Note:

All arguments must be passed as keywords when instantiating this class. The timestamp-based approach means that items created after the initial run but before the timestamp will be skipped. This is intentional to avoid duplicates and ensure consistent evaluation.

BatchEvaluationResumeToken( *, scope: str, filter: Optional[str], last_processed_timestamp: str, last_processed_id: str, items_processed: int)
549    def __init__(
550        self,
551        *,
552        scope: str,
553        filter: Optional[str],
554        last_processed_timestamp: str,
555        last_processed_id: str,
556        items_processed: int,
557    ):
558        """Initialize BatchEvaluationResumeToken with the provided state.
559
560        Args:
561            scope: The scope type ("traces", "observations").
562            filter: The original JSON filter string.
563            last_processed_timestamp: ISO 8601 timestamp of last processed item.
564            last_processed_id: ID of last processed item.
565            items_processed: Count of items processed before interruption.
566
567        Note:
568            All arguments must be provided as keywords.
569        """
570        self.scope = scope
571        self.filter = filter
572        self.last_processed_timestamp = last_processed_timestamp
573        self.last_processed_id = last_processed_id
574        self.items_processed = items_processed

Initialize BatchEvaluationResumeToken with the provided state.

Arguments:
  • scope: The scope type ("traces", "observations").
  • filter: The original JSON filter string.
  • last_processed_timestamp: ISO 8601 timestamp of last processed item.
  • last_processed_id: ID of last processed item.
  • items_processed: Count of items processed before interruption.
Note:

All arguments must be provided as keywords.

scope
filter
last_processed_timestamp
last_processed_id
items_processed
class BatchEvaluationResult:
577class BatchEvaluationResult:
578    r"""Complete result structure for batch evaluation execution.
579
580    This class encapsulates comprehensive statistics and metadata about a batch
581    evaluation run, including counts, evaluator-specific metrics, timing information,
582    error details, and resume capability.
583
584    Attributes:
585        total_items_fetched: Total number of items fetched from the API.
586        total_items_processed: Number of items successfully evaluated.
587        total_items_failed: Number of items that failed during evaluation.
588        total_scores_created: Total scores created by all item-level evaluators.
589        total_composite_scores_created: Scores created by the composite evaluator.
590        total_evaluations_failed: Number of individual evaluator failures across all items.
591        evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
592        resume_token: Token for resuming if evaluation was interrupted (None if completed).
593        completed: True if all items were processed, False if stopped early or failed.
594        duration_seconds: Total time taken to execute the batch evaluation.
595        failed_item_ids: List of IDs for items that failed evaluation.
596        error_summary: Dictionary mapping error types to occurrence counts.
597        has_more_items: True if max_items limit was reached but more items exist.
598        item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
599
600    Examples:
601        Basic result inspection:
602        ```python
603        result = client.run_batched_evaluation(...)
604
605        print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
606        print(f"Scores created: {result.total_scores_created}")
607        print(f"Duration: {result.duration_seconds:.2f}s")
608        print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")
609        ```
610
611        Detailed analysis with evaluator stats:
612        ```python
613        result = client.run_batched_evaluation(...)
614
615        print(f"\n📊 Batch Evaluation Results")
616        print(f"{'='*50}")
617        print(f"Items processed: {result.total_items_processed}")
618        print(f"Items failed: {result.total_items_failed}")
619        print(f"Scores created: {result.total_scores_created}")
620
621        if result.total_composite_scores_created > 0:
622            print(f"Composite scores: {result.total_composite_scores_created}")
623
624        print(f"\n📈 Evaluator Performance:")
625        for stats in result.evaluator_stats:
626            success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
627            print(f"\n  {stats.name}:")
628            print(f"    Success rate: {success_rate:.1%}")
629            print(f"    Scores created: {stats.total_scores_created}")
630            if stats.failed_runs > 0:
631                print(f"    âš ī¸  Failures: {stats.failed_runs}")
632
633        if result.error_summary:
634            print(f"\nâš ī¸  Errors encountered:")
635            for error_type, count in result.error_summary.items():
636                print(f"    {error_type}: {count}")
637        ```
638
639        Handling incomplete runs:
640        ```python
641        result = client.run_batched_evaluation(...)
642
643        if not result.completed:
644            print("âš ī¸  Evaluation incomplete!")
645
646            if result.resume_token:
647                print(f"Processed {result.resume_token.items_processed} items before failure")
648                print(f"Use resume_from parameter to continue from:")
649                print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
650                print(f"  Last ID: {result.resume_token.last_processed_id}")
651
652        if result.has_more_items:
653            print(f"â„šī¸  More items available beyond max_items limit")
654        ```
655
656        Performance monitoring:
657        ```python
658        result = client.run_batched_evaluation(...)
659
660        items_per_second = result.total_items_processed / result.duration_seconds
661        avg_scores_per_item = result.total_scores_created / result.total_items_processed
662
663        print(f"Performance metrics:")
664        print(f"  Throughput: {items_per_second:.2f} items/second")
665        print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
666        print(f"  Total duration: {result.duration_seconds:.2f}s")
667
668        if result.total_evaluations_failed > 0:
669            failure_rate = result.total_evaluations_failed / (
670                result.total_items_processed * len(result.evaluator_stats)
671            )
672            print(f"  Evaluation failure rate: {failure_rate:.1%}")
673        ```
674
675    Note:
676        All arguments must be passed as keywords when instantiating this class.
677    """
678
679    def __init__(
680        self,
681        *,
682        total_items_fetched: int,
683        total_items_processed: int,
684        total_items_failed: int,
685        total_scores_created: int,
686        total_composite_scores_created: int,
687        total_evaluations_failed: int,
688        evaluator_stats: List[EvaluatorStats],
689        resume_token: Optional[BatchEvaluationResumeToken],
690        completed: bool,
691        duration_seconds: float,
692        failed_item_ids: List[str],
693        error_summary: Dict[str, int],
694        has_more_items: bool,
695        item_evaluations: Dict[str, List["Evaluation"]],
696    ):
697        """Initialize BatchEvaluationResult with comprehensive statistics.
698
699        Args:
700            total_items_fetched: Total items fetched from API.
701            total_items_processed: Items successfully evaluated.
702            total_items_failed: Items that failed evaluation.
703            total_scores_created: Scores from item-level evaluators.
704            total_composite_scores_created: Scores from composite evaluator.
705            total_evaluations_failed: Individual evaluator failures.
706            evaluator_stats: Per-evaluator statistics.
707            resume_token: Token for resuming (None if completed).
708            completed: Whether all items were processed.
709            duration_seconds: Total execution time.
710            failed_item_ids: IDs of failed items.
711            error_summary: Error types and counts.
712            has_more_items: Whether more items exist beyond max_items.
713            item_evaluations: Dictionary mapping item IDs to their evaluation results.
714
715        Note:
716            All arguments must be provided as keywords.
717        """
718        self.total_items_fetched = total_items_fetched
719        self.total_items_processed = total_items_processed
720        self.total_items_failed = total_items_failed
721        self.total_scores_created = total_scores_created
722        self.total_composite_scores_created = total_composite_scores_created
723        self.total_evaluations_failed = total_evaluations_failed
724        self.evaluator_stats = evaluator_stats
725        self.resume_token = resume_token
726        self.completed = completed
727        self.duration_seconds = duration_seconds
728        self.failed_item_ids = failed_item_ids
729        self.error_summary = error_summary
730        self.has_more_items = has_more_items
731        self.item_evaluations = item_evaluations
732
733    def __str__(self) -> str:
734        """Return a formatted string representation of the batch evaluation results.
735
736        Returns:
737            A multi-line string with a summary of the evaluation results.
738        """
739        lines = []
740        lines.append("=" * 60)
741        lines.append("Batch Evaluation Results")
742        lines.append("=" * 60)
743
744        # Summary statistics
745        lines.append(f"\nStatus: {'Completed' if self.completed else 'Incomplete'}")
746        lines.append(f"Duration: {self.duration_seconds:.2f}s")
747        lines.append(f"\nItems fetched: {self.total_items_fetched}")
748        lines.append(f"Items processed: {self.total_items_processed}")
749
750        if self.total_items_failed > 0:
751            lines.append(f"Items failed: {self.total_items_failed}")
752
753        # Success rate
754        if self.total_items_fetched > 0:
755            success_rate = self.total_items_processed / self.total_items_fetched * 100
756            lines.append(f"Success rate: {success_rate:.1f}%")
757
758        # Scores created
759        lines.append(f"\nScores created: {self.total_scores_created}")
760        if self.total_composite_scores_created > 0:
761            lines.append(f"Composite scores: {self.total_composite_scores_created}")
762
763        total_scores = self.total_scores_created + self.total_composite_scores_created
764        lines.append(f"Total scores: {total_scores}")
765
766        # Evaluator statistics
767        if self.evaluator_stats:
768            lines.append("\nEvaluator Performance:")
769            for stats in self.evaluator_stats:
770                lines.append(f"  {stats.name}:")
771                if stats.total_runs > 0:
772                    success_rate = (
773                        stats.successful_runs / stats.total_runs * 100
774                        if stats.total_runs > 0
775                        else 0
776                    )
777                    lines.append(
778                        f"    Runs: {stats.successful_runs}/{stats.total_runs} "
779                        f"({success_rate:.1f}% success)"
780                    )
781                    lines.append(f"    Scores created: {stats.total_scores_created}")
782                    if stats.failed_runs > 0:
783                        lines.append(f"    Failed runs: {stats.failed_runs}")
784
785        # Performance metrics
786        if self.total_items_processed > 0 and self.duration_seconds > 0:
787            items_per_sec = self.total_items_processed / self.duration_seconds
788            lines.append("\nPerformance:")
789            lines.append(f"  Throughput: {items_per_sec:.2f} items/second")
790            if self.total_scores_created > 0:
791                avg_scores = self.total_scores_created / self.total_items_processed
792                lines.append(f"  Avg scores per item: {avg_scores:.2f}")
793
794        # Errors and warnings
795        if self.error_summary:
796            lines.append("\nErrors encountered:")
797            for error_type, count in self.error_summary.items():
798                lines.append(f"  {error_type}: {count}")
799
800        # Incomplete run information
801        if not self.completed:
802            lines.append("\nWarning: Evaluation incomplete")
803            if self.resume_token:
804                lines.append(
805                    f"  Last processed: {self.resume_token.last_processed_timestamp}"
806                )
807                lines.append(f"  Items processed: {self.resume_token.items_processed}")
808                lines.append("  Use resume_from parameter to continue")
809
810        if self.has_more_items:
811            lines.append("\nNote: More items available beyond max_items limit")
812
813        lines.append("=" * 60)
814        return "\n".join(lines)

Complete result structure for batch evaluation execution.

This class encapsulates comprehensive statistics and metadata about a batch evaluation run, including counts, evaluator-specific metrics, timing information, error details, and resume capability.

Attributes:
  • total_items_fetched: Total number of items fetched from the API.
  • total_items_processed: Number of items successfully evaluated.
  • total_items_failed: Number of items that failed during evaluation.
  • total_scores_created: Total scores created by all item-level evaluators.
  • total_composite_scores_created: Scores created by the composite evaluator.
  • total_evaluations_failed: Number of individual evaluator failures across all items.
  • evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
  • resume_token: Token for resuming if evaluation was interrupted (None if completed).
  • completed: True if all items were processed, False if stopped early or failed.
  • duration_seconds: Total time taken to execute the batch evaluation.
  • failed_item_ids: List of IDs for items that failed evaluation.
  • error_summary: Dictionary mapping error types to occurrence counts.
  • has_more_items: True if max_items limit was reached but more items exist.
  • item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
Examples:

Basic result inspection:

result = client.run_batched_evaluation(...)

print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
print(f"Scores created: {result.total_scores_created}")
print(f"Duration: {result.duration_seconds:.2f}s")
print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")

Detailed analysis with evaluator stats:

result = client.run_batched_evaluation(...)

print(f"\n📊 Batch Evaluation Results")
print(f"{'='*50}")
print(f"Items processed: {result.total_items_processed}")
print(f"Items failed: {result.total_items_failed}")
print(f"Scores created: {result.total_scores_created}")

if result.total_composite_scores_created > 0:
    print(f"Composite scores: {result.total_composite_scores_created}")

print(f"\n📈 Evaluator Performance:")
for stats in result.evaluator_stats:
    success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
    print(f"\n  {stats.name}:")
    print(f"    Success rate: {success_rate:.1%}")
    print(f"    Scores created: {stats.total_scores_created}")
    if stats.failed_runs > 0:
        print(f"    âš ī¸  Failures: {stats.failed_runs}")

if result.error_summary:
    print(f"\nâš ī¸  Errors encountered:")
    for error_type, count in result.error_summary.items():
        print(f"    {error_type}: {count}")

Handling incomplete runs:

result = client.run_batched_evaluation(...)

if not result.completed:
    print("âš ī¸  Evaluation incomplete!")

    if result.resume_token:
        print(f"Processed {result.resume_token.items_processed} items before failure")
        print(f"Use resume_from parameter to continue from:")
        print(f"  Timestamp: {result.resume_token.last_processed_timestamp}")
        print(f"  Last ID: {result.resume_token.last_processed_id}")

if result.has_more_items:
    print(f"â„šī¸  More items available beyond max_items limit")

Performance monitoring:

result = client.run_batched_evaluation(...)

items_per_second = result.total_items_processed / result.duration_seconds
avg_scores_per_item = result.total_scores_created / result.total_items_processed

print(f"Performance metrics:")
print(f"  Throughput: {items_per_second:.2f} items/second")
print(f"  Avg scores/item: {avg_scores_per_item:.2f}")
print(f"  Total duration: {result.duration_seconds:.2f}s")

if result.total_evaluations_failed > 0:
    failure_rate = result.total_evaluations_failed / (
        result.total_items_processed * len(result.evaluator_stats)
    )
    print(f"  Evaluation failure rate: {failure_rate:.1%}")
Note:

All arguments must be passed as keywords when instantiating this class.

BatchEvaluationResult( *, total_items_fetched: int, total_items_processed: int, total_items_failed: int, total_scores_created: int, total_composite_scores_created: int, total_evaluations_failed: int, evaluator_stats: List[EvaluatorStats], resume_token: Optional[BatchEvaluationResumeToken], completed: bool, duration_seconds: float, failed_item_ids: List[str], error_summary: Dict[str, int], has_more_items: bool, item_evaluations: Dict[str, List[Evaluation]])
679    def __init__(
680        self,
681        *,
682        total_items_fetched: int,
683        total_items_processed: int,
684        total_items_failed: int,
685        total_scores_created: int,
686        total_composite_scores_created: int,
687        total_evaluations_failed: int,
688        evaluator_stats: List[EvaluatorStats],
689        resume_token: Optional[BatchEvaluationResumeToken],
690        completed: bool,
691        duration_seconds: float,
692        failed_item_ids: List[str],
693        error_summary: Dict[str, int],
694        has_more_items: bool,
695        item_evaluations: Dict[str, List["Evaluation"]],
696    ):
697        """Initialize BatchEvaluationResult with comprehensive statistics.
698
699        Args:
700            total_items_fetched: Total items fetched from API.
701            total_items_processed: Items successfully evaluated.
702            total_items_failed: Items that failed evaluation.
703            total_scores_created: Scores from item-level evaluators.
704            total_composite_scores_created: Scores from composite evaluator.
705            total_evaluations_failed: Individual evaluator failures.
706            evaluator_stats: Per-evaluator statistics.
707            resume_token: Token for resuming (None if completed).
708            completed: Whether all items were processed.
709            duration_seconds: Total execution time.
710            failed_item_ids: IDs of failed items.
711            error_summary: Error types and counts.
712            has_more_items: Whether more items exist beyond max_items.
713            item_evaluations: Dictionary mapping item IDs to their evaluation results.
714
715        Note:
716            All arguments must be provided as keywords.
717        """
718        self.total_items_fetched = total_items_fetched
719        self.total_items_processed = total_items_processed
720        self.total_items_failed = total_items_failed
721        self.total_scores_created = total_scores_created
722        self.total_composite_scores_created = total_composite_scores_created
723        self.total_evaluations_failed = total_evaluations_failed
724        self.evaluator_stats = evaluator_stats
725        self.resume_token = resume_token
726        self.completed = completed
727        self.duration_seconds = duration_seconds
728        self.failed_item_ids = failed_item_ids
729        self.error_summary = error_summary
730        self.has_more_items = has_more_items
731        self.item_evaluations = item_evaluations

Initialize BatchEvaluationResult with comprehensive statistics.

Arguments:
  • total_items_fetched: Total items fetched from API.
  • total_items_processed: Items successfully evaluated.
  • total_items_failed: Items that failed evaluation.
  • total_scores_created: Scores from item-level evaluators.
  • total_composite_scores_created: Scores from composite evaluator.
  • total_evaluations_failed: Individual evaluator failures.
  • evaluator_stats: Per-evaluator statistics.
  • resume_token: Token for resuming (None if completed).
  • completed: Whether all items were processed.
  • duration_seconds: Total execution time.
  • failed_item_ids: IDs of failed items.
  • error_summary: Error types and counts.
  • has_more_items: Whether more items exist beyond max_items.
  • item_evaluations: Dictionary mapping item IDs to their evaluation results.
Note:

All arguments must be provided as keywords.

total_items_fetched
total_items_processed
total_items_failed
total_scores_created
total_composite_scores_created
total_evaluations_failed
evaluator_stats
resume_token
completed
duration_seconds
failed_item_ids
error_summary
has_more_items
item_evaluations