# llm_with_tracing.py

# Function: A production-ready LLM name wrapper with full observability.

# Each name is traced in Langfuse: enter, output, tokens, price, latency.

#

# Stipulations:

#   pip set up langfuse anthropic python-dotenv

#

# Setup:

#   1. Create a free account at https://cloud.langfuse.com

#   2. Get your keys from Settings > API Keys

#   3. Create a .env file with the variables beneath

#

# Run:

#   python llm_with_tracing.py

 

import os

import time

from dotenv import load_dotenv

import anthropic

from langfuse import Langfuse

 

# Load surroundings variables from .env file

load_dotenv()

 

# Required surroundings variables in your .env:

# LANGFUSE_PUBLIC_KEY=pk-lf-…

# LANGFUSE_SECRET_KEY=sk-lf-…

# LANGFUSE_HOST= (or your self-hosted URL)

# ANTHROPIC_API_KEY=sk-ant-…

 

# Initialize purchasers

langfuse_client = Langfuse()          # Reads keys mechanically from surroundings

anthropic_client = anthropic.Anthropic()  # Reads ANTHROPIC_API_KEY from surroundings

 

# ── Configuration ─────────────────────────────────────────────────────────────

# Retailer your immediate right here, not inline within the API name.

# This makes it versionable and testable independently.

SYSTEM_PROMPT = “”“You’re a useful buyer assist assistant.

Reply questions clearly and concisely.

If you happen to have no idea one thing, say so straight — don’t guess.”“”

 

MODEL = “claude-sonnet-4-20250514”

 

# Anthropic’s pricing as of mid-2026 (replace when pricing modifications)

# Used to calculate price per name for price monitoring

COST_PER_INPUT_TOKEN  = 3.00 / 1_000_000   # $3.00 per million enter tokens

COST_PER_OUTPUT_TOKEN = 15.00 / 1_000_000  # $15.00 per million output tokens

 

 

def call_llm_with_tracing(

    user_message: str,

    session_id: str = “default-session”,

    user_id: str = “nameless”

) -> str:

    “”

    Make a traced LLM name. Each name creates a Langfuse hint with:

    – Full enter and output

    – Token utilization (enter, output, whole)

    – Calculated price in USD

    – Latency in milliseconds

    – Mannequin used and session context

 

    Parameters:

        user_message : The message from the person

        session_id   : Teams associated calls into one dialog in Langfuse

        user_id      : Associates the decision with a particular person for analytics

 

    Returns:

        The LLM response as a string

    ““”

 

    # Create a top-level hint for this person interplay

    # The hint seems within the Langfuse dashboard as one unit of labor

    hint = langfuse_client.hint(

        identify=“customer-support-call”,

        session_id=session_id,

        user_id=user_id,

        enter={“user_message”: user_message, “system_prompt”: SYSTEM_PROMPT}

    )

 

    # Create a era span contained in the hint

    # This captures model-specific particulars: mannequin identify, tokens, price

    era = hint.era(

        identify=“claude-completion”,

        mannequin=MODEL,

        enter={

            “system”: SYSTEM_PROMPT,

            “messages”: [{“role”: “user”, “content”: user_message}]

        }

    )

 

    start_time = time.time()

 

    attempt:

        # Make the API name

        response = anthropic_client.messages.create(

            mannequin=MODEL,

            max_tokens=1024,

            system=SYSTEM_PROMPT,

            messages=[{“role”: “user”, “content”: user_message}]

        )

 

        latency_ms = int((time.time() start_time) * 1000)

 

        # Extract the response textual content

        response_text = response.content material[0].textual content

 

        # Extract token utilization from the response

        input_tokens  = response.utilization.input_tokens

        output_tokens = response.utilization.output_tokens

        total_tokens  = input_tokens + output_tokens

 

        # Calculate price for this name

        cost_usd = (

            input_tokens  * COST_PER_INPUT_TOKEN +

            output_tokens * COST_PER_OUTPUT_TOKEN

        )

 

        # Replace the era span with outcomes

        # This knowledge populates the Langfuse price and token dashboards

        era.finish(

            output=response_text,

            utilization={

                “enter”:  input_tokens,

                “output”: output_tokens,

                “whole”:  total_tokens,

                “unit”:   “TOKENS”

            },

            metadata={

                “latency_ms”: latency_ms,

                “cost_usd”:   spherical(cost_usd, 6),

                “mannequin”:      MODEL

            }

        )

 

        # Replace the hint with the ultimate output

        hint.replace(

            output={“response”: response_text},

            metadata={“total_cost_usd”: spherical(cost_usd, 6)}

        )

 

        # Print a abstract to stdout for native visibility

        print(f“n{‘─’ * 60}”)

        print(f“Consumer:    {user_message}”)

        print(f“Claude:  {response_text}”)

        print(f“Tokens:  {input_tokens} in / {output_tokens} out / {total_tokens} whole”)

        print(f“Value:    ${cost_usd:.6f}”)

        print(f“Latency: {latency_ms}ms”)

        print(f“Hint:   {langfuse_client.base_url}/hint/{hint.id}”)

        print(f“{‘─’ * 60}n”)

 

        return response_text

 

    besides Exception as e:

        # File the error within the hint so it exhibits up in Langfuse

        era.finish(

            output=None,

            metadata={“error”: str(e), “latency_ms”: int((time.time() start_time) * 1000)}

        )

        hint.replace(output={“error”: str(e)})

 

        # All the time flush earlier than elevating — ensures the error hint is distributed

        langfuse_client.flush()

        elevate

 

    lastly:

        # Flush sends all buffered occasions to Langfuse

        # In a long-running service, Langfuse flushes mechanically.

        # In a script, you could flush manually earlier than the method exits.

        langfuse_client.flush()

 

 

# ── Run an indication ────────────────────────────────────────────────────────

if __name__ == “__main__”:

    # Simulate two turns of a buyer assist dialog

    test_messages = [

        “What is your return policy for electronics?”,

        “Can I return an item I bought 45 days ago?”

    ]

 

    session = “demo-session-001”

 

    for i, message in enumerate(test_messages):

        print(f“nCall {i + 1}/{len(test_messages)}”)

        attempt:

            call_llm_with_tracing(

                user_message=message,

                session_id=session,

                user_id=“test-user-42”

            )

        besides Exception as e:

            print(f“Error on name {i + 1}: {e}”)



Supply hyperlink


Leave a Reply

Your email address will not be published. Required fields are marked *