The Roadmap for Mastering LLMOps in 2026

# llm_with_tracing.py

# Function: A production-ready LLM name wrapper with full observability.

# Each name is traced in Langfuse: enter, output, tokens, price, latency.

# Stipulations:

# pip set up langfuse anthropic python-dotenv

# Setup:

# 1. Create a free account at https://cloud.langfuse.com

# 2. Get your keys from Settings > API Keys

# 3. Create a .env file with the variables beneath

# Run:

# python llm_with_tracing.py

import os

import time

from dotenv import load_dotenv

import anthropic

from langfuse import Langfuse

# Load surroundings variables from .env file

load_dotenv()

# Required surroundings variables in your .env:

# LANGFUSE_PUBLIC_KEY=pk-lf-…

# LANGFUSE_SECRET_KEY=sk-lf-…

# LANGFUSE_HOST= (or your self-hosted URL)

# ANTHROPIC_API_KEY=sk-ant-…

# Initialize purchasers

langfuse_client = Langfuse() # Reads keys mechanically from surroundings

anthropic_client = anthropic.Anthropic() # Reads ANTHROPIC_API_KEY from surroundings

# ── Configuration ─────────────────────────────────────────────────────────────

# Retailer your immediate right here, not inline within the API name.

# This makes it versionable and testable independently.

SYSTEM_PROMPT = “”“You’re a useful buyer assist assistant.

Reply questions clearly and concisely.

If you happen to have no idea one thing, say so straight — don’t guess.”“”

MODEL = “claude-sonnet-4-20250514”

# Anthropic’s pricing as of mid-2026 (replace when pricing modifications)

# Used to calculate price per name for price monitoring

COST_PER_INPUT_TOKEN = 3.00 / 1_000_000 # $3.00 per million enter tokens

COST_PER_OUTPUT_TOKEN = 15.00 / 1_000_000 # $15.00 per million output tokens

def call_llm_with_tracing(

user_message: str,

session_id: str = “default-session”,

user_id: str = “nameless”

) -> str:

“”“

Make a traced LLM name. Each name creates a Langfuse hint with:

– Full enter and output

– Token utilization (enter, output, whole)

– Calculated price in USD

– Latency in milliseconds

– Mannequin used and session context

Parameters:

user_message : The message from the person

session_id : Teams associated calls into one dialog in Langfuse

user_id : Associates the decision with a particular person for analytics

Returns:

The LLM response as a string

““”

# Create a top-level hint for this person interplay

# The hint seems within the Langfuse dashboard as one unit of labor

hint = langfuse_client.hint(

identify=“customer-support-call”,

session_id=session_id,

user_id=user_id,

enter={“user_message”: user_message, “system_prompt”: SYSTEM_PROMPT}

)

# Create a era span contained in the hint

# This captures model-specific particulars: mannequin identify, tokens, price

era = hint.era(

identify=“claude-completion”,

mannequin=MODEL,

enter={

“system”: SYSTEM_PROMPT,

“messages”: [{“role”: “user”, “content”: user_message}]

}

)

start_time = time.time()

attempt:

# Make the API name

response = anthropic_client.messages.create(

mannequin=MODEL,

max_tokens=1024,

system=SYSTEM_PROMPT,

messages=[{“role”: “user”, “content”: user_message}]

)

latency_ms = int((time.time() – start_time) * 1000)

# Extract the response textual content

response_text = response.content material[0].textual content

# Extract token utilization from the response

input_tokens = response.utilization.input_tokens

output_tokens = response.utilization.output_tokens

total_tokens = input_tokens + output_tokens

# Calculate price for this name

cost_usd = (

input_tokens * COST_PER_INPUT_TOKEN +

output_tokens * COST_PER_OUTPUT_TOKEN

)

# Replace the era span with outcomes

# This knowledge populates the Langfuse price and token dashboards

era.finish(

output=response_text,

utilization={

“enter”: input_tokens,

“output”: output_tokens,

“whole”: total_tokens,

“unit”: “TOKENS”

metadata={

“latency_ms”: latency_ms,

“cost_usd”: spherical(cost_usd, 6),

“mannequin”: MODEL

}

)

# Replace the hint with the ultimate output

hint.replace(

output={“response”: response_text},

metadata={“total_cost_usd”: spherical(cost_usd, 6)}

)

# Print a abstract to stdout for native visibility

print(f“n{‘─’ * 60}”)

print(f“Consumer: {user_message}”)

print(f“Claude: {response_text}”)

print(f“Tokens: {input_tokens} in / {output_tokens} out / {total_tokens} whole”)

print(f“Value: ${cost_usd:.6f}”)

print(f“Latency: {latency_ms}ms”)

print(f“Hint: {langfuse_client.base_url}/hint/{hint.id}”)

print(f“{‘─’ * 60}n”)

return response_text

besides Exception as e:

# File the error within the hint so it exhibits up in Langfuse

era.finish(

output=None,

metadata={“error”: str(e), “latency_ms”: int((time.time() – start_time) * 1000)}

)

hint.replace(output={“error”: str(e)})

# All the time flush earlier than elevating — ensures the error hint is distributed

langfuse_client.flush()

elevate

lastly:

# Flush sends all buffered occasions to Langfuse

# In a long-running service, Langfuse flushes mechanically.

# In a script, you could flush manually earlier than the method exits.

langfuse_client.flush()

# ── Run an indication ────────────────────────────────────────────────────────

if __name__ == “__main__”:

# Simulate two turns of a buyer assist dialog

test_messages = [

“What is your return policy for electronics?”,

“Can I return an item I bought 45 days ago?”

]

session = “demo-session-001”

for i, message in enumerate(test_messages):

print(f“nCall {i + 1}/{len(test_messages)}”)

attempt:

call_llm_with_tracing(

user_message=message,

session_id=session,

user_id=“test-user-42”

)

besides Exception as e:

print(f“Error on name {i + 1}: {e}”)

Supply hyperlink

Leave a Reply Cancel reply