from lmnr import evaluate, HumanEvaluator
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
client = OpenAI()
def generate_customer_response(data: dict) -> str:
"""Generate customer service responses"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": "You are a helpful customer service representative."
},
{
"role": "user",
"content": data["customer_inquiry"]
}
]
)
return response.choices[0].message.content
def llm_judge_helpfulness(output: str, target: dict) -> int:
"""LLM-as-a-judge evaluator for helpfulness (1-3 scale)"""
response = client.chat.completions.create(
model="o4-mini",
messages=[
{
"role": "system",
"content": """Rate the helpfulness of this customer service response on a scale of 1-3:
1 = Not helpful at all
2 = Moderately helpful
3 = Very helpful
Consider: Does it address the customer's concern? Is it clear and actionable?
Respond with only the number."""
},
{
"role": "user",
"content": f"Customer inquiry: {target['customer_inquiry']}\n\nResponse: {output}"
}
]
)
return int(response.choices[0].message.content.strip()) / 3
# Step 1: Create reference data with human evaluators
evaluate(
data=[
{
"data": {"customer_inquiry": "My order hasn't arrived yet, it's been 2 weeks"},
"target": {"customer_inquiry": "My order hasn't arrived yet, it's been 2 weeks"}
},
{
"data": {"customer_inquiry": "I need to return a damaged product"},
"target": {"customer_inquiry": "I need to return a damaged product"}
},
],
executor=generate_customer_response,
evaluators={
"human_helpfulness": HumanEvaluator(), # Creates reference scores
"llm_judge_helpfulness": llm_judge_helpfulness, # LLM judge to validate
},
group_name="llm_judge_calibration"
)