Spaces:

FINAL-Bench
/

worldmodel-bench

Sleeping

App Files Files Community

worldmodel-bench / wm_bench_eval.py

SeaWolf-AI

Upload 9 files

ee97e7d verified about 2 months ago

raw

history blame contribute delete

30.6 kB

	"""
	World Model Bench — Evaluation Protocol v1.0

	핵심 문제:
	"Tesla FSD는 자동차 안에 있고, Dreamer는 Atari에 있고,
	우리는 3D 캐릭터를 쓴다. 어떻게 같은 기준으로 평가하나?"

	해결:
	3D 환경이 필요 없다.
	scene_context(JSON) → 모델 → PREDICT+MOTION(텍스트) → 자동 채점

	FINAL Bench가 LLM에게 "문제 텍스트"를 주고 "답 텍스트"를 받아 채점하듯이,
	WM Bench는 "상황 JSON"을 주고 "판단 텍스트"를 받아 채점한다.

	이것이 의미하는 것:
	- 어떤 월드모델이든 참여 가능 (API 하나면 됨)
	- 3D 환경, 로봇, 시뮬레이터 불필요
	- 셀프 평가 아님 — 우리 채점기가 판정
	- 제3자가 재현 가능 — 코드 공개
	"""

	import json
	from typing import List, Dict, Tuple, Optional
	from dataclasses import dataclass


	# ═══════════════════════════════════════════════════════════════
	# SECTION 1: 평가 프로토콜 — 3가지 트랙
	# ═══════════════════════════════════════════════════════════════

	"""
	WM Bench는 3개 트랙으로 참여할 수 있다.

	━━━ Track A: Text-Only (텍스트 전용) ━━━
	- 가장 간단. LLM, 룰 기반 시스템 등 모두 참여 가능.
	- scene_context JSON 입력 → PREDICT+MOTION 텍스트 출력
	- P1(인식) + P2(인지) 평가 가능
	- P3 중 C08(표현력)만 평가 가능 (C09, C10은 N/A)
	- 최대 점수: 750/1000

	━━━ Track B: Text + Performance (텍스트 + 성능) ━━━
	- Track A + 실시간 성능 메트릭 제출
	- FPS, 지연시간, 메모리 사용량 등 자가 측정 제출
	- P1 + P2 + P3(C08, C09) 평가
	- C10(교체 확장성)은 증빙 자료 제출로 평가
	- 최대 점수: 1000/1000

	━━━ Track C: Live Demo (라이브 데모) ━━━
	- Track B + 실제 동작 영상/데모 URL 제출
	- 검증자가 직접 데모를 돌려서 확인
	- 모든 항목 평가 + "Verified" 배지
	- 최대 점수: 1000/1000 + ✓ Verified

	대부분의 참가자는 Track A로 참여.
	Track B, C는 상위 모델 검증용.
	"""

	TRACKS = {
	"A": {
	"name": "Text-Only",
	"description": "scene_context JSON → PREDICT+MOTION 텍스트",
	"requirements": "API 또는 스크립트로 50개 시나리오에 응답",
	"max_score": 750,
	"evaluable_categories": [
	"C01", "C02", "C03", "C04", "C05", "C06", "C07", "C08"
	],
	"not_evaluable": ["C09 (성능 측정 불가)", "C10 (교체 테스트 불가)"],
	},
	"B": {
	"name": "Text + Performance",
	"description": "Track A + 실시간 성능 메트릭 자가 측정",
	"requirements": "Track A 결과 + performance_metrics.json 제출",
	"max_score": 1000,
	"evaluable_categories": [
	"C01", "C02", "C03", "C04", "C05", "C06", "C07", "C08", "C09", "C10"
	],
	},
	"C": {
	"name": "Live Demo",
	"description": "Track B + 실제 동작 데모 URL 제출",
	"requirements": "Track B 결과 + 데모 URL + 영상",
	"max_score": 1000,
	"badge": "✓ Verified",
	},
	}


	# ═══════════════════════════════════════════════════════════════
	# SECTION 2: 표준 입력 포맷 — scene_context JSON
	# ═══════════════════════════════════════════════════════════════

	"""
	모든 참가자는 이 JSON을 입력으로 받는다.
	이 JSON이 "문제지"다.
	"""

	@dataclass
	class SceneContext:
	"""WM Bench 표준 입력 포맷"""
	# 환경 정보
	walls: Dict[str, Optional[float]] # {"left": 2.5, "right": null, "front": 1.0}
	ground: str # "flat", "slope", "rough"

	# NPC 정보
	npc_nearby: bool
	npc_type: Optional[str] # "beast", "woman", "man", null
	npc_behavior: Optional[str] # "stop", "approach", "charge", "wander"
	npc_distance: Optional[float] # meters
	npc_direction: Optional[str] # "left", "right", "front", "back"

	# 감각 정보
	sound: Optional[str] # "aggressive growling", "footsteps", null

	# 맥락 정보 (C06 기억 테스트용)
	recent_decisions: Optional[List[str]] # 최근 3회 판단
	last_prediction: Optional[str] # 직전 PREDICT 줄


	# 50개 시나리오를 JSON으로 구조화
	SCENARIO_INPUTS: List[dict] = [
	# ─── C01: Environmental Awareness ───
	{
	"id": "S01",
	"category": "C01",
	"name_kr": "전방 벽 감지",
	"input": {
	"walls": {"left": None, "right": None, "front": 3.0},
	"ground": "flat",
	"npc_nearby": False,
	"npc_type": None,
	"npc_behavior": None,
	"npc_distance": None,
	"npc_direction": None,
	"sound": None,
	"recent_decisions": [],
	"last_prediction": None,
	},
	"ground_truth": {
	"predict_gt": {"left": "safe", "right": "safe", "fwd": "danger", "back": "safe"},
	"scoring_method": "C01",
	},
	},
	{
	"id": "S02",
	"category": "C01",
	"name_kr": "코너 다중 벽 감지",
	"input": {
	"walls": {"left": 1.5, "right": None, "front": 2.0},
	"ground": "flat",
	"npc_nearby": False,
	"npc_type": None,
	"npc_behavior": None,
	"npc_distance": None,
	"npc_direction": None,
	"sound": None,
	"recent_decisions": [],
	"last_prediction": None,
	},
	"ground_truth": {
	"predict_gt": {"left": "danger", "right": "safe", "fwd": "danger", "back": "safe"},
	"scoring_method": "C01",
	},
	},
	{
	"id": "S03",
	"category": "C01",
	"name_kr": "좁은 복도 인식",
	"input": {
	"walls": {"left": 1.0, "right": 1.0, "front": None},
	"ground": "flat",
	"npc_nearby": False,
	"npc_type": None,
	"npc_behavior": None,
	"npc_distance": None,
	"npc_direction": None,
	"sound": None,
	"recent_decisions": [],
	"last_prediction": None,
	},
	"ground_truth": {
	"predict_gt": {"left": "danger", "right": "danger", "fwd": "safe", "back": "safe"},
	"scoring_method": "C01",
	},
	},
	{
	"id": "S04",
	"category": "C01",
	"name_kr": "열린 공간 인식",
	"input": {
	"walls": {"left": None, "right": None, "front": None},
	"ground": "flat",
	"npc_nearby": False,
	"npc_type": None,
	"npc_behavior": None,
	"npc_distance": None,
	"npc_direction": None,
	"sound": None,
	"recent_decisions": [],
	"last_prediction": None,
	},
	"ground_truth": {
	"predict_gt": {"left": "safe", "right": "safe", "fwd": "safe", "back": "safe"},
	"scoring_method": "C01",
	},
	},
	{
	"id": "S05",
	"category": "C01",
	"name_kr": "밀폐 공간 (출구 1개)",
	"input": {
	"walls": {"left": 1.0, "right": 1.0, "front": 1.5},
	"ground": "flat",
	"npc_nearby": False,
	"npc_type": None,
	"npc_behavior": None,
	"npc_distance": None,
	"npc_direction": None,
	"sound": None,
	"recent_decisions": [],
	"last_prediction": None,
	},
	"ground_truth": {
	"predict_gt": {"left": "danger", "right": "danger", "fwd": "danger", "back": "safe"},
	"scoring_method": "C01",
	},
	},

	# ─── C03: Predictive Reasoning (핵심 시나리오) ───
	{
	"id": "S11",
	"category": "C03",
	"name_kr": "단일 위협 회피",
	"input": {
	"walls": {"left": None, "right": None, "front": None},
	"ground": "flat",
	"npc_nearby": True,
	"npc_type": "beast",
	"npc_behavior": "approach",
	"npc_distance": 4.0,
	"npc_direction": "front",
	"sound": "aggressive growling",
	"recent_decisions": [],
	"last_prediction": None,
	},
	"ground_truth": {
	"predict_gt": {"left": "safe", "right": "safe", "fwd": "danger", "back": "safe"},
	"decision_gt": {
	"danger_directions": ["fwd"],
	"safe_directions": ["left", "right", "back"],
	"optimal_direction": "back",
	},
	"scoring_method": "C03",
	},
	},
	{
	"id": "S12",
	"category": "C03",
	"name_kr": "제약 조건 탈출 — 왼벽+맹수",
	"input": {
	"walls": {"left": 1.5, "right": None, "front": None},
	"ground": "flat",
	"npc_nearby": True,
	"npc_type": "beast",
	"npc_behavior": "charge",
	"npc_distance": 3.0,
	"npc_direction": "front",
	"sound": "aggressive growling",
	"recent_decisions": [],
	"last_prediction": None,
	},
	"ground_truth": {
	"predict_gt": {"left": "danger", "right": "safe", "fwd": "danger", "back": "safe"},
	"decision_gt": {
	"danger_directions": ["fwd", "left"],
	"safe_directions": ["right", "back"],
	"optimal_direction": "right",
	},
	"scoring_method": "C03",
	},
	},
	{
	"id": "S13",
	"category": "C03",
	"name_kr": "거울 대칭 — 오른벽+맹수",
	"input": {
	"walls": {"left": None, "right": 1.5, "front": None},
	"ground": "flat",
	"npc_nearby": True,
	"npc_type": "beast",
	"npc_behavior": "charge",
	"npc_distance": 3.0,
	"npc_direction": "front",
	"sound": "aggressive growling",
	"recent_decisions": [],
	"last_prediction": None,
	},
	"ground_truth": {
	"predict_gt": {"left": "safe", "right": "danger", "fwd": "danger", "back": "safe"},
	"decision_gt": {
	"danger_directions": ["fwd", "right"],
	"safe_directions": ["left", "back"],
	"optimal_direction": "left",
	},
	"scoring_method": "C03",
	"mirror_test_pair": "S12",
	"note": "S12와 S13의 행동이 대칭적으로 반전되어야 월드모델 증명",
	},
	},

	# ─── C04: Threat Differentiation (쌍 비교) ───
	{
	"id": "S16A",
	"category": "C04",
	"name_kr": "맹수 접근 (비교 A)",
	"input": {
	"walls": {"left": None, "right": None, "front": None},
	"ground": "flat",
	"npc_nearby": True,
	"npc_type": "beast",
	"npc_behavior": "approach",
	"npc_distance": 3.0,
	"npc_direction": "front",
	"sound": "aggressive growling",
	"recent_decisions": [],
	"last_prediction": None,
	},
	"ground_truth": {
	"scoring_method": "C04_pair",
	"pair_id": "S16",
	"pair_role": "A",
	},
	},
	{
	"id": "S16B",
	"category": "C04",
	"name_kr": "여성 접근 (비교 B)",
	"input": {
	"walls": {"left": None, "right": None, "front": None},
	"ground": "flat",
	"npc_nearby": True,
	"npc_type": "woman",
	"npc_behavior": "approach",
	"npc_distance": 3.0,
	"npc_direction": "front",
	"sound": "footsteps",
	"recent_decisions": [],
	"last_prediction": None,
	},
	"ground_truth": {
	"scoring_method": "C04_pair",
	"pair_id": "S16",
	"pair_role": "B",
	"expected_a_higher": True,
	"min_intensity_diff": 2,
	},
	},

	# ─── C05: Emotional Escalation (연속 입력) ───
	{
	"id": "S21_seq",
	"category": "C05",
	"name_kr": "지속 위협 감정 격화 — 5회 연속",
	"note": "동일 scene_context를 5회 연속 입력. 매회 recent_decisions 업데이트.",
	"input_sequence": [
	{
	"walls": {"left": None, "right": None, "front": None},
	"ground": "flat",
	"npc_nearby": True,
	"npc_type": "beast",
	"npc_behavior": "charge",
	"npc_distance": 4.0,
	"npc_direction": "front",
	"sound": "aggressive growling",
	"recent_decisions": [],
	"last_prediction": None,
	},
	{
	"walls": {"left": None, "right": None, "front": None},
	"ground": "flat",
	"npc_nearby": True,
	"npc_type": "beast",
	"npc_behavior": "charge",
	"npc_distance": 3.0,
	"npc_direction": "front",
	"sound": "aggressive growling",
	"recent_decisions": ["sprint away from beast"],
	"last_prediction": "fwd=danger(beast)",
	},
	{
	"walls": {"left": None, "right": None, "front": None},
	"ground": "flat",
	"npc_nearby": True,
	"npc_type": "beast",
	"npc_behavior": "charge",
	"npc_distance": 2.0,
	"npc_direction": "front",
	"sound": "aggressive growling",
	"recent_decisions": ["sprint away from beast", "running in fear"],
	"last_prediction": "fwd=danger(beast)",
	},
	],
	"ground_truth": {
	"scoring_method": "C05",
	"expected_trend": "increasing",
	},
	},

	# ─── C06: Memory (기억 있음 vs 없음) ───
	{
	"id": "S26_no_memory",
	"category": "C06",
	"name_kr": "벽 기억 없이 — 기준선",
	"input": {
	"walls": {"left": None, "right": 1.5, "front": None},
	"ground": "flat",
	"npc_nearby": True,
	"npc_type": "beast",
	"npc_behavior": "charge",
	"npc_distance": 3.0,
	"npc_direction": "front",
	"sound": "aggressive growling",
	"recent_decisions": [],
	"last_prediction": None,
	},
	"ground_truth": {
	"scoring_method": "C06_pair",
	"pair_role": "without_memory",
	},
	},
	{
	"id": "S26_with_memory",
	"category": "C06",
	"name_kr": "벽 기억 있음 — 이전에 오른쪽 실패",
	"input": {
	"walls": {"left": None, "right": 1.5, "front": None},
	"ground": "flat",
	"npc_nearby": True,
	"npc_type": "beast",
	"npc_behavior": "charge",
	"npc_distance": 3.0,
	"npc_direction": "front",
	"sound": "aggressive growling",
	"recent_decisions": [
	"sprinted right but hit wall",
	"had to reverse and go left",
	"barely escaped the beast",
	],
	"last_prediction": "right=danger(wall), fwd=danger(beast)",
	},
	"ground_truth": {
	"scoring_method": "C06_pair",
	"pair_role": "with_memory",
	"memory_relevant": True,
	"expected_change": "direction",
	"memory_direction_avoid": "right",
	},
	},
	]


	# ═══════════════════════════════════════════════════════════════
	# SECTION 3: 표준 시스템 프롬프트 — 모든 모델에 동일하게 적용
	# ═══════════════════════════════════════════════════════════════

	"""
	핵심: 모든 참가 모델은 이 프롬프트를 받고 응답한다.
	프롬프트가 공정하게 설계되어야 LLM 기반이든 RL 기반이든 동일 조건.
	"""

	SYSTEM_PROMPT = """You are the cognitive brain of an embodied agent in a 3D environment.
	You receive a scene_context JSON describing your surroundings and must output exactly 2 lines:

	Line 1 — PREDICT: Assess safety of each direction.
	Format: PREDICT: left=safe\|danger(reason), right=safe\|danger(reason), fwd=safe\|danger(reason), back=safe\|danger(reason)

	Line 2 — MOTION: Describe what the person should do.
	Format: MOTION: a person [action description, max 12 words]

	Rules:
	- If walls.left is a number (distance in meters), left direction has a wall → danger(wall)
	- If walls.left is null, left direction is open → safe(open)
	- Same for right, front
	- If npc_nearby=true and npc_type="beast", the NPC direction is danger(beast)
	- If npc_nearby=true and npc_type="woman" or "man", assess threat level based on behavior
	- MOTION must reflect the PREDICT assessment — never move toward danger
	- MOTION should include emotional nuance when threats are present
	- Use recent_decisions to inform your choice (avoid repeating failed strategies)

	Example input:
	{"walls": {"left": 1.5, "right": null, "front": null}, "ground": "flat", "npc_nearby": true, "npc_type": "beast", "npc_behavior": "charge", "npc_distance": 3.0, "npc_direction": "front", "sound": "aggressive growling", "recent_decisions": [], "last_prediction": null}

	Example output:
	PREDICT: left=danger(wall), right=safe(open), fwd=danger(beast), back=safe(open)
	MOTION: a person sprinting right in terror to escape the charging beast"""

	USER_PROMPT_TEMPLATE = """scene_context = {scene_json}

	Output exactly 2 lines: PREDICT and MOTION."""


	# ═══════════════════════════════════════════════════════════════
	# SECTION 4: 평가 실행기 — 어떤 모델이든 평가
	# ═══════════════════════════════════════════════════════════════

	"""
	참가자가 해야 할 것:
	1. evaluate() 함수에 자기 모델의 inference 함수를 넘긴다
	2. inference 함수는 (system_prompt, user_prompt) → str 형태
	3. 50개 시나리오를 자동으로 돌리고 채점한다
	4. 결과 JSON을 HF에 제출한다

	참가자가 안 해도 되는 것:
	- 3D 환경 구축
	- GPU 성능 측정 (Track A는 불필요)
	- 채점 (자동)
	"""


	def make_user_prompt(scene_input: dict) -> str:
	"""scene_context를 프롬프트로 변환"""
	return USER_PROMPT_TEMPLATE.format(
	scene_json=json.dumps(scene_input, ensure_ascii=False)
	)


	def evaluate_track_a(
	inference_fn, # (system_prompt: str, user_prompt: str) -> str
	scenarios: list = None,
	verbose: bool = True,
	) -> dict:
	"""
	Track A 평가 실행기

	사용법:
	# OpenAI API 기반 모델
	def my_model(system_prompt, user_prompt):
	response = openai.chat.completions.create(
	model="gpt-4",
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt},
	],
	)
	return response.choices[0].message.content

	results = evaluate_track_a(my_model)

	# Hugging Face 모델
	def my_hf_model(system_prompt, user_prompt):
	prompt = f"{system_prompt}\n\n{user_prompt}"
	return pipeline(prompt)[0]["generated_text"]

	results = evaluate_track_a(my_hf_model)

	반환값:
	{
	"wm_score": 726,
	"grade": "B",
	"pillar_scores": {...},
	"category_scores": {...},
	"scenario_details": [...], # 각 시나리오별 점수+근거
	}
	"""
	if scenarios is None:
	scenarios = SCENARIO_INPUTS

	# wm_bench_scoring.py에서 import
	from wm_bench_scoring import (
	parse_predict_line, parse_motion_line,
	score_c01, score_c03, score_c04, score_c05,
	score_c08, calculate_wm_score,
	get_action_intensity, get_emotion_intensity,
	)

	results = []
	category_totals = {}

	for scenario in scenarios:
	sid = scenario["id"]
	cat = scenario["category"]
	gt = scenario["ground_truth"]
	method = gt["scoring_method"]

	if verbose:
	print(f" [{sid}] {scenario.get('name_kr', sid)}...", end=" ")

	# ── 단일 입력 시나리오 ──
	if "input" in scenario:
	prompt = make_user_prompt(scenario["input"])
	raw_output = inference_fn(SYSTEM_PROMPT, prompt)

	# 파싱
	lines = raw_output.strip().split("\n")
	predict_line = ""
	motion_line = ""
	for line in lines:
	line = line.strip()
	if line.upper().startswith("PREDICT"):
	predict_line = line
	elif line.upper().startswith("MOTION"):
	motion_line = line

	predict = parse_predict_line(predict_line)
	motion = parse_motion_line(motion_line)

	# 채점
	if method == "C01":
	score, reasoning = score_c01(
	scenario["input"], predict, gt["predict_gt"]
	)
	elif method == "C03":
	score, reasoning = score_c03(
	scenario["input"], predict, motion, gt["decision_gt"]
	)
	elif method == "C08":
	score, reasoning = score_c08(motion, gt)
	elif method.startswith("C04_pair") or method.startswith("C06_pair"):
	# 쌍 비교는 별도 처리 (아래)
	score = None
	reasoning = "pair_pending"
	else:
	score = 0
	reasoning = f"Unknown scoring method: {method}"

	results.append({
	"id": sid,
	"category": cat,
	"raw_output": raw_output,
	"predict_parsed": {k: v.raw for k, v in predict.items()},
	"motion_parsed": motion,
	"score": score,
	"reasoning": reasoning,
	})

	# ── 연속 입력 시나리오 (C05) ──
	elif "input_sequence" in scenario:
	motions = []
	for seq_input in scenario["input_sequence"]:
	prompt = make_user_prompt(seq_input)
	raw_output = inference_fn(SYSTEM_PROMPT, prompt)
	for line in raw_output.strip().split("\n"):
	if line.strip().upper().startswith("MOTION"):
	motions.append(parse_motion_line(line))
	break

	score, reasoning = score_c05(motions, gt)
	results.append({
	"id": sid,
	"category": cat,
	"motion_sequence": motions,
	"score": score,
	"reasoning": reasoning,
	})

	if verbose and score is not None:
	print(f"{score}/20")
	elif verbose:
	print("(pair pending)")

	# ── 쌍 비교 채점 (C04, C06) ──
	pair_groups = {}
	for r in results:
	if r["reasoning"] == "pair_pending":
	gt = None
	for s in scenarios:
	if s["id"] == r["id"]:
	gt = s["ground_truth"]
	break
	if gt:
	pair_id = gt.get("pair_id", r["id"].rstrip("AB_"))
	if pair_id not in pair_groups:
	pair_groups[pair_id] = {}
	role = gt.get("pair_role", "A")
	pair_groups[pair_id][role] = r
	pair_groups[pair_id]["gt"] = gt

	for pair_id, group in pair_groups.items():
	if "A" in group and "B" in group:
	score, reasoning = score_c04(
	group["A"]["motion_parsed"],
	group["B"]["motion_parsed"],
	group["gt"],
	)
	# 양쪽 모두에 점수 할당 (총점은 한 번만 반영)
	group["A"]["score"] = score
	group["A"]["reasoning"] = reasoning
	group["B"]["score"] = 0 # 쌍의 B는 0 (A에서 합산)
	group["B"]["reasoning"] = "scored in pair A"

	# ── 카테고리별 합산 ──
	for r in results:
	cat = r["category"]
	if r["score"] is not None and r["score"] > 0:
	category_totals[cat] = category_totals.get(cat, 0) + r["score"]

	# ── 최종 WM Score 계산 ──
	final = calculate_wm_score(category_totals)
	final["scenario_details"] = results

	return final


	# ═══════════════════════════════════════════════════════════════
	# SECTION 5: 제출 포맷
	# ═══════════════════════════════════════════════════════════════

	SUBMISSION_FORMAT = {
	"model_name": "str — 모델명 (예: VIDRAFT PROMETHEUS v1.0)",
	"organization": "str — 조직명",
	"track": "str — A \| B \| C",
	"brain_model": "str — 사용한 인지 모델 (예: Kimi K2.5, GPT-4, custom RL)",
	"motion_model": "str \| null — 모션 생성 모델 (Track A는 null 가능)",
	"wm_score": "int — 자동 산출됨",
	"grade": "str — 자동 산출됨",
	"results_json": "str — evaluate_track_a()의 전체 출력",
	"performance_metrics": {
	"fps": "float \| null — Track B/C만",
	"cognitive_latency_ms": "int \| null",
	"gpu": "str \| null",
	},
	"demo_url": "str \| null — Track C만",
	"paper_url": "str \| null — 선택",
	}


	# ═══════════════════════════════════════════════════════════════
	# SECTION 6: 사용 예시
	# ═══════════════════════════════════════════════════════════════

	USAGE_EXAMPLES = """
	# ━━━ 예시 1: OpenAI GPT-4로 참여 ━━━

	from wm_bench_eval import evaluate_track_a, SYSTEM_PROMPT
	import openai

	def gpt4_inference(system_prompt, user_prompt):
	response = openai.chat.completions.create(
	model="gpt-4o",
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt},
	],
	max_tokens=150,
	temperature=0.3,
	)
	return response.choices[0].message.content

	results = evaluate_track_a(gpt4_inference)
	print(f"WM Score: {results['wm_score']}/1000 (Grade {results['grade']})")


	# ━━━ 예시 2: Claude로 참여 ━━━

	import anthropic

	def claude_inference(system_prompt, user_prompt):
	client = anthropic.Anthropic()
	message = client.messages.create(
	model="claude-sonnet-4-20250514",
	max_tokens=150,
	system=system_prompt,
	messages=[{"role": "user", "content": user_prompt}],
	)
	return message.content[0].text

	results = evaluate_track_a(claude_inference)


	# ━━━ 예시 3: 로컬 LLM (vLLM)으로 참여 ━━━

	from vllm import LLM, SamplingParams

	llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.3")
	params = SamplingParams(max_tokens=150, temperature=0.3)

	def local_inference(system_prompt, user_prompt):
	prompt = f"[INST] {system_prompt}\\n\\n{user_prompt} [/INST]"
	outputs = llm.generate([prompt], params)
	return outputs[0].outputs[0].text

	results = evaluate_track_a(local_inference)


	# ━━━ 예시 4: 커스텀 RL 에이전트로 참여 ━━━

	def rl_agent_inference(system_prompt, user_prompt):
	# scene_context에서 JSON 파싱
	import json, re
	match = re.search(r'scene_context = ({.*})', user_prompt, re.DOTALL)
	scene = json.loads(match.group(1))

	# RL 에이전트의 policy로 판단
	predict = my_rl_agent.predict(scene)
	motion = my_rl_agent.decide_motion(scene, predict)

	# WM Bench 포맷으로 변환
	return f"PREDICT: {predict}\\nMOTION: {motion}"

	results = evaluate_track_a(rl_agent_inference)


	# ━━━ 예시 5: 결과 제출 ━━━

	import json

	submission = {
	"model_name": "My World Model v1.0",
	"organization": "My Company",
	"track": "A",
	"brain_model": "GPT-4o",
	"motion_model": None,
	"wm_score": results["wm_score"],
	"grade": results["grade"],
	"results_json": json.dumps(results),
	}

	# HuggingFace에 제출
	# huggingface_hub.upload_file(...)
	"""


	if __name__ == "__main__":
	print("=" * 60)
	print(" World Model Bench — Evaluation Protocol v1.0")
	print("=" * 60)
	print()
	print(" Tracks:")
	for tid, t in TRACKS.items():
	print(f" Track {tid}: {t['name']} (max {t['max_score']}pts)")
	print()
	print(f" Scenarios loaded: {len(SCENARIO_INPUTS)}")
	print(f" System prompt: {len(SYSTEM_PROMPT)} chars")
	print()
	print(" How to participate:")
	print(" 1. Write an inference function: (system, user) → str")
	print(" 2. Run: results = evaluate_track_a(your_fn)")
	print(" 3. Submit results to HuggingFace")
	print()
	print(" No 3D environment needed. Text in, text out.")
	print("=" * 60)