π macOS OrbStack νκ²½μμ VLLM λ‘컬 νκ° κ°μ΄λ¶
μ΄ λ¬Έμλ macOSμ OrbStack νκ²½μμ VLLM λͺ¨λΈμ μ€ννκ³ LLM νκ°λ₯Ό μννλ λ¨κ³λ³ κ°μ΄λμ λλ€.
π λͺ©μ°¨¶
- π μ¬μ μꡬμ¬ν
- π§ OrbStack μ€μΉ λ° μ€μ
- π€ VLLM μλ² μ€ν
- π¬ νκ° νκ²½ ꡬμΆ
- π§ͺ Deepeval νκ° μ€ν
- π Evalchemy λ²€μΉλ§ν¬ μ€ν
- π κ²°κ³Ό λΆμ λ° μκ°ν
- π« λ¬Έμ ν΄κ²°
π μ¬μ μꡬμ¬ν¶
μμ€ν μꡬμ¬ν¶
- macOS: 13.0 μ΄μ (Apple Silicon κΆμ₯)
- RAM: μ΅μ 16GB, κΆμ₯ 32GB
- λμ€ν¬: μ΅μ 20GB μ¬μ 곡κ°
- GPU: Apple Silicon GPU λλ NVIDIA GPU (μ νμ¬ν)
νμ λꡬ μ€μΉ¶
# Homebrew μ€μΉ (μλ κ²½μ°)
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
# νμ λκ΅¬λ€ μ€μΉ
brew install python@3.11 git curl jq
π§ OrbStack μ€μΉ λ° μ€μ ¶
1. OrbStack μ€μΉ¶
# OrbStack μ€μΉ (Docker Desktop λμ κΆμ₯)
brew install --cask orbstack
# OrbStack μμ
open -a OrbStack
# OrbStack μμ νμΈ
while ! docker info > /dev/null 2>&1; do
echo "OrbStack μμ λκΈ° μ€..."
sleep 3
done
echo "β
OrbStackμ΄ μ±κ³΅μ μΌλ‘ μμλμμ΅λλ€."
2. νλ‘μ νΈ ν΄λ‘ λ° μ€μ ¶
# νλ‘μ νΈ ν΄λ‘
git clone https://github.com/your-org/vllm-eval.git
cd vllm-eval
# Python κ°μνκ²½ μμ±
python3.11 -m venv venv
source venv/bin/activate
# νμ ν¨ν€μ§ μ€μΉ
pip install --upgrade pip
pip install -r requirements-dev.txt
pip install -r requirements-deepeval.txt
pip install -r requirements-evalchemy.txt
π€ VLLM μλ² μ€ν¶
1. λͺ¨λΈ λ€μ΄λ‘λ λ° μ€ν¶
# VLLM μλ² μ€ν (μ: Qwen2-7B λͺ¨λΈ)
docker run -d \
--name vllm-server \
--gpus all \
-p 8000:8000 \
vllm/vllm-openai:latest \
--model "Qwen/Qwen2-7B-Instruct" \
--served-model-name "qwen3-8b" \
--host 0.0.0.0 \
--port 8000
# μλ² μν νμΈ
docker logs vllm-server
# API ν
μ€νΈ
curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "qwen3-8b",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "νκ΅μ μλλ μ΄λμΈκ°μ?"}
],
"temperature": 0.7,
"max_tokens": 256,
"top_p": 0.95,
"stream": false
}'
2. λͺ¨λΈ μλ² ν¬μ€μ²΄ν¬¶
# λͺ¨λΈ λͺ©λ‘ νμΈ
curl http://localhost:8000/v1/models | jq
# μλ² μν νμΈ
curl http://localhost:8000/health
π¬ νκ° νκ²½ ꡬ좶
1. νκ²½ λ³μ μ€μ ¶
# .env νμΌ μμ±
cat > .env << 'EOF'
# VLLM λͺ¨λΈ μλν¬μΈνΈ
VLLM_MODEL_ENDPOINT=http://localhost:8000/v1
MODEL_NAME=qwen3-8b
# νκ° μ€μ
EVAL_CONFIG_PATH=configs/evalchemy.json
OUTPUT_DIR=./test_results
RUN_ID=local_eval_$(date +%Y%m%d_%H%M%S)
# λ‘κ·Έ μ€μ
LOG_LEVEL=INFO
PYTHONPATH=.
EOF
# νκ²½ λ³μ λ‘λ
source .env
2. ν μ€νΈ λ°μ΄ν°μ μ€λΉ¶
# κ²°κ³Ό λλ ν 리 μμ±
mkdir -p test_results
# ν
μ€νΈμ© λ°μ΄ν°μ
μμ±
mkdir -p datasets/raw/local_test_dataset
cat > datasets/raw/local_test_dataset/test.jsonl << 'EOF'
{"input": "νκ΅μ μλλ μ΄λμΈκ°μ?", "expected_output": "νκ΅μ μλλ μμΈμ
λλ€.", "context": "νκ΅ μ§λ¦¬μ κ΄ν μ§λ¬Έμ
λλ€."}
{"input": "νμ΄μ¬μμ 리μ€νΈλ₯Ό μ λ ¬νλ λ°©λ²μ?", "expected_output": "νμ΄μ¬μμ 리μ€νΈλ₯Ό μ λ ¬νλ €λ©΄ sort() λ©μλλ sorted() ν¨μλ₯Ό μ¬μ©ν μ μμ΅λλ€.", "context": "νλ‘κ·Έλλ° κ΄λ ¨ μ§λ¬Έμ
λλ€."}
{"input": "μ§κ΅¬μ λλ λ μΌλ§λ λ©λκΉ?", "expected_output": "μ§κ΅¬μ λλ λ μ½ 40,075kmμ
λλ€.", "context": "μ§κ΅¬κ³Όνμ κ΄ν μ§λ¬Έμ
λλ€."}
EOF
π§ͺ Deepeval νκ° μ€ν¶
1. 컀μ€ν νκ° μ€ν¬λ¦½νΈ μμ±¶
# λ‘컬 νκ° μ€ν¬λ¦½νΈ μμ±
cat > scripts/run_local_deepeval.py << 'EOF'
#!/usr/bin/env python3
"""
λ‘컬 VLLM μλ²λ₯Ό μ΄μ©ν Deepeval νκ° μ€ν¬λ¦½νΈ
"""
import os
import json
import asyncio
from typing import List, Dict, Any
from deepeval import evaluate
from deepeval.models import DeepEvalBaseLLM
from deepeval.test_case import LLMTestCase
from deepeval.metrics import (
ContextualPrecisionMetric,
ContextualRecallMetric,
ContextualRelevancyMetric,
AnswerRelevancyMetric
)
import openai
import logging
# λ‘κΉ
μ€μ
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class VLLMModel(DeepEvalBaseLLM):
"""VLLM OpenAI νΈν APIλ₯Ό μν λͺ¨λΈ ν΄λμ€"""
def __init__(self, model_name: str = "qwen3-8b", base_url: str = "http://localhost:8000/v1"):
self.model_name = model_name
self.client = openai.OpenAI(
base_url=base_url,
api_key="dummy" # VLLMμμλ API ν€κ° νμμμ
)
def load_model(self):
return self.model_name
def generate(self, prompt: str, schema: Dict = None) -> str:
"""ν
μ€νΈ μμ±"""
try:
response = self.client.chat.completions.create(
model=self.model_name,
messages=[{"role": "user", "content": prompt}],
temperature=0.1,
max_tokens=512
)
return response.choices[0].message.content
except Exception as e:
logger.error(f"Generation failed: {e}")
return ""
async def a_generate(self, prompt: str, schema: Dict = None) -> str:
"""λΉλκΈ° ν
μ€νΈ μμ±"""
return self.generate(prompt, schema)
def get_model_name(self) -> str:
return self.model_name
def load_test_dataset(file_path: str) -> List[Dict[str, Any]]:
"""JSONL ν
μ€νΈ λ°μ΄ν°μ
λ‘λ"""
test_cases = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
test_cases.append(json.loads(line.strip()))
return test_cases
def create_test_cases(dataset: List[Dict], model: VLLMModel) -> List[LLMTestCase]:
"""ν
μ€νΈ μΌμ΄μ€ μμ±"""
test_cases = []
for item in dataset:
# λͺ¨λΈλ‘λΆν° μ€μ μλ΅ μμ±
actual_output = model.generate(item["input"])
test_case = LLMTestCase(
input=item["input"],
actual_output=actual_output,
expected_output=item["expected_output"],
context=[item.get("context", "")]
)
test_cases.append(test_case)
logger.info(f"Created test case: {item['input'][:50]}...")
return test_cases
def main():
"""λ©μΈ νκ° μ€ν"""
# λͺ¨λΈ μ΄κΈ°ν
model = VLLMModel()
# ν
μ€νΈ λ°μ΄ν°μ
λ‘λ
dataset_path = "eval/deepeval_tests/datasets/test_local_dataset.jsonl"
dataset = load_test_dataset(dataset_path)
# ν
μ€νΈ μΌμ΄μ€ μμ±
test_cases = create_test_cases(dataset, model)
# νκ° λ©νΈλ¦ μ μ
metrics = [
AnswerRelevancyMetric(
threshold=0.7,
model=model,
include_reason=True
),
ContextualRelevancyMetric(
threshold=0.7,
model=model,
include_reason=True
)
]
# νκ° μ€ν
logger.info("Starting evaluation...")
results = evaluate(
test_cases=test_cases,
metrics=metrics,
print_results=True
)
# κ²°κ³Ό μ μ₯
output_dir = os.getenv("OUTPUT_DIR", "./test_results")
os.makedirs(output_dir, exist_ok=True)
results_file = f"{output_dir}/deepeval_results_{os.getenv('RUN_ID', 'local')}.json"
with open(results_file, 'w', encoding='utf-8') as f:
json.dump({
"test_results": [
{
"input": tc.input,
"actual_output": tc.actual_output,
"expected_output": tc.expected_output,
"metrics": {
metric.__class__.__name__: {
"score": getattr(metric, 'score', None),
"threshold": getattr(metric, 'threshold', None),
"success": getattr(metric, 'success', None),
"reason": getattr(metric, 'reason', None)
}
for metric in metrics
}
}
for tc in test_cases
]
}, f, ensure_ascii=False, indent=2)
logger.info(f"Results saved to: {results_file}")
return results
if __name__ == "__main__":
main()
EOF
# μ€ν κΆν λΆμ¬
chmod +x scripts/run_local_deepeval.py
2. Deepeval μ€ν¶
# Deepeval νκ° μ€ν
python scripts/run_local_deepeval.py
# κ²°κ³Ό νμΈ
ls -la test_results/
cat test_results/deepeval_results_*.json | jq
β‘ Evalchemy λ²€μΉλ§ν¬ μ€ν¶
1. λ‘컬 Evalchemy μ€μ ¶
# λ‘μ»¬μ© Evalchemy μ€μ νμΌ μμ±
cat > eval/evalchemy/configs/local_eval_config.json << 'EOF'
{
"benchmarks": {
"arc_easy": {
"enabled": true,
"tasks": ["arc_easy"],
"num_fewshot": 5,
"batch_size": 4,
"limit": 10,
"description": "ARC Easy λ²€μΉλ§ν¬ (λ‘컬 ν
μ€νΈμ©)",
"metrics": ["acc", "acc_norm"]
},
"hellaswag": {
"enabled": true,
"tasks": ["hellaswag"],
"num_fewshot": 10,
"batch_size": 4,
"limit": 10,
"description": "HellaSwag λ²€μΉλ§ν¬ (λ‘컬 ν
μ€νΈμ©)",
"metrics": ["acc", "acc_norm"]
}
}
}
EOF
2. λ‘컬 Evalchemy μ€ν μ€ν¬λ¦½νΈ¶
# λ‘컬 Evalchemy μ€ν μ€ν¬λ¦½νΈ μμ±
cat > scripts/run_local_evalchemy.py << 'EOF'
#!/usr/bin/env python3
"""
λ‘컬 VLLM μλ²λ₯Ό μ΄μ©ν Evalchemy λ²€μΉλ§ν¬ μ€ν
"""
import os
import json
import subprocess
import logging
from typing import Dict, Any
# λ‘κΉ
μ€μ
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def run_evalchemy_benchmark(config_path: str, output_dir: str) -> Dict[str, Any]:
"""Evalchemy λ²€μΉλ§ν¬ μ€ν"""
# νκ²½ λ³μ μ€μ
env = os.environ.copy()
env.update({
"VLLM_MODEL_ENDPOINT": "http://localhost:8000/v1",
"MODEL_NAME": "qwen3-8b",
"OUTPUT_DIR": output_dir,
"EVAL_CONFIG_PATH": config_path
})
# lm_eval λͺ
λ Ήμ΄ κ΅¬μ±
cmd = [
"lm_eval",
"--model", "openai-chat-completions",
"--model_args", f"base_url=http://localhost:8000/v1,model={env['MODEL_NAME']},tokenizer={env['MODEL_NAME']}",
"--tasks", "arc_easy,hellaswag",
"--num_fewshot", "5",
"--batch_size", "4",
"--limit", "10",
"--output_path", f"{output_dir}/evalchemy_results.json",
"--log_samples"
]
logger.info(f"Running command: {' '.join(cmd)}")
try:
# λ²€μΉλ§ν¬ μ€ν
result = subprocess.run(
cmd,
env=env,
capture_output=True,
text=True,
timeout=3600 # 1μκ° νμμμ
)
if result.returncode == 0:
logger.info("Evalchemy benchmark completed successfully")
# κ²°κ³Ό νμΌ μ½κΈ°
results_file = f"{output_dir}/evalchemy_results.json"
if os.path.exists(results_file):
with open(results_file, 'r') as f:
results = json.load(f)
return results
else:
logger.warning("Results file not found")
return {}
else:
logger.error(f"Benchmark failed with return code: {result.returncode}")
logger.error(f"Error output: {result.stderr}")
return {}
except subprocess.TimeoutExpired:
logger.error("Benchmark timed out")
return {}
except Exception as e:
logger.error(f"Benchmark failed with exception: {e}")
return {}
def main():
"""λ©μΈ μ€ν ν¨μ"""
config_path = "eval/evalchemy/configs/local_eval_config.json"
output_dir = os.getenv("OUTPUT_DIR", "./test_results")
# μΆλ ₯ λλ ν 리 μμ±
os.makedirs(output_dir, exist_ok=True)
# λ²€μΉλ§ν¬ μ€ν
results = run_evalchemy_benchmark(config_path, output_dir)
if results:
logger.info("Benchmark results:")
for task, metrics in results.get("results", {}).items():
logger.info(f" {task}: {metrics}")
else:
logger.error("No results obtained")
if __name__ == "__main__":
main()
EOF
# μ€ν κΆν λΆμ¬
chmod +x scripts/run_local_evalchemy.py
3. Evalchemy μ€ν¶
# lm-evaluation-harness μ€μΉ (νμν κ²½μ°)
pip install lm-eval[openai]
# Evalchemy λ²€μΉλ§ν¬ μ€ν
python scripts/run_local_evalchemy.py
# κ²°κ³Ό νμΈ
ls -la test_results/
cat test_results/evalchemy_results.json | jq
π κ²°κ³Ό λΆμ λ° μκ°ν¶
1. κ²°κ³Ό μ§κ³ μ€ν¬λ¦½νΈ¶
# κ²°κ³Ό μ§κ³ μ€ν¬λ¦½νΈ μμ±
cat > scripts/aggregate_local_results.py << 'EOF'
#!/usr/bin/env python3
"""
λ‘컬 νκ° κ²°κ³Ό μ§κ³ λ° μκ°ν
"""
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import logging
# λ‘κΉ
μ€μ
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def load_deepeval_results(results_dir: str) -> dict:
"""Deepeval κ²°κ³Ό λ‘λ"""
results = {}
for file in os.listdir(results_dir):
if file.startswith("deepeval_results_") and file.endswith(".json"):
with open(os.path.join(results_dir, file), 'r') as f:
results[file] = json.load(f)
return results
def load_evalchemy_results(results_dir: str) -> dict:
"""Evalchemy κ²°κ³Ό λ‘λ"""
results = {}
for file in os.listdir(results_dir):
if file.startswith("evalchemy_results") and file.endswith(".json"):
with open(os.path.join(results_dir, file), 'r') as f:
results[file] = json.load(f)
return results
def create_summary_report(deepeval_results: dict, evalchemy_results: dict, output_dir: str):
"""ν΅ν© λ³΄κ³ μ μμ±"""
report = {
"timestamp": datetime.now().isoformat(),
"model_name": os.getenv("MODEL_NAME", "unknown"),
"summary": {
"deepeval": {},
"evalchemy": {}
}
}
# Deepeval κ²°κ³Ό μμ½
if deepeval_results:
for filename, data in deepeval_results.items():
test_results = data.get("test_results", [])
if test_results:
# λ©νΈλ¦λ³ νκ· κ³μ°
metrics_summary = {}
for result in test_results:
for metric_name, metric_data in result.get("metrics", {}).items():
if metric_name not in metrics_summary:
metrics_summary[metric_name] = []
if metric_data.get("score") is not None:
metrics_summary[metric_name].append(metric_data["score"])
# νκ· κ³μ°
avg_metrics = {}
for metric_name, scores in metrics_summary.items():
if scores:
avg_metrics[metric_name] = {
"average_score": sum(scores) / len(scores),
"count": len(scores)
}
report["summary"]["deepeval"][filename] = avg_metrics
# Evalchemy κ²°κ³Ό μμ½
if evalchemy_results:
for filename, data in evalchemy_results.items():
results = data.get("results", {})
summary = {}
for task, metrics in results.items():
summary[task] = {
"accuracy": metrics.get("acc", 0),
"normalized_accuracy": metrics.get("acc_norm", 0)
}
report["summary"]["evalchemy"][filename] = summary
# λ³΄κ³ μ μ μ₯
report_file = f"{output_dir}/evaluation_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(report_file, 'w', encoding='utf-8') as f:
json.dump(report, f, ensure_ascii=False, indent=2)
logger.info(f"Summary report saved to: {report_file}")
return report
def create_visualizations(report: dict, output_dir: str):
"""κ²°κ³Ό μκ°ν"""
try:
import matplotlib.pyplot as plt
import seaborn as sns
# μ€νμΌ μ€μ
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
# Deepeval κ²°κ³Ό μκ°ν
deepeval_data = report["summary"]["deepeval"]
if deepeval_data:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle(f'Local VLLM Evaluation Results - {report["model_name"]}', fontsize=16)
# λ©νΈλ¦λ³ μ μ μκ°ν
all_scores = []
all_metrics = []
for filename, metrics in deepeval_data.items():
for metric_name, metric_data in metrics.items():
all_scores.append(metric_data["average_score"])
all_metrics.append(metric_name.replace("Metric", ""))
if all_scores:
axes[0, 0].bar(all_metrics, all_scores)
axes[0, 0].set_title('Deepeval Metrics Scores')
axes[0, 0].set_ylabel('Score')
axes[0, 0].tick_params(axis='x', rotation=45)
# Evalchemy κ²°κ³Ό μκ°ν
evalchemy_data = report["summary"]["evalchemy"]
if evalchemy_data:
tasks = []
accuracies = []
for filename, results in evalchemy_data.items():
for task, metrics in results.items():
tasks.append(task)
accuracies.append(metrics["accuracy"])
if tasks:
axes[0, 1].bar(tasks, accuracies)
axes[0, 1].set_title('Evalchemy Benchmark Accuracies')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].tick_params(axis='x', rotation=45)
plt.tight_layout()
chart_file = f"{output_dir}/evaluation_charts_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
plt.savefig(chart_file, dpi=300, bbox_inches='tight')
logger.info(f"Charts saved to: {chart_file}")
except ImportError:
logger.warning("matplotlib/seaborn not installed, skipping visualization")
except Exception as e:
logger.error(f"Visualization failed: {e}")
def main():
"""λ©μΈ μ€ν ν¨μ"""
output_dir = os.getenv("OUTPUT_DIR", "./test_results")
# κ²°κ³Ό λ‘λ
deepeval_results = load_deepeval_results(output_dir)
evalchemy_results = load_evalchemy_results(output_dir)
# ν΅ν© λ³΄κ³ μ μμ±
report = create_summary_report(deepeval_results, evalchemy_results, output_dir)
# μκ°ν
create_visualizations(report, output_dir)
# μ½μ μΆλ ₯
print("\n=== Local VLLM Evaluation Summary ===")
print(f"Model: {report['model_name']}")
print(f"Timestamp: {report['timestamp']}")
if report["summary"]["deepeval"]:
print("\n--- Deepeval Results ---")
for filename, metrics in report["summary"]["deepeval"].items():
print(f"File: {filename}")
for metric_name, data in metrics.items():
print(f" {metric_name}: {data['average_score']:.3f} (n={data['count']})")
if report["summary"]["evalchemy"]:
print("\n--- Evalchemy Results ---")
for filename, results in report["summary"]["evalchemy"].items():
print(f"File: {filename}")
for task, metrics in results.items():
print(f" {task}: {metrics['accuracy']:.3f}")
if __name__ == "__main__":
main()
EOF
# μ€ν κΆν λΆμ¬
chmod +x scripts/aggregate_local_results.py
2. κ²°κ³Ό μ§κ³ μ€ν¶
# μκ°ν λΌμ΄λΈλ¬λ¦¬ μ€μΉ
pip install matplotlib seaborn pandas
# κ²°κ³Ό μ§κ³ λ° μκ°ν
python scripts/aggregate_local_results.py
# μμ±λ νμΌ νμΈ
ls -la test_results/evaluation_*
π§ ν΅ν© μ€ν μ€ν¬λ¦½νΈ¶
# μ 체 λ‘컬 νκ° μ€ν μ€ν¬λ¦½νΈ μμ±
cat > scripts/run_full_local_evaluation.sh << 'EOF'
#!/bin/bash
set -e
echo "π λ‘컬 VLLM νκ° μμ"
echo "===================="
# νκ²½ λ³μ λ‘λ
source .env
# 1. VLLM μλ² μν νμΈ
echo "π‘ VLLM μλ² μν νμΈ μ€..."
if ! curl -f http://localhost:8000/health > /dev/null 2>&1; then
echo "β VLLM μλ²κ° μ€νλμ§ μμμ΅λλ€. μλ²λ₯Ό λ¨Όμ μμν΄μ£ΌμΈμ."
exit 1
fi
echo "β
VLLM μλ² μ μ μλ"
# 2. κ²°κ³Ό λλ ν 리 μμ±
mkdir -p $OUTPUT_DIR
# 3. Deepeval μ€ν
echo "π§ͺ Deepeval νκ° μ€ν μ€..."
python scripts/run_local_deepeval.py
echo "β
Deepeval μλ£"
# 4. Evalchemy μ€ν
echo "β‘ Evalchemy λ²€μΉλ§ν¬ μ€ν μ€..."
python scripts/run_local_evalchemy.py
echo "β
Evalchemy μλ£"
# 5. κ²°κ³Ό μ§κ³
echo "π κ²°κ³Ό μ§κ³ λ° μκ°ν μ€..."
python scripts/aggregate_local_results.py
echo "β
κ²°κ³Ό μ§κ³ μλ£"
# 6. κ²°κ³Ό μΆλ ₯
echo ""
echo "π λ‘컬 VLLM νκ° μλ£!"
echo "===================="
echo "κ²°κ³Ό νμΌ μμΉ: $OUTPUT_DIR"
echo "μ£Όμ νμΌ:"
echo " - Deepeval κ²°κ³Ό: $OUTPUT_DIR/deepeval_results_*.json"
echo " - Evalchemy κ²°κ³Ό: $OUTPUT_DIR/evalchemy_results.json"
echo " - ν΅ν© λ³΄κ³ μ: $OUTPUT_DIR/evaluation_summary_*.json"
echo " - μκ°ν μ°¨νΈ: $OUTPUT_DIR/evaluation_charts_*.png"
EOF
# μ€ν κΆν λΆμ¬
chmod +x scripts/run_full_local_evaluation.sh
π¨ λ¬Έμ ν΄κ²°¶
1. VLLM μλ² κ΄λ ¨ λ¬Έμ ¶
# μλ² λ‘κ·Έ νμΈ
docker logs vllm-server
# μλ² μ¬μμ
docker restart vllm-server
# ν¬νΈ μ¬μ© νμΈ
lsof -i :8000
2. Python μμ‘΄μ± λ¬Έμ ¶
# κ°μνκ²½ μ¬μμ±
deactivate
rm -rf venv
python3.11 -m venv venv
source venv/bin/activate
pip install --upgrade pip
pip install -r requirements-dev.txt
3. λ©λͺ¨λ¦¬ λΆμ‘± λ¬Έμ ¶
# Docker λ©λͺ¨λ¦¬ μ ν μ€μ
docker run -d \
--name vllm-server \
--memory="8g" \
--gpus all \
-p 8000:8000 \
vllm/vllm-openai:latest \
--model "Qwen/Qwen2-7B-Instruct" \
--gpu-memory-utilization 0.8
4. νκ° κ²°κ³Όκ° λμ€μ§ μλ κ²½μ°¶
# λ‘κ·Έ λ 벨 λ³κ²½
export LOG_LEVEL=DEBUG
# λ¨κ³λ³ λλ²κΉ
python -c "
import openai
client = openai.OpenAI(base_url='http://localhost:8000/v1', api_key='dummy')
response = client.chat.completions.create(
model='qwen3-8b',
messages=[{'role': 'user', 'content': 'Hello!'}]
)
print(response.choices[0].message.content)
"
π― μ€ν μμ½¶
λ°©λ² 1: ν΅ν© μ€ν¬λ¦½νΈ μ¬μ© (κΆμ₯)¶
# ν λ²μ λͺ¨λ κ²μ μ€ν (VLLM μλ² μλ κ°μ§)
./scripts/run_complete_local_evaluation.sh
λ°©λ² 2: μλ λ¨κ³λ³ μ€ν¶
# 1. VLLM μλ² μμ (μ νμ¬ν)
docker run -d \
--name vllm-server \
--gpus all \
-p 8000:8000 \
vllm/vllm-openai:latest \
--model "Qwen/Qwen2-7B-Instruct" \
--served-model-name "qwen3-8b"
# 2. κ°λ³ ν
μ€νΈ μ€ν
python scripts/run_simple_deepeval_test.py # Mock ν
μ€νΈ
python scripts/run_vllm_deepeval_test.py # μ€μ VLLM ν
μ€νΈ
# μ 체 νκ° μ€ν
python scripts/run_complete_local_evalchemy.py
# κ°λ³ ν
μ€νΈ μ€ν
python scripts/run_simple_deepeval_test.py
python scripts/run_simple_evalchemy_test.py
# 3. κ²°κ³Ό νμΈ
cat test_results/*.json | jq
μ€ν κ²°κ³Ό μμ¶
π macOS OrbStack VLLM λ‘컬 νκ° ν΅ν© μ€ν
=============================================
π 1. νκ²½ νμΈ
π 2. νμ ν¨ν€μ§ νμΈ
β
νμ ν¨ν€μ§ νμΈ μλ£
π 3. κ²°κ³Ό λλ ν 리 μμ±
β
κ²°κ³Ό λλ ν 리 μμ±: ./test_results
π 4. VLLM μλ² μν νμΈ
β
VLLM μλ² λ°κ²¬: http://localhost:1234
π 5. μ€μ VLLM μλ²λ‘ νκ° μ€ν
π μ΅μ’
κ²°κ³Ό:
μ΄ ν
μ€νΈ: 5
νκ· μ μ: 0.50
μ±κ³΅λ₯ : 50.0%
β
λ‘컬 VLLM νκ°κ° μλ£λμμ΅λλ€!
π μ 리¶
μ΄ κ°μ΄λλ₯Ό ν΅ν΄ macOS OrbStack νκ²½μμ VLLM λͺ¨λΈμ λ‘컬 νκ°λ₯Ό μμ ν μνν μ μμ΅λλ€:
π§ μ£Όμ κΈ°λ₯¶
- μλ νκ²½ κ°μ§: VLLM μλ² μ¬λ¬ ν¬νΈ μλ νμ§
- Mock λͺ¨λ μ§μ: μλ² μμ΄λ ν μ€νΈ κ°λ₯
- ν΅ν© μ€ν: ν λ²μ λͺ λ ΉμΌλ‘ μ 체 νκ° μν
- μμΈν κ²°κ³Ό λΆμ: JSON ννμ ꡬ쑰νλ κ²°κ³Ό
π μμ±λλ κ²°κ³Ό νμΌ¶
simple_deepeval_results.json
: Mock ν μ€νΈ κ²°κ³Όvllm_deepeval_results.json
: VLLM μλ² ν μ€νΈ κ²°κ³Ό
π λ€μ λ¨κ³¶
μ΄ λ‘컬 νκ° μμ€ν μ κΈ°λ°μΌλ‘ λ€μκ³Ό κ°μ νμ₯μ΄ κ°λ₯ν©λλ€: - 컀μ€ν νκ° λ©νΈλ¦ μΆκ° - λ€μν λͺ¨λΈ λΉκ΅ νκ° - μ±λ₯ λ²€μΉλ§ν¬ νμ₯ - CI/CD νμ΄νλΌμΈ ν΅ν©