OrbStack으로 MLflow 컨테이너 환경 구축하기 - Docker & Kubernetes 실험 가이드
개요
OrbStack은 macOS에서 Docker와 Kubernetes를 빠르고 효율적으로 실행할 수 있는 현대적인 가상화 플랫폼입니다. 기존의 Docker Desktop보다 가볍고 빠르며, 네이티브 macOS 성능을 제공합니다.
이 가이드에서는 OrbStack을 활용하여 MLflow를 컨테이너 환경에서 구축하고, Docker Compose와 Kubernetes에서 실험 추적 시스템을 운영하는 방법을 소개합니다.
OrbStack 설치 및 설정
1. OrbStack 설치
# Homebrew를 통한 설치
brew install orbstack
# 또는 공식 웹사이트에서 다운로드
# https://orbstack.dev/
2. OrbStack 초기 설정
# OrbStack 실행
open -a OrbStack
# Docker 컨텍스트 확인
docker context ls
# OrbStack 컨텍스트 사용
docker context use orbstack
# Kubernetes 클러스터 활성화
orb create k8s
3. 환경 확인
# Docker 상태 확인
docker info
# Kubernetes 클러스터 확인
kubectl cluster-info
# 노드 상태 확인
kubectl get nodes
Docker Compose로 MLflow 환경 구축
1. 프로젝트 구조 설정
# 프로젝트 디렉토리 생성
mkdir mlflow-orbstack
cd mlflow-orbstack
# 디렉토리 구조 생성
mkdir -p {docker,kubernetes,notebooks,data,models}
2. Docker Compose 파일 작성
# docker-compose.yml
version: '3.8'
services:
postgresql:
image: postgres:15
container_name: mlflow-postgres
environment:
POSTGRES_DB: mlflow
POSTGRES_USER: mlflow
POSTGRES_PASSWORD: mlflow123
volumes:
- postgres_data:/var/lib/postgresql/data
ports:
- "5432:5432"
healthcheck:
test: ["CMD-SHELL", "pg_isready -U mlflow"]
interval: 30s
timeout: 10s
retries: 5
minio:
image: minio/minio:latest
container_name: mlflow-minio
environment:
MINIO_ACCESS_KEY: minioadmin
MINIO_SECRET_KEY: minioadmin123
command: server /data --console-address ":9001"
volumes:
- minio_data:/data
ports:
- "9000:9000"
- "9001:9001"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 30s
timeout: 20s
retries: 3
mlflow:
build:
context: ./docker
dockerfile: Dockerfile.mlflow
container_name: mlflow-server
environment:
- MLFLOW_BACKEND_STORE_URI=postgresql://mlflow:mlflow123@postgresql:5432/mlflow
- MLFLOW_DEFAULT_ARTIFACT_ROOT=s3://mlflow-artifacts/
- AWS_ACCESS_KEY_ID=minioadmin
- AWS_SECRET_ACCESS_KEY=minioadmin123
- MLFLOW_S3_ENDPOINT_URL=http://minio:9000
ports:
- "5000:5000"
depends_on:
postgresql:
condition: service_healthy
minio:
condition: service_healthy
command: >
sh -c "
mlflow db upgrade postgresql://mlflow:mlflow123@postgresql:5432/mlflow &&
mlflow server
--backend-store-uri postgresql://mlflow:mlflow123@postgresql:5432/mlflow
--default-artifact-root s3://mlflow-artifacts/
--host 0.0.0.0
--port 5000
"
jupyter:
build:
context: ./docker
dockerfile: Dockerfile.jupyter
container_name: mlflow-jupyter
environment:
- MLFLOW_TRACKING_URI=http://mlflow:5000
- AWS_ACCESS_KEY_ID=minioadmin
- AWS_SECRET_ACCESS_KEY=minioadmin123
- MLFLOW_S3_ENDPOINT_URL=http://minio:9000
volumes:
- ./notebooks:/home/jovyan/notebooks
- ./data:/home/jovyan/data
ports:
- "8888:8888"
depends_on:
- mlflow
volumes:
postgres_data:
minio_data:
3. MLflow Dockerfile 작성
# docker/Dockerfile.mlflow
FROM python:3.11-slim
WORKDIR /app
# 시스템 패키지 설치
RUN apt-get update && apt-get install -y \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Python 패키지 설치
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# MLflow 설정
RUN mkdir -p /mlflow/artifacts
EXPOSE 5000
CMD ["mlflow", "server", "--host", "0.0.0.0", "--port", "5000"]
4. Jupyter Dockerfile 작성
# docker/Dockerfile.jupyter
FROM jupyter/scipy-notebook:latest
USER root
# 시스템 패키지 설치
RUN apt-get update && apt-get install -y \
build-essential \
&& rm -rf /var/lib/apt/lists/*
USER $NB_UID
# Python 패키지 설치
COPY requirements-jupyter.txt /tmp/
RUN pip install --no-cache-dir -r /tmp/requirements-jupyter.txt
# Jupyter 설정
RUN jupyter lab --generate-config
EXPOSE 8888
CMD ["start-notebook.sh", "--NotebookApp.token=''", "--NotebookApp.password=''"]
5. Requirements 파일 작성
# docker/requirements.txt
mlflow==2.8.1
psycopg2-binary==2.9.9
boto3==1.34.0
pymongo==4.6.0
scikit-learn==1.3.2
pandas==2.1.4
numpy==1.24.4
matplotlib==3.8.2
seaborn==0.13.0
plotly==5.17.0
# docker/requirements-jupyter.txt
mlflow==2.8.1
psycopg2-binary==2.9.9
boto3==1.34.0
scikit-learn==1.3.2
pandas==2.1.4
numpy==1.24.4
matplotlib==3.8.2
seaborn==0.13.0
plotly==5.17.0
jupyterlab==4.0.9
ipywidgets==8.1.1
6. 컨테이너 실행
# 컨테이너 빌드 및 실행
docker compose up -d
# 서비스 상태 확인
docker compose ps
# 로그 확인
docker compose logs mlflow
# MinIO 버킷 생성
docker exec mlflow-minio mc mb minio/mlflow-artifacts
7. 접속 확인
# MLflow UI 접속
open http://localhost:5000
# Jupyter Lab 접속
open http://localhost:8888
# MinIO Console 접속
open http://localhost:9001
Kubernetes에서 MLflow 배포
1. Namespace 생성
# kubernetes/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: mlflow
labels:
name: mlflow
---
apiVersion: v1
kind: ConfigMap
metadata:
name: mlflow-config
namespace: mlflow
data:
MLFLOW_BACKEND_STORE_URI: "postgresql://mlflow:mlflow123@postgresql:5432/mlflow"
MLFLOW_DEFAULT_ARTIFACT_ROOT: "s3://mlflow-artifacts/"
AWS_ACCESS_KEY_ID: "minioadmin"
AWS_SECRET_ACCESS_KEY: "minioadmin123"
MLFLOW_S3_ENDPOINT_URL: "http://minio:9000"
2. PostgreSQL 배포
# kubernetes/postgresql.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: postgres-pvc
namespace: mlflow
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 5Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: postgresql
namespace: mlflow
spec:
replicas: 1
selector:
matchLabels:
app: postgresql
template:
metadata:
labels:
app: postgresql
spec:
containers:
- name: postgresql
image: postgres:15
env:
- name: POSTGRES_DB
value: "mlflow"
- name: POSTGRES_USER
value: "mlflow"
- name: POSTGRES_PASSWORD
value: "mlflow123"
ports:
- containerPort: 5432
volumeMounts:
- name: postgres-storage
mountPath: /var/lib/postgresql/data
livenessProbe:
exec:
command:
- pg_isready
- -U
- mlflow
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
exec:
command:
- pg_isready
- -U
- mlflow
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: postgres-storage
persistentVolumeClaim:
claimName: postgres-pvc
---
apiVersion: v1
kind: Service
metadata:
name: postgresql
namespace: mlflow
spec:
selector:
app: postgresql
ports:
- port: 5432
targetPort: 5432
type: ClusterIP
3. MinIO 배포
# kubernetes/minio.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: minio-pvc
namespace: mlflow
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: minio
namespace: mlflow
spec:
replicas: 1
selector:
matchLabels:
app: minio
template:
metadata:
labels:
app: minio
spec:
containers:
- name: minio
image: minio/minio:latest
args:
- server
- /data
- --console-address
- ":9001"
env:
- name: MINIO_ACCESS_KEY
value: "minioadmin"
- name: MINIO_SECRET_KEY
value: "minioadmin123"
ports:
- containerPort: 9000
- containerPort: 9001
volumeMounts:
- name: minio-storage
mountPath: /data
livenessProbe:
httpGet:
path: /minio/health/live
port: 9000
initialDelaySeconds: 120
periodSeconds: 20
readinessProbe:
httpGet:
path: /minio/health/ready
port: 9000
initialDelaySeconds: 120
periodSeconds: 20
volumes:
- name: minio-storage
persistentVolumeClaim:
claimName: minio-pvc
---
apiVersion: v1
kind: Service
metadata:
name: minio
namespace: mlflow
spec:
selector:
app: minio
ports:
- name: api
port: 9000
targetPort: 9000
- name: console
port: 9001
targetPort: 9001
type: LoadBalancer
4. MLflow 서버 배포
# kubernetes/mlflow.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: mlflow
namespace: mlflow
spec:
replicas: 1
selector:
matchLabels:
app: mlflow
template:
metadata:
labels:
app: mlflow
spec:
initContainers:
- name: db-migration
image: python:3.11-slim
command:
- sh
- -c
- |
pip install mlflow psycopg2-binary &&
mlflow db upgrade postgresql://mlflow:mlflow123@postgresql:5432/mlflow
env:
- name: MLFLOW_BACKEND_STORE_URI
value: "postgresql://mlflow:mlflow123@postgresql:5432/mlflow"
containers:
- name: mlflow
image: python:3.11-slim
command:
- sh
- -c
- |
pip install mlflow psycopg2-binary boto3 &&
mlflow server --backend-store-uri postgresql://mlflow:mlflow123@postgresql:5432/mlflow --default-artifact-root s3://mlflow-artifacts/ --host 0.0.0.0 --port 5000
envFrom:
- configMapRef:
name: mlflow-config
ports:
- containerPort: 5000
livenessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 60
periodSeconds: 30
readinessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
restartPolicy: Always
---
apiVersion: v1
kind: Service
metadata:
name: mlflow
namespace: mlflow
spec:
selector:
app: mlflow
ports:
- port: 5000
targetPort: 5000
type: LoadBalancer
5. Kubernetes 배포 실행
# Namespace 및 ConfigMap 생성
kubectl apply -f kubernetes/namespace.yaml
# PostgreSQL 배포
kubectl apply -f kubernetes/postgresql.yaml
# MinIO 배포
kubectl apply -f kubernetes/minio.yaml
# MLflow 배포
kubectl apply -f kubernetes/mlflow.yaml
# 배포 상태 확인
kubectl get all -n mlflow
# 서비스 엔드포인트 확인
kubectl get svc -n mlflow
# 로그 확인
kubectl logs -f deployment/mlflow -n mlflow
6. 포트 포워딩 설정
# MLflow UI 포트 포워딩
kubectl port-forward svc/mlflow 5000:5000 -n mlflow &
# MinIO Console 포트 포워딩
kubectl port-forward svc/minio 9001:9001 -n mlflow &
# 접속 확인
open http://localhost:5000
open http://localhost:9001
실험 예제: 컨테이너 환경에서 MLflow 사용
1. Jupyter Notebook에서 실험 실행
# notebooks/mlflow_experiment.ipynb
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import os
# MLflow 설정
mlflow.set_tracking_uri("http://mlflow:5000")
mlflow.set_experiment("housing_price_container")
# 환경 변수 설정
os.environ['AWS_ACCESS_KEY_ID'] = 'minioadmin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'minioadmin123'
os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://minio:9000'
def create_sample_data():
"""샘플 데이터 생성"""
np.random.seed(42)
n_samples = 1000
data = {
'area': np.random.normal(1500, 500, n_samples),
'bedrooms': np.random.randint(1, 6, n_samples),
'bathrooms': np.random.randint(1, 4, n_samples),
'age': np.random.randint(0, 50, n_samples),
'distance_to_city': np.random.normal(10, 5, n_samples)
}
df = pd.DataFrame(data)
df['price'] = (df['area'] * 100 +
df['bedrooms'] * 50000 +
df['bathrooms'] * 30000 -
df['age'] * 2000 -
df['distance_to_city'] * 5000 +
np.random.normal(0, 50000, n_samples))
return df
def run_container_experiment(n_estimators, max_depth, min_samples_split):
"""컨테이너 환경에서 실험 실행"""
# 데이터 생성
df = create_sample_data()
X = df.drop('price', axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
with mlflow.start_run():
# 환경 정보 태그
mlflow.set_tag("environment", "container")
mlflow.set_tag("platform", "orbstack")
mlflow.set_tag("deployment", "docker-compose")
# 하이퍼파라미터 로깅
mlflow.log_param("n_estimators", n_estimators)
mlflow.log_param("max_depth", max_depth)
mlflow.log_param("min_samples_split", min_samples_split)
# 모델 훈련
model = RandomForestRegressor(
n_estimators=n_estimators,
max_depth=max_depth,
min_samples_split=min_samples_split,
random_state=42
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# 메트릭 계산
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
# 메트릭 로깅
mlflow.log_metric("mse", mse)
mlflow.log_metric("r2_score", r2)
mlflow.log_metric("rmse", rmse)
# 모델 저장
mlflow.sklearn.log_model(model, "model")
# 시각화 생성
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Container Environment - Predicted vs Actual')
plt.savefig('/tmp/prediction_plot.png')
mlflow.log_artifact('/tmp/prediction_plot.png')
plt.close()
print(f"Container experiment - R²: {r2:.4f}, RMSE: {rmse:.2f}")
return model, r2, rmse
# 실험 실행
experiments = [
(100, 10, 2),
(200, 15, 5),
(150, 12, 3)
]
for n_est, max_dep, min_split in experiments:
run_container_experiment(n_est, max_dep, min_split)
print("모든 컨테이너 실험이 완료되었습니다!")
2. 배치 실험 스크립트
# notebooks/batch_experiment.py
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import logging
import os
# 로깅 설정
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# MLflow 설정
mlflow.set_tracking_uri("http://mlflow:5000")
mlflow.set_experiment("batch_experiments_container")
# 환경 변수 설정
os.environ['AWS_ACCESS_KEY_ID'] = 'minioadmin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'minioadmin123'
os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://minio:9000'
def run_batch_experiments():
"""배치 실험 실행"""
# 데이터 로드
logger.info("데이터 생성 중...")
df = create_sample_data()
X = df.drop('price', axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 하이퍼파라미터 그리드
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [10, 15, 20],
'min_samples_split': [2, 5, 10]
}
with mlflow.start_run(run_name="batch_grid_search"):
# 환경 태그
mlflow.set_tag("environment", "container")
mlflow.set_tag("experiment_type", "batch")
mlflow.set_tag("method", "grid_search")
# GridSearchCV 실행
logger.info("GridSearchCV 실행 중...")
grid_search = GridSearchCV(
estimator=RandomForestRegressor(random_state=42),
param_grid=param_grid,
cv=3,
scoring='r2',
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
# 최고 성능 모델
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
# 성능 평가
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# 결과 로깅
mlflow.log_params(grid_search.best_params_)
mlflow.log_metric("best_r2_score", r2)
mlflow.log_metric("best_rmse", rmse)
mlflow.log_metric("cv_score", grid_search.best_score_)
# 모델 저장
mlflow.sklearn.log_model(best_model, "best_model")
# 그리드 서치 결과 저장
results_df = pd.DataFrame(grid_search.cv_results_)
results_df.to_csv('/tmp/grid_search_results.csv', index=False)
mlflow.log_artifact('/tmp/grid_search_results.csv')
logger.info(f"배치 실험 완료 - 최고 R²: {r2:.4f}")
return best_model, grid_search.best_params_
if __name__ == "__main__":
run_batch_experiments()
고급 설정 및 최적화
1. MLflow 서버 고가용성 설정
# kubernetes/mlflow-ha.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: mlflow-ha
namespace: mlflow
spec:
replicas: 3
selector:
matchLabels:
app: mlflow-ha
template:
metadata:
labels:
app: mlflow-ha
spec:
containers:
- name: mlflow
image: your-registry/mlflow:latest
command:
- sh
- -c
- |
mlflow server \
--backend-store-uri postgresql://mlflow:mlflow123@postgresql:5432/mlflow \
--default-artifact-root s3://mlflow-artifacts/ \
--host 0.0.0.0 \
--port 5000 \
--workers 4
envFrom:
- configMapRef:
name: mlflow-config
ports:
- containerPort: 5000
resources:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "1Gi"
cpu: "500m"
livenessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 60
periodSeconds: 30
readinessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
---
apiVersion: v1
kind: Service
metadata:
name: mlflow-ha
namespace: mlflow
spec:
selector:
app: mlflow-ha
ports:
- port: 5000
targetPort: 5000
type: LoadBalancer
sessionAffinity: ClientIP
2. 모니터링 및 로깅 설정
# kubernetes/monitoring.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: mlflow
data:
prometheus.yml: |
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'mlflow'
static_configs:
- targets: ['mlflow:5000']
metrics_path: '/metrics'
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: mlflow
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
containers:
- name: prometheus
image: prom/prometheus:latest
ports:
- containerPort: 9090
volumeMounts:
- name: config
mountPath: /etc/prometheus
volumes:
- name: config
configMap:
name: prometheus-config
---
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: mlflow
spec:
selector:
app: prometheus
ports:
- port: 9090
targetPort: 9090
type: LoadBalancer
3. 자동 백업 설정
#!/bin/bash
# scripts/backup.sh
# 환경 변수
BACKUP_DIR="/backups/$(date +%Y%m%d_%H%M%S)"
NAMESPACE="mlflow"
# 백업 디렉토리 생성
mkdir -p $BACKUP_DIR
# PostgreSQL 백업
echo "PostgreSQL 백업 중..."
kubectl exec -n $NAMESPACE deployment/postgresql -- pg_dump -U mlflow mlflow > $BACKUP_DIR/mlflow_db.sql
# MinIO 백업
echo "MinIO 아티팩트 백업 중..."
kubectl exec -n $NAMESPACE deployment/minio -- mc mirror /data/mlflow-artifacts /tmp/backup-artifacts
kubectl cp $NAMESPACE/minio-pod:/tmp/backup-artifacts $BACKUP_DIR/artifacts
# Kubernetes 리소스 백업
echo "Kubernetes 리소스 백업 중..."
kubectl get all -n $NAMESPACE -o yaml > $BACKUP_DIR/k8s-resources.yaml
# 백업 압축
tar -czf $BACKUP_DIR.tar.gz -C $BACKUP_DIR .
rm -rf $BACKUP_DIR
echo "백업 완료: $BACKUP_DIR.tar.gz"
4. 성능 최적화 설정
# kubernetes/optimization.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: mlflow-optimization
namespace: mlflow
data:
nginx.conf: |
upstream mlflow_backend {
least_conn;
server mlflow:5000 max_fails=3 fail_timeout=30s;
}
server {
listen 80;
client_max_body_size 100M;
location / {
proxy_pass http://mlflow_backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_connect_timeout 60s;
proxy_send_timeout 60s;
proxy_read_timeout 60s;
}
location /static/ {
alias /static/;
expires 30d;
add_header Cache-Control "public, immutable";
}
}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: nginx-proxy
namespace: mlflow
spec:
replicas: 2
selector:
matchLabels:
app: nginx-proxy
template:
metadata:
labels:
app: nginx-proxy
spec:
containers:
- name: nginx
image: nginx:alpine
ports:
- containerPort: 80
volumeMounts:
- name: config
mountPath: /etc/nginx/conf.d
resources:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "256Mi"
cpu: "200m"
volumes:
- name: config
configMap:
name: mlflow-optimization
문제 해결 및 디버깅
1. 일반적인 문제 해결
# 컨테이너 상태 확인
docker compose ps
docker compose logs mlflow
# Kubernetes 문제 해결
kubectl get pods -n mlflow
kubectl describe pod <pod-name> -n mlflow
kubectl logs <pod-name> -n mlflow
# 네트워크 연결 테스트
kubectl exec -it <pod-name> -n mlflow -- nc -zv postgresql 5432
kubectl exec -it <pod-name> -n mlflow -- nc -zv minio 9000
# 리소스 사용량 확인
kubectl top pods -n mlflow
kubectl top nodes
2. 성능 모니터링
# notebooks/performance_monitoring.py
import psutil
import time
import mlflow
import logging
from datetime import datetime
def monitor_system_performance():
"""시스템 성능 모니터링"""
mlflow.set_tracking_uri("http://mlflow:5000")
with mlflow.start_run(run_name=f"system_monitoring_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
# CPU 사용률
cpu_percent = psutil.cpu_percent(interval=1)
mlflow.log_metric("cpu_usage_percent", cpu_percent)
# 메모리 사용률
memory = psutil.virtual_memory()
mlflow.log_metric("memory_usage_percent", memory.percent)
mlflow.log_metric("memory_available_gb", memory.available / (1024**3))
# 디스크 사용률
disk = psutil.disk_usage('/')
mlflow.log_metric("disk_usage_percent", (disk.used / disk.total) * 100)
mlflow.log_metric("disk_free_gb", disk.free / (1024**3))
# 네트워크 통계
network = psutil.net_io_counters()
mlflow.log_metric("network_bytes_sent", network.bytes_sent)
mlflow.log_metric("network_bytes_recv", network.bytes_recv)
# 환경 정보
mlflow.set_tag("platform", "container")
mlflow.set_tag("monitoring_time", datetime.now().isoformat())
logging.info(f"시스템 모니터링 완료 - CPU: {cpu_percent}%, Memory: {memory.percent}%")
if __name__ == "__main__":
monitor_system_performance()
운영 가이드
1. 일일 운영 체크리스트
#!/bin/bash
# scripts/daily_check.sh
echo "=== MLflow 일일 운영 체크 ==="
echo "체크 시간: $(date)"
# 서비스 상태 확인
echo "1. 서비스 상태 확인"
kubectl get pods -n mlflow
kubectl get svc -n mlflow
# 디스크 사용량 확인
echo "2. 디스크 사용량 확인"
kubectl exec -n mlflow deployment/postgresql -- df -h /var/lib/postgresql/data
kubectl exec -n mlflow deployment/minio -- df -h /data
# 데이터베이스 연결 확인
echo "3. 데이터베이스 연결 확인"
kubectl exec -n mlflow deployment/postgresql -- psql -U mlflow -d mlflow -c "SELECT count(*) FROM experiments;"
# MLflow 서비스 health check
echo "4. MLflow 서비스 확인"
kubectl exec -n mlflow deployment/mlflow -- curl -f http://localhost:5000/health
# 최근 실험 확인
echo "5. 최근 실험 확인"
kubectl exec -n mlflow deployment/mlflow -- python -c "
import mlflow
mlflow.set_tracking_uri('http://localhost:5000')
runs = mlflow.search_runs(max_results=5)
print(f'최근 5개 실험: {len(runs)}개')
"
echo "=== 체크 완료 ==="
2. 정기 메인터넌스
#!/bin/bash
# scripts/maintenance.sh
echo "=== MLflow 정기 메인터넌스 ==="
# 오래된 실험 정리 (30일 이전)
echo "1. 오래된 실험 정리"
kubectl exec -n mlflow deployment/mlflow -- python -c "
import mlflow
from datetime import datetime, timedelta
mlflow.set_tracking_uri('http://localhost:5000')
cutoff_date = datetime.now() - timedelta(days=30)
experiments = mlflow.search_experiments()
for exp in experiments:
runs = mlflow.search_runs(experiment_ids=[exp.experiment_id])
old_runs = runs[runs['start_time'] < cutoff_date.isoformat()]
for _, run in old_runs.iterrows():
mlflow.delete_run(run['run_id'])
print(f'삭제된 실험: {run[\"run_id\"]}')
"
# 아티팩트 정리
echo "2. 아티팩트 정리"
kubectl exec -n mlflow deployment/minio -- mc rm --recursive --force minio/mlflow-artifacts/$(date -d '30 days ago' +%Y/%m/%d)/
# 데이터베이스 VACUUM
echo "3. 데이터베이스 최적화"
kubectl exec -n mlflow deployment/postgresql -- psql -U mlflow -d mlflow -c "VACUUM ANALYZE;"
echo "=== 메인터넌스 완료 ==="
결론
OrbStack을 활용한 MLflow 컨테이너 환경은 다음과 같은 장점을 제공합니다:
- 성능: OrbStack의 네이티브 macOS 성능으로 빠른 실행
- 확장성: Kubernetes를 통한 수평 확장 가능
- 격리: 컨테이너 기반 격리된 실험 환경
- 재현성: 동일한 환경에서 실험 재현 보장
- 유연성: Docker Compose와 Kubernetes 모두 지원
이 가이드를 통해 로컬 개발부터 프로덕션 배포까지 MLflow를 효과적으로 활용할 수 있습니다.