Today's ML engineers must choose between various serving systems, each with their own abstractions, terminology, and trade-offs. These platforms differ in their approaches to fundamental concepts like:
ModelArtifact ::= (code, weights, metadata) Container ::= (ModelArtifact, runtime, deps) Endpoint ::= (Container, scaling_config, routing) Version ::= (Endpoint, traffic_weight)
package : ModelArtifact → Container deploy : Container → Endpoint scale : Endpoint × Config → Endpoint route : Version × Version × Weight → Version
SageMaker's approach closely mirrors our theoretical model, with explicit container building and endpoint management. Key mappings include:
Theoretical Representation:
# SageMaker strict implementation of core grammar
ModelArtifact ::= (
code = "s3://bucket/model.tar.gz", # Model code and artifacts
weights = "s3://bucket/weights", # Model weights
metadata = { # Essential metadata only
"framework": str, # e.g., "huggingface"
"version": str, # e.g., "4.37"
"py_version": str # e.g., "py310"
}
)
Container ::= (
ModelArtifact,
runtime = {
"image": str, # ECR image URI
"execution_role": str # IAM role
},
deps = {
"environment": dict, # Environment variables
"entry_point": str # Inference script
}
)
Endpoint ::= (
Container,
scaling_config = {
"instance_count": int,
"instance_type": str
},
routing = {
"variants": list[str], # Production variant names
"weights": list[float] # Traffic weights
}
)
Version ::= (
Endpoint,
traffic_weight = float # Simple weight for this version
)
# Core operations
package : ModelArtifact → Container # Create SageMaker model
deploy : Container → Endpoint # Deploy to endpoint
scale : Endpoint × Config → Endpoint # Update instance count/type
route : Version × Version × Weight → Version # Update traffic split
Implementation:
from sagemaker.huggingface import HuggingFaceModel
from sagemaker.huggingface.model import get_huggingface_llm_image_uri
# Define the model image
image_uri = get_huggingface_llm_image_uri(
"huggingface",
version="1.4.2"
)
# Create the model (packaging step)
huggingface_model = HuggingFaceModel(
env=env, # Environment variables for the container
role=SAGEMAKER_ROLE,
transformers_version="4.37",
pytorch_version="2.1",
py_version="py310",
image_uri=image_uri
)
# Deploy the model (endpoint creation)
predictor = huggingface_model.deploy(
initial_instance_count=deployment.instance_count,
instance_type=deployment.instance_type,
endpoint_name=endpoint_name,
)
# Inference invocation
predictor = HuggingFacePredictor(
endpoint_name=endpoint_name,
sagemaker_session=sagemaker_session
)
response = predictor.predict(input)
Azure ML implements a workspace-centric approach with managed online endpoints, emphasizing environment management and model registry integration.
Theoretical Representation:
# Azure ML implementation of our core grammar
ModelArtifact ::= (
code = "model/path", # Local or registry path
weights = "weights/path",
metadata = {
"name": str, # e.g., "hf-model"
"type": AssetType, # e.g., CUSTOM_MODEL
"description": str,
"registry": optional[str] # e.g., "HuggingFace"
}
)
Container ::= (
ModelArtifact,
runtime = {
"image": str, # e.g., "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04"
"conda_file": {
"channels": list[str],
"dependencies": list[str]
}
},
deps = {
"environment_variables": dict[str, str],
"pip_packages": list[str]
}
)
Endpoint ::= (
Container,
scaling_config = {
"instance_type": str, # e.g., "Standard_DS3_v2"
"instance_count": int,
"min_replicas": int,
"max_replicas": int
},
routing = {
"deployment_name": str,
"traffic_percentage": int
}
)
Version ::= (
Endpoint,
traffic_weight = {
"blue_green_config": {
"active": str, # blue or green
"percentage": int,
"evaluation_rules": dict
}
}
)
# Core operations
package(ModelArtifact) → Container # Creates Azure container environment
deploy(Container) → Endpoint # Deploys to Azure managed endpoint
scale(Endpoint × Config) → Endpoint # Updates endpoint scaling
route(Version × Weight) → Version # Updates traffic routing
Implementation:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import (
Environment,
Model,
ManagedOnlineEndpoint,
ManagedOnlineDeployment
)
from azure.ai.ml.constants import AssetTypes
# Initialize workspace client
credential = DefaultAzureCredential()
ml_client = MLClient(
credential=credential,
subscription_id=subscription_id,
resource_group_name=resource_group,
workspace_name=workspace_name
)
# Define environment with dependencies
environment = Environment(
name="bert-env",
image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
conda_file={
"channels": ["conda-forge", "pytorch"],
"dependencies": [
"python=3.11",
"pip",
"pytorch",
"transformers",
"numpy"
]
}
)
# Register model from registry
model = Model(
path=f"hf://{model_id}",
type=AssetTypes.CUSTOM_MODEL,
name="hf-model",
description="HuggingFace model from Model Hub"
)
# Create and configure endpoint
endpoint_name = f"hf-ep-{int(time.time())}"
ml_client.begin_create_or_update(
ManagedOnlineEndpoint(name=endpoint_name)
).wait()
# Deploy model
deployment = ml_client.online_deployments.begin_create_or_update(
ManagedOnlineDeployment(
name="demo",
endpoint_name=endpoint_name,
model=model_id,
environment=environment,
instance_type="Standard_DS3_v2",
instance_count=1,
)
).wait()
# Update traffic rules
endpoint = ml_client.online_endpoints.get(endpoint_name)
endpoint.traffic = {"demo": 100}
ml_client.begin_create_or_update(endpoint).result()
Vertex AI takes a streamlined approach to model deployment, with strong integration with Google Cloud's container infrastructure and emphasis on GPU acceleration.
Theoretical Representation:
# Vertex AI implementation of our core grammar
ModelArtifact ::= (
code = "gs://model/path", # GCS path
weights = "gs://weights/path",
metadata = {
"model_id": str, # e.g., "hf-bert-base"
"framework": str, # e.g., "huggingface"
"generation_config": dict
}
)
Container ::= (
ModelArtifact,
runtime = {
"image_uri": str, # e.g., "us-docker.pkg.dev/vertex-ai/prediction/..."
"accelerator": str # e.g., "NVIDIA_TESLA_A100"
},
deps = {
"env_vars": {
"MODEL_ID": str,
"MAX_INPUT_LENGTH": str,
"MAX_TOTAL_TOKENS": str,
"NUM_SHARD": str
}
}
)
Endpoint ::= (
Container,
scaling_config = {
"machine_type": str, # e.g., "a2-highgpu-4g"
"min_replica_count": int,
"max_replica_count": int,
"accelerator_count": int
},
routing = {
"traffic_split": dict[str, int],
"prediction_config": dict
}
)
Version ::= (
Endpoint,
traffic_weight = {
"split_name": str,
"percentage": int,
"monitoring_config": dict
}
)
# Core operations
package(ModelArtifact) → Container # Creates Vertex AI container
deploy(Container) → Endpoint # Deploys to Vertex endpoint
scale(Endpoint × Config) → Endpoint # Updates endpoint scaling
route(Version × Weight) → Version # Updates traffic routing
Implementation:
from google.cloud import aiplatform
def deploy_hf_model(
project_id: str,
location: str,
model_id: str,
machine_type: str = "a2-highgpu-4g",
):
aiplatform.init(project=project_id, location=location)
env_vars = {
"MODEL_ID": model_id,
"MAX_INPUT_LENGTH": "512",
"MAX_TOTAL_TOKENS": "1024",
"MAX_BATCH_PREFILL_TOKENS": "2048",
"NUM_SHARD": "1"
}
# Upload model with container configuration
model = aiplatform.Model.upload(
display_name=f"hf-{model_id.replace('/', '-')}",
serving_container_image_uri=(
"us-docker.pkg.dev/deeplearning-platform-release/gcr.io/"
"huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310"
),
serving_container_environment_variables=env_vars
)
# Deploy model with compute configuration
endpoint = model.deploy(
machine_type=machine_type,
min_replica_count=1,
max_replica_count=1,
accelerator_type="NVIDIA_TESLA_A100",
accelerator_count=1,
sync=True
)
return endpoint
def create_completion(
endpoint,
prompt: str,
max_tokens: int = 100,
temperature: float = 0.7
):
response = endpoint.predict({
"text": prompt,
"parameters": {
"max_new_tokens": max_tokens,
"temperature": temperature,
"top_p": 0.95,
"top_k": 40,
}
})
return response
ServerlessML takes a radical approach by completely eliminating the concept of endpoints and containers, instead treating models as pure functions:
Theoretical Representation:
ModelArtifact ::= (code, weights, metadata, scaling_rules) Function ::= (ModelArtifact, memory_size, timeout) Invocation ::= (Function, cold_start_policy) # Key innovation: No explicit container or endpoint deploy : ModelArtifact → Function invoke : Function → Response scale : automatic based on concurrent invocations
Implementation:
from serverlessml import MLFunction
model = MLFunction(
model_path="model.pkl",
framework="pytorch",
memory_size="2GB",
scaling_rules={
"cold_start_policy": "eager_loading",
"max_concurrent": 1000,
"idle_timeout": "10m"
}
)
# Deployment is implicit - function is ready to serve
function_url = model.deploy()
Pros:
Cons:
StatefulML introduces a novel approach by making model state and caching first-class concepts:
Theoretical Representation:
ModelArtifact ::= (code, weights, metadata) ModelState ::= (cache, warm_weights, dynamic_config) Container ::= (ModelArtifact, ModelState, runtime) StateManager ::= (Container, caching_policy, update_strategy) # Key innovation: Explicit state management deploy : (ModelArtifact, StateManager) → Container update_state : (Container, ModelState) → Container cache_forward : (Container, Request) → Response
Implementation:
from statefulml import MLContainer, StateManager
state_manager = StateManager(
caching_policy={
"strategy": "predictive_cache",
"cache_size": "4GB",
"eviction_policy": "feature_based_lru"
},
update_strategy={
"type": "incremental",
"frequency": "5m",
"warm_up": True
}
)
model = MLContainer(
model_path="model.pkl",
framework="tensorflow",
state_manager=state_manager,
dynamic_config={
"feature_importance_tracking": True,
"automatic_cache_tuning": True
}
)
endpoint = model.deploy()
Pros:
Cons:
DynamicResource ::= (
GranularAllocation,
ElasticScaling,
CostAwareScheduling
)
SharedModelConfig ::= (
CrossEndpointSharing,
DynamicModelLoading,
ResourcePooling
)
EnhancedMonitoring ::= (
PredictiveAlerts,
AutomaticDiagnosis,
AdaptiveOptimization
)
This framework provides a way to understand and compare ML serving systems. While current platforms have significant differences, many reflect platform-specific constraints rather than fundamental requirements of the domain. Future systems can benefit from this analysis to provide more consistent and powerful abstractions for ML deployment.