AI Architecture Services

MLOps Platform Architecture

ML Pipeline Design

Training Pipeline

training_pipeline:
  data_ingestion:
    sources:
      - data_lakes
      - streaming_data
      - external_apis
      - file_systems
    formats:
      - parquet
      - csv
      - json
      - avro
  
  data_preprocessing:
    validation:
      - schema_validation
      - data_quality_checks
      - outlier_detection
    transformation:
      - feature_engineering
      - data_cleaning
      - normalization
      - encoding
  
  model_training:
    frameworks:
      - tensorflow
      - pytorch
      - scikit_learn
      - xgboost
    compute:
      - gpu_clusters
      - distributed_training
      - hyperparameter_tuning
  
  model_evaluation:
    metrics:
      - accuracy
      - precision_recall
      - auc_roc
      - custom_metrics
    validation:
      - cross_validation
      - holdout_testing
      - temporal_validation

Model Registry Architecture

class ModelRegistry:
    def __init__(self, storage_backend, metadata_db):
        self.storage = storage_backend
        self.metadata = metadata_db
    
    def register_model(self, model_info: ModelInfo):
        """Register new model version with metadata"""
        model_version = ModelVersion(
            name=model_info.name,
            version=model_info.version,
            algorithm=model_info.algorithm,
            metrics=model_info.metrics,
            parameters=model_info.parameters,
            artifacts_path=model_info.artifacts_path,
            created_by=model_info.created_by,
            tags=model_info.tags
        )
        
        # Store model artifacts
        artifact_path = self.storage.upload_artifacts(
            model_info.artifacts, 
            f"models/{model_info.name}/{model_info.version}"
        )
        
        # Store metadata
        self.metadata.save_model_version(model_version)
        
        return model_version
    
    def get_model(self, name: str, version: str = None):
        """Retrieve model by name and version"""
        if version is None:
            version = self.get_latest_version(name)
        
        return self.metadata.get_model_version(name, version)
    
    def promote_model(self, name: str, version: str, stage: str):
        """Promote model to different stage (staging, production)"""
        model = self.get_model(name, version)
        model.stage = stage
        model.promoted_at = datetime.utcnow()
        
        self.metadata.update_model_version(model)
        
        # Trigger deployment pipeline
        self.trigger_deployment(model)

Model Serving Architecture

Real-time Inference System

# High-performance model serving with caching
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import asyncio
import redis
import pickle
import numpy as np

app = FastAPI()
redis_client = redis.Redis(host='redis', port=6379, decode_responses=True)

class PredictionRequest(BaseModel):
    features: list
    model_name: str
    version: str = "latest"

class PredictionResponse(BaseModel):
    prediction: float
    confidence: float
    model_version: str
    inference_time_ms: float

class ModelServer:
    def __init__(self):
        self.models = {}
        self.feature_cache = {}
        self.prediction_cache = {}
    
    async def load_model(self, name: str, version: str):
        """Load model from registry with caching"""
        cache_key = f"model:{name}:{version}"
        
        if cache_key in self.models:
            return self.models[cache_key]
        
        # Load from model registry
        model_artifacts = await self.model_registry.get_model(name, version)
        model = pickle.load(model_artifacts['model_file'])
        
        self.models[cache_key] = model
        return model
    
    async def predict(self, request: PredictionRequest):
        """Perform prediction with caching and monitoring"""
        start_time = time.time()
        
        # Feature preprocessing
        features = np.array(request.features).reshape(1, -1)
        feature_hash = hashlib.md5(str(features).encode()).hexdigest()
        
        # Check prediction cache
        cache_key = f"prediction:{request.model_name}:{feature_hash}"
        cached_prediction = redis_client.get(cache_key)
        
        if cached_prediction:
            return pickle.loads(cached_prediction)
        
        # Load model and predict
        model = await self.load_model(request.model_name, request.version)
        prediction = model.predict(features)[0]
        confidence = model.predict_proba(features).max()
        
        inference_time = (time.time() - start_time) * 1000
        
        response = PredictionResponse(
            prediction=prediction,
            confidence=confidence,
            model_version=request.version,
            inference_time_ms=inference_time
        )
        
        # Cache result
        redis_client.setex(cache_key, 3600, pickle.dumps(response))
        
        # Log metrics
        await self.log_prediction_metrics(request, response, inference_time)
        
        return response

Batch Inference Architecture

batch_inference:
  data_sources:
    - data_warehouse
    - data_lake
    - streaming_data
  
  processing_framework:
    - apache_spark
    - apache_beam
    - dask
    - ray
  
  compute_resources:
    - kubernetes_jobs
    - aws_batch
    - azure_batch
    - google_cloud_dataflow
  
  output_destinations:
    - databases
    - data_warehouses
    - message_queues
    - file_systems
  
  scheduling:
    - cron_jobs
    - event_triggers
    - dependency_based

Feature Store Architecture

Feature Pipeline Design

Online Feature Store

class OnlineFeatureStore:
    def __init__(self, storage_backend):
        self.storage = storage_backend  # Redis, DynamoDB, etc.
    
    async def get_features(self, entity_ids: List[str], feature_names: List[str]):
        """Retrieve features for real-time inference"""
        features = {}
        
        for entity_id in entity_ids:
            entity_features = {}
            for feature_name in feature_names:
                key = f"feature:{entity_id}:{feature_name}"
                value = await self.storage.get(key)
                
                if value:
                    entity_features[feature_name] = pickle.loads(value)
                else:
                    # Fallback to offline computation or default value
                    entity_features[feature_name] = await self.compute_fallback_feature(
                        entity_id, feature_name
                    )
            
            features[entity_id] = entity_features
        
        return features
    
    async def write_features(self, features: Dict[str, Dict[str, Any]]):
        """Write features to online store"""
        pipeline = self.storage.pipeline()
        
        for entity_id, entity_features in features.items():
            for feature_name, feature_value in entity_features.items():
                key = f"feature:{entity_id}:{feature_name}"
                pipeline.setex(key, 86400, pickle.dumps(feature_value))  # 24h TTL
        
        await pipeline.execute()

# Offline feature store for training data
class OfflineFeatureStore:
    def __init__(self, data_warehouse):
        self.warehouse = data_warehouse
    
    def get_training_data(self, feature_view: FeatureView, start_date: datetime, end_date: datetime):
        """Generate training dataset with point-in-time correctness"""
        query = self.build_training_query(feature_view, start_date, end_date)
        return self.warehouse.execute_query(query)
    
    def build_training_query(self, feature_view: FeatureView, start_date: datetime, end_date: datetime):
        """Build SQL query with temporal joins for training data"""
        return f"""
        SELECT 
            e.entity_id,
            e.event_timestamp,
            {self.build_feature_columns(feature_view.features)}
        FROM {feature_view.entity_table} e
        {self.build_temporal_joins(feature_view.features)}
        WHERE e.event_timestamp BETWEEN '{start_date}' AND '{end_date}'
        ORDER BY e.entity_id, e.event_timestamp
        """

Feature Engineering Pipeline

Streaming Feature Computation

# Apache Kafka Streams for real-time feature engineering
from kafka import KafkaConsumer, KafkaProducer
import json

class StreamingFeatureProcessor:
    def __init__(self, input_topic: str, output_topic: str):
        self.consumer = KafkaConsumer(
            input_topic,
            bootstrap_servers=['kafka:9092'],
            auto_offset_reset='latest',
            value_deserializer=lambda x: json.loads(x.decode('utf-8'))
        )
        
        self.producer = KafkaProducer(
            bootstrap_servers=['kafka:9092'],
            value_serializer=lambda x: json.dumps(x).encode('utf-8')
        )
        
        self.output_topic = output_topic
        self.feature_store = OnlineFeatureStore()
    
    async def process_events(self):
        """Process streaming events to compute real-time features"""
        for message in self.consumer:
            event = message.value
            
            # Compute features from event
            features = await self.compute_features(event)
            
            # Write to feature store
            await self.feature_store.write_features({
                event['entity_id']: features
            })
            
            # Publish to output topic for downstream consumers
            self.producer.send(self.output_topic, {
                'entity_id': event['entity_id'],
                'features': features,
                'timestamp': event['timestamp']
            })
    
    async def compute_features(self, event: dict):
        """Compute features from raw event data"""
        features = {}
        
        # Example feature computations
        features['user_session_count'] = await self.get_session_count(event['user_id'])
        features['time_since_last_purchase'] = await self.time_since_last_purchase(event['user_id'])
        features['rolling_avg_transaction'] = await self.rolling_average(event['user_id'], 'transaction_amount', window='7d')
        
        return features

AI Infrastructure Architecture

GPU Cluster Management

Kubernetes for ML Workloads

# Kubernetes configuration for GPU-enabled ML training
apiVersion: v1
kind: Namespace
metadata:
  name: ml-workloads
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: model-training
  namespace: ml-workloads
spec:
  replicas: 1
  selector:
    matchLabels:
      app: model-training
  template:
    metadata:
      labels:
        app: model-training
    spec:
      containers:
      - name: trainer
        image: tensorflow/tensorflow:2.12.0-gpu
        resources:
          limits:
            nvidia.com/gpu: 2
            memory: "16Gi"
            cpu: "8"
          requests:
            nvidia.com/gpu: 2
            memory: "8Gi"
            cpu: "4"
        env:
        - name: CUDA_VISIBLE_DEVICES
          value: "0,1"
        volumeMounts:
        - name: model-data
          mountPath: /data
        - name: model-output
          mountPath: /output
      volumes:
      - name: model-data
        persistentVolumeClaim:
          claimName: ml-data-pvc
      - name: model-output
        persistentVolumeClaim:
          claimName: ml-output-pvc
      nodeSelector:
        accelerator: nvidia-tesla-v100

Distributed Training Architecture

# PyTorch distributed training setup
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP

class DistributedTrainer:
    def __init__(self, model, train_loader, optimizer, criterion):
        self.model = model
        self.train_loader = train_loader
        self.optimizer = optimizer
        self.criterion = criterion
    
    def setup_distributed(self, rank, world_size):
        """Initialize distributed training"""
        os.environ['MASTER_ADDR'] = 'localhost'
        os.environ['MASTER_PORT'] = '12355'
        
        # Initialize the process group
        dist.init_process_group("nccl", rank=rank, world_size=world_size)
        
        # Set device
        torch.cuda.set_device(rank)
        
        # Wrap model with DDP
        self.model = self.model.cuda(rank)
        self.model = DDP(self.model, device_ids=[rank])
    
    def train_epoch(self, epoch):
        """Train one epoch with distributed data parallel"""
        self.model.train()
        self.train_loader.sampler.set_epoch(epoch)
        
        for batch_idx, (data, target) in enumerate(self.train_loader):
            data, target = data.cuda(), target.cuda()
            
            self.optimizer.zero_grad()
            output = self.model(data)
            loss = self.criterion(output, target)
            loss.backward()
            
            # Gradient synchronization happens automatically with DDP
            self.optimizer.step()
            
            if batch_idx % 100 == 0:
                print(f'Rank {dist.get_rank()}, Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item():.6f}')

def run_distributed_training(rank, world_size, model, train_loader, optimizer, criterion, epochs):
    trainer = DistributedTrainer(model, train_loader, optimizer, criterion)
    trainer.setup_distributed(rank, world_size)
    
    for epoch in range(epochs):
        trainer.train_epoch(epoch)
    
    # Cleanup
    dist.destroy_process_group()

Model Monitoring and Observability

Performance Monitoring

Model Drift Detection

class ModelDriftDetector:
    def __init__(self, reference_data, drift_threshold=0.05):
        self.reference_data = reference_data
        self.drift_threshold = drift_threshold
        self.baseline_stats = self.compute_baseline_stats()
    
    def compute_baseline_stats(self):
        """Compute baseline statistics from reference data"""
        return {
            'mean': np.mean(self.reference_data, axis=0),
            'std': np.std(self.reference_data, axis=0),
            'quantiles': np.quantile(self.reference_data, [0.25, 0.5, 0.75], axis=0)
        }
    
    def detect_drift(self, new_data):
        """Detect data drift using statistical tests"""
        drift_results = {}
        
        # Kolmogorov-Smirnov test for each feature
        for i in range(self.reference_data.shape[1]):
            ks_stat, p_value = stats.ks_2samp(
                self.reference_data[:, i], 
                new_data[:, i]
            )
            
            drift_results[f'feature_{i}'] = {
                'ks_statistic': ks_stat,
                'p_value': p_value,
                'drift_detected': p_value < self.drift_threshold
            }
        
        # Population Stability Index (PSI)
        psi_score = self.calculate_psi(self.reference_data, new_data)
        drift_results['psi_score'] = psi_score
        drift_results['psi_drift'] = psi_score > 0.2  # Common threshold
        
        return drift_results
    
    def calculate_psi(self, reference, new_data, bins=10):
        """Calculate Population Stability Index"""
        # Create bins based on reference data
        bin_edges = np.quantile(reference, np.linspace(0, 1, bins + 1))
        
        # Calculate expected and actual frequencies
        expected_freq = np.histogram(reference, bins=bin_edges)[0] / len(reference)
        actual_freq = np.histogram(new_data, bins=bin_edges)[0] / len(new_data)
        
        # Avoid division by zero
        expected_freq = np.where(expected_freq == 0, 0.0001, expected_freq)
        actual_freq = np.where(actual_freq == 0, 0.0001, actual_freq)
        
        # Calculate PSI
        psi = np.sum((actual_freq - expected_freq) * np.log(actual_freq / expected_freq))
        
        return psi

Model Performance Monitoring

class ModelPerformanceMonitor:
    def __init__(self, model_name, metrics_backend):
        self.model_name = model_name
        self.metrics = metrics_backend
        self.alert_thresholds = {
            'accuracy': 0.85,
            'precision': 0.80,
            'recall': 0.80,
            'latency_p95': 100,  # milliseconds
            'error_rate': 0.05
        }
    
    def log_prediction(self, prediction_request, prediction_response, ground_truth=None):
        """Log prediction with performance metrics"""
        metrics = {
            'timestamp': datetime.utcnow(),
            'model_name': self.model_name,
            'model_version': prediction_response.model_version,
            'features': prediction_request.features,
            'prediction': prediction_response.prediction,
            'confidence': prediction_response.confidence,
            'inference_time_ms': prediction_response.inference_time_ms
        }
        
        if ground_truth is not None:
            metrics['ground_truth'] = ground_truth
            metrics['is_correct'] = (prediction_response.prediction == ground_truth)
        
        self.metrics.log_prediction(metrics)
    
    def compute_batch_metrics(self, predictions, ground_truth):
        """Compute batch performance metrics"""
        accuracy = accuracy_score(ground_truth, predictions)
        precision = precision_score(ground_truth, predictions, average='weighted')
        recall = recall_score(ground_truth, predictions, average='weighted')
        f1 = f1_score(ground_truth, predictions, average='weighted')
        
        batch_metrics = {
            'timestamp': datetime.utcnow(),
            'model_name': self.model_name,
            'sample_count': len(predictions),
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        }
        
        self.metrics.log_batch_metrics(batch_metrics)
        
        # Check for alerts
        self.check_performance_alerts(batch_metrics)
        
        return batch_metrics
    
    def check_performance_alerts(self, metrics):
        """Check if metrics breach alert thresholds"""
        alerts = []
        
        for metric_name, threshold in self.alert_thresholds.items():
            if metric_name in metrics:
                value = metrics[metric_name]
                
                # Different alert conditions for different metrics
                if metric_name in ['accuracy', 'precision', 'recall'] and value < threshold:
                    alerts.append(f"{metric_name.upper()} below threshold: {value:.3f} < {threshold}")
                elif metric_name in ['latency_p95', 'error_rate'] and value > threshold:
                    alerts.append(f"{metric_name.upper()} above threshold: {value:.3f} > {threshold}")
        
        if alerts:
            self.send_alerts(alerts)
    
    def send_alerts(self, alerts):
        """Send performance alerts to monitoring system"""
        for alert in alerts:
            self.metrics.send_alert({
                'severity': 'WARNING',
                'model_name': self.model_name,
                'message': alert,
                'timestamp': datetime.utcnow()
            })

Edge AI Architecture

Model Optimization for Edge Deployment

Model Compression Techniques

class ModelCompressor:
    def __init__(self, model):
        self.model = model
    
    def quantize_model(self, quantization_type='int8'):
        """Quantize model for edge deployment"""
        if quantization_type == 'int8':
            # Post-training quantization
            quantized_model = tf.lite.TFLiteConverter.from_keras_model(self.model)
            quantized_model.optimizations = [tf.lite.Optimize.DEFAULT]
            quantized_model.target_spec.supported_types = [tf.int8]
            
            return quantized_model.convert()
        
        elif quantization_type == 'dynamic':
            # Dynamic range quantization
            converter = tf.lite.TFLiteConverter.from_keras_model(self.model)
            converter.optimizations = [tf.lite.Optimize.DEFAULT]
            
            return converter.convert()
    
    def prune_model(self, sparsity=0.5):
        """Apply magnitude-based pruning"""
        import tensorflow_model_optimization as tfmot
        
        pruning_params = {
            'pruning_schedule': tfmot.sparsity.keras.PolynomialDecay(
                initial_sparsity=0.0,
                final_sparsity=sparsity,
                begin_step=0,
                end_step=1000
            )
        }
        
        pruned_model = tfmot.sparsity.keras.prune_low_magnitude(
            self.model, **pruning_params
        )
        
        return pruned_model
    
    def knowledge_distillation(self, teacher_model, student_model, temperature=3.0):
        """Compress model using knowledge distillation"""
        class DistillationLoss(tf.keras.losses.Loss):
            def __init__(self, temperature):
                super().__init__()
                self.temperature = temperature
            
            def call(self, y_true, y_pred):
                teacher_pred = teacher_model(y_true['input'], training=False)
                student_pred = y_pred
                
                # Soft targets from teacher
                soft_targets = tf.nn.softmax(teacher_pred / self.temperature)
                soft_pred = tf.nn.softmax(student_pred / self.temperature)
                
                # Distillation loss
                distillation_loss = tf.keras.losses.categorical_crossentropy(
                    soft_targets, soft_pred
                )
                
                # Hard targets loss
                hard_loss = tf.keras.losses.categorical_crossentropy(
                    y_true['labels'], student_pred
                )
                
                return 0.7 * distillation_loss + 0.3 * hard_loss
        
        return DistillationLoss(temperature)

Federated Learning Architecture

class FederatedLearningCoordinator:
    def __init__(self, global_model, client_fraction=0.1):
        self.global_model = global_model
        self.client_fraction = client_fraction
        self.clients = {}
        self.round_number = 0
    
    def register_client(self, client_id, client):
        """Register federated learning client"""
        self.clients[client_id] = client
    
    def federated_averaging(self, client_updates, client_weights):
        """Perform federated averaging of model updates"""
        # Weight updates by number of samples
        total_samples = sum(client_weights.values())
        
        # Initialize averaged weights
        averaged_weights = None
        
        for client_id, weights in client_updates.items():
            client_sample_weight = client_weights[client_id] / total_samples
            
            if averaged_weights is None:
                averaged_weights = [w * client_sample_weight for w in weights]
            else:
                for i, w in enumerate(weights):
                    averaged_weights[i] += w * client_sample_weight
        
        return averaged_weights
    
    def run_federated_round(self):
        """Execute one round of federated learning"""
        # Select random subset of clients
        selected_clients = random.sample(
            list(self.clients.keys()), 
            int(len(self.clients) * self.client_fraction)
        )
        
        # Send global model to selected clients
        global_weights = self.global_model.get_weights()
        client_updates = {}
        client_weights = {}
        
        for client_id in selected_clients:
            client = self.clients[client_id]
            
            # Client trains on local data
            local_weights, num_samples = client.train(global_weights)
            
            client_updates[client_id] = local_weights
            client_weights[client_id] = num_samples
        
        # Aggregate updates using federated averaging
        new_global_weights = self.federated_averaging(client_updates, client_weights)
        
        # Update global model
        self.global_model.set_weights(new_global_weights)
        
        self.round_number += 1
        
        return {
            'round': self.round_number,
            'selected_clients': selected_clients,
            'total_samples': sum(client_weights.values())
        }

class FederatedClient:
    def __init__(self, client_id, local_data, model_architecture):
        self.client_id = client_id
        self.local_data = local_data
        self.model = model_architecture()
    
    def train(self, global_weights, epochs=5):
        """Train local model on client data"""
        # Set global weights
        self.model.set_weights(global_weights)
        
        # Train on local data
        self.model.fit(
            self.local_data['x_train'],
            self.local_data['y_train'],
            epochs=epochs,
            batch_size=32,
            verbose=0
        )
        
        return self.model.get_weights(), len(self.local_data['x_train'])

AI Architecture & MLOps Platforms

MLOps Platform Design

Model Serving Architecture

Data Pipeline Architecture

AI Infrastructure Design

Model Governance

Edge AI Architecture

MLOps Platform Architecture

Real-time AI Systems

AI Architecture Patterns

Lambda Architecture for ML

Microservices for AI

Edge AI Architecture

Multi-Model Serving

MLOps Platform Architecture

ML Pipeline Design

Training Pipeline

Model Registry Architecture

Model Serving Architecture

Real-time Inference System

Batch Inference Architecture

Feature Store Architecture

Feature Pipeline Design

Online Feature Store

Feature Engineering Pipeline

Streaming Feature Computation

AI Infrastructure Architecture

GPU Cluster Management

Kubernetes for ML Workloads

Distributed Training Architecture

Model Monitoring and Observability

Performance Monitoring

Model Drift Detection

Model Performance Monitoring

Edge AI Architecture

Model Optimization for Edge Deployment

Model Compression Techniques

Federated Learning Architecture

Build Your AI Architecture