Add flow-0.10.6 and update documentation

2026-03-23 22:03:39 +00:00
parent 03a232e342
commit ef328d64b0
3 changed files with 352 additions and 98 deletions
--- a/examples/values.yaml
+++ b/examples/values.yaml
@@ -136,6 +136,18 @@ global:
        connectionLifetime: 0
        # -- Connection idle timeout in seconds
        connectionIdleLifetime: 300
+      # -- Read replica configuration (for read/write splitting)
+      # When using HA PostgreSQL (internal or external), read-heavy queries can be
+      # routed to replicas to reduce load on the primary
+      readReplica:
+        # -- Enable read replica connection (requires HA PostgreSQL)
+        enabled: false
+        # -- Read replica host (auto-resolved for internal HA, set FQDN for external)
+        # For internal HA: leave empty to auto-resolve to the read service
+        # For external: set to your read replica endpoint (e.g., "myserver-replica.postgres.database.azure.com")
+        host: ""
+        # -- Read replica port (defaults to same as primary)
+        port: 0

  # -- RabbitMQ configuration
  rabbitmq:
@@ -163,8 +175,11 @@ global:
    existingSecretKey: "rabbitmq-password"
    # -- Virtual host
    vhost: "/"
-    # -- Prefetch count
-    prefetch: 1
+    # -- Prefetch count (number of unacknowledged messages per consumer)
+    # Higher values increase throughput at the cost of less fair distribution.
+    # For enterprise workloads handling millions of messages, 20 provides a good
+    # balance between throughput and fair load distribution across consumers.
+    prefetch: 20
    # -- Activity exchange name
    activityExchange: "workflow.activities"
    # -- Completed exchange name
@@ -290,6 +305,16 @@ global:
    tenantRegistry: "http://{{ .Release.Name }}-tenant-registry.{{ .Release.Namespace }}.svc.cluster.local:80"
    aiAssistant: "http://{{ .Release.Name }}-ai-assistant.{{ .Release.Namespace }}.svc.cluster.local:80"

+# -- OpenTelemetry configuration for distributed tracing and metrics
+# Requires an OTLP-compatible collector (e.g., Grafana Alloy, Jaeger, OpenTelemetry Collector)
+openTelemetry:
+  # -- Enable OpenTelemetry tracing and metrics for all services
+  enabled: false
+  # -- OTLP exporter endpoint (gRPC)
+  endpoint: "http://otel-collector:4317"
+  # -- OTLP protocol: grpc or http/protobuf
+  protocol: "grpc"
+
 # =============================================================================
 # Core Services
 # =============================================================================
@@ -321,17 +346,19 @@ workflowEngine:

  resources:
    limits:
-      cpu: 500m
-      memory: 512Mi
+      cpu: 1000m
+      memory: 1Gi
    requests:
-      cpu: 100m
-      memory: 256Mi
+      cpu: 250m
+      memory: 512Mi

  autoscaling:
    enabled: false
    minReplicas: 1
-    maxReplicas: 5
-    targetCPUUtilizationPercentage: 80
+    maxReplicas: 30
+    targetCPUUtilizationPercentage: 75
+    # -- Target memory utilization percentage (optional, enable for memory-intensive workloads)
+    # targetMemoryUtilizationPercentage: 80

  nodeSelector: {}
  tolerations: []
@@ -342,6 +369,10 @@ workflowEngine:
    concurrencyMaxRetries: 5
    concurrencyBaseDelayMs: 100
    concurrencyJitterMs: 50
+    # -- Maximum number of activity results to process concurrently per pod
+    # Higher values increase throughput but require more DB connections.
+    # Set this <= prefetch count and <= primary pool size.
+    maxConcurrentActivityResults: 10

  # -- Tenant settings provider configuration
  tenantSettings:
@@ -372,10 +403,33 @@ workflowEngine:
    # -- How often to run cleanup (in minutes, default 60 = 1 hour)
    checkIntervalMinutes: 60
    # -- Max workflows to delete per tenant per run
-    batchSize: 100
+    # For enterprise workloads with thousands of concurrent flows, a higher batch
+    # size prevents cleanup from falling behind workflow creation rate.
+    batchSize: 1000
    # -- How long to hold leadership lease (in minutes)
    leaseDurationMinutes: 5

+  # -- Rate limiting configuration
+  rateLimiting:
+    # -- Enable per-tenant rate limiting
+    enabled: true
+    # -- Maximum requests per tenant per minute
+    requestsPerMinute: 1000
+    # -- Queue limit (0 = reject immediately when limit reached)
+    queueLimit: 0
+
+  # -- Load shedding configuration
+  loadShedding:
+    # -- Enable load shedding (reject requests when overloaded)
+    enabled: false
+    # -- Maximum concurrent requests before shedding
+    maxConcurrentRequests: 500
+
+  # -- Pod disruption budget
+  podDisruptionBudget:
+    enabled: false
+    minAvailable: 1
+
  # -- Additional environment variables
  extraEnv: []

@@ -389,12 +443,12 @@ workflowEngine:
 activityRegistry:
  enabled: true
  replicaCount: 1
-  
+
  image:
    repository: flow/activityregistry
    tag: ""
    pullPolicy: IfNotPresent
-  
+
  service:
    type: ClusterIP
    port: 80
@@ -412,22 +466,27 @@ activityRegistry:

  resources:
    limits:
-      cpu: 250m
-      memory: 256Mi
+      cpu: 500m
+      memory: 512Mi
    requests:
-      cpu: 50m
-      memory: 128Mi
+      cpu: 100m
+      memory: 256Mi

  autoscaling:
    enabled: false
    minReplicas: 1
-    maxReplicas: 3
-    targetCPUUtilizationPercentage: 80
+    maxReplicas: 10
+    targetCPUUtilizationPercentage: 75

  nodeSelector: {}
  tolerations: []
  affinity: {}

+  # -- Pod disruption budget
+  podDisruptionBudget:
+    enabled: false
+    minAvailable: 1
+
  extraEnv: []
  extraVolumeMounts: []
  extraVolumes: []
@@ -436,12 +495,12 @@ activityRegistry:
 definitionStore:
  enabled: true
  replicaCount: 1
-  
+
  image:
    repository: flow/definitionstore
    tag: ""
    pullPolicy: IfNotPresent
-  
+
  service:
    type: ClusterIP
    port: 80
@@ -459,22 +518,27 @@ definitionStore:

  resources:
    limits:
-      cpu: 250m
-      memory: 256Mi
+      cpu: 500m
+      memory: 512Mi
    requests:
-      cpu: 50m
-      memory: 128Mi
+      cpu: 100m
+      memory: 256Mi

  autoscaling:
    enabled: false
    minReplicas: 1
-    maxReplicas: 3
-    targetCPUUtilizationPercentage: 80
+    maxReplicas: 10
+    targetCPUUtilizationPercentage: 75

  nodeSelector: {}
  tolerations: []
  affinity: {}

+  # -- Pod disruption budget
+  podDisruptionBudget:
+    enabled: false
+    minAvailable: 1
+
  extraEnv: []
  extraVolumeMounts: []
  extraVolumes: []
@@ -483,12 +547,12 @@ definitionStore:
 workflowLogging:
  enabled: true
  replicaCount: 1
-  
+
  image:
    repository: flow/workflowlogging
    tag: ""
    pullPolicy: IfNotPresent
-  
+
  service:
    type: ClusterIP
    port: 80
@@ -506,22 +570,27 @@ workflowLogging:

  resources:
    limits:
-      cpu: 250m
-      memory: 256Mi
+      cpu: 500m
+      memory: 512Mi
    requests:
-      cpu: 50m
-      memory: 128Mi
+      cpu: 100m
+      memory: 256Mi

  autoscaling:
    enabled: false
    minReplicas: 1
-    maxReplicas: 3
-    targetCPUUtilizationPercentage: 80
+    maxReplicas: 10
+    targetCPUUtilizationPercentage: 75

  nodeSelector: {}
  tolerations: []
  affinity: {}

+  # -- Pod disruption budget
+  podDisruptionBudget:
+    enabled: false
+    minAvailable: 1
+
  extraEnv: []
  extraVolumeMounts: []
  extraVolumes: []
@@ -530,12 +599,12 @@ workflowLogging:
 connectionStore:
  enabled: true
  replicaCount: 1
-  
+
  image:
    repository: flow/connectionstore
    tag: ""
    pullPolicy: IfNotPresent
-  
+
  service:
    type: ClusterIP
    port: 80
@@ -553,22 +622,27 @@ connectionStore:

  resources:
    limits:
-      cpu: 250m
-      memory: 256Mi
+      cpu: 500m
+      memory: 512Mi
    requests:
-      cpu: 50m
-      memory: 128Mi
+      cpu: 100m
+      memory: 256Mi

  autoscaling:
    enabled: false
    minReplicas: 1
-    maxReplicas: 3
-    targetCPUUtilizationPercentage: 80
+    maxReplicas: 10
+    targetCPUUtilizationPercentage: 75

  nodeSelector: {}
  tolerations: []
  affinity: {}

+  # -- Pod disruption budget
+  podDisruptionBudget:
+    enabled: false
+    minAvailable: 1
+
  extraEnv: []
  extraVolumeMounts: []
  extraVolumes: []
@@ -605,6 +679,11 @@ aiAssistant:
  tolerations: []
  affinity: {}

+  # -- Pod disruption budget
+  podDisruptionBudget:
+    enabled: false
+    minAvailable: 1
+
  extraEnv: []
  extraVolumeMounts: []
  extraVolumes: []
@@ -613,12 +692,12 @@ aiAssistant:
 tenantRegistry:
  enabled: true
  replicaCount: 1
-  
+
  image:
    repository: flow/tenantregistry
    tag: ""
    pullPolicy: IfNotPresent
-  
+
  service:
    type: ClusterIP
    port: 80
@@ -636,22 +715,27 @@ tenantRegistry:

  resources:
    limits:
-      cpu: 250m
-      memory: 256Mi
+      cpu: 500m
+      memory: 512Mi
    requests:
-      cpu: 50m
-      memory: 128Mi
+      cpu: 100m
+      memory: 256Mi

  autoscaling:
    enabled: false
    minReplicas: 1
-    maxReplicas: 3
-    targetCPUUtilizationPercentage: 80
+    maxReplicas: 10
+    targetCPUUtilizationPercentage: 75

  nodeSelector: {}
  tolerations: []
  affinity: {}

+  # -- Pod disruption budget
+  podDisruptionBudget:
+    enabled: false
+    minAvailable: 1
+
  extraEnv: []
  extraVolumeMounts: []
  extraVolumes: []
@@ -714,6 +798,11 @@ frontendWeb:
  tolerations: []
  affinity: {}

+  # -- Pod disruption budget
+  podDisruptionBudget:
+    enabled: false
+    minAvailable: 1
+
  extraEnv: []
  extraVolumeMounts: []
  extraVolumes: []
@@ -722,20 +811,48 @@ frontendWeb:
 # Activity Services
 # =============================================================================

+# -- Dedicated activity worker groups for tenant workload isolation
+# Each group deploys additional activity workers that consume from group-specific queues.
+# Configure tenants to route to a group via TenantSettings.QueueGroup in TenantRegistry.
+# Example: A tenant with QueueGroup="enterprise-a" will have messages routed to
+# queue "activity.{name}.execute.enterprise-a" instead of the shared queue.
+dedicatedActivityGroups: []
+# Example configuration:
+# dedicatedActivityGroups:
+#   - name: "enterprise-a"
+#     # Override common resources for this group
+#     resources:
+#       limits:
+#         cpu: 1000m
+#         memory: 1Gi
+#       requests:
+#         cpu: 250m
+#         memory: 512Mi
+#     # Activity workers to deploy for this group
+#     activities:
+#       - name: httprequest
+#         image:
+#           repository: flow/httprequestactivity
+#         replicaCount: 3
+#       - name: sql
+#         image:
+#           repository: flow/sqlactivity
+#         replicaCount: 2
+
 # -- Activity services common configuration
 activities:
  # -- Common image settings for activity services
  image:
    pullPolicy: IfNotPresent
-  
+
  # -- Common resource settings for activity services
  resources:
    limits:
-      cpu: 250m
-      memory: 256Mi
+      cpu: 500m
+      memory: 512Mi
    requests:
-      cpu: 50m
-      memory: 128Mi
+      cpu: 100m
+      memory: 256Mi

  # -- Common service settings
  service:
@@ -987,6 +1104,9 @@ postgresql:
  # -- Primary node configuration
  primary:
    # -- PostgreSQL configuration parameters
+    # These defaults are tuned for standalone mode with 1Gi RAM.
+    # For HA/enterprise workloads, override with values appropriate for your resource limits.
+    # See values-enterprise.yaml for enterprise-tuned configuration.
    configuration: |
      max_connections = 200
      shared_buffers = 256MB
@@ -1037,23 +1157,23 @@ postgresql:
  
  # -- Replica/standby configuration (used when mode=ha)
  replica:
-    # -- Number of read replicas
-    replicaCount: 1
-    
+    # -- Number of read replicas (recommend >= 2 for enterprise workloads)
+    replicaCount: 2
+
    # -- Hot standby settings
    configuration: |
      hot_standby = on
      max_standby_streaming_delay = 30s
      wal_receiver_status_interval = 10s
      hot_standby_feedback = on
-    
+
    resources:
      limits:
-        cpu: 500m
-        memory: 512Mi
+        cpu: 1000m
+        memory: 1Gi
      requests:
-        cpu: 100m
-        memory: 256Mi
+        cpu: 250m
+        memory: 512Mi
    
    persistence:
      enabled: true
@@ -1368,6 +1488,35 @@ serviceAccount:
 # -- Pod annotations
 podAnnotations: {}

+# =============================================================================
+# KEDA Autoscaling (Optional)
+# =============================================================================
+# Requires KEDA to be installed in the cluster (https://keda.sh)
+# When enabled, KEDA ScaledObjects replace standard HPAs for message-driven scaling
+# based on RabbitMQ queue depth rather than CPU/memory metrics.
+
+keda:
+  # -- Enable KEDA-based autoscaling
+  enabled: false
+  # -- WorkflowEngine scaling based on results queue depth
+  workflowEngine:
+    # -- Queue to monitor for scaling decisions
+    queueName: "workflowengine.results"
+    # -- Target queue length per replica (scale up when exceeded)
+    queueLength: 50
+    # -- Seconds to wait before scaling down after queue drains
+    cooldownPeriod: 60
+    # -- How often KEDA checks queue depth
+    pollingInterval: 15
+  # -- Activity worker scaling based on execute queue depth
+  activities:
+    # -- Target queue length per replica
+    queueLength: 20
+    # -- Seconds to wait before scaling down
+    cooldownPeriod: 60
+    # -- How often KEDA checks queue depth
+    pollingInterval: 15
+
 # =============================================================================
 # Network Policies (SOC2/NIS2 Compliance)
 # =============================================================================
@@ -1808,3 +1957,60 @@ tls:
      additionalDnsNames: []
    frontendWeb:
      additionalDnsNames: []
+
+# =============================================================================
+# Monitoring (Prometheus Operator + Grafana)
+# =============================================================================
+
+# -- Prometheus monitoring integration
+# Requires Prometheus Operator (kube-prometheus-stack) installed in the cluster
+monitoring:
+  # -- Create PrometheusRule for alerting
+  prometheusRules:
+    enabled: false
+    # -- Additional labels for PrometheusRule (for rule selection in Prometheus)
+    additionalLabels: {}
+  # -- Create ServiceMonitor for auto-discovery
+  serviceMonitor:
+    enabled: false
+    # -- Scrape interval
+    interval: "30s"
+    # -- Additional labels for ServiceMonitor
+    additionalLabels: {}
+  # -- Deploy Grafana dashboards as ConfigMaps
+  grafanaDashboards:
+    enabled: false
+    # -- Additional labels for dashboard ConfigMaps
+    additionalLabels: {}
+
+# =============================================================================
+# External Secrets Operator
+# =============================================================================
+
+# -- External Secrets Operator integration for secrets management
+# Requires ESO (external-secrets.io) installed in the cluster
+externalSecrets:
+  # -- Enable ExternalSecret resources
+  enabled: false
+  # -- SecretStore or ClusterSecretStore name
+  secretStore: ""
+  # -- Kind of secret store: SecretStore or ClusterSecretStore
+  secretStoreKind: "ClusterSecretStore"
+  # -- How often to sync secrets
+  refreshInterval: "1h"
+  # -- PostgreSQL password reference
+  postgres:
+    key: "flow/database"
+    property: "password"
+  # -- RabbitMQ password reference
+  rabbitmq:
+    key: "flow/rabbitmq"
+    property: "password"
+  # -- Redis password reference
+  redis:
+    key: "flow/redis"
+    property: "password"
+  # -- Azure AD client secret reference
+  azureAd:
+    key: "flow/azure-ad"
+    property: "client-secret"