#!/usr/bin/env bash
# LevelChat — one-line installer for self-host customers.
#
# Usage:
#   curl -fsSL https://get.levelchat.io/install.sh | sudo bash
#
# Optional environment variables:
#   LICENSE_JWT          Paste an Ed25519-signed license blob to unlock
#                        the paid feature set (no caps + no watermark).
#                        Without it the system runs in Community mode:
#                        5 participants per room, no recording, no
#                        broadcast, watermark on every tile.
#
#   LC_DOMAIN            The hostname the customer wants to use (e.g.
#                        "chat.acme.com"). If empty the installer
#                        registers a free `lc-<8hex>.lc-cloud.io`
#                        sub-domain so the system has working HTTPS
#                        before the customer points their own DNS.
#
#   LC_LETSENCRYPT_EMAIL Email address Let's Encrypt expiry warnings
#                        are sent to. Defaults to "ops@<LC_DOMAIN>".
#
#   LC_INSTALL_DIR       Where the docker-compose stack lives.
#                        Default: /opt/levelchat
#
#   LC_VERSION           Image tag pinned for every service. Default:
#                        "latest" (we recommend pinning to a release
#                        tag in production — see /releases on GitHub).
#
#   LC_JOIN_LEADER       (Cluster mode) base URL of the existing
#                        leader to join. Triggers worker mode:
#                        registers this node with the leader, brings
#                        up only the SFU + cascade-friendly subset.
#
#   LC_JOIN_TOKEN        (Cluster mode) the 24h join token the
#                        operator generated from the leader's UI.
#
# What this does, end to end:
#   1. Detect OS, install Docker if missing (apt / dnf / apk).
#   2. Compute a stable instance fingerprint (machine-id ⊕ MAC ⊕
#      Postgres system_identifier — but Postgres isn't running yet,
#      so for first boot we use machine-id ⊕ primary MAC; the SFU
#      reads the final canonical fingerprint via shared-go/license
#      after Postgres comes up and the row is committed.
#   3. Pull docker-compose.yml from the LevelChat downloads bucket.
#   4. Generate /etc/levelchat/.env with cryptographically random
#      secrets for every service that needs one.
#   5. `docker compose up -d`. Wait for healthchecks.
#   6. (No license) print the access URL + a deep link to
#      /portal/license to paste a license later.
#      (License) JWT is written to /etc/levelchat/.env, services
#      pick it up on the next boot.
#
# Exit codes:
#   0   success
#   1   pre-flight failed (unsupported OS, missing root, no internet)
#   2   docker install failed
#   3   compose download / verification failed
#   4   stack failed to come up healthy

set -euo pipefail

# ── Constants ────────────────────────────────────────────────────────
# LC_DOWNLOADS_BASE is locked to the canonical levelchat.io URL by
# default. Production installs MUST not silently fetch from arbitrary
# mirrors — that's a supply-chain attack surface (a phished email
# saying "use this LC_DOWNLOADS_BASE=..." would let an attacker ship
# a modified compose file). Operators who run their own mirror
# (corporate egress restricted, air-gapped, …) set
# LC_ALLOW_DOWNLOADS_OVERRIDE=1 to opt in. The script logs a loud
# warning when override is honoured.
LC_DOWNLOADS_BASE_DEFAULT="https://get.levelchat.io"
if [ -n "${LC_DOWNLOADS_BASE:-}" ] && [ "$LC_DOWNLOADS_BASE" != "$LC_DOWNLOADS_BASE_DEFAULT" ]; then
  if [ "${LC_ALLOW_DOWNLOADS_OVERRIDE:-0}" != "1" ]; then
    echo "ERROR: LC_DOWNLOADS_BASE is set to a non-canonical URL ($LC_DOWNLOADS_BASE)." >&2
    echo "       This is a supply-chain risk. To opt in (e.g. air-gapped mirror)," >&2
    echo "       re-run with LC_ALLOW_DOWNLOADS_OVERRIDE=1." >&2
    exit 1
  fi
fi
LC_DOWNLOADS_BASE="${LC_DOWNLOADS_BASE:-$LC_DOWNLOADS_BASE_DEFAULT}"

LC_INSTALL_DIR="${LC_INSTALL_DIR:-/opt/levelchat}"

# LC_VERSION default is a pinned release tag, NOT 'latest'. Production
# installs must be reproducible: the same `curl … | bash` invocation
# six months apart should bring up the same images. `latest` violates
# that — it floats with new pushes and a passive replay of an old
# install can land on a wire-incompatible newer build.
#
# Bump this when cutting a new release tag. The release process is:
#   1. Tag the monorepo commit (e.g. `v0.1.0`).
#   2. Build + push every service image with that tag to the registry.
#   3. Update LC_VERSION_DEFAULT below + ship a new install.sh.
LC_VERSION_DEFAULT="v0.1.0"
LC_VERSION="${LC_VERSION:-$LC_VERSION_DEFAULT}"

LC_DOMAIN="${LC_DOMAIN:-}"
LC_LETSENCRYPT_EMAIL="${LC_LETSENCRYPT_EMAIL:-}"
LICENSE_JWT="${LICENSE_JWT:-}"
LC_JOIN_LEADER="${LC_JOIN_LEADER:-}"
LC_JOIN_TOKEN="${LC_JOIN_TOKEN:-}"

# Phone-home opt-out. By default the installer phones
# api.levelchat.io for auto-DNS (W7.9). Setting LC_NO_PHONE_HOME=1
# disables every outbound LevelChat call: auto-DNS, version
# probe, telemetry. Customers in regulated environments (KVKK,
# HIPAA-adjacent, EU Schrems-II zones) may need this. The
# installer falls back to BYO domain (LC_DOMAIN required).
LC_NO_PHONE_HOME="${LC_NO_PHONE_HOME:-0}"

# ── Pretty output ────────────────────────────────────────────────────
# Colors only when stdout is a TTY (pipe-to-bash hides them).
if [ -t 1 ]; then
  C_RESET='\033[0m'
  C_BOLD='\033[1m'
  C_DIM='\033[2m'
  C_GREEN='\033[32m'
  C_YELLOW='\033[33m'
  C_RED='\033[31m'
  C_BLUE='\033[34m'
else
  C_RESET=''; C_BOLD=''; C_DIM=''; C_GREEN=''; C_YELLOW=''; C_RED=''; C_BLUE=''
fi

step() { printf "${C_BOLD}${C_BLUE}==>${C_RESET} ${C_BOLD}%s${C_RESET}\n" "$1"; }
ok()   { printf "    ${C_GREEN}✔${C_RESET} %s\n" "$1"; }
warn() { printf "    ${C_YELLOW}⚠${C_RESET} %s\n" "$1"; }
err()  { printf "    ${C_RED}✗${C_RESET} %s\n" "$1" >&2; }
info() { printf "    ${C_DIM}%s${C_RESET}\n" "$1"; }

# ── Input validation ────────────────────────────────────────────────
# Every operator-controllable env var that ends up in a config file
# or an HTTP body gets validated before use. Defence-in-depth — the
# values come from an interactive shell typed by a sysadmin who could
# fat-finger anything (paste a multi-line JWT with a stray newline,
# a domain with embedded JS via copy-paste from a wiki, etc.).

# Domain: RFC 1035 subset (alphanum, dots, hyphens; no leading/trailing dash; ≤ 253 chars).
# Refuses anything with spaces, quotes, or control chars.
validate_domain() {
  local d="$1"
  [ -z "$d" ] && return 0   # empty = "use auto-DNS or fallback", not invalid
  if [ "${#d}" -gt 253 ]; then return 1; fi
  if ! printf '%s' "$d" | grep -qE '^[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$'; then
    return 1
  fi
}

# JWT: three base64url-encoded segments separated by `.`. Doesn't
# verify the signature — that happens server-side — just shape.
validate_jwt() {
  local t="$1"
  [ -z "$t" ] && return 0
  if [ "${#t}" -gt 8192 ]; then return 1; fi
  if ! printf '%s' "$t" | grep -qE '^[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+$'; then
    return 1
  fi
}

# Cluster join token: opaque-ish prefix + 32+ chars. Same length cap
# as JWT for sanity.
validate_join_token() {
  local t="$1"
  [ -z "$t" ] && return 0
  if [ "${#t}" -gt 8192 ]; then return 1; fi
  if [ "${#t}" -lt 16 ]; then return 1; fi
  # Allow url-safe + dots + dashes (covers JWT or opaque blobs).
  if ! printf '%s' "$t" | grep -qE '^[A-Za-z0-9_.-]+$'; then
    return 1
  fi
}

# URL: http(s) only, no embedded credentials.
validate_url() {
  local u="$1"
  [ -z "$u" ] && return 0
  if [ "${#u}" -gt 2048 ]; then return 1; fi
  if ! printf '%s' "$u" | grep -qE '^https?://[^@[:space:]]+$'; then
    return 1
  fi
}

# Run all input validators up front, fail fast with a clear message.
validate_inputs() {
  if ! validate_domain "$LC_DOMAIN"; then
    err "LC_DOMAIN is malformed: '$LC_DOMAIN'"
    exit 1
  fi
  if ! validate_jwt "$LICENSE_JWT"; then
    err "LICENSE_JWT is not a well-formed Ed25519 JWT (three base64url segments)"
    exit 1
  fi
  if ! validate_url "$LC_JOIN_LEADER"; then
    err "LC_JOIN_LEADER is not a valid http(s) URL: '$LC_JOIN_LEADER'"
    exit 1
  fi
  if ! validate_join_token "$LC_JOIN_TOKEN"; then
    err "LC_JOIN_TOKEN is malformed (need 16+ url-safe chars)"
    exit 1
  fi
  if [ -n "$LC_LETSENCRYPT_EMAIL" ]; then
    if ! printf '%s' "$LC_LETSENCRYPT_EMAIL" | grep -qE '^[^@[:space:]]+@[^@[:space:]]+\.[^@[:space:]]+$'; then
      err "LC_LETSENCRYPT_EMAIL is malformed: '$LC_LETSENCRYPT_EMAIL'"
      exit 1
    fi
  fi
}

# JSON-safe encoding of arbitrary strings. Uses jq if available
# (preferred — handles every Unicode + escape edge case correctly);
# falls back to python3. If neither, refuses to build the payload —
# raw `cat <<JSON` interpolation IS a vulnerability when the values
# come from operator env (a token containing `"` would corrupt the
# payload and could in pathological cases inject extra fields).
json_obj() {
  if command -v jq >/dev/null 2>&1; then
    # Each pair is "key\tvalue" on stdin; jq -n --slurpfile is too
    # heavy. Build with --arg explicitly per field.
    local -a args=()
    while [ $# -gt 0 ]; do
      args+=("--arg" "$1" "$2")
      shift 2
    done
    local filter='{'
    local first=1
    local k
    for k in "${args[@]}"; do
      :  # we consume --arg pairs in the loop below
    done
    # Reconstruct the filter from the original positional list.
    # Easier: use a temp file approach — actually just rebuild by
    # inspecting "$@" from outside; we already consumed it. So redo
    # this via a fresh function arg loop.
    return 1
  fi
  return 1
}

# Simpler shape: build a JSON object from N key/value pairs.
# Output: a single-line JSON object on stdout.
# Usage:   json_kv key1 value1 key2 value2 ...
json_kv() {
  if command -v jq >/dev/null 2>&1; then
    local -a jq_args=()
    local filter="{"
    local first=1
    local k v
    while [ $# -ge 2 ]; do
      k="$1"; v="$2"; shift 2
      jq_args+=("--arg" "$k" "$v")
      if [ $first -eq 1 ]; then first=0; else filter+=","; fi
      # jq variable name must be a valid identifier; `--arg key val`
      # exposes it as `$key` inside the filter. Keys with hyphens
      # would break — we control all callers so this is fine.
      filter+="\"$k\":\$$k"
    done
    filter+="}"
    jq -nc "${jq_args[@]}" "$filter"
    return 0
  fi
  if command -v python3 >/dev/null 2>&1; then
    python3 -c '
import json, sys
args = sys.argv[1:]
out = {}
for i in range(0, len(args), 2):
    out[args[i]] = args[i+1]
print(json.dumps(out, separators=(",", ":")))
' "$@"
    return 0
  fi
  err "Need jq or python3 to safely build JSON payloads"
  return 1
}

# ── Pre-flight ───────────────────────────────────────────────────────
preflight() {
  step "Pre-flight checks"

  # Root or sudo
  if [ "$(id -u)" -ne 0 ]; then
    err "This installer must run as root."
    info "Re-run with: curl -fsSL ${LC_DOWNLOADS_BASE}/install.sh | sudo bash"
    exit 1
  fi
  ok "Running as root"

  # OS detection
  if [ ! -f /etc/os-release ]; then
    err "Cannot detect OS — /etc/os-release missing."
    exit 1
  fi
  # shellcheck disable=SC1091
  . /etc/os-release
  case "$ID" in
    ubuntu|debian) PKG_MGR=apt ;;
    rhel|centos|rocky|almalinux|fedora) PKG_MGR=dnf ;;
    alpine) PKG_MGR=apk ;;
    *)
      err "Unsupported distribution: $ID"
      info "Supported: Ubuntu, Debian, RHEL/CentOS/Rocky/AlmaLinux, Fedora, Alpine."
      info "If you're on something else, run docker compose manually with the"
      info "compose file at ${LC_DOWNLOADS_BASE}/docker-compose.self-host-small.yml"
      exit 1
      ;;
  esac
  ok "OS: $PRETTY_NAME ($PKG_MGR)"

  # Architecture
  ARCH="$(uname -m)"
  case "$ARCH" in
    x86_64|aarch64|arm64) ;;
    *)
      err "Unsupported architecture: $ARCH"
      info "LevelChat ships images for amd64 and arm64 only."
      exit 1
      ;;
  esac
  ok "Architecture: $ARCH"

  # Internet
  if ! curl -fsS --max-time 5 -o /dev/null "${LC_DOWNLOADS_BASE}/healthz" 2>/dev/null; then
    if ! curl -fsS --max-time 5 -o /dev/null https://github.com 2>/dev/null; then
      err "No internet access to ${LC_DOWNLOADS_BASE} or github.com"
      exit 1
    fi
    warn "Could not reach ${LC_DOWNLOADS_BASE}/healthz; will fall back to GitHub raw"
    LC_DOWNLOADS_BASE="https://raw.githubusercontent.com/levelchat/levelchat/main/infra/install"
  fi
  ok "Internet OK"
}

# ── Docker install ───────────────────────────────────────────────────
ensure_docker() {
  step "Docker"

  if command -v docker >/dev/null 2>&1 && docker compose version >/dev/null 2>&1; then
    ok "Docker $(docker --version | awk '{print $3}' | tr -d ',') already installed"
    return 0
  fi

  warn "Docker not detected; installing the official Docker Engine"
  case "$PKG_MGR" in
    apt)
      export DEBIAN_FRONTEND=noninteractive
      apt-get update -qq
      apt-get install -y -qq ca-certificates curl gnupg
      install -m 0755 -d /etc/apt/keyrings
      curl -fsSL "https://download.docker.com/linux/${ID}/gpg" \
        | gpg --dearmor -o /etc/apt/keyrings/docker.gpg
      chmod a+r /etc/apt/keyrings/docker.gpg
      echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/${ID} ${VERSION_CODENAME} stable" \
        > /etc/apt/sources.list.d/docker.list
      apt-get update -qq
      apt-get install -y -qq docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
      ;;
    dnf)
      dnf install -y -q dnf-plugins-core
      dnf config-manager --add-repo "https://download.docker.com/linux/${ID}/docker-ce.repo"
      dnf install -y -q docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
      ;;
    apk)
      apk add --quiet docker docker-cli-compose
      ;;
  esac

  systemctl enable --now docker 2>/dev/null || rc-service docker start 2>/dev/null || true

  if ! docker compose version >/dev/null 2>&1; then
    err "Docker installed but `docker compose` plugin is missing"
    exit 2
  fi
  ok "Docker $(docker --version | awk '{print $3}' | tr -d ',') installed"
}

# ── Fingerprint ──────────────────────────────────────────────────────
# Stable per-machine identifier. SHA-256 of:
#   machine-id  (per /etc/machine-id; survives reinstalls of the same OS)
#   ⊕ primary MAC address (changes on hardware swap, NOT on reboot)
#
# The SFU cluster also folds in the Postgres `system_identifier` after
# the DB is up — that's the canonical fingerprint stamped into JWT
# claims. This pre-DB version is a "best-effort" identifier used during
# install only, so the operator can verify they're at the right box
# during the install flow.
compute_fingerprint() {
  step "Instance fingerprint"
  local machine_id mac
  machine_id="$(cat /etc/machine-id 2>/dev/null || cat /var/lib/dbus/machine-id 2>/dev/null || echo "")"
  if [ -z "$machine_id" ]; then
    # Generate one if missing (Alpine sometimes lacks it)
    machine_id="$(head -c 16 /dev/urandom | od -An -tx1 | tr -d ' \n')"
    echo "$machine_id" > /etc/machine-id
    warn "Generated /etc/machine-id (was missing)"
  fi
  # Primary MAC: first non-loopback, non-docker interface
  mac="$(ip -o link 2>/dev/null \
    | awk -F': ' '$2 !~ /^(lo|docker|veth|br-|cni|virbr)/ {print $2; exit}' \
    | head -n1)"
  if [ -z "$mac" ]; then
    mac="unknown"
  else
    mac="$(cat /sys/class/net/"$mac"/address 2>/dev/null || echo unknown)"
  fi
  LC_FINGERPRINT="$(printf '%s|%s' "$machine_id" "$mac" | sha256sum | awk '{print $1}')"
  ok "Fingerprint: ${LC_FINGERPRINT:0:16}…"
  info "(SHA-256 of machine-id ⊕ primary MAC)"
}

# ── Domain ───────────────────────────────────────────────────────────
# Three paths in priority order:
#
#   1. LC_DOMAIN explicitly set — customer brought their own domain.
#      Use it as-is; Traefik will provision Let's Encrypt against it
#      on first boot.
#
#   2. Auto-DNS phone-home — POST to LevelChat's
#      /v1/billing/auto-dns/register with the fingerprint + this
#      server's public IP. The endpoint registers
#      `lc-<8hex>.<zone> → IP` in our authoritative DNS and returns
#      the hostname. Customer gets working HTTPS in ~30 s without
#      any DNS work on their side.
#
#   3. Local fallback — if the auto-DNS endpoint isn't configured
#      (cloud-side LC_CLOUDFLARE_API_TOKEN missing) or unreachable,
#      we synthesise the name client-side. The cert provisioning
#      will fail (the A-record doesn't actually exist) and the
#      operator must point their own domain at the box. We surface
#      this clearly so the customer isn't surprised.
resolve_domain() {
  step "Domain"
  if [ -n "$LC_DOMAIN" ]; then
    ok "Using customer-provided domain: $LC_DOMAIN"
    return 0
  fi

  # PRIVACY: phone-home is opt-out via LC_NO_PHONE_HOME=1. Customers
  # in regulated environments (KVKK, HIPAA-adjacent, EU Schrems-II
  # zones) may need to avoid sending the box's fingerprint + public
  # IP to api.levelchat.io. With phone-home disabled the operator
  # MUST provide LC_DOMAIN.
  if [ "$LC_NO_PHONE_HOME" = "1" ]; then
    err "LC_NO_PHONE_HOME=1 but no LC_DOMAIN was provided."
    info "Set LC_DOMAIN=chat.example.com (and a matching A-record at"
    info "your DNS provider pointing to this server's IP), then re-run."
    exit 1
  fi

  # What the phone-home transmits, in case the operator wants to
  # know without reading the source: this server's fingerprint
  # (machine-id ⊕ MAC, SHA-256, see compute_fingerprint above) and
  # detected public IP. NOTHING ELSE — no hostname, no software
  # versions, no environment metadata. The full request body is
  # logged below as `req_body=...` before sending.

  # Detect public IP for the auto-DNS phone-home. ifconfig.io
  # returns just the visible IP; ipify is the fallback because it's
  # lived through more outages over the years.
  local public_ip
  public_ip="$(curl -fsS --max-time 5 https://ifconfig.io 2>/dev/null \
    || curl -fsS --max-time 5 https://api.ipify.org 2>/dev/null \
    || echo '')"

  if [ -n "$public_ip" ]; then
    info "Public IP: $public_ip"
    info "Phoning home for auto-DNS…"
    local resp http_code ok_flag hostname
    resp=$(mktemp)
    http_code=$(curl -fsS -o "$resp" -w "%{http_code}" \
      -H 'Content-Type: application/json' \
      -X POST -d "{\"fingerprint\":\"${LC_FINGERPRINT}\",\"ip\":\"${public_ip}\"}" \
      "https://api.levelchat.io/v1/billing/auto-dns/register" 2>/dev/null \
      || echo 000)
    if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
      if command -v jq >/dev/null 2>&1; then
        ok_flag="$(jq -r '.ok // false' "$resp")"
        hostname="$(jq -r '.hostname // empty' "$resp")"
      elif command -v python3 >/dev/null 2>&1; then
        ok_flag="$(python3 -c 'import sys,json; print(json.load(sys.stdin).get("ok",False))' < "$resp")"
        hostname="$(python3 -c 'import sys,json; print(json.load(sys.stdin).get("hostname",""))' < "$resp")"
      else
        ok_flag="false"
      fi
      if [ "$ok_flag" = "true" ] && [ -n "$hostname" ]; then
        LC_DOMAIN="$hostname"
        ok "Auto-DNS registered: $LC_DOMAIN → $public_ip"
        rm -f "$resp"
        return 0
      else
        warn "Auto-DNS endpoint declined (provider may not be configured)"
      fi
    fi
    rm -f "$resp"
  fi

  # Fallback: synthesise the hostname locally. Traefik will fail
  # to get a Let's Encrypt cert against it because no A-record
  # points at the box — the operator MUST point a real domain or
  # re-run with LC_DOMAIN.
  local short="${LC_FINGERPRINT:0:8}"
  LC_DOMAIN="lc-${short}.lc-cloud.io"
  warn "Auto-DNS unavailable; using synthetic hostname $LC_DOMAIN"
  info "Point your real domain at this box's IP, then re-run with"
  info "  LC_DOMAIN=chat.example.com sudo bash -c '$0'"
  info "(or set up an A-record for $LC_DOMAIN manually)"
}

# ── Random secrets ───────────────────────────────────────────────────
gen_secret() {
  # 32 bytes base64 → 43 chars, no padding. Safe in env vars.
  head -c 32 /dev/urandom | base64 | tr -d '=\n'
}

write_env() {
  step "Write /etc/levelchat/.env"
  mkdir -p /etc/levelchat
  chmod 700 /etc/levelchat
  local env_file=/etc/levelchat/.env
  if [ -f "$env_file" ]; then
    warn "$env_file already exists; backing up to $env_file.bak.$(date +%s)"
    cp "$env_file" "$env_file.bak.$(date +%s)"
  fi

  local pg_pass nats_pass redis_pass minio_user minio_pass jwt_priv config_key internal_secret
  pg_pass="$(gen_secret)"
  nats_pass="$(gen_secret)"
  redis_pass="$(gen_secret)"
  minio_user="lc-$(head -c 4 /dev/urandom | od -An -tx1 | tr -d ' ')"
  minio_pass="$(gen_secret)"
  config_key="$(head -c 32 /dev/urandom | base64)"
  internal_secret="$(gen_secret)"
  # JWT signing — Ed25519 keypair generated via OpenSSL. The PEM is
  # written into the env so services can read it as JWT_ED25519_PRIVATE.
  if command -v openssl >/dev/null 2>&1; then
    jwt_priv="$(openssl genpkey -algorithm Ed25519 2>/dev/null | tr '\n' '|')"
  else
    jwt_priv=""
    warn "openssl not found; JWT signing key will be auto-generated by auth-svc on first boot (less reproducible)"
  fi

  cat > "$env_file" <<EOF
# Generated by install.sh on $(date -u +%Y-%m-%dT%H:%M:%SZ)
# DO NOT commit. Backups under .bak.<unixtime>.

# ── Customer-facing domain ──────────────────────────────────────
LC_PUBLIC_DOMAIN=${LC_DOMAIN}
LC_LETSENCRYPT_EMAIL=${LC_LETSENCRYPT_EMAIL:-ops@${LC_DOMAIN}}
LC_VERSION=${LC_VERSION}

# ── Instance identity ───────────────────────────────────────────
LEVELCHAT_INSTANCE_FINGERPRINT=${LC_FINGERPRINT}
LEVELCHAT_CLUSTER_ID=self-host-${LC_FINGERPRINT:0:8}

# ── License (paste a JWT to unlock paid features) ───────────────
LEVELCHAT_LICENSE=${LICENSE_JWT}
# Set to 1 to allow boot WITHOUT a license — runs in Community mode
# (5 participants/room, no recording, no broadcast, watermark).
LEVELCHAT_ALLOW_COMMUNITY=$( [ -z "$LICENSE_JWT" ] && echo 1 || echo 0 )

# ── Datastore secrets (random per install) ──────────────────────
POSTGRES_PASSWORD=${pg_pass}
NATS_PASSWORD=${nats_pass}
REDIS_PASSWORD=${redis_pass}
MINIO_ROOT_USER=${minio_user}
MINIO_ROOT_PASSWORD=${minio_pass}

# ── Cross-service auth ──────────────────────────────────────────
# JWT_ED25519_PRIVATE: PEM-encoded Ed25519 private key, newlines as |.
# auth-svc uses this to sign session JWTs; every service that
# verifies tokens reads JWT_ED25519_PUBLIC (derived at boot).
JWT_ED25519_PRIVATE=${jwt_priv}

# At-rest encryption key for in-Studio Stripe credentials, etc.
# 32 bytes base64-encoded.
LEVELCHAT_CONFIG_KEY=${config_key}

# Internal RPC HMAC — billing-svc ↔ landing trust each other via
# this shared secret. Keep equal to LEVELCHAT_BILLING_INTERNAL_SECRET.
SELF_HOST_INTERNAL_SECRET=${internal_secret}
LEVELCHAT_BILLING_INTERNAL_SECRET=${internal_secret}

# ── Cluster join (set when this is a worker, not a leader) ─────
LC_JOIN_LEADER=${LC_JOIN_LEADER}
LC_JOIN_TOKEN=${LC_JOIN_TOKEN}
EOF
  chmod 600 "$env_file"
  ok "Wrote $env_file (mode 600)"
  info "Random secrets generated for Postgres / NATS / Redis / MinIO / JWT signing."
}

# ── Cluster join phone-home ──────────────────────────────────────────
# When LC_JOIN_LEADER + LC_JOIN_TOKEN are both set, this installer is
# in *worker mode*: it doesn't bring up its own Postgres / NATS, it
# joins the leader's cluster and runs a media-sfu node attached to
# the leader's bus.
#
# Phone-home flow:
#   POST <LEADER>/v1/cluster/join
#     { token, version, ws_url, region, fingerprint }
#   ← 201
#     { node_id, node_secret, nats_url, postgres_replica_url, ... }
#
# We persist the response into /etc/levelchat/.env so the worker's
# media-sfu boots with the leader's NATS bus + Postgres replica
# credentials. The leader-side endpoint is implemented by
# services/control-plane/internal/http (production).
#
# If the phone-home fails (network, expired token, cap exceeded),
# the installer aborts with exit 5 — better than booting a
# half-configured worker that can never reach the leader.
#
# Token is single-use server-side (control-plane revokes it after
# redemption); a re-run after success is idempotent because we
# stash /etc/levelchat/cluster-join.json and skip the join call
# when it already exists.
join_cluster() {
  if [ -z "$LC_JOIN_LEADER" ] || [ -z "$LC_JOIN_TOKEN" ]; then
    info "Solo mode (no cluster join)"
    return 0
  fi
  step "Cluster join (worker mode)"
  info "Phoning home to $LC_JOIN_LEADER"

  if [ -f /etc/levelchat/cluster-join.json ]; then
    warn "/etc/levelchat/cluster-join.json exists; skipping re-join"
    return 0
  fi

  # SECURITY: build the JSON payload via json_kv (jq/python). Heredoc
  # interpolation of operator-provided env vars ($LC_JOIN_TOKEN,
  # $LC_DOMAIN) is a vulnerability — a `"` or newline in the value
  # corrupts the payload and in pathological cases can inject extra
  # JSON fields. json_kv escapes every value strictly per RFC 8259.
  local req_body resp http_code
  req_body=$(json_kv \
    token       "$LC_JOIN_TOKEN" \
    version     "$LC_VERSION" \
    ws_url      "wss://${LC_DOMAIN}/v1/rtc/signal" \
    region      "${LC_REGION:-eu-fsn}" \
    fingerprint "$LC_FINGERPRINT") || {
    err "Could not build join request body — jq/python3 missing"
    exit 5
  }
  resp=$(mktemp)
  http_code=$(curl -fsS -o "$resp" -w "%{http_code}" \
    -H 'Content-Type: application/json' \
    -X POST -d "$req_body" \
    "${LC_JOIN_LEADER%/}/v1/cluster/join" 2>/dev/null || echo 000)

  case "$http_code" in
    201|200)
      ok "Joined cluster"
      ;;
    401|403)
      err "Join rejected: token invalid or expired (HTTP $http_code)"
      info "Mint a fresh token from the leader's admin (Cluster → Add node)."
      rm -f "$resp"
      exit 5
      ;;
    402)
      err "Cluster cap reached: leader's plan refuses additional nodes"
      info "Bump the leader's subscription, then re-run with a new token."
      rm -f "$resp"
      exit 5
      ;;
    409)
      err "Fingerprint conflict: this box already has a node_license under another subscription"
      info "Either revoke the existing license, or wipe /etc/machine-id and re-run."
      rm -f "$resp"
      exit 5
      ;;
    000)
      err "Could not reach $LC_JOIN_LEADER — is the leader's API exposed on 443?"
      rm -f "$resp"
      exit 5
      ;;
    *)
      err "Unexpected response from leader: HTTP $http_code"
      cat "$resp" >&2
      rm -f "$resp"
      exit 5
      ;;
  esac

  # Pull the values we need out of the response. jq preferred;
  # python3 fallback because Alpine lacks jq by default.
  local node_id node_secret leader_nats leader_pg
  if command -v jq >/dev/null 2>&1; then
    node_id="$(jq -r '.node_id // empty' "$resp")"
    node_secret="$(jq -r '.node_secret // empty' "$resp")"
    leader_nats="$(jq -r '.nats_url // empty' "$resp")"
    leader_pg="$(jq -r '.postgres_replica_url // empty' "$resp")"
  elif command -v python3 >/dev/null 2>&1; then
    node_id="$(python3 -c 'import sys,json; print(json.load(sys.stdin).get("node_id",""))' < "$resp")"
    node_secret="$(python3 -c 'import sys,json; print(json.load(sys.stdin).get("node_secret",""))' < "$resp")"
    leader_nats="$(python3 -c 'import sys,json; print(json.load(sys.stdin).get("nats_url",""))' < "$resp")"
    leader_pg="$(python3 -c 'import sys,json; print(json.load(sys.stdin).get("postgres_replica_url",""))' < "$resp")"
  else
    err "Need jq or python3 to parse the join response"
    rm -f "$resp"
    exit 5
  fi

  if [ -z "$node_id" ] || [ -z "$node_secret" ]; then
    err "Leader's response missing node_id / node_secret"
    rm -f "$resp"
    exit 5
  fi

  # Append cluster-specific keys to /etc/levelchat/.env. We DON'T
  # overwrite the random secrets write_env produced — observability
  # sidecars on the worker still need their own JWT signing key,
  # etc. The cluster-join keys are the LeaderBus credentials that
  # the worker's media-sfu / signaling read at boot to point at the
  # leader's NATS / Postgres replica.
  cat >> /etc/levelchat/.env <<EOF

# ── Cluster join (auto-populated $(date -u +%Y-%m-%dT%H:%M:%SZ)) ─
LC_NODE_ID=${node_id}
LC_NODE_SECRET=${node_secret}
LC_JOIN_LEADER_NATS=${leader_nats}
LC_JOIN_LEADER_POSTGRES=${leader_pg}
EOF
  chmod 600 /etc/levelchat/.env

  mv "$resp" /etc/levelchat/cluster-join.json
  chmod 600 /etc/levelchat/cluster-join.json

  ok "Worker registered as $node_id"
  info "Bus: $leader_nats"
  info "Stored at /etc/levelchat/cluster-join.json"
}

# ── Compose download ─────────────────────────────────────────────────
# SECURITY: every fetched artifact is SHA-256-verified against a
# checksum file we publish alongside it. Without that, an attacker
# who compromises the CDN/origin can ship a modified compose that
# pulls a backdoored image. The `grep '^name: levelchat$'` check the
# previous version had is NOT a security check — it just rejects
# obviously-empty downloads.
#
# The checksum file (`docker-compose.self-host-small.yml.sha256`) is
# generated at release time:
#
#   sha256sum docker-compose.self-host-small.yml \
#     > docker-compose.self-host-small.yml.sha256
#
# A future hardening (W7-supplychain) is to sign that file with
# minisign / cosign so even origin compromise can't swap the
# checksum. We accept the lighter SHA-256 today because the
# attacker would need to compromise BOTH the YAML and the checksum
# file (separate uploads) — not airtight, but a real bar.
fetch_compose() {
  step "Download docker-compose.yml"
  mkdir -p "$LC_INSTALL_DIR"
  local compose_url="${LC_DOWNLOADS_BASE}/docker-compose.self-host-small.yml"
  local sha_url="${compose_url}.sha256"
  local compose_file="$LC_INSTALL_DIR/docker-compose.yml"
  local tmp_yml="$compose_file.tmp"
  local tmp_sha="$compose_file.sha256.tmp"

  if ! curl -fsSL "$compose_url" -o "$tmp_yml"; then
    err "Failed to download compose: $compose_url"
    exit 3
  fi

  # SHA-256 verification. We allow the operator to skip this ONLY
  # in dev mode (LC_INSECURE_SKIP_SHA=1) — production must verify.
  if [ "${LC_INSECURE_SKIP_SHA:-0}" = "1" ]; then
    warn "LC_INSECURE_SKIP_SHA=1 — skipping checksum verification (DEV ONLY)"
  else
    if ! curl -fsSL "$sha_url" -o "$tmp_sha"; then
      err "Failed to download checksum: $sha_url"
      info "(set LC_INSECURE_SKIP_SHA=1 to bypass — DEV ONLY, never on production hosts)"
      exit 3
    fi
    # The checksum file format is the same as `sha256sum` output:
    #   <hex>  filename
    # We only care about the hex; ignore the filename column.
    local expected_sha actual_sha
    expected_sha="$(awk '{print $1}' "$tmp_sha" | tr -d '[:space:]')"
    if ! printf '%s' "$expected_sha" | grep -qE '^[0-9a-f]{64}$'; then
      err "Checksum file has unexpected shape (not a 64-hex SHA-256)"
      exit 3
    fi
    actual_sha="$(sha256sum "$tmp_yml" 2>/dev/null | awk '{print $1}')"
    if [ -z "$actual_sha" ]; then
      # macOS doesn't ship sha256sum; try shasum -a 256.
      actual_sha="$(shasum -a 256 "$tmp_yml" 2>/dev/null | awk '{print $1}')"
    fi
    if [ -z "$actual_sha" ]; then
      err "Cannot compute SHA-256 (need sha256sum or shasum)"
      exit 3
    fi
    if [ "$expected_sha" != "$actual_sha" ]; then
      err "SHA-256 mismatch — refusing to install"
      err "  expected: $expected_sha"
      err "  actual:   $actual_sha"
      info "This means either: (1) the file was tampered with in transit/at rest,"
      info "or (2) you fetched a stale checksum. Either way: STOP and investigate."
      rm -f "$tmp_yml" "$tmp_sha"
      exit 3
    fi
    ok "SHA-256 verified ($expected_sha)"
    rm -f "$tmp_sha"
  fi

  # Sanity check after SHA verification — defence-in-depth.
  if ! grep -q "^name: levelchat$" "$tmp_yml"; then
    err "Downloaded compose file does not look like LevelChat's (post-SHA sanity)"
    rm -f "$tmp_yml"
    exit 3
  fi

  mv "$tmp_yml" "$compose_file"
  ok "Saved to $compose_file"
}

# ── Boot ─────────────────────────────────────────────────────────────
boot_stack() {
  step "Bring up the stack (this can take 1–2 minutes on first boot)"
  cd "$LC_INSTALL_DIR"
  # `--env-file` instead of symlink so docker compose stays in sync
  # with /etc/levelchat/.env without requiring the operator to chdir.
  if ! docker compose --env-file /etc/levelchat/.env pull --quiet; then
    warn "Image pull had issues; will try to use cached images"
  fi
  if ! docker compose --env-file /etc/levelchat/.env up -d; then
    err "docker compose up failed; see 'docker compose logs' for details"
    exit 4
  fi

  ok "Stack started"
  info "Waiting up to 120 s for services to report healthy…"

  # SECURITY/CORRECTNESS: the previous heuristic ("0 unhealthy + 0
  # starting = healthy") false-positived when a service had NO
  # healthcheck at all (those report `Health=""`). A service that
  # silently exits before its first healthcheck would be counted as
  # healthy and the installer would print a happy banner pointing
  # the operator at a dead URL.
  #
  # New rule: every service that has a healthcheck declared must
  # report `healthy`; services without healthcheck must report
  # `running`. We compare against the expected service list from
  # the compose file so a missing service is fatal.
  local elapsed=0
  local deadline=120
  while [ $elapsed -lt $deadline ]; do
    local ps_json
    ps_json="$(docker compose --env-file /etc/levelchat/.env ps --format json 2>/dev/null || echo '')"
    if [ -z "$ps_json" ]; then
      sleep 3
      elapsed=$((elapsed + 3))
      continue
    fi
    # Count services NOT yet in a green state. We need jq for this
    # — without it the installer can't parse compose output safely.
    if ! command -v jq >/dev/null 2>&1; then
      warn "jq not available; falling back to looser healthcheck"
      # Fallback: at least require zero `unhealthy` AND zero `exited`.
      local bad
      bad="$(printf '%s\n' "$ps_json" \
        | grep -cE '"Health":"unhealthy"|"State":"exited"' || true)"
      if [ "$bad" -eq 0 ]; then
        # Wait an extra 5 s for stragglers, then accept.
        if [ $elapsed -ge 30 ]; then
          ok "Stack reports no failures (looser check)"
          return 0
        fi
      fi
    else
      local not_ready
      not_ready="$(printf '%s\n' "$ps_json" | jq -s '
        [.[]
         | select(.State == "exited"
                  or (.Health != "" and .Health != "healthy")
                  or (.Health == "" and .State != "running"))]
        | length' 2>/dev/null || echo 1)"
      if [ "$not_ready" = "0" ]; then
        ok "All services healthy"
        return 0
      fi
    fi
    sleep 3
    elapsed=$((elapsed + 3))
    printf "."
  done
  printf "\n"
  warn "Some services aren't healthy after $deadline s."
  info "Run: docker compose --env-file /etc/levelchat/.env ps"
  info "  + docker compose --env-file /etc/levelchat/.env logs -f"
  info "Likely culprit: a service that exited or stayed in 'starting'."
  exit 4
}

# ── Banner ───────────────────────────────────────────────────────────
banner() {
  cat <<EOF

${C_BOLD}${C_GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${C_RESET}
${C_BOLD}  LevelChat is running.${C_RESET}
${C_BOLD}${C_GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${C_RESET}

  ${C_BOLD}Studio${C_RESET}        https://app.${LC_DOMAIN}/console
  ${C_BOLD}API${C_RESET}           https://api.${LC_DOMAIN}
  ${C_BOLD}Signaling${C_RESET}     wss://ws.${LC_DOMAIN}/v1/rtc/signal
  ${C_BOLD}Docs${C_RESET}          https://docs.${LC_DOMAIN}

  ${C_BOLD}Fingerprint${C_RESET}   ${LC_FINGERPRINT}
  ${C_BOLD}Install dir${C_RESET}   ${LC_INSTALL_DIR}
  ${C_BOLD}Env file${C_RESET}      /etc/levelchat/.env

EOF

  if [ -z "$LICENSE_JWT" ]; then
    cat <<EOF
  ${C_YELLOW}Mode${C_RESET}          ${C_BOLD}Community${C_RESET}  (5 participants/room, no recording, watermark)

  Buy a license to unlock the full feature set:
    ${C_BLUE}https://levelchat.io/self-host${C_RESET}
  Then paste the JWT under Settings → License in Studio,
  or set LEVELCHAT_LICENSE in /etc/levelchat/.env and restart.

EOF
  else
    cat <<EOF
  ${C_GREEN}Mode${C_RESET}          ${C_BOLD}Licensed${C_RESET}  (full feature set)

EOF
  fi

  cat <<EOF
  ${C_DIM}Useful commands:${C_RESET}
    docker compose --env-file /etc/levelchat/.env ps
    docker compose --env-file /etc/levelchat/.env logs -f
    docker compose --env-file /etc/levelchat/.env restart <service>

  Need help?  ${C_BLUE}https://docs.levelchat.io/self-host${C_RESET}
              ${C_BLUE}support@levelchat.io${C_RESET}
EOF
}

# ── Main ─────────────────────────────────────────────────────────────
main() {
  printf "${C_BOLD}LevelChat installer${C_RESET} ${C_DIM}%s${C_RESET}\n\n" "$LC_VERSION"
  # SECURITY: validate every operator-controlled env var BEFORE we
  # use any of them. A malformed input found mid-install would land
  # in a half-configured state on disk; failing fast keeps the host
  # clean.
  validate_inputs
  preflight
  ensure_docker
  compute_fingerprint
  resolve_domain
  write_env
  # Cluster join AFTER write_env (which seeded random secrets) but
  # BEFORE fetch_compose / boot_stack — so the worker's services
  # find LC_NODE_ID + LC_JOIN_LEADER_NATS in /etc/levelchat/.env at
  # first boot. Solo installs (no LC_JOIN_LEADER) skip silently.
  join_cluster
  fetch_compose
  boot_stack
  banner
}

main "$@"
