"""파이프라인 YAML dict 검증 모듈.

설계서 Section 3-2 기반 11개 항목 검증:
  1. schema_version 존재 + 지원 버전("1.0")
  2. gates 최소 1개 이상
  3. token_budget 존재 + 양수
  4. blast_radius 유효값("step"|"team"|"org")
  5. allowed_teams 비어있지 않음
  6. 순환 DAG 검출 (Kahn's algorithm, 자기 참조 포함)
  7. 시크릿 패턴 탐지
  8. 모든 step의 target_team이 allowed_teams 내에 존재
  9. 모든 depends_on 참조가 유효한 step id
  10. 모든 step의 task_desc가 injection_guard.check_content() 통과
  11. inject_context.source 경로가 WORKSPACE_ROOT 하위인지 검증 (path traversal 방지)

작성자: 토르 (dev2-team 백엔드 개발자)
날짜: 2026-03-24
"""

import os
import re
import sys
from collections import deque

sys.path.insert(0, "/home/jay/workspace")
from utils.injection_guard import InjectionBlockedError, check_content  # noqa: E402

WORKSPACE_ROOT = "/home/jay/workspace"
_SUPPORTED_SCHEMA_VERSIONS = {"1.0"}
_VALID_BLAST_RADIUS = {"step", "team", "org"}
_SECRET_PATTERNS: list[str] = [
    r"AWS_ACCESS_KEY_ID=AKIA",
    r"PRIVATE_KEY",
    r"password=",
    r"secret=",
    r"BEGIN RSA PRIVATE KEY",
    r"API_KEY=",
]


def scan_secrets(text: str) -> bool:
    """시크릿 패턴이 text에 포함되어 있으면 True를 반환한다."""
    for pattern in _SECRET_PATTERNS:
        if re.search(pattern, text, re.IGNORECASE):
            return True
    return False


def validate_dag(steps: list[dict]) -> list[str]:
    """Kahn's algorithm으로 순환 의존성을 검출한다. 에러 목록을 반환한다."""
    errors: list[str] = []
    step_ids = {s["id"] for s in steps if "id" in s}

    # 자기 참조 검사
    for step in steps:
        sid = step.get("id", "")
        for dep in step.get("depends_on", []):
            if dep == sid:
                errors.append(f"Step '{sid}' has a self-referencing cycle in depends_on.")

    if errors:
        return errors

    # Kahn's algorithm
    in_degree: dict[str, int] = {sid: 0 for sid in step_ids}
    adjacency: dict[str, list[str]] = {sid: [] for sid in step_ids}

    for step in steps:
        sid = step.get("id", "")
        for dep in step.get("depends_on", []):
            if dep in step_ids:
                adjacency[dep].append(sid)
                in_degree[sid] += 1

    queue: deque[str] = deque(node for node, deg in in_degree.items() if deg == 0)
    visited = 0
    while queue:
        node = queue.popleft()
        visited += 1
        for neighbor in adjacency[node]:
            in_degree[neighbor] -= 1
            if in_degree[neighbor] == 0:
                queue.append(neighbor)

    if visited != len(step_ids):
        errors.append(
            "Pipeline steps contain a circular dependency cycle (Kahn's algorithm detected a cycle)."
        )

    return errors


def _collect_all_strings(obj: object, strings: list[str]) -> None:
    """dict/list/str을 재귀 탐색하여 모든 문자열 값을 수집한다."""
    if isinstance(obj, str):
        strings.append(obj)
    elif isinstance(obj, dict):
        for v in obj.values():
            _collect_all_strings(v, strings)
    elif isinstance(obj, list):
        for item in obj:
            _collect_all_strings(item, strings)


def validate_pipeline(pipeline: dict) -> list[str]:
    """파이프라인 YAML dict를 검증하고 에러 목록을 반환한다. 빈 리스트 = 유효."""
    errors: list[str] = []

    # 1. schema_version
    schema_version = pipeline.get("schema_version")
    if schema_version is None:
        errors.append("Missing required field: schema_version.")
    elif schema_version not in _SUPPORTED_SCHEMA_VERSIONS:
        errors.append(
            f"Unsupported schema_version: '{schema_version}'. Supported: {_SUPPORTED_SCHEMA_VERSIONS}."
        )

    # 2. gates
    gates = pipeline.get("gates")
    if gates is None:
        errors.append("Missing required field: gates (must have at least one gate).")
    elif not isinstance(gates, list) or len(gates) == 0:
        errors.append("Field gates must be a non-empty list (at least one gate required).")

    # 3. token_budget
    token_budget = pipeline.get("token_budget")
    if token_budget is None:
        errors.append("Missing required field: token_budget.")
    elif not isinstance(token_budget, (int, float)) or token_budget <= 0:
        errors.append(f"Field token_budget must be a positive number, got: {token_budget!r}.")

    # 4. blast_radius
    blast_radius = pipeline.get("blast_radius")
    if blast_radius is not None and blast_radius not in _VALID_BLAST_RADIUS:
        errors.append(
            f"Invalid blast_radius: '{blast_radius}'. Allowed values: {sorted(_VALID_BLAST_RADIUS)}."
        )

    # 5. allowed_teams
    allowed_teams = pipeline.get("allowed_teams", [])
    if not isinstance(allowed_teams, list) or len(allowed_teams) == 0:
        errors.append("Field allowed_teams must be a non-empty list.")

    allowed_teams_set = set(allowed_teams) if isinstance(allowed_teams, list) else set()
    steps: list[dict] = pipeline.get("steps", [])
    step_ids = {s["id"] for s in steps if "id" in s}

    # 7. scan_secrets — 전체 파이프라인 문자열 탐색
    all_strings: list[str] = []
    _collect_all_strings(pipeline, all_strings)
    for text in all_strings:
        if scan_secrets(text):
            errors.append(
                f"Secret pattern (secret/key) detected in pipeline content: {text[:80]!r}."
            )
            break

    # 6. validate_dag
    if steps:
        errors.extend(validate_dag(steps))

    for step in steps:
        sid = step.get("id", "<unknown>")

        # 8. target_team이 allowed_teams 내에 존재
        target_team = step.get("target_team")
        if target_team is not None and target_team not in allowed_teams_set:
            errors.append(
                f"Step '{sid}': target_team '{target_team}' is not in allowed_teams {sorted(allowed_teams_set)}."
            )

        # 9. depends_on 참조가 유효한 step id
        for dep in step.get("depends_on", []):
            if dep not in step_ids:
                errors.append(f"Step '{sid}': depends_on references unknown step id '{dep}'.")

        # 10. task_desc 인젝션 검사
        task_desc = step.get("task_desc")
        if task_desc is not None:
            try:
                check_content(str(task_desc))
            except InjectionBlockedError as exc:
                errors.append(f"Step '{sid}': task_desc failed injection guard check: {exc}.")

        # 11. inject_context.source path traversal 검증
        inject_context = step.get("inject_context")
        if isinstance(inject_context, dict):
            source = inject_context.get("source", "")
            if ".." in str(source):
                errors.append(
                    f"Step '{sid}': inject_context.source contains path traversal sequence '..': {source!r}."
                )
            else:
                norm = os.path.normpath(os.path.join(WORKSPACE_ROOT, str(source)))
                if not norm.startswith(WORKSPACE_ROOT):
                    errors.append(
                        f"Step '{sid}': inject_context.source path '{source}' is outside WORKSPACE_ROOT (path traversal rejected)."
                    )

    return errors
