#!/usr/bin/env python3
"""
extract-changed-files.py - 작업 보고서에서 변경 파일 목록 자동 추출

팀 작업 완료 보고서(markdown)에서 '생성/수정 파일 목록' 섹션을 파싱하여
project-map.py --incremental --changed-files에 전달 가능한 형식으로 출력합니다.

Usage:
    python3 extract-changed-files.py /path/to/report.md
    python3 extract-changed-files.py /path/to/report.md --project insuwiki --auto-update

출력 예시:
    {"changed_files": ["functions/src/crawlYoutubeChannels.ts", "functions/src/whisperStt.ts"], "deleted_files": []}
"""

import argparse
import json
import os
import re
import subprocess
import sys
from pathlib import Path

# 변경 파일 섹션 헤더 패턴
SECTION_HEADER_PATTERNS = [
    r"^#{1,4}\s+생성/수정\s*파일\s*목록",
    r"^#{1,4}\s+변경\s*파일",
    r"^#{1,4}\s+수정\s*파일",
    r"^#{1,4}\s+생성.*파일",
    r"^#{1,4}\s+Modified\s+Files",
    r"^#{1,4}\s+Changed\s+Files",
    r"^#{1,4}\s+Created.*Files",
]

# 삭제 유형 키워드 패턴
DELETE_TYPE_PATTERNS = [
    r"삭제",
    r"removed?",
    r"deleted?",
]

# 다음 섹션을 나타내는 헤더 패턴 (파일 목록 섹션 종료 신호)
NEXT_SECTION_PATTERN = r"^#{1,4}\s+"

# 테이블 행 패턴: | 파일경로 | 변경유형 | ... |
TABLE_ROW_PATTERN = re.compile(r"^\|\s*([^|]+?)\s*\|(.+)$")

# 리스트 항목 패턴: - 파일경로 또는 * 파일경로
LIST_ITEM_PATTERN = re.compile(r"^[-*]\s+(`)?([^\s`|]+\.[a-zA-Z0-9_]+[^\s`]*)\1?")

# 코드 스팬 패턴: `파일경로`
CODE_SPAN_PATTERN = re.compile(r"`([^`]+\.[a-zA-Z0-9_]+[^`]*)`")

# 파일 경로처럼 보이는 패턴 (확장자 포함)
FILE_PATH_PATTERN = re.compile(
    r"(?:^|[\s`|])([a-zA-Z0-9가-힣_./-]+\.[a-zA-Z0-9]{1,10})(?:[\s`|]|$)"
)

# 헤더/구분자 행 무시 패턴
TABLE_SEPARATOR_PATTERN = re.compile(r"^\|[-| :]+\|")

# 파일 확장자 화이트리스트 (일반적으로 소스 파일 확장자)
VALID_EXTENSIONS = {
    "ts", "tsx", "js", "jsx", "py", "sh", "md", "json", "yaml", "yml",
    "html", "css", "scss", "sass", "sql", "env", "toml", "cfg", "ini",
    "rs", "go", "java", "kt", "rb", "php", "swift", "c", "cpp", "h",
    "lock", "txt",
}


def is_delete_action(type_str: str) -> bool:
    """변경 유형 문자열이 삭제를 나타내는지 확인."""
    if not type_str:
        return False
    s = type_str.strip().lower()
    for pat in DELETE_TYPE_PATTERNS:
        if re.search(pat, s, re.IGNORECASE):
            return True
    return False


def looks_like_file_path(s: str) -> bool:
    """문자열이 파일 경로처럼 보이는지 확인."""
    s = s.strip().strip("`").strip()
    if not s:
        return False
    # 경로 구분자 또는 확장자가 있어야 함
    if "." not in s and "/" not in s:
        return False
    # 확장자 추출
    parts = s.rsplit(".", 1)
    if len(parts) == 2:
        ext = parts[1].lower().rstrip(")")  # 괄호 등 제거
        if ext in VALID_EXTENSIONS:
            return True
    # /로 구분된 경로 (확장자 없어도 경로 같으면 허용)
    if "/" in s and not s.startswith("http"):
        return True
    return False


def clean_path(s: str) -> str:
    """경로 문자열 정리: 코드 스팬 제거, 공백 제거."""
    s = s.strip()
    # 코드 스팬 제거
    s = s.strip("`")
    # 괄호, 쉼표, 세미콜론 등 후행 특수문자 제거
    s = re.sub(r"[,;:()]+$", "", s).strip()
    return s


def find_section_start(lines: list) -> int:
    """파일 목록 섹션의 시작 줄 인덱스를 찾는다. 없으면 -1 반환."""
    for i, line in enumerate(lines):
        for pattern in SECTION_HEADER_PATTERNS:
            if re.search(pattern, line, re.IGNORECASE):
                return i
    return -1


def extract_from_table(lines: list, section_start: int) -> tuple:
    """
    테이블 형식에서 파일 목록 추출.
    반환: (changed_files, deleted_files)
    """
    changed = []
    deleted = []

    # 섹션 시작 다음 줄부터 다음 헤더 전까지 처리
    in_section = False
    for i in range(section_start + 1, len(lines)):
        line = lines[i]

        # 다음 섹션 헤더 → 종료
        if re.match(NEXT_SECTION_PATTERN, line) and i > section_start + 1:
            break

        in_section = True

        # 테이블 구분자 행 건너뜀
        if TABLE_SEPARATOR_PATTERN.match(line):
            continue

        # 테이블 행 파싱
        m = TABLE_ROW_PATTERN.match(line)
        if m:
            raw_path = m.group(1).strip()
            rest = m.group(2)

            # 첫 번째 열에서 파일 경로 추출
            path = clean_path(raw_path)
            if not looks_like_file_path(path):
                # 코드 스팬 내에서 추출 시도
                code_spans = CODE_SPAN_PATTERN.findall(raw_path)
                if code_spans and looks_like_file_path(code_spans[0]):
                    path = clean_path(code_spans[0])
                else:
                    continue

            # 두 번째 열(변경유형) 추출
            # rest는 "| 변경유형 | 설명 |" 형식
            rest_cols = [c.strip() for c in rest.split("|") if c.strip()]
            action_col = rest_cols[0] if rest_cols else ""

            if is_delete_action(action_col):
                if path not in deleted:
                    deleted.append(path)
            else:
                if path not in changed:
                    changed.append(path)

    return changed, deleted


def extract_from_list(lines: list, section_start: int) -> tuple:
    """
    리스트 형식에서 파일 목록 추출.
    반환: (changed_files, deleted_files)
    """
    changed = []
    deleted = []

    for i in range(section_start + 1, len(lines)):
        line = lines[i]

        # 다음 섹션 헤더 → 종료
        if re.match(NEXT_SECTION_PATTERN, line) and i > section_start + 1:
            break

        # 리스트 항목 파싱
        m = LIST_ITEM_PATTERN.match(line)
        if m:
            path = clean_path(m.group(2))
            if looks_like_file_path(path):
                if path not in changed:
                    changed.append(path)
            continue

        # 코드 스팬에서 추출 시도
        code_spans = CODE_SPAN_PATTERN.findall(line)
        for span in code_spans:
            p = clean_path(span)
            if looks_like_file_path(p) and p not in changed:
                changed.append(p)

    return changed, deleted


def parse_report(report_path: str) -> dict:
    """
    보고서 파일 파싱하여 변경/삭제 파일 목록 반환.
    반환: {"changed_files": [...], "deleted_files": [...]}
    """
    path = Path(report_path)
    if not path.exists():
        return {"error": f"파일 없음: {report_path}", "changed_files": [], "deleted_files": []}

    try:
        content = path.read_text(encoding="utf-8")
    except OSError as e:
        return {"error": str(e), "changed_files": [], "deleted_files": []}

    lines = content.splitlines()

    # 파일 목록 섹션 찾기
    section_start = find_section_start(lines)
    if section_start == -1:
        # 섹션 못 찾으면 전체 파일에서 코드 스팬 파일 경로 추출 시도
        changed = []
        for line in lines:
            spans = CODE_SPAN_PATTERN.findall(line)
            for span in spans:
                p = clean_path(span)
                if looks_like_file_path(p) and p not in changed:
                    changed.append(p)
        return {"changed_files": changed, "deleted_files": [], "warning": "파일 목록 섹션을 찾지 못해 전체 파일 스캔"}

    # 섹션 이후 내용 분석
    # 테이블 형식인지 리스트 형식인지 판별
    has_table = False
    for i in range(section_start + 1, min(section_start + 10, len(lines))):
        if TABLE_ROW_PATTERN.match(lines[i]) and not TABLE_SEPARATOR_PATTERN.match(lines[i]):
            has_table = True
            break

    if has_table:
        changed, deleted = extract_from_table(lines, section_start)
    else:
        changed, deleted = extract_from_list(lines, section_start)

    return {"changed_files": changed, "deleted_files": deleted}


def run_auto_update(report_path: str, project_id: str, result: dict) -> dict:
    """
    --auto-update 옵션: project-map.py --incremental --changed-files 자동 실행.
    """
    changed = result.get("changed_files", [])
    deleted = result.get("deleted_files", [])

    workspace = os.environ.get("WORKSPACE_ROOT", str(Path(__file__).resolve().parent.parent))
    project_path = f"{workspace}/projects/{project_id}"
    map_output = f"{workspace}/memory/project-maps/{project_id}.md"
    drive_log = f"{workspace}/memory/drive-changes/{project_id}.jsonl"
    project_map_script = f"{workspace}/scripts/project-map.py"

    if not Path(project_map_script).exists():
        return {"status": "error", "message": f"project-map.py 없음: {project_map_script}"}

    if not Path(project_path).exists():
        return {"status": "error", "message": f"프로젝트 경로 없음: {project_path}"}

    # 출력 디렉토리 생성
    Path(map_output).parent.mkdir(parents=True, exist_ok=True)

    cmd = [
        sys.executable,
        project_map_script,
        project_path,
        "--output", map_output,
        "--incremental",
    ]

    if changed:
        cmd += ["--changed-files", ",".join(changed)]
    if deleted:
        cmd += ["--deleted-files", ",".join(deleted)]
    if Path(drive_log).exists():
        cmd += ["--drive-log", drive_log]

    try:
        proc = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=120,
        )
        return {
            "status": "ok" if proc.returncode == 0 else "error",
            "returncode": proc.returncode,
            "stdout": proc.stdout.strip(),
            "stderr": proc.stderr.strip(),
            "cmd": " ".join(cmd),
        }
    except subprocess.TimeoutExpired:
        return {"status": "error", "message": "project-map.py 실행 타임아웃 (120초)"}
    except OSError as e:
        return {"status": "error", "message": f"project-map.py 실행 실패: {e}"}


def parse_args():
    parser = argparse.ArgumentParser(
        description="작업 보고서에서 변경 파일 목록 자동 추출"
    )
    parser.add_argument("report_path", help="보고서 파일 경로 (.md)")
    parser.add_argument(
        "--project",
        default=None,
        help="프로젝트 ID (--auto-update와 함께 사용)",
    )
    parser.add_argument(
        "--auto-update",
        action="store_true",
        default=False,
        help="project-map.py --incremental 자동 실행",
    )
    return parser.parse_args()


def main():
    args = parse_args()

    # 보고서 파싱
    result = parse_report(args.report_path)

    if args.auto_update:
        if not args.project:
            output = {
                "status": "error",
                "message": "--auto-update 사용 시 --project 옵션 필수",
                "changed_files": result.get("changed_files", []),
                "deleted_files": result.get("deleted_files", []),
            }
            print(json.dumps(output, ensure_ascii=False, indent=2))
            sys.exit(1)

        update_result = run_auto_update(args.report_path, args.project, result)
        output = {
            "changed_files": result.get("changed_files", []),
            "deleted_files": result.get("deleted_files", []),
            "auto_update": update_result,
        }
        if result.get("warning"):
            output["warning"] = result["warning"]
        if result.get("error"):
            output["error"] = result["error"]
    else:
        output = {
            "changed_files": result.get("changed_files", []),
            "deleted_files": result.get("deleted_files", []),
        }
        if result.get("warning"):
            output["warning"] = result["warning"]
        if result.get("error"):
            output["error"] = result["error"]

    print(json.dumps(output, ensure_ascii=False, indent=2))

    if result.get("error") and not result.get("changed_files"):
        sys.exit(1)


if __name__ == "__main__":
    main()