"""
통합 테스트: utils/sanitize_gate.py
task-1837_5.1 - 닌기르수 작성
"""

import sys

sys.path.insert(0, "/home/jay/workspace")

from utils.sanitize_gate import (
    SANITIZE_PATTERNS,
    generate_sanitize_report,
    sanitize_file_content,
    sanitize_text,
    should_sanitize,
)


# ── 1. sanitize_text — 개별 PII 마스킹 ──────────────────────────────────────

def test_sanitize_text_masks_rrn():
    """주민등록번호(6자리-7자리)가 [RRN-REDACTED]로 마스킹되어야 한다."""
    text = "홍길동 주민번호: 901225-1234567 입니다."
    masked, detections = sanitize_text(text)
    assert "[RRN-REDACTED]" in masked
    assert "901225-1234567" not in masked
    assert len(detections) == 1
    assert detections[0]["type"] == "rrn"


def test_sanitize_text_masks_phone():
    """전화번호(010-xxxx-xxxx)가 [PHONE-REDACTED]로 마스킹되어야 한다."""
    text = "연락처: 010-1234-5678 로 문의하세요."
    masked, detections = sanitize_text(text)
    assert "[PHONE-REDACTED]" in masked
    assert "010-1234-5678" not in masked
    assert any(d["type"] == "phone" for d in detections)


def test_sanitize_text_masks_email():
    """이메일 주소가 [EMAIL-REDACTED]로 마스킹되어야 한다."""
    text = "이메일: user@example.com 로 보내세요."
    masked, detections = sanitize_text(text)
    assert "[EMAIL-REDACTED]" in masked
    assert "user@example.com" not in masked
    assert any(d["type"] == "email" for d in detections)


def test_sanitize_text_masks_apikey_sk():
    """sk- 프리픽스 API 키가 [APIKEY-REDACTED]로 마스킹되어야 한다."""
    text = "API 키: sk-abcdefghij1234567890 사용"
    masked, detections = sanitize_text(text)
    assert "[APIKEY-REDACTED]" in masked
    assert "sk-abcdefghij1234567890" not in masked
    assert any(d["type"] == "apikey" for d in detections)


def test_sanitize_text_masks_account():
    """계좌번호(3-4자리-6자리-4자리 형식)가 [ACCOUNT-REDACTED]로 마스킹되어야 한다."""
    text = "계좌번호: 123-456789-1234 로 입금하세요."
    masked, detections = sanitize_text(text)
    assert "[ACCOUNT-REDACTED]" in masked
    assert "123-456789-1234" not in masked
    assert any(d["type"] == "account" for d in detections)


def test_sanitize_text_masks_policy():
    """보험 증권번호(대문자+숫자)가 [POLICY-REDACTED]로 마스킹되어야 한다."""
    text = "증권번호: AB123456789 확인 바랍니다."
    masked, detections = sanitize_text(text)
    assert "[POLICY-REDACTED]" in masked
    assert "AB123456789" not in masked
    assert any(d["type"] == "policy" for d in detections)


def test_sanitize_text_mixed_pii_all_masked():
    """여러 종류의 PII가 섞인 텍스트에서 전부 마스킹되는지 확인."""
    text = (
        "고객 정보:\n"
        "이름: 홍길동\n"
        "주민번호: 901225-1234567\n"
        "전화번호: 010-9876-5432\n"
        "이메일: hong@test.co.kr\n"
        "계좌번호: 110-123456-78901\n"
        "API 키: sk-MySecretKey12345678\n"
    )
    masked, detections = sanitize_text(text)
    assert "901225-1234567" not in masked
    assert "010-9876-5432" not in masked
    assert "hong@test.co.kr" not in masked
    assert "110-123456-78901" not in masked
    assert "sk-MySecretKey12345678" not in masked
    # 최소 5가지 유형 감지
    detected_types = {d["type"] for d in detections}
    assert len(detected_types) >= 4, f"감지된 유형: {detected_types}"


def test_sanitize_text_no_pii_returns_empty_detections():
    """PII가 없는 일반 텍스트는 detections가 비어야 한다."""
    text = "이 텍스트에는 개인정보가 없습니다. 안녕하세요!"
    masked, detections = sanitize_text(text)
    assert masked == text
    assert detections == []


def test_sanitize_text_detection_record_structure():
    """감지 항목은 type, description, original, replacement 키를 포함해야 한다."""
    text = "이메일: admin@corp.com"
    _, detections = sanitize_text(text)
    assert len(detections) == 1
    d = detections[0]
    assert "type" in d
    assert "description" in d
    assert "original" in d
    assert "replacement" in d
    assert d["original"] == "admin@corp.com"
    assert d["replacement"] == "[EMAIL-REDACTED]"


# ── 2. sanitize_file_content — 파일 읽기 + 마스킹 ───────────────────────────

def test_sanitize_file_content_masks_rrn_in_file(tmp_path):
    """tmp_path에 주민번호가 포함된 파일을 생성하고 마스킹 확인."""
    test_file = tmp_path / "test_pii.txt"
    test_file.write_text("주민번호: 850101-2345678\n", encoding="utf-8")

    masked, detections = sanitize_file_content(str(test_file))
    assert "[RRN-REDACTED]" in masked
    assert "850101-2345678" not in masked
    assert len(detections) == 1


def test_sanitize_file_content_handles_multiple_pii(tmp_path):
    """여러 PII가 담긴 파일에서 전부 마스킹되는지 확인."""
    test_file = tmp_path / "multi_pii.txt"
    content = "전화: 011-234-5678\n이메일: test@mail.com\n"
    test_file.write_text(content, encoding="utf-8")

    masked, detections = sanitize_file_content(str(test_file))
    assert "011-234-5678" not in masked
    assert "test@mail.com" not in masked
    assert len(detections) >= 2


def test_sanitize_file_content_raises_on_missing_file(tmp_path):
    """존재하지 않는 파일 경로 시 FileNotFoundError가 발생해야 한다."""
    import pytest
    missing = str(tmp_path / "nonexistent.txt")
    with pytest.raises((FileNotFoundError, OSError)):
        sanitize_file_content(missing)


# ── 3. should_sanitize — 레벨 판단 ──────────────────────────────────────────

def test_should_sanitize_below_3_returns_false():
    """level < 3이면 False를 반환해야 한다."""
    assert should_sanitize(0) is False
    assert should_sanitize(1) is False
    assert should_sanitize(2) is False


def test_should_sanitize_3_and_above_returns_true():
    """level >= 3이면 True를 반환해야 한다."""
    assert should_sanitize(3) is True
    assert should_sanitize(4) is True
    assert should_sanitize(10) is True


def test_should_sanitize_boundary_value():
    """경계값: level=2는 False, level=3은 True."""
    assert should_sanitize(2) is False
    assert should_sanitize(3) is True


# ── 4. generate_sanitize_report — 리포트 생성 ───────────────────────────────

def test_generate_sanitize_report_empty_detections():
    """감지 항목이 없으면 '감지된 PII 없음' 문자열을 포함해야 한다."""
    report = generate_sanitize_report([])
    assert "감지된 PII 없음" in report


def test_generate_sanitize_report_with_detections_contains_table():
    """감지 항목이 있으면 마크다운 테이블 헤더가 포함되어야 한다."""
    detections = [
        {
            "type": "email",
            "description": "이메일",
            "original": "test@example.com",
            "replacement": "[EMAIL-REDACTED]",
        }
    ]
    report = generate_sanitize_report(detections)
    assert "|" in report
    assert "email" in report
    assert "1건" in report


def test_generate_sanitize_report_original_partially_hidden():
    """원본 값은 앞 3자리만 표시하고 나머지는 '*'로 가려야 한다."""
    detections = [
        {
            "type": "rrn",
            "description": "주민등록번호",
            "original": "901225-1234567",
            "replacement": "[RRN-REDACTED]",
        }
    ]
    report = generate_sanitize_report(detections)
    # 앞 3자리 "901" 는 표시, 나머지는 * 로
    assert "901" in report
    # 전체 주민번호가 그대로 노출되면 안 됨
    assert "901225-1234567" not in report


def test_generate_sanitize_report_multiple_detections_count():
    """감지 항목이 여러 개면 총 건수가 리포트에 반영되어야 한다."""
    detections = [
        {
            "type": "email",
            "description": "이메일",
            "original": "a@b.com",
            "replacement": "[EMAIL-REDACTED]",
        },
        {
            "type": "phone",
            "description": "전화번호",
            "original": "010-1234-5678",
            "replacement": "[PHONE-REDACTED]",
        },
    ]
    report = generate_sanitize_report(detections)
    assert "2건" in report


# ── 5. SANITIZE_PATTERNS — 패턴 구조 확인 ───────────────────────────────────

def test_sanitize_patterns_contains_all_required_keys():
    """SANITIZE_PATTERNS에 6가지 패턴이 모두 정의되어야 한다."""
    required = {"rrn", "phone", "email", "apikey", "account", "policy"}
    assert required.issubset(set(SANITIZE_PATTERNS.keys()))


def test_sanitize_patterns_each_has_pattern_replacement_description():
    """각 패턴은 pattern, replacement, description 키를 포함해야 한다."""
    import re
    for key, config in SANITIZE_PATTERNS.items():
        assert "pattern" in config, f"{key} 에 pattern 없음"
        assert "replacement" in config, f"{key} 에 replacement 없음"
        assert "description" in config, f"{key} 에 description 없음"
        assert isinstance(config["pattern"], re.Pattern), f"{key} pattern이 re.Pattern이 아님"