#!/usr/bin/env python3 """Autoresearch 코어 러너 - 스킬 프롬프트 자동 최적화 Karpathy의 autoresearch 방법론 적용: 딱 1가지만 변경 → 테스트 → 좋으면 유지, 나쁘면 롤백 → 목표 달성 시 종료 """ import argparse import os import shutil import sys from pathlib import Path from typing import Any, cast # sys.path 설정: /home/jay/workspace/scripts 를 경로에 추가하여 # autoresearch 패키지 내에서 서로를 임포트 가능하게 함 _SCRIPTS_DIR = str(Path(__file__).resolve().parent.parent) if _SCRIPTS_DIR not in sys.path: sys.path.insert(0, _SCRIPTS_DIR) _WORKSPACE_ROOT = os.environ.get( "WORKSPACE_ROOT", str(Path(__file__).resolve().parent.parent.parent) ) from autoresearch.changelog import add_round, create_log, finalize_log, get_recent_changelog, save_log from autoresearch.judge import judge_output, load_checklist from autoresearch.mutator import generate_mutation from autoresearch.skill_executor import execute_skill, load_skill def backup_original(skill_name: str, skills_dir: str) -> str: """원본 SKILL.md를 백업. 경로: skills//evals/backup-original.md Returns: 백업 파일 경로 (절대 경로 문자열) """ skill_path = Path(skills_dir) / skill_name / "SKILL.md" evals_dir = Path(skills_dir) / skill_name / "evals" evals_dir.mkdir(parents=True, exist_ok=True) backup_path = evals_dir / "backup-original.md" shutil.copy2(str(skill_path), str(backup_path)) return str(backup_path.resolve()) def restore_skill(skill_name: str, backup_content: str, skills_dir: str, frontmatter: str) -> None: """SKILL.md를 원래 내용으로 복원 (frontmatter 보존). Args: skill_name: 스킬 이름 backup_content: 복원할 본문 (body) 텍스트 skills_dir: 스킬 디렉토리 경로 frontmatter: YAML 프론트매터 문자열 (--- 구분자 제외) """ skill_path = Path(skills_dir) / skill_name / "SKILL.md" if frontmatter: content = f"---\n{frontmatter}\n---\n{backup_content}" else: content = backup_content skill_path.write_text(content, encoding="utf-8") def apply_mutation(skill_name: str, mutated_body: str, skills_dir: str, frontmatter: str) -> None: """변경된 본문으로 SKILL.md 업데이트 (frontmatter 보존). Args: skill_name: 스킬 이름 mutated_body: 변경된 마크다운 본문 skills_dir: 스킬 디렉토리 경로 frontmatter: YAML 프론트매터 문자열 (--- 구분자 제외) """ skill_path = Path(skills_dir) / skill_name / "SKILL.md" if frontmatter: content = f"---\n{frontmatter}\n---\n{mutated_body}" else: content = mutated_body skill_path.write_text(content, encoding="utf-8") def load_test_inputs(file_path: str) -> list[dict[str, Any]]: """YAML 파일에서 테스트 입력 목록을 로드. Args: file_path: test-inputs.yaml 파일 경로 Returns: 입력 항목 리스트 (각 항목: {"id": ..., "text": ...}) Raises: FileNotFoundError: 파일이 존재하지 않을 경우 ValueError: inputs 리스트가 비어 있을 경우 """ import yaml as _yaml path = Path(file_path) if not path.exists(): raise FileNotFoundError(f"테스트 입력 파일이 없습니다: {file_path}") with path.open(encoding="utf-8") as f: data = _yaml.safe_load(f) inputs: list[dict[str, Any]] = data.get("inputs", []) if data else [] if not inputs: raise ValueError(f"inputs 리스트가 비어 있습니다: {file_path}") return inputs def run_round( skill_name: str, current_body: str, frontmatter: str, checklist: dict[str, Any], log: dict[str, Any], round_num: int, prev_score: float, skills_dir: str, model_mutate: str, model_judge: str, test_input: str | None = None, test_inputs: list[str] | None = None, ) -> tuple[str, float, dict[str, Any]]: """단일 라운드 실행. Returns: (current_body_after_round, score, updated_log) 로직: 1. get_recent_changelog(log) → recent_changelog 2. generate_mutation(current_body, checklist_yaml_str, recent_changelog, model_mutate, api_key) 3. apply_mutation → 변경된 SKILL.md 저장 4. execute_skill → 스킬 실행 (각 입력으로) 5. judge_output → 채점 (각 입력에 대해) 6. 점수 비교 (멀티입력의 경우 평균 사용): - score >= prev_score → KEEP - score < prev_score → REVERT (원본 복원) 7. add_round → 로그에 추가 8. Return (body, score, log) """ import yaml as _yaml # test_input, test_inputs 정규화: 내부적으로 항상 리스트로 처리 if test_inputs is not None: inputs_list = test_inputs elif test_input is not None: inputs_list = [test_input] else: raise ValueError("test_input 또는 test_inputs 중 하나를 제공해야 합니다.") recent_changelog = get_recent_changelog(log) # 체크리스트를 YAML 문자열로 변환 checklist_yaml_str = _yaml.dump(checklist, allow_unicode=True, default_flow_style=False) try: mutation_result = generate_mutation( current_skill_md=current_body, checklist_yaml=checklist_yaml_str, recent_changelog=recent_changelog, model=model_mutate, ) except Exception as exc: print(f" [Round {round_num}] mutation 생성 실패, 라운드 스킵: {exc}") return current_body, prev_score, log mutation_type: str = mutation_result["mutation_type"] mutation_description: str = mutation_result["mutation_description"] mutated_body: str = mutation_result["modified_skill_md"] mut_input_tokens: int = int(mutation_result.get("input_tokens", 0)) mut_output_tokens: int = int(mutation_result.get("output_tokens", 0)) # 변경 적용 apply_mutation(skill_name, mutated_body, skills_dir, frontmatter) # 각 입력에 대해 스킬 실행 및 채점 scores: list[float] = [] items_detail: list[Any] = [] total_exec_input_tokens = 0 total_exec_output_tokens = 0 judge_input_tokens = 0 judge_output_tokens = 0 judge_failed = False for inp in inputs_list: exec_result: dict[str, Any] = cast( dict[str, Any], execute_skill( skill_body=mutated_body, test_input=inp, model=model_mutate, ), ) skill_output: str = str(exec_result.get("output", "")) total_exec_input_tokens += int(exec_result.get("input_tokens") or 0) total_exec_output_tokens += int(exec_result.get("output_tokens") or 0) try: judge_result = judge_output( checklist=checklist, skill_output=skill_output, model=model_judge, ) inp_score: float = float(judge_result.get("total_score", 0.0)) scores.append(inp_score) items_detail.extend(judge_result.get("items", [])) judge_input_tokens += int(judge_result.get("input_tokens", 0)) judge_output_tokens += int(judge_result.get("output_tokens", 0)) except Exception as exc: print(f" [Round {round_num}] 채점 실패, 라운드 스킵: {exc}") restore_skill(skill_name, current_body, skills_dir, frontmatter) judge_failed = True break if judge_failed: return current_body, prev_score, log # 평균 점수 계산 new_score: float = sum(scores) / len(scores) if scores else 0.0 # 입력별 점수 로그 (멀티입력 시) if len(inputs_list) > 1: score_strs = ", ".join(f"{s:.2f}" for s in scores) print(f" [Round {round_num}] 입력별 점수: [{score_strs}] → 평균: {new_score:.2f}") # 점수 비교 및 결정 delta = new_score - prev_score if new_score >= prev_score: decision = "kept" result_body = mutated_body result_score = new_score symbol = "✓ KEEP" else: decision = "reverted" result_body = current_body result_score = prev_score symbol = "✗ REVERT" restore_skill(skill_name, current_body, skills_dir, frontmatter) # 콘솔 출력 delta_str = f"+{delta:.2f}" if delta >= 0 else f"{delta:.2f}" print(f'[Round {round_num}] 변경: {mutation_type} - "{mutation_description}"') print(f" 점수: {prev_score:.2f} → {new_score:.2f} ({delta_str}) {symbol}") # 로그 기록 total_input_tokens = mut_input_tokens + total_exec_input_tokens + judge_input_tokens total_output_tokens = mut_output_tokens + total_exec_output_tokens + judge_output_tokens updated_log = add_round( log=log, round_num=round_num, mutation_type=mutation_type, mutation_description=mutation_description, score_before=prev_score, score_after=new_score, items_detail=items_detail, decision=decision, input_tokens=total_input_tokens, output_tokens=total_output_tokens, mutation_input_tokens=mut_input_tokens, mutation_output_tokens=mut_output_tokens, execution_input_tokens=total_exec_input_tokens, execution_output_tokens=total_exec_output_tokens, judge_input_tokens=judge_input_tokens, judge_output_tokens=judge_output_tokens, ) return result_body, result_score, updated_log def run( skill_name: str, checklist_path: str, test_input: str | None = None, test_inputs: list[str] | None = None, rounds: int = 50, target_score: float = 0.95, consecutive: int = 3, model_mutate: str = "claude-sonnet-4-6", model_judge: str = "claude-haiku-4-5-20251001", dry_run: bool = False, skills_dir: str = str(Path(_WORKSPACE_ROOT) / "skills"), background: bool = False, ) -> dict[str, Any]: """메인 루프. Returns: 최종 로그 dict 흐름: 1. load_checklist(checklist_path) 2. load_skill(skill_name, skills_dir) → (frontmatter, body) 3. backup_original(skill_name, skills_dir) 4. create_log(skill_name) 5. 초기 실행: execute_skill → judge_output → 초기 점수 6. for round_num in range(1, rounds+1): a. run_round(...) b. 연속 target_score 이상 consecutive회 → STOP c. dry_run이면 1라운드만 실행 후 종료 7. finalize_log(log, final_score) 8. save_log(log, skill_name) 9. Return log """ # test_input / test_inputs 정규화 if test_input is not None and test_inputs is None: resolved_inputs: list[str] = [test_input] elif test_inputs is not None: resolved_inputs = test_inputs else: raise ValueError("test_input 또는 test_inputs 중 하나를 제공해야 합니다.") # 1. 체크리스트 로드 checklist = load_checklist(checklist_path) # 2. 스킬 로드 frontmatter, body = load_skill(skill_name, skills_dir) # 3. 원본 백업 backup_original(skill_name, skills_dir) # 4. 로그 생성 log: dict[str, Any] = create_log(skill_name) # 5. 초기 실행 및 채점 (각 입력에 대해 실행 후 평균) print(f"초기 실행 중: {skill_name}") initial_scores: list[float] = [] for inp in resolved_inputs: exec_result: dict[str, Any] = cast( dict[str, Any], execute_skill( skill_body=body, test_input=inp, model=model_mutate, ), ) skill_output = str(exec_result.get("output", "")) judge_result = judge_output( checklist=checklist, skill_output=skill_output, model=model_judge, ) initial_scores.append(float(judge_result.get("total_score", 0.0))) initial_score: float = sum(initial_scores) / len(initial_scores) if initial_scores else 0.0 print(f"초기 점수: {initial_score:.2%}") current_body = body current_score = initial_score consecutive_count = 0 # 6. 라운드 루프 for round_num in range(1, rounds + 1): current_body, current_score, log = run_round( skill_name=skill_name, current_body=current_body, frontmatter=frontmatter, checklist=checklist, test_inputs=resolved_inputs, log=log, round_num=round_num, prev_score=current_score, skills_dir=skills_dir, model_mutate=model_mutate, model_judge=model_judge, ) # 연속 달성 판정 if current_score >= target_score: consecutive_count += 1 else: consecutive_count = 0 # dry_run이면 1라운드만 실행 후 복원 및 종료 if dry_run: print(f" [dry-run] 1라운드 완료. 원본 복원 후 종료.") restore_skill(skill_name, body, skills_dir, frontmatter) break # 연속 consecutive회 달성 시 조기 종료 if consecutive_count >= consecutive: print(f"목표 점수 {target_score:.2%}를 {consecutive}회 연속 달성. 조기 종료.") break # 7. 로그 마무리 final_log = finalize_log(log, current_score) # 8. 로그 저장 save_log(final_log, skill_name) return final_log def main(argv: list[str] | None = None) -> int: """CLI 엔트리포인트""" parser = argparse.ArgumentParser(description="Autoresearch: 스킬 프롬프트 자동 최적화") parser.add_argument("--skill", required=True, help="스킬명 (skills/ 하위 폴더명)") parser.add_argument("--checklist", required=True, help="체크리스트 YAML 경로") parser.add_argument("--rounds", type=int, default=50, help="최대 라운드 수 (기본 50)") parser.add_argument("--target-score", type=float, default=0.95, help="목표 점수 (기본 0.95)") parser.add_argument("--consecutive", type=int, default=3, help="연속 달성 횟수 (기본 3)") parser.add_argument("--model-mutate", default="claude-sonnet-4-6", help="변경 생성용 모델") parser.add_argument("--model-judge", default="claude-haiku-4-5-20251001", help="채점용 모델") parser.add_argument("--dry-run", action="store_true", help="실제 변경 없이 1라운드만 시뮬레이션") parser.add_argument("--skills-dir", default=str(Path(_WORKSPACE_ROOT) / "skills"), help="스킬 디렉토리 경로") parser.add_argument( "--background", action="store_true", help="백그라운드 모드: stdout 파일 리다이렉트 후 보고서 자동 생성" ) parser.add_argument( "--notify", action="store_true", help="완료 시 cokacdir로 보고서 전송 (--background와 함께 사용)" ) # --test-input / --test-inputs-file 상호 배타 그룹 input_group = parser.add_mutually_exclusive_group(required=True) input_group.add_argument("--test-input", help="테스트용 입력 텍스트 (단일)") input_group.add_argument("--test-inputs-file", help="테스트 입력 YAML 파일 경로 (멀티)") args = parser.parse_args(argv) # 입력 처리 run_kwargs: dict[str, Any] = { "skill_name": args.skill, "checklist_path": args.checklist, "rounds": args.rounds, "target_score": args.target_score, "consecutive": args.consecutive, "model_mutate": args.model_mutate, "model_judge": args.model_judge, "dry_run": args.dry_run, "skills_dir": args.skills_dir, } if args.test_input is not None: run_kwargs["test_input"] = args.test_input else: # --test-inputs-file 사용 inputs_data = load_test_inputs(args.test_inputs_file) run_kwargs["test_inputs"] = [item["text"] for item in inputs_data] # --background 플래그 if args.background: run_kwargs["background"] = True # background 모드: stdout/stderr를 파일로 리다이렉트 log_file = None original_stdout = None original_stderr = None if args.background: import datetime evals_dir = Path(args.skills_dir) / args.skill / "evals" evals_dir.mkdir(parents=True, exist_ok=True) log_path = evals_dir / "autoresearch-stdout.log" log_file = open(str(log_path), "w", encoding="utf-8") original_stdout = sys.stdout original_stderr = sys.stderr sys.stdout = log_file sys.stderr = log_file try: log = run(**run_kwargs) finally: # stdout/stderr 복원 if args.background and original_stdout is not None: sys.stdout = original_stdout sys.stderr = original_stderr if log_file is not None: log_file.close() # background 모드: 보고서 생성 if args.background: import datetime from autoresearch.reporter import generate_report timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") evals_dir = Path(args.skills_dir) / args.skill / "evals" report_path = str(evals_dir / f"report-{timestamp}.md") generate_report(log, report_path) print(f"보고서 저장: {report_path}") # --notify 플래그 처리 if args.notify: import subprocess subprocess.run( [ "/usr/local/bin/cokacdir", "--sendfile", report_path, "--chat", os.environ.get("COKACDIR_CHAT_ID", "6937032012"), "--key", "109fa85250c6d46b", ], check=False, ) # 결과 요약 출력 print(f"\n{'='*50}") print(f"Autoresearch 완료: {args.skill}") print(f"라운드: {log['total_rounds']}") print(f"최종 점수: {log['final_score']:.2%}") print(f"KEEP: {log['kept']}, REVERT: {log['reverted']}") print(f"{'='*50}") return 0 if __name__ == "__main__": sys.exit(main())