
    Kip2                       d Z ddlmZ ddlZddlZddlZddlmZmZ ddl	m
Z
mZ  ej                  e      ZddZddZd	d
ddZ	 d	 	 	 	 	 	 	 ddZ	 	 d	 	 	 	 	 	 	 	 	 ddZ	 d	 	 	 	 	 	 	 ddZ	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 ddZy)zHybrid search engine combining FTS5 (BM25) and vector embeddings.

Uses Reciprocal Rank Fusion (RRF) to merge results from full-text search
and semantic similarity, with query-aware kind boosting and context-file
boosting for relevance tuning.
    )annotationsN)AnyOptional   )
GraphStore_sanitize_namec                4   | j                   }|j                  d       |j                  d       |j                          |j                  d       |j                          |j                  d      j                         d   }t        j                  d|       |S )zRebuild the FTS5 index from the nodes table.

    Checks whether the ``nodes_fts`` virtual table exists, clears it, then
    repopulates it from every row in ``nodes``.

    Returns:
        Number of rows indexed.
    zDROP TABLE IF EXISTS nodes_ftsz
        CREATE VIRTUAL TABLE nodes_fts USING fts5(
            name, qualified_name, file_path, signature,
            tokenize='porter unicode61'
        )
    z
        INSERT INTO nodes_fts(rowid, name, qualified_name, file_path, signature)
        SELECT id, name, qualified_name, file_path, COALESCE(signature, '')
        FROM nodes
    zSELECT count(*) FROM nodes_ftsr   z"FTS index rebuilt: %d rows indexed)_connexecutecommitfetchoneloggerinfo)storeconncounts      d/home/jay/workspace/scripts/.codegraph-venv/lib/python3.12/site-packages/code_review_graph/search.pyrebuild_fts_indexr      s     ;;D 	LL12LL  	 	KKM 	LL  	
 	KKMLL9:CCEaHE
KK4e<L    c                   i }| r| j                         s|S | j                         }t        j                  d|      r|j                         s
d|d<   d|d<   d|v rt        j                  d|      rd|d<   d|v rd	|d
<   |S )ap  Detect query patterns and return kind-specific boost multipliers.

    Heuristics:
    - PascalCase queries (e.g. ``MyClass``) boost Class/Type by 1.5x
    - snake_case queries (e.g. ``get_users``) boost Function by 1.5x
    - Queries containing ``.`` boost qualified name matches by 2.0x

    Returns:
        Dict mapping node kind strings to boost multipliers.
    z^[A-Z][a-z]      ?ClassType_z[a-zA-Z]Function.       @
_qualified)striprematchisuppersearch)queryboostsqs      r   detect_query_kind_boostr'   B   s      "FA 
xx"199;wv axBIIk1- z ax"|Mr   <   )kc                    i }|D ]9  }t        |      D ])  \  }\  }}|j                  |d      d| |z   dz   z  z   ||<   + ; t        |j                         d d      }|S )a5  Merge multiple ranked result lists using Reciprocal Rank Fusion.

    Each input list contains ``(id, score)`` tuples, ordered by score
    descending. The RRF score for each item is the sum of
    ``1 / (k + rank + 1)`` across all lists it appears in, where rank is
    the 0-based position.

    Args:
        *result_lists: Variable number of ranked result lists.
        k: RRF constant (default 60). Higher values reduce the impact of
           rank differences.

    Returns:
        Merged list of ``(id, rrf_score)`` tuples sorted by score descending.
    g              ?r   c                    | d   S Nr    xs    r   <lambda>zrrf_merge.<locals>.<lambda>   s
    !A$ r   Tkeyreverse)	enumerategetsorteditems)r)   result_listsscoresresult_listrankitem_id_scoremergeds           r   	rrf_merger@   i   s{       "F# N'0'= 	N#D#7F$jj#6D19MMF7O	NN FLLNEFMr   c                $   d|j                  dd      z   dz   }	 | j                  d||f      j                         }|D cg c]  }|d   |d    f c}S c c}w # t        j                  $ r"}t
        j                  d|       g cY d}~S d}~ww xY w)zRun an FTS5 BM25 search against the nodes_fts table.

    Returns list of ``(node_id, bm25_score)`` tuples. The BM25 score is
    negated so higher = better (FTS5 returns negative BM25).
    "z""zOSELECT rowid, rank FROM nodes_fts WHERE nodes_fts MATCH ? ORDER BY rank LIMIT ?r   r   zFTS5 search failed: %sN)replacer   fetchallsqlite3OperationalErrorr   warning)r   r$   limit
safe_queryrowsrowes          r   _fts_searchrM      s     u}}S$//#5J
||$
 (*	 	 .22cQ#a&!222## /3	s/   &A  AA A B-B
B
Bc                
   	 ddl m} 	  || j                  |      }	 |j                  r|j                         dk(  rg |j                          S |j                  ||      }g }|D ]6  \  }}	| j                  |      }
|
s|j                  |
j                  |	f       8 ||j                          S # t        $ r g cY S w xY w# |j                          w xY w# t        $ r"}t        j                  d|       g cY d}~S d}~ww xY w)zRun a vector similarity search using the embedding store.

    Returns list of ``(node_id, similarity_score)`` tuples.
    Gracefully returns an empty list if embeddings are not available.
    r   )EmbeddingStore)modelr   rH   zEmbedding search failed: %sN)
embeddingsrO   ImportErrordb_path	availabler   closer#   get_nodeappendid	Exceptionr   rG   )r   r$   rH   rP   rO   	emb_storeresults	id_scoresqnscorenoderL   s               r   _embedding_searchra      s    ."5==>		&&)//*;q*@ OO  &&uE&:G13I$ 7	E~~b)$$dggu%567 OO%  	$ OO 4a8	sV   B1 C  C C 0C   C  C 1B?>B?CC 	D C=7D=Dc                h   |j                         j                         }|sg S g }g }|D ].  }|j                  d       |j                  d| dd| dg       0 dj	                  |      }|j                  |       d| d}	 | j                  ||      j                         }	|j                         }
g }|	D ]I  }|d   j                         }||
k(  rd}n|j                  |
      rd}nd	}|j                  |d
   |f       K |j                  d d       |S # t        j                  $ r g cY S w xY w)zFall back to simple LIKE keyword matching.

    Each word in the query must match independently (AND logic).
    Returns ``(node_id, score)`` tuples with a basic relevance score.
    z4(LOWER(name) LIKE ? OR LOWER(qualified_name) LIKE ?)%z AND z1SELECT id, name, qualified_name FROM nodes WHERE z LIMIT ?nameg      @r   r+   rY   c                    | d   S r-   r.   r/   s    r   r1   z!_keyword_search.<locals>.<lambda>   
    qt r   Tr2   )lowersplitrX   extendjoinr   rD   rE   rF   
startswithsort)r   r$   rH   words
conditionsparamswordwheresqlrJ   q_lowerr\   rK   
name_lowerr_   s                  r   _keyword_searchru      sU    KKM!E	J F 2B	
 	4&{avQK01	2 LL$E
MM%=eWH
MC||C(113
 kkmG')G +[&&(
 E""7+EED	5)*+ LL^TL2N# ## 	s    D D10D1c                t   |r|j                         sg S | j                  }|dz  }g }g }		 t        |||      }t        | |||      }	|s|	r1g }|r|j                  |       |	r|j                  |	       t        | }nt        |||      }|sg S |}t        |      }|rt        |      n	t               }|D cg c]  \  }}|	 }}}i }d}t        dt        |      |      D ]U  }||||z    }dj                  d	 |D              }|j                  d
| d|      j!                         }|D ]
  }|||d   <    W g }|D ]  \  }}|j#                  |      }|s|d   }|d   }|d   }d}||v r|||   z  }d|v r,d|v r(|j%                         |j%                         v r||d   z  }|r	||v r|dz  }|j                  |||z  f        |j'                  d d       g } |D ]  \  }}!t        |       |k\  r | S |j#                  |      }|s+|d   }|r||k7  r8| j                  t)        |d         t)        |d         ||d   |d   |d   |d   xs d|d   |d   d|j+                         v r|d   ndt-        |!d      d         | S # t        $ r!}
t        j                  d|
       Y d}
~
sd}
~
ww xY wc c}}w )!a]  Hybrid search combining FTS5 BM25 and vector embeddings via RRF.

    Attempts FTS5 + embedding search first, falling back to FTS5-only,
    then keyword LIKE matching if FTS5 is unavailable.

    Args:
        store: The graph store to search.
        query: Search query string.
        kind: Optional node kind filter (e.g. ``"Function"``, ``"Class"``).
        limit: Maximum results to return (default 20).
        context_files: Optional list of file paths. Nodes in these files
            receive a 1.5x score boost.

    Returns:
        List of dicts with node metadata and ``score`` field.
       rQ   z'FTS5 unavailable, will use fallback: %sN)rH   rP   i  r   ,c              3      K   | ]  }d   yw)?Nr.   ).0r   s     r   	<genexpr>z hybrid_search.<locals>.<genexpr>M  s     33s   z!SELECT * FROM nodes WHERE id IN ()rY   kind	file_pathqualified_namer+   r   r   r   c                    | d   S r-   r.   r/   s    r   r1   zhybrid_search.<locals>.<lambda>k  rf   r   Tr2   rd   
line_startline_endlanguage ro   return_type	signature   )rd   r   r~   r   r   r   r   ro   r   r   r_   )r   r
   rM   rZ   r   rG   ra   rX   r@   ru   r'   setrangelenrj   r   rD   r6   rg   rl   r   keysround)"r   r$   r~   rH   context_filesrP   r   fetch_limitfts_resultsemb_resultsrL   lists_to_merger?   keyword_resultskind_boostscontext_setnode_idr   candidate_ids	node_rows
batch_sizeibatchplaceholdersrJ   rK   boostedr_   	node_kindr   r   boostr\   final_scores"                                     r   hybrid_searchr     s]   0 	
 ;;D!)K ,.K+-KE!$[A
 $E55QK k!!+.!!+.N+ *$[II  *%0K(5#m$35K 066!W6M6 "IJ1c-(*5 'aJ/xx3U33||/~Q?
 (* 	  	'C#&Ic$i 	'' (*G  1mmG$K	$	-.#[++E;&3%<{{} 4 4 66\2293SLE/0%1( LL^TL2 %'G ' w<5 0 N- mmG$K	I%"3v;/,S1A-BC[)l+JJ-2(m}--8CHHJ-F[)D;*
 	4 Nw  E@!DDE4 7s   J 8J4	J1J,,J1)r   r   returnint)r$   strr   zdict[str, float])r9   list[tuple[int, float]]r)   r   r   r   )2   )r   zsqlite3.Connectionr$   r   rH   r   r   r   )r   N)
r   r   r$   r   rH   r   rP   z
str | Noner   r   )N   NN)r   r   r$   r   r~   Optional[str]rH   r   r   zOptional[list[str]]rP   r   r   zlist[dict[str, Any]])__doc__
__future__r   loggingr    rE   typingr   r   graphr   r   	getLogger__name__r   r   r'   r@   rM   ra   ru   r   r.   r   r   <module>r      s@   #  	    -			8	$!RN @B D 
  	D 	""" " 	"
 "Z -
-- - 	-p )-CCC C 	C
 'C C Cr   