
    KiF                       d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	Z	ddl
mZmZ ddlmZ ddlmZ ddlmZmZmZ  ej*                  e      Z G d	 d
e      ZdZ G d de      Z G d de      Z G d de      ZddhZd dZ	 	 d!	 	 	 	 	 d"dZd#dZ dZ!d$dZ"d%dZ#d&dZ$d'dZ% G d d      Z&d(dZ'	 d)	 	 	 	 	 	 	 	 	 d*dZ(y)+a-  Vector embedding support for semantic code search.

Supports multiple providers:
1. Local (sentence-transformers) - Private, fast, offline.
2. Google Gemini - High-quality, cloud-based. Requires explicit opt-in.
3. MiniMax (embo-01) - High-quality 1536-dim cloud embeddings. Requires MINIMAX_API_KEY.
    )annotationsN)ABCabstractmethod)Path)Any   )	GraphNode
GraphStorenode_to_dictc                  h    e Zd Zedd       Zedd       Zeedd              Zeed	d              Zy)
EmbeddingProviderc                     y N )selftextss     h/home/jay/workspace/scripts/.codegraph-venv/lib/python3.12/site-packages/code_review_graph/embeddings.pyembedzEmbeddingProvider.embed    s        c                     y)zCEmbed a search query (may use a different task type than indexing).Nr   r   texts     r   embed_queryzEmbeddingProvider.embed_query$        	r   c                     y r   r   r   s    r   	dimensionzEmbeddingProvider.dimension)   r   r   c                     y r   r   r   s    r   namezEmbeddingProvider.name.   r   r   Nr   	list[str]returnlist[list[float]]r   strr"   list[float]r"   intr"   r%   )	__name__
__module____qualname__r   r   r   propertyr   r   r   r   r   r   r      s`             r   r   zall-MiniLM-L6-v2c                  P    e Zd Zdd	dZd Zd
dZddZedd       Zedd       Z	y)LocalEmbeddingProviderNc                l    |xs$ t         j                  j                  dt              | _        d | _        y )NCRG_EMBEDDING_MODEL)osenvirongetLOCAL_DEFAULT_MODEL_model_name_model)r   
model_names     r   __init__zLocalEmbeddingProvider.__init__8   s-    % 
!#6*
 r   c                    | j                   /	 ddlm}  || j                  dddi      | _         | j                   S | j                   S # t        $ r t	        d      w xY w)Nr   )SentenceTransformerTtrust_remote_code)r<   model_kwargszSsentence-transformers not installed. Run: pip install code-review-graph[embeddings])r7   sentence_transformersr;   r6   ImportError)r   r;   s     r   
_get_modelz!LocalEmbeddingProvider._get_model>   sm    ;;E1$$&*"5t!< {{t{{  !E s   "A Ac                    | j                         }|j                  |d      }|D cg c]  }|j                          c}S c c}w )NF)show_progress_bar)r@   encodetolist)r   r   modelvectorsvs        r   r   zLocalEmbeddingProvider.embedN   s:    !,,u,>$+,q
,,,s   Ac                ,    | j                  |g      d   S )Nr   )r   r   s     r   r   z"LocalEmbeddingProvider.embed_queryS   s    zz4&!!$$r   c                B    | j                         }|j                         S r   )r@    get_sentence_embedding_dimension)r   rE   s     r   r   z LocalEmbeddingProvider.dimensionV   s    !5577r   c                     d| j                    S )Nzlocal:)r6   r   s    r   r   zLocalEmbeddingProvider.name[   s    (()**r   r   )r8   
str | Noner"   Noner    r$   r'   r)   )
r*   r+   r,   r9   r@   r   r   r-   r   r   r   r   r   r/   r/   7   s>     -
% 8 8 + +r   r/   c                  ^    e Zd Zdd	dZd
dZeddd       ZddZedd       Z	edd       Z
y)GoogleEmbeddingProviderc                    	 ddl m} |j                  |      | _        || _        d | _        y # t        $ r t        d      w xY w)Nr   )genaiapi_keyzXgoogle-generativeai not installed. Run: pip install code-review-graph[google-embeddings])googlerQ   Client_clientrE   
_dimensionr?   )r   rS   rE   rQ   s       r   r9   z GoogleEmbeddingProvider.__init__a   sL    		$ <<<8DLDJ*.DO 	H 	s	   +. Ac                4    d}g }t        dt        |      |      D ]S  }||||z    } j                  |f fd	      }|j                  |j                  D cg c]  }|j
                   c}       U  j                  |rt        |d          _        |S c c}w )Nd   r   c                l    j                   j                  j                  j                  | ddi      S )N	task_typeRETRIEVAL_DOCUMENTrE   contentsconfigrV   modelsembed_contentrE   )br   s    r   <lambda>z/GoogleEmbeddingProvider.embed.<locals>.<lambda>s   s5     3 3 A A**')=> !B ! r   )rangelen_call_with_retryextend
embeddingsvaluesrW   )r   r   
batch_sizeresultsibatchresponsees   `       r   r   zGoogleEmbeddingProvider.embedm   s    
q#e*j1 		DA!A
N+E,, H NNh.A.ABAHHBC		D ??"w!'!*oDO Cs   B
c           	        t        |      D ]  }	  |        c S  y# t        $ rf}t        |      }d|v xs
 d|v xs d|v }|r||dz
  k(  r d|z  }t        j	                  d|dz   |||       t        j                  |       Y d}~yd}~ww xY w)z9Call fn with exponential backoff on transient API errors.429500503r      z5Gemini API error (attempt %d/%d), retrying in %ds: %sN)re   	Exceptionr%   loggerwarningtimesleep)fnmax_retriesattemptrp   err_stris_retryablewaits          r   rg   z(GoogleEmbeddingProvider._call_with_retry~   s     [) 	!G!t	!  	!a&$/W5G3CWuPWGW#w+/'AG|V&{KqB

4  	!s   	B
ABB
c                      j                   fd      }|j                  d   j                  } j                  t	        |       _        |S )Nc                 n     j                   j                  j                   j                  gddi      S )Nr[   RETRIEVAL_QUERYr]   r`   r   s   r   rd   z5GoogleEmbeddingProvider.embed_query.<locals>.<lambda>   s6    DLL''55jj#%67 6  r   r   )rg   ri   rj   rW   rf   )r   r   ro   vecs   ``  r   r   z#GoogleEmbeddingProvider.embed_query   sJ    ((
 !!!$++??"!#hDO
r   c                4    | j                   | j                   S y)Ni   )rW   r   s    r   r   z!GoogleEmbeddingProvider.dimension   s    ??&??"r   c                     d| j                    S )Nzgoogle:rE   r   s    r   r   zGoogleEmbeddingProvider.name   s    %%r   N)zgemini-embedding-001)rS   r%   rE   r%   r"   rM   r    )   )r|   r(   r$   r'   r)   )r*   r+   r,   r9   r   staticmethodrg   r   r-   r   r   r   r   r   rO   rO   `   sM    
" ! !    & &r   rO   c                  `    e Zd ZdZdZdZdZddZddZddZ	ddZ
edd	       Zedd
       Zy)MiniMaxEmbeddingProviderzMiniMax embo-01 embedding provider (1536 dimensions).

    Uses the MiniMax Embeddings API (https://api.minimax.io/v1/embeddings)
    with the embo-01 model. Requires the MINIMAX_API_KEY environment variable.
    z$https://api.minimax.io/v1/embeddingszembo-01i   c                    || _         y r   )_api_key)r   rS   s     r   r9   z!MiniMaxEmbeddingProvider.__init__   s	    r   c           	     n   dd l }dd l}|j                  | j                  ||d      j	                  d      }|j
                  j                  | j                  |dd| j                   d      }d}t        |      D ]  }	 dd l
}	|	j                         }
|j
                  j                  |d	|

      5 }|j                  |j                         j                  d            }d d d        j!                  di       }|j!                  dd      dk7  rt#        d|j!                  dd             |d   c S  g S # 1 sw Y   XxY w# t$        $ rg}t'        |      }d|v xs
 d|v xs d|v }|r||dz
  k(  r d|z  }t(        j+                  d|dz   |||       t-        j.                  |       Y d }~3d }~ww xY w)Nr   )rE   r   typezutf-8zapplication/jsonzBearer )zContent-TypeAuthorization)dataheadersr   <   )timeoutcontext	base_respstatus_codezMiniMax API error: 
status_msgunknownrF   rr   rs   rt   r   ru   z6MiniMax API error (attempt %d/%d), retrying in %ds: %s)jsonurllib.requestdumps_MODELrC   requestRequest	_ENDPOINTr   re   sslcreate_default_contexturlopenloadsreaddecoder4   RuntimeErrorrv   r%   rw   rx   ry   rz   )r   r   r[   _jsonurllibpayloadreqr|   r}   r   _ssl_ctxrespbodyr   rp   r~   r   r   s                     r   	_call_apiz"MiniMaxEmbeddingProvider._call_api   s   ++[[
  6'?	 	 nn$$NN 2#*4==/!: % 
 [) 	!G!557^^++CX+N DRV ;;tyy{'9'9''BCDD !HH["5	==2a7&-immL).T-UV  I&	!4 	-D D  
!a&$/W5G3CWuPWGW#w+/'AG|LaKdA 

4  
!s2    2E2/D8!AE8E	=E	F4AF//F4c                    d}g }t        dt        |      |      D ]+  }||||z    }|j                  | j                  |d             - |S )NrY   r   db)re   rf   rh   r   )r   r   rk   rl   rm   rn   s         r   r   zMiniMaxEmbeddingProvider.embed   sV    
%'q#e*j1 	8A!A
N+ENN4>>%67	8 r   c                .    | j                  |gd      d   S )Nqueryr   )r   r   s     r   r   z$MiniMaxEmbeddingProvider.embed_query   s    ~~tfg.q11r   c                    | j                   S r   )
_DIMENSIONr   s    r   r   z"MiniMaxEmbeddingProvider.dimension   s    r   c                     d| j                    S )Nzminimax:)r   r   s    r   r   zMiniMaxEmbeddingProvider.name   s    $++''r   N)rS   r%   r"   rM   )r   r!   r[   r%   r"   r#   r    r$   r'   r)   )r*   r+   r,   __doc__r   r   r   r9   r   r   r   r-   r   r   r   r   r   r   r      sU     7IFJ .`2   ( (r   r   rT   minimaxc                    t         j                  j                  dd      j                         dk(  ryt	        d|  dt
        j                         y)u  Print a stderr warning before a cloud embedding provider is used.

    The warning is suppressed when ``CRG_ACCEPT_CLOUD_EMBEDDINGS=1`` is
    set in the environment, so scripted / CI workloads can acknowledge
    once and move on. Use stderr (never stdin/input) to stay compatible
    with the MCP stdio transport — anything we write to stdout would
    corrupt the JSON-RPC stream. See: #174
    CRG_ACCEPT_CLOUD_EMBEDDINGS 1Nu9   
⚠️  code-review-graph: about to embed code via the 'az  ' cloud provider.
    Your source code (function names, docstrings, file paths) will be sent to an external API.
    This is necessary for semantic search with the cloud provider you selected.
    To skip this warning in future runs, set CRG_ACCEPT_CLOUD_EMBEDDINGS=1 in your environment.
    To stay fully offline, use the default 'local' provider instead (no API key needed).
)file)r2   r3   r4   stripprintsysstderr)provider_names    r   _warn_cloud_egressr      sK     
zz~~3R8>>@CG	
D]O 	T! 		! ZZr   c                   | dk(  rCt         j                  j                  d      }|st        d      t	        d       t        |      S | dk(  rMt         j                  j                  d      }|st        d      t	        d       	 t        dd|i|rd	|iS i S 	 t        |      S # t        $ r Y y
w xY w# t        $ r Y y
w xY w)a  Get an embedding provider by name.

    Args:
        provider: Provider name. One of "local", "google", "minimax", or None for local.
                  Google requires GOOGLE_API_KEY env var and explicit opt-in.
                  MiniMax requires MINIMAX_API_KEY env var and explicit opt-in.
                  Cloud providers emit a one-time stderr warning before use
                  unless ``CRG_ACCEPT_CLOUD_EMBEDDINGS=1`` is set. See: #174
        model: Model name/path to use. For local provider this is any
               sentence-transformers compatible model. Falls back to
               CRG_EMBEDDING_MODEL env var, then to all-MiniLM-L6-v2.
               For Google provider this is a Gemini model ID.
    r   MINIMAX_API_KEYzTMINIMAX_API_KEY environment variable is required for the MiniMax embedding provider.rR   rT   GOOGLE_API_KEYzRGOOGLE_API_KEY environment variable is required for the Google embedding provider.rS   rE   N)r8   r   )	r2   r3   r4   
ValueErrorr   r   rO   r?   r/   )providerrE   rS   s      r   get_providerr     s    " 9**..!232  	9%'888**..!121  	8$	* ',GU# 24 %77  		  s*   B( B( B7 (	B43B47	CCc                 ,    	 ddl } y# t        $ r Y yw xY w)z3Check whether local embedding support is available.r   NTF)r>   r?   )r>   s    r   _check_availabler   I  s    $ s    	z
CREATE TABLE IF NOT EXISTS embeddings (
    qualified_name TEXT PRIMARY KEY,
    vector BLOB NOT NULL,
    text_hash TEXT NOT NULL,
    provider TEXT NOT NULL DEFAULT 'unknown'
);
c                F    t        j                  t        |        dg|  S )z/Encode a float vector as a compact binary blob.f)structpackrf   )r   s    r   _encode_vectorr   `  s     ;;#c(1~,,,r   c                b    t        |       dz  }t        t        j                  | d|             S )z,Decode a binary blob back to a float vector.   r   )rf   listr   unpack)blobns     r   _decode_vectorr   e  s+    D	QA1gt,--r   c                    t        |       t        |      k7  ryt        d t        | |      D              }t        d | D              dz  }t        d |D              dz  }|dk(  s|dk(  ry|||z  z  S )z.Compute cosine similarity between two vectors.g        c              3  ,   K   | ]  \  }}||z    y wr   r   ).0xys      r   	<genexpr>z%_cosine_similarity.<locals>.<genexpr>o  s     *1a!e*s   c              3  &   K   | ]	  }||z    y wr   r   r   r   s     r   r   z%_cosine_similarity.<locals>.<genexpr>p       "1Q"   g      ?c              3  &   K   | ]	  }||z    y wr   r   r   s     r   r   z%_cosine_similarity.<locals>.<genexpr>q  r   r   r   )rf   sumzip)arc   dotnorm_anorm_bs        r   _cosine_similarityr   k  st    
1vQ
*Aq	*
*C"""c)F"""c)F{fk&6/""r   c                   | j                   g}| j                  dk7  r)|j                  | j                  j                                | j                  r|j                  d| j                          | j
                  r|j                  | j
                         | j                  r|j                  d| j                          | j                  r|j                  | j                         dj                  |      S )z3Convert a node to a searchable text representation.Filezin zreturns  )	r   kindappendlowerparent_nameparamsreturn_typelanguagejoin)nodepartss     r   _node_to_textr   w  s    YYKEyyFTYY__&'s4++,-.{{T[[!x 0 0123}}T]]#88E?r   c                  X    e Zd ZdZ	 	 d		 	 	 	 	 	 	 d
dZddZdddZdddZddZddZ	y)EmbeddingStorez4Manages vector embeddings for graph nodes in SQLite.Nc                    t        ||      | _        | j                  d u| _        t        |      | _        t        j                  t        | j                        ddd       | _        t
        j                  | j                  _
        | j                  j                  t               	 | j                  j                  d       | j                  j                          y # t
        j                  $ r | j                  j                  d       Y Kw xY w)Nr      F)r   check_same_threadisolation_levelz'SELECT provider FROM embeddings LIMIT 1zJALTER TABLE embeddings ADD COLUMN provider TEXT NOT NULL DEFAULT 'unknown')r   r   	availabler   db_pathsqlite3connectr%   _connRowrow_factoryexecutescript_EMBEDDINGS_SCHEMAexecuteOperationalErrorcommit)r   r   r   rE   s       r   r9   zEmbeddingStore.__init__  s     %XU;d2G}__rU 

 ")



  !34	JJHI 	

 '' 	JJ2	s   &C .DDc                8    | j                   j                          y r   )r  closer   s    r   r  zEmbeddingStore.close  s    

r   c                    | j                   syg }| j                   j                  }|D ]  }|j                  dk(  rt        |      }t	        j
                  |j                               j                         }| j                  j                  d|j                  f      j                         }|r|d   |k(  r	|d   |k(  r|j                  |||f        |sy|D 	
cg c]  \  }	}
}	|

 }}	}
| j                   j                  |      }t        ||      D ]>  \  \  }}}}t        |      }| j                  j                  d|j                  |||f       @ | j                  j!                          t#        |      S c c}
}	w )z1Compute and store embeddings for a list of nodes.r   r   zCSELECT text_hash, provider FROM embeddings WHERE qualified_name = ?	text_hashr   zvINSERT OR REPLACE INTO embeddings (qualified_name, vector, text_hash, provider)
                   VALUES (?, ?, ?, ?))r   r   r   r   hashlibsha256rC   	hexdigestr  r  qualified_namefetchoner   r   r   r   r	  rf   )r   nodesrk   to_embedr   r   r   r  existing_tr   rF   _textr   r   s                   r   embed_nodeszEmbeddingStore.embed_nodes  sw   }} 68** 	5DyyF" &Dt{{}5??AIzz))U$$& hj  Xk2i? ,=OOT434	5"  #++wq!Q++--%%e,-07-C 	)$T5)c!#&DJJ*$$dI}E	 	

8} ,s   E:c                   | j                   sg S | j                   j                  }| j                   j                  |      }g }| j                  j	                  d|f      }d}	 |j                  |      }|sn8|D ]2  }	t        |	d         }
t        ||
      }|j                  |	d   |f       4 L|j                  d d       |d| S )	z(Search for nodes by semantic similarity.z@SELECT qualified_name, vector FROM embeddings WHERE provider = ?i  Tvectorr  c                    | d   S )Nr   r   )r   s    r   rd   z'EmbeddingStore.search.<locals>.<lambda>  s
    !A$ r   )keyreverseN)
r   r   r   r  r  	fetchmanyr   r   r   sort)r   r   limitr   	query_vecscoredcursor
chunk_sizerowsrowr   sims               r   searchzEmbeddingStore.search  s    }}I**MM--e4	 +-##N
 
##J/D <$S]3(C8s#34c:;<	  	5fu~r   c                r    | j                   j                  d|f       | j                   j                          y )Nz/DELETE FROM embeddings WHERE qualified_name = ?)r  r  r	  )r   r  s     r   remove_nodezEmbeddingStore.remove_node  s.    

=?P	
 	

r   c                Z    | j                   j                  d      j                         d   S )NzSELECT COUNT(*) FROM embeddingsr   )r  r  r  r   s    r   countzEmbeddingStore.count  s'    zz!!"CDMMOPQRRr   NN)r   z
str | Pathr   rL   rE   rL   r"   rM   )r"   rM   )@   )r  zlist[GraphNode]rk   r(   r"   r(      )r   r%   r!  r(   r"   zlist[tuple[str, float]])r  r%   r"   rM   r'   )
r*   r+   r,   r   r9   r  r  r)  r+  r-  r   r   r   r   r     sS    >
  $ 	  	
 
6*X6Sr   r   c                    |j                   sy| j                         }g }|D ]"  }|j                  | j                  |             $ |j	                  |      S )z&Embed all non-file nodes in the graph.r   )r   get_all_filesrh   get_nodes_by_filer  )graph_storeembedding_store	all_files	all_nodesr   s        r   embed_all_nodesr9    s\    $$))+I!#I ;66q9:; &&y11r   c                b   |j                   rs|j                         dkD  r`|j                  | |      }g }|D ]D  \  }}|j                  |      }|st	        |      }	t        |d      |	d<   |j                  |	       F |S |j                  | |      }
|
D cg c]  }t	        |       c}S c c}w )zESearch nodes using vector similarity, falling back to keyword search.r   )r!  r   similarity_score)r   r-  r)  get_noder   roundr   search_nodes)r   r5  r6  r!  rl   outputqnscorer   dr  r   s               r   semantic_searchrC    s       _%:%:%<q%@!((e(<  	!IB''+D &(-eQ$%a 	!  $$U%$8E%*+LO+++s   B,)r   r%   r"   rM   r.  )r   rL   rE   rL   r"   zEmbeddingProvider | None)r"   bool)r   r&   r"   bytes)r   rE  r"   r&   )r   r&   rc   r&   r"   float)r   r	   r"   r%   )r5  r
   r6  r   r"   r(   r0  )
r   r%   r5  r
   r6  r   r!  r(   r"   zlist[dict[str, Any]]))r   
__future__r   r  loggingr2   r   r   r   ry   abcr   r   pathlibr   typingr   graphr	   r
   r   	getLoggerr*   rw   r   r5   r/   rO   r   CLOUD_PROVIDERSr   r   r   r  r   r   r   r   r   r9  rC  r   r   r   <module>rO     s-   #   	   
  #   6 6			8	$ * ) &+. &+RE&/ E&PO(0 O(d Y'6  /// /d -
.	# oS oSd
2" 	,,, $, 	,
 ,r   