prevent cached_content leak to non-extract prompts (#4089)
This commit is contained in:
@@ -2,7 +2,7 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
from asyncio import CancelledError
|
from asyncio import CancelledError
|
||||||
from typing import Any, AsyncIterator
|
from typing import Any, AsyncIterator, Protocol, runtime_checkable
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
import structlog
|
import structlog
|
||||||
@@ -40,6 +40,19 @@ LOG = structlog.get_logger()
|
|||||||
EXTRACT_ACTION_PROMPT_NAME = "extract-actions"
|
EXTRACT_ACTION_PROMPT_NAME = "extract-actions"
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class RouterWithModelList(Protocol):
|
||||||
|
model_list: list[dict[str, Any]]
|
||||||
|
|
||||||
|
|
||||||
|
def _get_primary_model_dict(router: Any, main_model_group: str) -> dict[str, Any] | None:
|
||||||
|
if isinstance(router, RouterWithModelList):
|
||||||
|
for model_dict in router.model_list:
|
||||||
|
if model_dict.get("model_name") == main_model_group:
|
||||||
|
return model_dict
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class LLMCallStats(BaseModel):
|
class LLMCallStats(BaseModel):
|
||||||
input_tokens: int | None = None
|
input_tokens: int | None = None
|
||||||
output_tokens: int | None = None
|
output_tokens: int | None = None
|
||||||
@@ -366,6 +379,8 @@ class LLMAPIHandlerFactory:
|
|||||||
|
|
||||||
vertex_cache_attached = False
|
vertex_cache_attached = False
|
||||||
cache_resource_name = getattr(context, "vertex_cache_name", None)
|
cache_resource_name = getattr(context, "vertex_cache_name", None)
|
||||||
|
primary_model_dict = _get_primary_model_dict(router, main_model_group)
|
||||||
|
|
||||||
# Add cached_content to primary model's litellm_params (not global parameters)
|
# Add cached_content to primary model's litellm_params (not global parameters)
|
||||||
# This ensures it's only passed to the Gemini primary, not to fallback models.
|
# This ensures it's only passed to the Gemini primary, not to fallback models.
|
||||||
# By setting it in the model-specific litellm_params, LiteLLM will only include it
|
# By setting it in the model-specific litellm_params, LiteLLM will only include it
|
||||||
@@ -376,23 +391,24 @@ class LLMAPIHandlerFactory:
|
|||||||
and prompt_name == EXTRACT_ACTION_PROMPT_NAME
|
and prompt_name == EXTRACT_ACTION_PROMPT_NAME
|
||||||
and getattr(context, "use_prompt_caching", False)
|
and getattr(context, "use_prompt_caching", False)
|
||||||
and "gemini" in main_model_group.lower()
|
and "gemini" in main_model_group.lower()
|
||||||
|
and primary_model_dict is not None
|
||||||
):
|
):
|
||||||
# Modify the router's model_list to add cached_content only to the primary model
|
litellm_params = primary_model_dict.setdefault("litellm_params", {})
|
||||||
# The router is created per-handler-instance, so this modification is safe
|
litellm_params["cached_content"] = cache_resource_name
|
||||||
# and idempotent (setting the same value multiple times is fine)
|
vertex_cache_attached = True
|
||||||
for model_dict in router.model_list:
|
LOG.info(
|
||||||
if model_dict.get("model_name") == main_model_group:
|
"Adding Vertex AI cache reference to primary model in router",
|
||||||
if "litellm_params" not in model_dict:
|
prompt_name=prompt_name,
|
||||||
model_dict["litellm_params"] = {}
|
primary_model=main_model_group,
|
||||||
model_dict["litellm_params"]["cached_content"] = cache_resource_name
|
fallback_model=llm_config.fallback_model_group,
|
||||||
vertex_cache_attached = True
|
)
|
||||||
LOG.info(
|
elif primary_model_dict and "litellm_params" in primary_model_dict:
|
||||||
"Adding Vertex AI cache reference to primary model in router",
|
if primary_model_dict["litellm_params"].pop("cached_content", None):
|
||||||
prompt_name=prompt_name,
|
LOG.info(
|
||||||
primary_model=main_model_group,
|
"Removed Vertex AI cache reference from primary model in router",
|
||||||
fallback_model=llm_config.fallback_model_group,
|
prompt_name=prompt_name,
|
||||||
)
|
primary_model=main_model_group,
|
||||||
break
|
)
|
||||||
|
|
||||||
llm_request_payload = {
|
llm_request_payload = {
|
||||||
"model": llm_key,
|
"model": llm_key,
|
||||||
@@ -728,6 +744,14 @@ class LLMAPIHandlerFactory:
|
|||||||
prompt_name=prompt_name,
|
prompt_name=prompt_name,
|
||||||
cache_attached=True,
|
cache_attached=True,
|
||||||
)
|
)
|
||||||
|
elif "cached_content" in active_parameters:
|
||||||
|
removed_cache = active_parameters.pop("cached_content", None)
|
||||||
|
if removed_cache:
|
||||||
|
LOG.info(
|
||||||
|
"Removed Vertex AI cache reference from request",
|
||||||
|
prompt_name=prompt_name,
|
||||||
|
cache_was_attached=True,
|
||||||
|
)
|
||||||
|
|
||||||
llm_request_payload = {
|
llm_request_payload = {
|
||||||
"model": model_name,
|
"model": model_name,
|
||||||
|
|||||||
Reference in New Issue
Block a user