Spaces:

Shyamnath
/

inferencing-llm

Running

App Files Files Community

Shyamnath commited on 4 days ago

Commit

469eae6

1 Parent(s): f526ba5

Push core package and essential files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

LICENSE +26 -0
litellm/__init__.py +1084 -0
litellm/_logging.py +167 -0
litellm/_redis.py +333 -0
litellm/_service_logger.py +311 -0
litellm/_version.py +6 -0
litellm/anthropic_interface/__init__.py +6 -0
litellm/anthropic_interface/messages/__init__.py +117 -0
litellm/anthropic_interface/readme.md +116 -0
litellm/assistants/main.py +1484 -0
litellm/assistants/utils.py +161 -0
litellm/batch_completion/Readme.md +11 -0
litellm/batch_completion/main.py +253 -0
litellm/batches/batch_utils.py +182 -0
litellm/batches/main.py +796 -0
litellm/budget_manager.py +230 -0
litellm/caching/Readme.md +40 -0
litellm/caching/__init__.py +9 -0
litellm/caching/_internal_lru_cache.py +30 -0
litellm/caching/base_cache.py +55 -0
litellm/caching/caching.py +818 -0
litellm/caching/caching_handler.py +938 -0
litellm/caching/disk_cache.py +88 -0
litellm/caching/dual_cache.py +434 -0
litellm/caching/in_memory_cache.py +203 -0
litellm/caching/llm_caching_handler.py +39 -0
litellm/caching/qdrant_semantic_cache.py +442 -0
litellm/caching/redis_cache.py +1162 -0
litellm/caching/redis_cluster_cache.py +59 -0
litellm/caching/redis_semantic_cache.py +450 -0
litellm/caching/s3_cache.py +159 -0
litellm/constants.py +543 -0
litellm/cost.json +5 -0
litellm/cost_calculator.py +1378 -0
litellm/exceptions.py +809 -0
litellm/experimental_mcp_client/Readme.md +6 -0
litellm/experimental_mcp_client/__init__.py +3 -0
litellm/experimental_mcp_client/client.py +0 -0
litellm/experimental_mcp_client/tools.py +111 -0
litellm/files/main.py +891 -0
litellm/fine_tuning/main.py +761 -0
litellm/integrations/Readme.md +5 -0
litellm/integrations/SlackAlerting/Readme.md +13 -0
litellm/integrations/SlackAlerting/batching_handler.py +81 -0
litellm/integrations/SlackAlerting/slack_alerting.py +1825 -0
litellm/integrations/SlackAlerting/utils.py +92 -0
litellm/integrations/__init__.py +1 -0
litellm/integrations/_types/open_inference.py +389 -0
litellm/integrations/additional_logging_utils.py +36 -0
litellm/integrations/agentops/__init__.py +3 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,26 @@

+Portions of this software are licensed as follows:
+* All content that resides under the "enterprise/" directory of this repository, if that directory exists, is licensed under the license defined in "enterprise/LICENSE".
+* Content outside of the above mentioned directories or restrictions above is available under the MIT license as defined below.
+---
+MIT License
+Copyright (c) 2023 Berri AI
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

litellm/__init__.py ADDED Viewed

	@@ -0,0 +1,1084 @@

+### Hide pydantic namespace conflict warnings globally ###
+import warnings
+warnings.filterwarnings("ignore", message=".*conflict with protected namespace.*")
+### INIT VARIABLES ###########
+import threading
+import os
+from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+from litellm.caching.caching import Cache, DualCache, RedisCache, InMemoryCache
+from litellm.caching.llm_caching_handler import LLMClientCache
+from litellm.types.llms.bedrock import COHERE_EMBEDDING_INPUT_TYPES
+from litellm.types.utils import (
+    ImageObject,
+    BudgetConfig,
+    all_litellm_params,
+    all_litellm_params as _litellm_completion_params,
+    CredentialItem,
+)  # maintain backwards compatibility for root param
+from litellm._logging import (
+    set_verbose,
+    _turn_on_debug,
+    verbose_logger,
+    json_logs,
+    _turn_on_json,
+    log_level,
+)
+import re
+from litellm.constants import (
+    DEFAULT_BATCH_SIZE,
+    DEFAULT_FLUSH_INTERVAL_SECONDS,
+    ROUTER_MAX_FALLBACKS,
+    DEFAULT_MAX_RETRIES,
+    DEFAULT_REPLICATE_POLLING_RETRIES,
+    DEFAULT_REPLICATE_POLLING_DELAY_SECONDS,
+    LITELLM_CHAT_PROVIDERS,
+    HUMANLOOP_PROMPT_CACHE_TTL_SECONDS,
+    OPENAI_CHAT_COMPLETION_PARAMS,
+    OPENAI_CHAT_COMPLETION_PARAMS as _openai_completion_params,  # backwards compatibility
+    OPENAI_FINISH_REASONS,
+    OPENAI_FINISH_REASONS as _openai_finish_reasons,  # backwards compatibility
+    openai_compatible_endpoints,
+    openai_compatible_providers,
+    openai_text_completion_compatible_providers,
+    _openai_like_providers,
+    replicate_models,
+    clarifai_models,
+    huggingface_models,
+    empower_models,
+    together_ai_models,
+    baseten_models,
+    REPEATED_STREAMING_CHUNK_LIMIT,
+    request_timeout,
+    open_ai_embedding_models,
+    cohere_embedding_models,
+    bedrock_embedding_models,
+    known_tokenizer_config,
+    BEDROCK_INVOKE_PROVIDERS_LITERAL,
+    DEFAULT_MAX_TOKENS,
+    DEFAULT_SOFT_BUDGET,
+    DEFAULT_ALLOWED_FAILS,
+)
+from litellm.types.guardrails import GuardrailItem
+from litellm.proxy._types import (
+    KeyManagementSystem,
+    KeyManagementSettings,
+    LiteLLM_UpperboundKeyGenerateParams,
+)
+from litellm.types.proxy.management_endpoints.ui_sso import DefaultTeamSSOParams
+from litellm.types.utils import StandardKeyGenerationConfig, LlmProviders
+from litellm.integrations.custom_logger import CustomLogger
+from litellm.litellm_core_utils.logging_callback_manager import LoggingCallbackManager
+import httpx
+import dotenv
+litellm_mode = os.getenv("LITELLM_MODE", "DEV")  # "PRODUCTION", "DEV"
+if litellm_mode == "DEV":
+    dotenv.load_dotenv()
+################################################
+if set_verbose == True:
+    _turn_on_debug()
+################################################
+### Callbacks /Logging / Success / Failure Handlers #####
+CALLBACK_TYPES = Union[str, Callable, CustomLogger]
+input_callback: List[CALLBACK_TYPES] = []
+success_callback: List[CALLBACK_TYPES] = []
+failure_callback: List[CALLBACK_TYPES] = []
+service_callback: List[CALLBACK_TYPES] = []
+logging_callback_manager = LoggingCallbackManager()
+_custom_logger_compatible_callbacks_literal = Literal[
+    "lago",
+    "openmeter",
+    "logfire",
+    "literalai",
+    "dynamic_rate_limiter",
+    "langsmith",
+    "prometheus",
+    "otel",
+    "datadog",
+    "datadog_llm_observability",
+    "galileo",
+    "braintrust",
+    "arize",
+    "arize_phoenix",
+    "langtrace",
+    "gcs_bucket",
+    "azure_storage",
+    "opik",
+    "argilla",
+    "mlflow",
+    "langfuse",
+    "pagerduty",
+    "humanloop",
+    "gcs_pubsub",
+    "agentops",
+    "anthropic_cache_control_hook",
+    "bedrock_knowledgebase_hook",
+]
+logged_real_time_event_types: Optional[Union[List[str], Literal["*"]]] = None
+_known_custom_logger_compatible_callbacks: List = list(
+    get_args(_custom_logger_compatible_callbacks_literal)
+)
+callbacks: List[
+    Union[Callable, _custom_logger_compatible_callbacks_literal, CustomLogger]
+] = []
+langfuse_default_tags: Optional[List[str]] = None
+langsmith_batch_size: Optional[int] = None
+prometheus_initialize_budget_metrics: Optional[bool] = False
+require_auth_for_metrics_endpoint: Optional[bool] = False
+argilla_batch_size: Optional[int] = None
+datadog_use_v1: Optional[bool] = False  # if you want to use v1 datadog logged payload
+gcs_pub_sub_use_v1: Optional[bool] = (
+    False  # if you want to use v1 gcs pubsub logged payload
+)
+argilla_transformation_object: Optional[Dict[str, Any]] = None
+_async_input_callback: List[Union[str, Callable, CustomLogger]] = (
+    []
+)  # internal variable - async custom callbacks are routed here.
+_async_success_callback: List[Union[str, Callable, CustomLogger]] = (
+    []
+)  # internal variable - async custom callbacks are routed here.
+_async_failure_callback: List[Union[str, Callable, CustomLogger]] = (
+    []
+)  # internal variable - async custom callbacks are routed here.
+pre_call_rules: List[Callable] = []
+post_call_rules: List[Callable] = []
+turn_off_message_logging: Optional[bool] = False
+log_raw_request_response: bool = False
+redact_messages_in_exceptions: Optional[bool] = False
+redact_user_api_key_info: Optional[bool] = False
+filter_invalid_headers: Optional[bool] = False
+add_user_information_to_llm_headers: Optional[bool] = (
+    None  # adds user_id, team_id, token hash (params from StandardLoggingMetadata) to request headers
+)
+store_audit_logs = False  # Enterprise feature, allow users to see audit logs
+### end of callbacks #############
+email: Optional[str] = (
+    None  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
+)
+token: Optional[str] = (
+    None  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
+)
+telemetry = True
+max_tokens: int = DEFAULT_MAX_TOKENS  # OpenAI Defaults
+drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False))
+modify_params = bool(os.getenv("LITELLM_MODIFY_PARAMS", False))
+retry = True
+### AUTH ###
+api_key: Optional[str] = None
+openai_key: Optional[str] = None
+groq_key: Optional[str] = None
+databricks_key: Optional[str] = None
+openai_like_key: Optional[str] = None
+azure_key: Optional[str] = None
+anthropic_key: Optional[str] = None
+replicate_key: Optional[str] = None
+cohere_key: Optional[str] = None
+infinity_key: Optional[str] = None
+clarifai_key: Optional[str] = None
+maritalk_key: Optional[str] = None
+ai21_key: Optional[str] = None
+ollama_key: Optional[str] = None
+openrouter_key: Optional[str] = None
+predibase_key: Optional[str] = None
+huggingface_key: Optional[str] = None
+vertex_project: Optional[str] = None
+vertex_location: Optional[str] = None
+predibase_tenant_id: Optional[str] = None
+togetherai_api_key: Optional[str] = None
+cloudflare_api_key: Optional[str] = None
+baseten_key: Optional[str] = None
+aleph_alpha_key: Optional[str] = None
+nlp_cloud_key: Optional[str] = None
+snowflake_key: Optional[str] = None
+common_cloud_provider_auth_params: dict = {
+    "params": ["project", "region_name", "token"],
+    "providers": ["vertex_ai", "bedrock", "watsonx", "azure", "vertex_ai_beta"],
+}
+use_client: bool = False
+ssl_verify: Union[str, bool] = True
+ssl_certificate: Optional[str] = None
+disable_streaming_logging: bool = False
+disable_add_transform_inline_image_block: bool = False
+in_memory_llm_clients_cache: LLMClientCache = LLMClientCache()
+safe_memory_mode: bool = False
+enable_azure_ad_token_refresh: Optional[bool] = False
+### DEFAULT AZURE API VERSION ###
+AZURE_DEFAULT_API_VERSION = "2025-02-01-preview"  # this is updated to the latest
+### DEFAULT WATSONX API VERSION ###
+WATSONX_DEFAULT_API_VERSION = "2024-03-13"
+### COHERE EMBEDDINGS DEFAULT TYPE ###
+COHERE_DEFAULT_EMBEDDING_INPUT_TYPE: COHERE_EMBEDDING_INPUT_TYPES = "search_document"
+### CREDENTIALS ###
+credential_list: List[CredentialItem] = []
+### GUARDRAILS ###
+llamaguard_model_name: Optional[str] = None
+openai_moderations_model_name: Optional[str] = None
+presidio_ad_hoc_recognizers: Optional[str] = None
+google_moderation_confidence_threshold: Optional[float] = None
+llamaguard_unsafe_content_categories: Optional[str] = None
+blocked_user_list: Optional[Union[str, List]] = None
+banned_keywords_list: Optional[Union[str, List]] = None
+llm_guard_mode: Literal["all", "key-specific", "request-specific"] = "all"
+guardrail_name_config_map: Dict[str, GuardrailItem] = {}
+##################
+### PREVIEW FEATURES ###
+enable_preview_features: bool = False
+return_response_headers: bool = (
+    False  # get response headers from LLM Api providers - example x-remaining-requests,
+)
+enable_json_schema_validation: bool = False
+##################
+logging: bool = True
+enable_loadbalancing_on_batch_endpoints: Optional[bool] = None
+enable_caching_on_provider_specific_optional_params: bool = (
+    False  # feature-flag for caching on optional params - e.g. 'top_k'
+)
+caching: bool = (
+    False  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
+)
+caching_with_models: bool = (
+    False  # # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
+)
+cache: Optional[Cache] = (
+    None  # cache object <- use this - https://docs.litellm.ai/docs/caching
+)
+default_in_memory_ttl: Optional[float] = None
+default_redis_ttl: Optional[float] = None
+default_redis_batch_cache_expiry: Optional[float] = None
+model_alias_map: Dict[str, str] = {}
+model_group_alias_map: Dict[str, str] = {}
+max_budget: float = 0.0  # set the max budget across all providers
+budget_duration: Optional[str] = (
+    None  # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
+)
+default_soft_budget: float = (
+    DEFAULT_SOFT_BUDGET  # by default all litellm proxy keys have a soft budget of 50.0
+)
+forward_traceparent_to_llm_provider: bool = False
+_current_cost = 0.0  # private variable, used if max budget is set
+error_logs: Dict = {}
+add_function_to_prompt: bool = (
+    False  # if function calling not supported by api, append function call details to system prompt
+)
+client_session: Optional[httpx.Client] = None
+aclient_session: Optional[httpx.AsyncClient] = None
+model_fallbacks: Optional[List] = None  # Deprecated for 'litellm.fallbacks'
+model_cost_map_url: str = (
+    "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
+)
+suppress_debug_info = False
+dynamodb_table_name: Optional[str] = None
+s3_callback_params: Optional[Dict] = None
+generic_logger_headers: Optional[Dict] = None
+default_key_generate_params: Optional[Dict] = None
+upperbound_key_generate_params: Optional[LiteLLM_UpperboundKeyGenerateParams] = None
+key_generation_settings: Optional[StandardKeyGenerationConfig] = None
+default_internal_user_params: Optional[Dict] = None
+default_team_params: Optional[Union[DefaultTeamSSOParams, Dict]] = None
+default_team_settings: Optional[List] = None
+max_user_budget: Optional[float] = None
+default_max_internal_user_budget: Optional[float] = None
+max_internal_user_budget: Optional[float] = None
+max_ui_session_budget: Optional[float] = 10  # $10 USD budgets for UI Chat sessions
+internal_user_budget_duration: Optional[str] = None
+tag_budget_config: Optional[Dict[str, BudgetConfig]] = None
+max_end_user_budget: Optional[float] = None
+disable_end_user_cost_tracking: Optional[bool] = None
+disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None
+custom_prometheus_metadata_labels: List[str] = []
+#### REQUEST PRIORITIZATION ####
+priority_reservation: Optional[Dict[str, float]] = None
+force_ipv4: bool = (
+    False  # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6.
+)
+module_level_aclient = AsyncHTTPHandler(
+    timeout=request_timeout, client_alias="module level aclient"
+)
+module_level_client = HTTPHandler(timeout=request_timeout)
+#### RETRIES ####
+num_retries: Optional[int] = None  # per model endpoint
+max_fallbacks: Optional[int] = None
+default_fallbacks: Optional[List] = None
+fallbacks: Optional[List] = None
+context_window_fallbacks: Optional[List] = None
+content_policy_fallbacks: Optional[List] = None
+allowed_fails: int = 3
+num_retries_per_request: Optional[int] = (
+    None  # for the request overall (incl. fallbacks + model retries)
+)
+####### SECRET MANAGERS #####################
+secret_manager_client: Optional[Any] = (
+    None  # list of instantiated key management clients - e.g. azure kv, infisical, etc.
+)
+_google_kms_resource_name: Optional[str] = None
+_key_management_system: Optional[KeyManagementSystem] = None
+_key_management_settings: KeyManagementSettings = KeyManagementSettings()
+#### PII MASKING ####
+output_parse_pii: bool = False
+#############################################
+from litellm.litellm_core_utils.get_model_cost_map import get_model_cost_map
+model_cost = get_model_cost_map(url=model_cost_map_url)
+custom_prompt_dict: Dict[str, dict] = {}
+check_provider_endpoint = False
+####### THREAD-SPECIFIC DATA ####################
+class MyLocal(threading.local):
+    def __init__(self):
+        self.user = "Hello World"
+_thread_context = MyLocal()
+def identify(event_details):
+    # Store user in thread local data
+    if "user" in event_details:
+        _thread_context.user = event_details["user"]
+####### ADDITIONAL PARAMS ################### configurable params if you use proxy models like Helicone, map spend to org id, etc.
+api_base: Optional[str] = None
+headers = None
+api_version = None
+organization = None
+project = None
+config_path = None
+vertex_ai_safety_settings: Optional[dict] = None
+BEDROCK_CONVERSE_MODELS = [
+    "anthropic.claude-3-7-sonnet-20250219-v1:0",
+    "anthropic.claude-3-5-haiku-20241022-v1:0",
+    "anthropic.claude-3-5-sonnet-20241022-v2:0",
+    "anthropic.claude-3-5-sonnet-20240620-v1:0",
+    "anthropic.claude-3-opus-20240229-v1:0",
+    "anthropic.claude-3-sonnet-20240229-v1:0",
+    "anthropic.claude-3-haiku-20240307-v1:0",
+    "anthropic.claude-v2",
+    "anthropic.claude-v2:1",
+    "anthropic.claude-v1",
+    "anthropic.claude-instant-v1",
+    "ai21.jamba-instruct-v1:0",
+    "meta.llama3-70b-instruct-v1:0",
+    "meta.llama3-8b-instruct-v1:0",
+    "meta.llama3-1-8b-instruct-v1:0",
+    "meta.llama3-1-70b-instruct-v1:0",
+    "meta.llama3-1-405b-instruct-v1:0",
+    "meta.llama3-70b-instruct-v1:0",
+    "mistral.mistral-large-2407-v1:0",
+    "mistral.mistral-large-2402-v1:0",
+    "meta.llama3-2-1b-instruct-v1:0",
+    "meta.llama3-2-3b-instruct-v1:0",
+    "meta.llama3-2-11b-instruct-v1:0",
+    "meta.llama3-2-90b-instruct-v1:0",
+]
+####### COMPLETION MODELS ###################
+open_ai_chat_completion_models: List = []
+open_ai_text_completion_models: List = []
+cohere_models: List = []
+cohere_chat_models: List = []
+mistral_chat_models: List = []
+text_completion_codestral_models: List = []
+anthropic_models: List = []
+openrouter_models: List = []
+vertex_language_models: List = []
+vertex_vision_models: List = []
+vertex_chat_models: List = []
+vertex_code_chat_models: List = []
+vertex_ai_image_models: List = []
+vertex_text_models: List = []
+vertex_code_text_models: List = []
+vertex_embedding_models: List = []
+vertex_anthropic_models: List = []
+vertex_llama3_models: List = []
+vertex_ai_ai21_models: List = []
+vertex_mistral_models: List = []
+ai21_models: List = []
+ai21_chat_models: List = []
+nlp_cloud_models: List = []
+aleph_alpha_models: List = []
+bedrock_models: List = []
+bedrock_converse_models: List = BEDROCK_CONVERSE_MODELS
+fireworks_ai_models: List = []
+fireworks_ai_embedding_models: List = []
+deepinfra_models: List = []
+perplexity_models: List = []
+watsonx_models: List = []
+gemini_models: List = []
+xai_models: List = []
+deepseek_models: List = []
+azure_ai_models: List = []
+jina_ai_models: List = []
+voyage_models: List = []
+infinity_models: List = []
+databricks_models: List = []
+cloudflare_models: List = []
+codestral_models: List = []
+friendliai_models: List = []
+palm_models: List = []
+groq_models: List = []
+azure_models: List = []
+azure_text_models: List = []
+anyscale_models: List = []
+cerebras_models: List = []
+galadriel_models: List = []
+sambanova_models: List = []
+assemblyai_models: List = []
+snowflake_models: List = []
+def is_bedrock_pricing_only_model(key: str) -> bool:
+    """
+    Excludes keys with the pattern 'bedrock/<region>/<model>'. These are in the model_prices_and_context_window.json file for pricing purposes only.
+    Args:
+        key (str): A key to filter.
+    Returns:
+        bool: True if the key matches the Bedrock pattern, False otherwise.
+    """
+    # Regex to match 'bedrock/<region>/<model>'
+    bedrock_pattern = re.compile(r"^bedrock/[a-zA-Z0-9_-]+/.+$")
+    if "month-commitment" in key:
+        return True
+    is_match = bedrock_pattern.match(key)
+    return is_match is not None
+def is_openai_finetune_model(key: str) -> bool:
+    """
+    Excludes model cost keys with the pattern 'ft:<model>'. These are in the model_prices_and_context_window.json file for pricing purposes only.
+    Args:
+        key (str): A key to filter.
+    Returns:
+        bool: True if the key matches the OpenAI finetune pattern, False otherwise.
+    """
+    return key.startswith("ft:") and not key.count(":") > 1
+def add_known_models():
+    for key, value in model_cost.items():
+        if value.get("litellm_provider") == "openai" and not is_openai_finetune_model(
+            key
+        ):
+            open_ai_chat_completion_models.append(key)
+        elif value.get("litellm_provider") == "text-completion-openai":
+            open_ai_text_completion_models.append(key)
+        elif value.get("litellm_provider") == "azure_text":
+            azure_text_models.append(key)
+        elif value.get("litellm_provider") == "cohere":
+            cohere_models.append(key)
+        elif value.get("litellm_provider") == "cohere_chat":
+            cohere_chat_models.append(key)
+        elif value.get("litellm_provider") == "mistral":
+            mistral_chat_models.append(key)
+        elif value.get("litellm_provider") == "anthropic":
+            anthropic_models.append(key)
+        elif value.get("litellm_provider") == "empower":
+            empower_models.append(key)
+        elif value.get("litellm_provider") == "openrouter":
+            openrouter_models.append(key)
+        elif value.get("litellm_provider") == "vertex_ai-text-models":
+            vertex_text_models.append(key)
+        elif value.get("litellm_provider") == "vertex_ai-code-text-models":
+            vertex_code_text_models.append(key)
+        elif value.get("litellm_provider") == "vertex_ai-language-models":
+            vertex_language_models.append(key)
+        elif value.get("litellm_provider") == "vertex_ai-vision-models":
+            vertex_vision_models.append(key)
+        elif value.get("litellm_provider") == "vertex_ai-chat-models":
+            vertex_chat_models.append(key)
+        elif value.get("litellm_provider") == "vertex_ai-code-chat-models":
+            vertex_code_chat_models.append(key)
+        elif value.get("litellm_provider") == "vertex_ai-embedding-models":
+            vertex_embedding_models.append(key)
+        elif value.get("litellm_provider") == "vertex_ai-anthropic_models":
+            key = key.replace("vertex_ai/", "")
+            vertex_anthropic_models.append(key)
+        elif value.get("litellm_provider") == "vertex_ai-llama_models":
+            key = key.replace("vertex_ai/", "")
+            vertex_llama3_models.append(key)
+        elif value.get("litellm_provider") == "vertex_ai-mistral_models":
+            key = key.replace("vertex_ai/", "")
+            vertex_mistral_models.append(key)
+        elif value.get("litellm_provider") == "vertex_ai-ai21_models":
+            key = key.replace("vertex_ai/", "")
+            vertex_ai_ai21_models.append(key)
+        elif value.get("litellm_provider") == "vertex_ai-image-models":
+            key = key.replace("vertex_ai/", "")
+            vertex_ai_image_models.append(key)
+        elif value.get("litellm_provider") == "ai21":
+            if value.get("mode") == "chat":
+                ai21_chat_models.append(key)
+            else:
+                ai21_models.append(key)
+        elif value.get("litellm_provider") == "nlp_cloud":
+            nlp_cloud_models.append(key)
+        elif value.get("litellm_provider") == "aleph_alpha":
+            aleph_alpha_models.append(key)
+        elif value.get(
+            "litellm_provider"
+        ) == "bedrock" and not is_bedrock_pricing_only_model(key):
+            bedrock_models.append(key)
+        elif value.get("litellm_provider") == "bedrock_converse":
+            bedrock_converse_models.append(key)
+        elif value.get("litellm_provider") == "deepinfra":
+            deepinfra_models.append(key)
+        elif value.get("litellm_provider") == "perplexity":
+            perplexity_models.append(key)
+        elif value.get("litellm_provider") == "watsonx":
+            watsonx_models.append(key)
+        elif value.get("litellm_provider") == "gemini":
+            gemini_models.append(key)
+        elif value.get("litellm_provider") == "fireworks_ai":
+            # ignore the 'up-to', '-to-' model names -> not real models. just for cost tracking based on model params.
+            if "-to-" not in key and "fireworks-ai-default" not in key:
+                fireworks_ai_models.append(key)
+        elif value.get("litellm_provider") == "fireworks_ai-embedding-models":
+            # ignore the 'up-to', '-to-' model names -> not real models. just for cost tracking based on model params.
+            if "-to-" not in key:
+                fireworks_ai_embedding_models.append(key)
+        elif value.get("litellm_provider") == "text-completion-codestral":
+            text_completion_codestral_models.append(key)
+        elif value.get("litellm_provider") == "xai":
+            xai_models.append(key)
+        elif value.get("litellm_provider") == "deepseek":
+            deepseek_models.append(key)
+        elif value.get("litellm_provider") == "azure_ai":
+            azure_ai_models.append(key)
+        elif value.get("litellm_provider") == "voyage":
+            voyage_models.append(key)
+        elif value.get("litellm_provider") == "infinity":
+            infinity_models.append(key)
+        elif value.get("litellm_provider") == "databricks":
+            databricks_models.append(key)
+        elif value.get("litellm_provider") == "cloudflare":
+            cloudflare_models.append(key)
+        elif value.get("litellm_provider") == "codestral":
+            codestral_models.append(key)
+        elif value.get("litellm_provider") == "friendliai":
+            friendliai_models.append(key)
+        elif value.get("litellm_provider") == "palm":
+            palm_models.append(key)
+        elif value.get("litellm_provider") == "groq":
+            groq_models.append(key)
+        elif value.get("litellm_provider") == "azure":
+            azure_models.append(key)
+        elif value.get("litellm_provider") == "anyscale":
+            anyscale_models.append(key)
+        elif value.get("litellm_provider") == "cerebras":
+            cerebras_models.append(key)
+        elif value.get("litellm_provider") == "galadriel":
+            galadriel_models.append(key)
+        elif value.get("litellm_provider") == "sambanova_models":
+            sambanova_models.append(key)
+        elif value.get("litellm_provider") == "assemblyai":
+            assemblyai_models.append(key)
+        elif value.get("litellm_provider") == "jina_ai":
+            jina_ai_models.append(key)
+        elif value.get("litellm_provider") == "snowflake":
+            snowflake_models.append(key)
+add_known_models()
+# known openai compatible endpoints - we'll eventually move this list to the model_prices_and_context_window.json dictionary
+# this is maintained for Exception Mapping
+# used for Cost Tracking & Token counting
+# https://azure.microsoft.com/en-in/pricing/details/cognitive-services/openai-service/
+# Azure returns gpt-35-turbo in their responses, we need to map this to azure/gpt-3.5-turbo for token counting
+azure_llms = {
+    "gpt-35-turbo": "azure/gpt-35-turbo",
+    "gpt-35-turbo-16k": "azure/gpt-35-turbo-16k",
+    "gpt-35-turbo-instruct": "azure/gpt-35-turbo-instruct",
+}
+azure_embedding_models = {
+    "ada": "azure/ada",
+}
+petals_models = [
+    "petals-team/StableBeluga2",
+]
+ollama_models = ["llama2"]
+maritalk_models = ["maritalk"]
+model_list = (
+    open_ai_chat_completion_models
+    + open_ai_text_completion_models
+    + cohere_models
+    + cohere_chat_models
+    + anthropic_models
+    + replicate_models
+    + openrouter_models
+    + huggingface_models
+    + vertex_chat_models
+    + vertex_text_models
+    + ai21_models
+    + ai21_chat_models
+    + together_ai_models
+    + baseten_models
+    + aleph_alpha_models
+    + nlp_cloud_models
+    + ollama_models
+    + bedrock_models
+    + deepinfra_models
+    + perplexity_models
+    + maritalk_models
+    + vertex_language_models
+    + watsonx_models
+    + gemini_models
+    + text_completion_codestral_models
+    + xai_models
+    + deepseek_models
+    + azure_ai_models
+    + voyage_models
+    + infinity_models
+    + databricks_models
+    + cloudflare_models
+    + codestral_models
+    + friendliai_models
+    + palm_models
+    + groq_models
+    + azure_models
+    + anyscale_models
+    + cerebras_models
+    + galadriel_models
+    + sambanova_models
+    + azure_text_models
+    + assemblyai_models
+    + jina_ai_models
+    + snowflake_models
+)
+model_list_set = set(model_list)
+provider_list: List[Union[LlmProviders, str]] = list(LlmProviders)
+models_by_provider: dict = {
+    "openai": open_ai_chat_completion_models + open_ai_text_completion_models,
+    "text-completion-openai": open_ai_text_completion_models,
+    "cohere": cohere_models + cohere_chat_models,
+    "cohere_chat": cohere_chat_models,
+    "anthropic": anthropic_models,
+    "replicate": replicate_models,
+    "huggingface": huggingface_models,
+    "together_ai": together_ai_models,
+    "baseten": baseten_models,
+    "openrouter": openrouter_models,
+    "vertex_ai": vertex_chat_models
+    + vertex_text_models
+    + vertex_anthropic_models
+    + vertex_vision_models
+    + vertex_language_models,
+    "ai21": ai21_models,
+    "bedrock": bedrock_models + bedrock_converse_models,
+    "petals": petals_models,
+    "ollama": ollama_models,
+    "deepinfra": deepinfra_models,
+    "perplexity": perplexity_models,
+    "maritalk": maritalk_models,
+    "watsonx": watsonx_models,
+    "gemini": gemini_models,
+    "fireworks_ai": fireworks_ai_models + fireworks_ai_embedding_models,
+    "aleph_alpha": aleph_alpha_models,
+    "text-completion-codestral": text_completion_codestral_models,
+    "xai": xai_models,
+    "deepseek": deepseek_models,
+    "mistral": mistral_chat_models,
+    "azure_ai": azure_ai_models,
+    "voyage": voyage_models,
+    "infinity": infinity_models,
+    "databricks": databricks_models,
+    "cloudflare": cloudflare_models,
+    "codestral": codestral_models,
+    "nlp_cloud": nlp_cloud_models,
+    "friendliai": friendliai_models,
+    "palm": palm_models,
+    "groq": groq_models,
+    "azure": azure_models + azure_text_models,
+    "azure_text": azure_text_models,
+    "anyscale": anyscale_models,
+    "cerebras": cerebras_models,
+    "galadriel": galadriel_models,
+    "sambanova": sambanova_models,
+    "assemblyai": assemblyai_models,
+    "jina_ai": jina_ai_models,
+    "snowflake": snowflake_models,
+}
+# mapping for those models which have larger equivalents
+longer_context_model_fallback_dict: dict = {
+    # openai chat completion models
+    "gpt-3.5-turbo": "gpt-3.5-turbo-16k",
+    "gpt-3.5-turbo-0301": "gpt-3.5-turbo-16k-0301",
+    "gpt-3.5-turbo-0613": "gpt-3.5-turbo-16k-0613",
+    "gpt-4": "gpt-4-32k",
+    "gpt-4-0314": "gpt-4-32k-0314",
+    "gpt-4-0613": "gpt-4-32k-0613",
+    # anthropic
+    "claude-instant-1": "claude-2",
+    "claude-instant-1.2": "claude-2",
+    # vertexai
+    "chat-bison": "chat-bison-32k",
+    "chat-bison@001": "chat-bison-32k",
+    "codechat-bison": "codechat-bison-32k",
+    "codechat-bison@001": "codechat-bison-32k",
+    # openrouter
+    "openrouter/openai/gpt-3.5-turbo": "openrouter/openai/gpt-3.5-turbo-16k",
+    "openrouter/anthropic/claude-instant-v1": "openrouter/anthropic/claude-2",
+}
+####### EMBEDDING MODELS ###################
+all_embedding_models = (
+    open_ai_embedding_models
+    + cohere_embedding_models
+    + bedrock_embedding_models
+    + vertex_embedding_models
+    + fireworks_ai_embedding_models
+)
+####### IMAGE GENERATION MODELS ###################
+openai_image_generation_models = ["dall-e-2", "dall-e-3"]
+from .timeout import timeout
+from .cost_calculator import completion_cost
+from litellm.litellm_core_utils.litellm_logging import Logging, modify_integration
+from litellm.litellm_core_utils.get_llm_provider_logic import get_llm_provider
+from litellm.litellm_core_utils.core_helpers import remove_index_from_tool_calls
+from litellm.litellm_core_utils.token_counter import get_modified_max_tokens
+from .utils import (
+    client,
+    exception_type,
+    get_optional_params,
+    get_response_string,
+    token_counter,
+    create_pretrained_tokenizer,
+    create_tokenizer,
+    supports_function_calling,
+    supports_web_search,
+    supports_response_schema,
+    supports_parallel_function_calling,
+    supports_vision,
+    supports_audio_input,
+    supports_audio_output,
+    supports_system_messages,
+    supports_reasoning,
+    get_litellm_params,
+    acreate,
+    get_max_tokens,
+    get_model_info,
+    register_prompt_template,
+    validate_environment,
+    check_valid_key,
+    register_model,
+    encode,
+    decode,
+    _calculate_retry_after,
+    _should_retry,
+    get_supported_openai_params,
+    get_api_base,
+    get_first_chars_messages,
+    ModelResponse,
+    ModelResponseStream,
+    EmbeddingResponse,
+    ImageResponse,
+    TranscriptionResponse,
+    TextCompletionResponse,
+    get_provider_fields,
+    ModelResponseListIterator,
+)
+ALL_LITELLM_RESPONSE_TYPES = [
+    ModelResponse,
+    EmbeddingResponse,
+    ImageResponse,
+    TranscriptionResponse,
+    TextCompletionResponse,
+]
+from .llms.custom_llm import CustomLLM
+from .llms.bedrock.chat.converse_transformation import AmazonConverseConfig
+from .llms.openai_like.chat.handler import OpenAILikeChatConfig
+from .llms.aiohttp_openai.chat.transformation import AiohttpOpenAIChatConfig
+from .llms.galadriel.chat.transformation import GaladrielChatConfig
+from .llms.github.chat.transformation import GithubChatConfig
+from .llms.empower.chat.transformation import EmpowerChatConfig
+from .llms.huggingface.chat.transformation import HuggingFaceChatConfig
+from .llms.huggingface.embedding.transformation import HuggingFaceEmbeddingConfig
+from .llms.oobabooga.chat.transformation import OobaboogaConfig
+from .llms.maritalk import MaritalkConfig
+from .llms.openrouter.chat.transformation import OpenrouterConfig
+from .llms.anthropic.chat.transformation import AnthropicConfig
+from .llms.anthropic.common_utils import AnthropicModelInfo
+from .llms.groq.stt.transformation import GroqSTTConfig
+from .llms.anthropic.completion.transformation import AnthropicTextConfig
+from .llms.triton.completion.transformation import TritonConfig
+from .llms.triton.completion.transformation import TritonGenerateConfig
+from .llms.triton.completion.transformation import TritonInferConfig
+from .llms.triton.embedding.transformation import TritonEmbeddingConfig
+from .llms.databricks.chat.transformation import DatabricksConfig
+from .llms.databricks.embed.transformation import DatabricksEmbeddingConfig
+from .llms.predibase.chat.transformation import PredibaseConfig
+from .llms.replicate.chat.transformation import ReplicateConfig
+from .llms.cohere.completion.transformation import CohereTextConfig as CohereConfig
+from .llms.snowflake.chat.transformation import SnowflakeConfig
+from .llms.cohere.rerank.transformation import CohereRerankConfig
+from .llms.cohere.rerank_v2.transformation import CohereRerankV2Config
+from .llms.azure_ai.rerank.transformation import AzureAIRerankConfig
+from .llms.infinity.rerank.transformation import InfinityRerankConfig
+from .llms.jina_ai.rerank.transformation import JinaAIRerankConfig
+from .llms.clarifai.chat.transformation import ClarifaiConfig
+from .llms.ai21.chat.transformation import AI21ChatConfig, AI21ChatConfig as AI21Config
+from .llms.anthropic.experimental_pass_through.messages.transformation import (
+    AnthropicMessagesConfig,
+)
+from .llms.together_ai.chat import TogetherAIConfig
+from .llms.together_ai.completion.transformation import TogetherAITextCompletionConfig
+from .llms.cloudflare.chat.transformation import CloudflareChatConfig
+from .llms.deprecated_providers.palm import (
+    PalmConfig,
+)  # here to prevent breaking changes
+from .llms.nlp_cloud.chat.handler import NLPCloudConfig
+from .llms.petals.completion.transformation import PetalsConfig
+from .llms.deprecated_providers.aleph_alpha import AlephAlphaConfig
+from .llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
+    VertexGeminiConfig,
+    VertexGeminiConfig as VertexAIConfig,
+)
+from .llms.gemini.common_utils import GeminiModelInfo
+from .llms.gemini.chat.transformation import (
+    GoogleAIStudioGeminiConfig,
+    GoogleAIStudioGeminiConfig as GeminiConfig,  # aliased to maintain backwards compatibility
+)
+from .llms.vertex_ai.vertex_embeddings.transformation import (
+    VertexAITextEmbeddingConfig,
+)
+vertexAITextEmbeddingConfig = VertexAITextEmbeddingConfig()
+from .llms.vertex_ai.vertex_ai_partner_models.anthropic.transformation import (
+    VertexAIAnthropicConfig,
+)
+from .llms.vertex_ai.vertex_ai_partner_models.llama3.transformation import (
+    VertexAILlama3Config,
+)
+from .llms.vertex_ai.vertex_ai_partner_models.ai21.transformation import (
+    VertexAIAi21Config,
+)
+from .llms.ollama.completion.transformation import OllamaConfig
+from .llms.sagemaker.completion.transformation import SagemakerConfig
+from .llms.sagemaker.chat.transformation import SagemakerChatConfig
+from .llms.ollama_chat import OllamaChatConfig
+from .llms.bedrock.chat.invoke_handler import (
+    AmazonCohereChatConfig,
+    bedrock_tool_name_mappings,
+)
+from .llms.bedrock.common_utils import (
+    AmazonBedrockGlobalConfig,
+)
+from .llms.bedrock.chat.invoke_transformations.amazon_ai21_transformation import (
+    AmazonAI21Config,
+)
+from .llms.bedrock.chat.invoke_transformations.amazon_nova_transformation import (
+    AmazonInvokeNovaConfig,
+)
+from .llms.bedrock.chat.invoke_transformations.anthropic_claude2_transformation import (
+    AmazonAnthropicConfig,
+)
+from .llms.bedrock.chat.invoke_transformations.anthropic_claude3_transformation import (
+    AmazonAnthropicClaude3Config,
+)
+from .llms.bedrock.chat.invoke_transformations.amazon_cohere_transformation import (
+    AmazonCohereConfig,
+)
+from .llms.bedrock.chat.invoke_transformations.amazon_llama_transformation import (
+    AmazonLlamaConfig,
+)
+from .llms.bedrock.chat.invoke_transformations.amazon_deepseek_transformation import (
+    AmazonDeepSeekR1Config,
+)
+from .llms.bedrock.chat.invoke_transformations.amazon_mistral_transformation import (
+    AmazonMistralConfig,
+)
+from .llms.bedrock.chat.invoke_transformations.amazon_titan_transformation import (
+    AmazonTitanConfig,
+)
+from .llms.bedrock.chat.invoke_transformations.base_invoke_transformation import (
+    AmazonInvokeConfig,
+)
+from .llms.bedrock.image.amazon_stability1_transformation import AmazonStabilityConfig
+from .llms.bedrock.image.amazon_stability3_transformation import AmazonStability3Config
+from .llms.bedrock.image.amazon_nova_canvas_transformation import AmazonNovaCanvasConfig
+from .llms.bedrock.embed.amazon_titan_g1_transformation import AmazonTitanG1Config
+from .llms.bedrock.embed.amazon_titan_multimodal_transformation import (
+    AmazonTitanMultimodalEmbeddingG1Config,
+)
+from .llms.bedrock.embed.amazon_titan_v2_transformation import (
+    AmazonTitanV2Config,
+)
+from .llms.cohere.chat.transformation import CohereChatConfig
+from .llms.bedrock.embed.cohere_transformation import BedrockCohereEmbeddingConfig
+from .llms.openai.openai import OpenAIConfig, MistralEmbeddingConfig
+from .llms.openai.image_variations.transformation import OpenAIImageVariationConfig
+from .llms.deepinfra.chat.transformation import DeepInfraConfig
+from .llms.deepgram.audio_transcription.transformation import (
+    DeepgramAudioTranscriptionConfig,
+)
+from .llms.topaz.common_utils import TopazModelInfo
+from .llms.topaz.image_variations.transformation import TopazImageVariationConfig
+from litellm.llms.openai.completion.transformation import OpenAITextCompletionConfig
+from .llms.groq.chat.transformation import GroqChatConfig
+from .llms.voyage.embedding.transformation import VoyageEmbeddingConfig
+from .llms.infinity.embedding.transformation import InfinityEmbeddingConfig
+from .llms.azure_ai.chat.transformation import AzureAIStudioConfig
+from .llms.mistral.mistral_chat_transformation import MistralConfig
+from .llms.openai.responses.transformation import OpenAIResponsesAPIConfig
+from .llms.azure.responses.transformation import AzureOpenAIResponsesAPIConfig
+from .llms.openai.chat.o_series_transformation import (
+    OpenAIOSeriesConfig as OpenAIO1Config,  # maintain backwards compatibility
+    OpenAIOSeriesConfig,
+)
+from .llms.snowflake.chat.transformation import SnowflakeConfig
+openaiOSeriesConfig = OpenAIOSeriesConfig()
+from .llms.openai.chat.gpt_transformation import (
+    OpenAIGPTConfig,
+)
+from .llms.openai.transcriptions.whisper_transformation import (
+    OpenAIWhisperAudioTranscriptionConfig,
+)
+from .llms.openai.transcriptions.gpt_transformation import (
+    OpenAIGPTAudioTranscriptionConfig,
+)
+openAIGPTConfig = OpenAIGPTConfig()
+from .llms.openai.chat.gpt_audio_transformation import (
+    OpenAIGPTAudioConfig,
+)
+openAIGPTAudioConfig = OpenAIGPTAudioConfig()
+from .llms.nvidia_nim.chat import NvidiaNimConfig
+from .llms.nvidia_nim.embed import NvidiaNimEmbeddingConfig
+nvidiaNimConfig = NvidiaNimConfig()
+nvidiaNimEmbeddingConfig = NvidiaNimEmbeddingConfig()
+from .llms.cerebras.chat import CerebrasConfig
+from .llms.sambanova.chat import SambanovaConfig
+from .llms.ai21.chat.transformation import AI21ChatConfig
+from .llms.fireworks_ai.chat.transformation import FireworksAIConfig
+from .llms.fireworks_ai.completion.transformation import FireworksAITextCompletionConfig
+from .llms.fireworks_ai.audio_transcription.transformation import (
+    FireworksAIAudioTranscriptionConfig,
+)
+from .llms.fireworks_ai.embed.fireworks_ai_transformation import (
+    FireworksAIEmbeddingConfig,
+)
+from .llms.friendliai.chat.transformation import FriendliaiChatConfig
+from .llms.jina_ai.embedding.transformation import JinaAIEmbeddingConfig
+from .llms.xai.chat.transformation import XAIChatConfig
+from .llms.xai.common_utils import XAIModelInfo
+from .llms.volcengine import VolcEngineConfig
+from .llms.codestral.completion.transformation import CodestralTextCompletionConfig
+from .llms.azure.azure import (
+    AzureOpenAIError,
+    AzureOpenAIAssistantsAPIConfig,
+)
+from .llms.azure.chat.gpt_transformation import AzureOpenAIConfig
+from .llms.azure.completion.transformation import AzureOpenAITextConfig
+from .llms.hosted_vllm.chat.transformation import HostedVLLMChatConfig
+from .llms.llamafile.chat.transformation import LlamafileChatConfig
+from .llms.litellm_proxy.chat.transformation import LiteLLMProxyChatConfig
+from .llms.vllm.completion.transformation import VLLMConfig
+from .llms.deepseek.chat.transformation import DeepSeekChatConfig
+from .llms.lm_studio.chat.transformation import LMStudioChatConfig
+from .llms.lm_studio.embed.transformation import LmStudioEmbeddingConfig
+from .llms.perplexity.chat.transformation import PerplexityChatConfig
+from .llms.azure.chat.o_series_transformation import AzureOpenAIO1Config
+from .llms.watsonx.completion.transformation import IBMWatsonXAIConfig
+from .llms.watsonx.chat.transformation import IBMWatsonXChatConfig
+from .llms.watsonx.embed.transformation import IBMWatsonXEmbeddingConfig
+from .main import *  # type: ignore
+from .integrations import *
+from .exceptions import (
+    AuthenticationError,
+    InvalidRequestError,
+    BadRequestError,
+    NotFoundError,
+    RateLimitError,
+    ServiceUnavailableError,
+    OpenAIError,
+    ContextWindowExceededError,
+    ContentPolicyViolationError,
+    BudgetExceededError,
+    APIError,
+    Timeout,
+    APIConnectionError,
+    UnsupportedParamsError,
+    APIResponseValidationError,
+    UnprocessableEntityError,
+    InternalServerError,
+    JSONSchemaValidationError,
+    LITELLM_EXCEPTION_TYPES,
+    MockException,
+)
+from .budget_manager import BudgetManager
+from .proxy.proxy_cli import run_server
+from .router import Router
+from .assistants.main import *
+from .batches.main import *
+from .batch_completion.main import *  # type: ignore
+from .rerank_api.main import *
+from .llms.anthropic.experimental_pass_through.messages.handler import *
+from .responses.main import *
+from .realtime_api.main import _arealtime
+from .fine_tuning.main import *
+from .files.main import *
+from .scheduler import *
+from .cost_calculator import response_cost_calculator, cost_per_token
+### ADAPTERS ###
+from .types.adapter import AdapterItem
+import litellm.anthropic_interface as anthropic
+adapters: List[AdapterItem] = []
+### CUSTOM LLMs ###
+from .types.llms.custom_llm import CustomLLMItem
+from .types.utils import GenericStreamingChunk
+custom_provider_map: List[CustomLLMItem] = []
+_custom_providers: List[str] = (
+    []
+)  # internal helper util, used to track names of custom providers
+disable_hf_tokenizer_download: Optional[bool] = (
+    None  # disable huggingface tokenizer download. Defaults to openai clk100
+)
+global_disable_no_log_param: bool = False

litellm/_logging.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import json
+import logging
+import os
+import sys
+from datetime import datetime
+from logging import Formatter
+set_verbose = False
+if set_verbose is True:
+    logging.warning(
+        "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."
+    )
+json_logs = bool(os.getenv("JSON_LOGS", False))
+# Create a handler for the logger (you may need to adapt this based on your needs)
+log_level = os.getenv("LITELLM_LOG", "DEBUG")
+numeric_level: str = getattr(logging, log_level.upper())
+handler = logging.StreamHandler()
+handler.setLevel(numeric_level)
+class JsonFormatter(Formatter):
+    def __init__(self):
+        super(JsonFormatter, self).__init__()
+    def formatTime(self, record, datefmt=None):
+        # Use datetime to format the timestamp in ISO 8601 format
+        dt = datetime.fromtimestamp(record.created)
+        return dt.isoformat()
+    def format(self, record):
+        json_record = {
+            "message": record.getMessage(),
+            "level": record.levelname,
+            "timestamp": self.formatTime(record),
+        }
+        if record.exc_info:
+            json_record["stacktrace"] = self.formatException(record.exc_info)
+        return json.dumps(json_record)
+# Function to set up exception handlers for JSON logging
+def _setup_json_exception_handlers(formatter):
+    # Create a handler with JSON formatting for exceptions
+    error_handler = logging.StreamHandler()
+    error_handler.setFormatter(formatter)
+    # Setup excepthook for uncaught exceptions
+    def json_excepthook(exc_type, exc_value, exc_traceback):
+        record = logging.LogRecord(
+            name="LiteLLM",
+            level=logging.ERROR,
+            pathname="",
+            lineno=0,
+            msg=str(exc_value),
+            args=(),
+            exc_info=(exc_type, exc_value, exc_traceback),
+        )
+        error_handler.handle(record)
+    sys.excepthook = json_excepthook
+    # Configure asyncio exception handler if possible
+    try:
+        import asyncio
+        def async_json_exception_handler(loop, context):
+            exception = context.get("exception")
+            if exception:
+                record = logging.LogRecord(
+                    name="LiteLLM",
+                    level=logging.ERROR,
+                    pathname="",
+                    lineno=0,
+                    msg=str(exception),
+                    args=(),
+                    exc_info=None,
+                )
+                error_handler.handle(record)
+            else:
+                loop.default_exception_handler(context)
+        asyncio.get_event_loop().set_exception_handler(async_json_exception_handler)
+    except Exception:
+        pass
+# Create a formatter and set it for the handler
+if json_logs:
+    handler.setFormatter(JsonFormatter())
+    _setup_json_exception_handlers(JsonFormatter())
+else:
+    formatter = logging.Formatter(
+        "\033[92m%(asctime)s - %(name)s:%(levelname)s\033[0m: %(filename)s:%(lineno)s - %(message)s",
+        datefmt="%H:%M:%S",
+    )
+    handler.setFormatter(formatter)
+verbose_proxy_logger = logging.getLogger("LiteLLM Proxy")
+verbose_router_logger = logging.getLogger("LiteLLM Router")
+verbose_logger = logging.getLogger("LiteLLM")
+# Add the handler to the logger
+verbose_router_logger.addHandler(handler)
+verbose_proxy_logger.addHandler(handler)
+verbose_logger.addHandler(handler)
+def _turn_on_json():
+    handler = logging.StreamHandler()
+    handler.setFormatter(JsonFormatter())
+    # Define all loggers to update, including root logger
+    loggers = [logging.getLogger()] + [
+        verbose_router_logger,
+        verbose_proxy_logger,
+        verbose_logger,
+    ]
+    # Iterate through each logger and update its handlers
+    for logger in loggers:
+        # Remove all existing handlers
+        for h in logger.handlers[:]:
+            logger.removeHandler(h)
+        # Add the new handler
+        logger.addHandler(handler)
+    # Set up exception handlers
+    _setup_json_exception_handlers(JsonFormatter())
+def _turn_on_debug():
+    verbose_logger.setLevel(level=logging.DEBUG)  # set package log to debug
+    verbose_router_logger.setLevel(level=logging.DEBUG)  # set router logs to debug
+    verbose_proxy_logger.setLevel(level=logging.DEBUG)  # set proxy logs to debug
+def _disable_debugging():
+    verbose_logger.disabled = True
+    verbose_router_logger.disabled = True
+    verbose_proxy_logger.disabled = True
+def _enable_debugging():
+    verbose_logger.disabled = False
+    verbose_router_logger.disabled = False
+    verbose_proxy_logger.disabled = False
+def print_verbose(print_statement):
+    try:
+        if set_verbose:
+            print(print_statement)  # noqa
+    except Exception:
+        pass
+def _is_debugging_on() -> bool:
+    """
+    Returns True if debugging is on
+    """
+    if verbose_logger.isEnabledFor(logging.DEBUG) or set_verbose is True:
+        return True
+    return False

litellm/_redis.py ADDED Viewed

	@@ -0,0 +1,333 @@

+# +-----------------------------------------------+
+# |                                               |
+# |           Give Feedback / Get Help            |
+# | https://github.com/BerriAI/litellm/issues/new |
+# |                                               |
+# +-----------------------------------------------+
+#
+#  Thank you users! We ❤️ you! - Krrish & Ishaan
+import inspect
+import json
+# s/o [@Frank Colson](https://www.linkedin.com/in/frank-colson-422b9b183/) for this redis implementation
+import os
+from typing import List, Optional, Union
+import redis  # type: ignore
+import redis.asyncio as async_redis  # type: ignore
+from litellm import get_secret, get_secret_str
+from litellm.constants import REDIS_CONNECTION_POOL_TIMEOUT, REDIS_SOCKET_TIMEOUT
+from ._logging import verbose_logger
+def _get_redis_kwargs():
+    arg_spec = inspect.getfullargspec(redis.Redis)
+    # Only allow primitive arguments
+    exclude_args = {
+        "self",
+        "connection_pool",
+        "retry",
+    }
+    include_args = ["url"]
+    available_args = [x for x in arg_spec.args if x not in exclude_args] + include_args
+    return available_args
+def _get_redis_url_kwargs(client=None):
+    if client is None:
+        client = redis.Redis.from_url
+    arg_spec = inspect.getfullargspec(redis.Redis.from_url)
+    # Only allow primitive arguments
+    exclude_args = {
+        "self",
+        "connection_pool",
+        "retry",
+    }
+    include_args = ["url"]
+    available_args = [x for x in arg_spec.args if x not in exclude_args] + include_args
+    return available_args
+def _get_redis_cluster_kwargs(client=None):
+    if client is None:
+        client = redis.Redis.from_url
+    arg_spec = inspect.getfullargspec(redis.RedisCluster)
+    # Only allow primitive arguments
+    exclude_args = {"self", "connection_pool", "retry", "host", "port", "startup_nodes"}
+    available_args = [x for x in arg_spec.args if x not in exclude_args]
+    available_args.append("password")
+    available_args.append("username")
+    available_args.append("ssl")
+    return available_args
+def _get_redis_env_kwarg_mapping():
+    PREFIX = "REDIS_"
+    return {f"{PREFIX}{x.upper()}": x for x in _get_redis_kwargs()}
+def _redis_kwargs_from_environment():
+    mapping = _get_redis_env_kwarg_mapping()
+    return_dict = {}
+    for k, v in mapping.items():
+        value = get_secret(k, default_value=None)  # type: ignore
+        if value is not None:
+            return_dict[v] = value
+    return return_dict
+def get_redis_url_from_environment():
+    if "REDIS_URL" in os.environ:
+        return os.environ["REDIS_URL"]
+    if "REDIS_HOST" not in os.environ or "REDIS_PORT" not in os.environ:
+        raise ValueError(
+            "Either 'REDIS_URL' or both 'REDIS_HOST' and 'REDIS_PORT' must be specified for Redis."
+        )
+    if "REDIS_PASSWORD" in os.environ:
+        redis_password = f":{os.environ['REDIS_PASSWORD']}@"
+    else:
+        redis_password = ""
+    return (
+        f"redis://{redis_password}{os.environ['REDIS_HOST']}:{os.environ['REDIS_PORT']}"
+    )
+def _get_redis_client_logic(**env_overrides):
+    """
+    Common functionality across sync + async redis client implementations
+    """
+    ### check if "os.environ/<key-name>" passed in
+    for k, v in env_overrides.items():
+        if isinstance(v, str) and v.startswith("os.environ/"):
+            v = v.replace("os.environ/", "")
+            value = get_secret(v)  # type: ignore
+            env_overrides[k] = value
+    redis_kwargs = {
+        **_redis_kwargs_from_environment(),
+        **env_overrides,
+    }
+    _startup_nodes: Optional[Union[str, list]] = redis_kwargs.get("startup_nodes", None) or get_secret(  # type: ignore
+        "REDIS_CLUSTER_NODES"
+    )
+    if _startup_nodes is not None and isinstance(_startup_nodes, str):
+        redis_kwargs["startup_nodes"] = json.loads(_startup_nodes)
+    _sentinel_nodes: Optional[Union[str, list]] = redis_kwargs.get("sentinel_nodes", None) or get_secret(  # type: ignore
+        "REDIS_SENTINEL_NODES"
+    )
+    if _sentinel_nodes is not None and isinstance(_sentinel_nodes, str):
+        redis_kwargs["sentinel_nodes"] = json.loads(_sentinel_nodes)
+    _sentinel_password: Optional[str] = redis_kwargs.get(
+        "sentinel_password", None
+    ) or get_secret_str("REDIS_SENTINEL_PASSWORD")
+    if _sentinel_password is not None:
+        redis_kwargs["sentinel_password"] = _sentinel_password
+    _service_name: Optional[str] = redis_kwargs.get("service_name", None) or get_secret(  # type: ignore
+        "REDIS_SERVICE_NAME"
+    )
+    if _service_name is not None:
+        redis_kwargs["service_name"] = _service_name
+    if "url" in redis_kwargs and redis_kwargs["url"] is not None:
+        redis_kwargs.pop("host", None)
+        redis_kwargs.pop("port", None)
+        redis_kwargs.pop("db", None)
+        redis_kwargs.pop("password", None)
+    elif "startup_nodes" in redis_kwargs and redis_kwargs["startup_nodes"] is not None:
+        pass
+    elif (
+        "sentinel_nodes" in redis_kwargs and redis_kwargs["sentinel_nodes"] is not None
+    ):
+        pass
+    elif "host" not in redis_kwargs or redis_kwargs["host"] is None:
+        raise ValueError("Either 'host' or 'url' must be specified for redis.")
+    # litellm.print_verbose(f"redis_kwargs: {redis_kwargs}")
+    return redis_kwargs
+def init_redis_cluster(redis_kwargs) -> redis.RedisCluster:
+    _redis_cluster_nodes_in_env: Optional[str] = get_secret("REDIS_CLUSTER_NODES")  # type: ignore
+    if _redis_cluster_nodes_in_env is not None:
+        try:
+            redis_kwargs["startup_nodes"] = json.loads(_redis_cluster_nodes_in_env)
+        except json.JSONDecodeError:
+            raise ValueError(
+                "REDIS_CLUSTER_NODES environment variable is not valid JSON. Please ensure it's properly formatted."
+            )
+    verbose_logger.debug("init_redis_cluster: startup nodes are being initialized.")
+    from redis.cluster import ClusterNode
+    args = _get_redis_cluster_kwargs()
+    cluster_kwargs = {}
+    for arg in redis_kwargs:
+        if arg in args:
+            cluster_kwargs[arg] = redis_kwargs[arg]
+    new_startup_nodes: List[ClusterNode] = []
+    for item in redis_kwargs["startup_nodes"]:
+        new_startup_nodes.append(ClusterNode(**item))
+    redis_kwargs.pop("startup_nodes")
+    return redis.RedisCluster(startup_nodes=new_startup_nodes, **cluster_kwargs)  # type: ignore
+def _init_redis_sentinel(redis_kwargs) -> redis.Redis:
+    sentinel_nodes = redis_kwargs.get("sentinel_nodes")
+    sentinel_password = redis_kwargs.get("sentinel_password")
+    service_name = redis_kwargs.get("service_name")
+    if not sentinel_nodes or not service_name:
+        raise ValueError(
+            "Both 'sentinel_nodes' and 'service_name' are required for Redis Sentinel."
+        )
+    verbose_logger.debug("init_redis_sentinel: sentinel nodes are being initialized.")
+    # Set up the Sentinel client
+    sentinel = redis.Sentinel(
+        sentinel_nodes,
+        socket_timeout=REDIS_SOCKET_TIMEOUT,
+        password=sentinel_password,
+    )
+    # Return the master instance for the given service
+    return sentinel.master_for(service_name)
+def _init_async_redis_sentinel(redis_kwargs) -> async_redis.Redis:
+    sentinel_nodes = redis_kwargs.get("sentinel_nodes")
+    sentinel_password = redis_kwargs.get("sentinel_password")
+    service_name = redis_kwargs.get("service_name")
+    if not sentinel_nodes or not service_name:
+        raise ValueError(
+            "Both 'sentinel_nodes' and 'service_name' are required for Redis Sentinel."
+        )
+    verbose_logger.debug("init_redis_sentinel: sentinel nodes are being initialized.")
+    # Set up the Sentinel client
+    sentinel = async_redis.Sentinel(
+        sentinel_nodes,
+        socket_timeout=REDIS_SOCKET_TIMEOUT,
+        password=sentinel_password,
+    )
+    # Return the master instance for the given service
+    return sentinel.master_for(service_name)
+def get_redis_client(**env_overrides):
+    redis_kwargs = _get_redis_client_logic(**env_overrides)
+    if "url" in redis_kwargs and redis_kwargs["url"] is not None:
+        args = _get_redis_url_kwargs()
+        url_kwargs = {}
+        for arg in redis_kwargs:
+            if arg in args:
+                url_kwargs[arg] = redis_kwargs[arg]
+        return redis.Redis.from_url(**url_kwargs)
+    if "startup_nodes" in redis_kwargs or get_secret("REDIS_CLUSTER_NODES") is not None:  # type: ignore
+        return init_redis_cluster(redis_kwargs)
+    # Check for Redis Sentinel
+    if "sentinel_nodes" in redis_kwargs and "service_name" in redis_kwargs:
+        return _init_redis_sentinel(redis_kwargs)
+    return redis.Redis(**redis_kwargs)
+def get_redis_async_client(
+    **env_overrides,
+) -> async_redis.Redis:
+    redis_kwargs = _get_redis_client_logic(**env_overrides)
+    if "url" in redis_kwargs and redis_kwargs["url"] is not None:
+        args = _get_redis_url_kwargs(client=async_redis.Redis.from_url)
+        url_kwargs = {}
+        for arg in redis_kwargs:
+            if arg in args:
+                url_kwargs[arg] = redis_kwargs[arg]
+            else:
+                verbose_logger.debug(
+                    "REDIS: ignoring argument: {}. Not an allowed async_redis.Redis.from_url arg.".format(
+                        arg
+                    )
+                )
+        return async_redis.Redis.from_url(**url_kwargs)
+    if "startup_nodes" in redis_kwargs:
+        from redis.cluster import ClusterNode
+        args = _get_redis_cluster_kwargs()
+        cluster_kwargs = {}
+        for arg in redis_kwargs:
+            if arg in args:
+                cluster_kwargs[arg] = redis_kwargs[arg]
+        new_startup_nodes: List[ClusterNode] = []
+        for item in redis_kwargs["startup_nodes"]:
+            new_startup_nodes.append(ClusterNode(**item))
+        redis_kwargs.pop("startup_nodes")
+        return async_redis.RedisCluster(
+            startup_nodes=new_startup_nodes, **cluster_kwargs  # type: ignore
+        )
+    # Check for Redis Sentinel
+    if "sentinel_nodes" in redis_kwargs and "service_name" in redis_kwargs:
+        return _init_async_redis_sentinel(redis_kwargs)
+    return async_redis.Redis(
+        **redis_kwargs,
+    )
+def get_redis_connection_pool(**env_overrides):
+    redis_kwargs = _get_redis_client_logic(**env_overrides)
+    verbose_logger.debug("get_redis_connection_pool: redis_kwargs", redis_kwargs)
+    if "url" in redis_kwargs and redis_kwargs["url"] is not None:
+        return async_redis.BlockingConnectionPool.from_url(
+            timeout=REDIS_CONNECTION_POOL_TIMEOUT, url=redis_kwargs["url"]
+        )
+    connection_class = async_redis.Connection
+    if "ssl" in redis_kwargs:
+        connection_class = async_redis.SSLConnection
+        redis_kwargs.pop("ssl", None)
+        redis_kwargs["connection_class"] = connection_class
+    redis_kwargs.pop("startup_nodes", None)
+    return async_redis.BlockingConnectionPool(
+        timeout=REDIS_CONNECTION_POOL_TIMEOUT, **redis_kwargs
+    )

litellm/_service_logger.py ADDED Viewed

	@@ -0,0 +1,311 @@

+import asyncio
+from datetime import datetime, timedelta
+from typing import TYPE_CHECKING, Any, Optional, Union
+import litellm
+from litellm._logging import verbose_logger
+from litellm.proxy._types import UserAPIKeyAuth
+from .integrations.custom_logger import CustomLogger
+from .integrations.datadog.datadog import DataDogLogger
+from .integrations.opentelemetry import OpenTelemetry
+from .integrations.prometheus_services import PrometheusServicesLogger
+from .types.services import ServiceLoggerPayload, ServiceTypes
+if TYPE_CHECKING:
+    from opentelemetry.trace import Span as _Span
+    Span = Union[_Span, Any]
+    OTELClass = OpenTelemetry
+else:
+    Span = Any
+    OTELClass = Any
+class ServiceLogging(CustomLogger):
+    """
+    Separate class used for monitoring health of litellm-adjacent services (redis/postgres).
+    """
+    def __init__(self, mock_testing: bool = False) -> None:
+        self.mock_testing = mock_testing
+        self.mock_testing_sync_success_hook = 0
+        self.mock_testing_async_success_hook = 0
+        self.mock_testing_sync_failure_hook = 0
+        self.mock_testing_async_failure_hook = 0
+        if "prometheus_system" in litellm.service_callback:
+            self.prometheusServicesLogger = PrometheusServicesLogger()
+    def service_success_hook(
+        self,
+        service: ServiceTypes,
+        duration: float,
+        call_type: str,
+        parent_otel_span: Optional[Span] = None,
+        start_time: Optional[Union[datetime, float]] = None,
+        end_time: Optional[Union[float, datetime]] = None,
+    ):
+        """
+        Handles both sync and async monitoring by checking for existing event loop.
+        """
+        if self.mock_testing:
+            self.mock_testing_sync_success_hook += 1
+        try:
+            # Try to get the current event loop
+            loop = asyncio.get_event_loop()
+            # Check if the loop is running
+            if loop.is_running():
+                # If we're in a running loop, create a task
+                loop.create_task(
+                    self.async_service_success_hook(
+                        service=service,
+                        duration=duration,
+                        call_type=call_type,
+                        parent_otel_span=parent_otel_span,
+                        start_time=start_time,
+                        end_time=end_time,
+                    )
+                )
+            else:
+                # Loop exists but not running, we can use run_until_complete
+                loop.run_until_complete(
+                    self.async_service_success_hook(
+                        service=service,
+                        duration=duration,
+                        call_type=call_type,
+                        parent_otel_span=parent_otel_span,
+                        start_time=start_time,
+                        end_time=end_time,
+                    )
+                )
+        except RuntimeError:
+            # No event loop exists, create a new one and run
+            asyncio.run(
+                self.async_service_success_hook(
+                    service=service,
+                    duration=duration,
+                    call_type=call_type,
+                    parent_otel_span=parent_otel_span,
+                    start_time=start_time,
+                    end_time=end_time,
+                )
+            )
+    def service_failure_hook(
+        self, service: ServiceTypes, duration: float, error: Exception, call_type: str
+    ):
+        """
+        [TODO] Not implemented for sync calls yet. V0 is focused on async monitoring (used by proxy).
+        """
+        if self.mock_testing:
+            self.mock_testing_sync_failure_hook += 1
+    async def async_service_success_hook(
+        self,
+        service: ServiceTypes,
+        call_type: str,
+        duration: float,
+        parent_otel_span: Optional[Span] = None,
+        start_time: Optional[Union[datetime, float]] = None,
+        end_time: Optional[Union[datetime, float]] = None,
+        event_metadata: Optional[dict] = None,
+    ):
+        """
+        - For counting if the redis, postgres call is successful
+        """
+        if self.mock_testing:
+            self.mock_testing_async_success_hook += 1
+        payload = ServiceLoggerPayload(
+            is_error=False,
+            error=None,
+            service=service,
+            duration=duration,
+            call_type=call_type,
+            event_metadata=event_metadata,
+        )
+        for callback in litellm.service_callback:
+            if callback == "prometheus_system":
+                await self.init_prometheus_services_logger_if_none()
+                await self.prometheusServicesLogger.async_service_success_hook(
+                    payload=payload
+                )
+            elif callback == "datadog" or isinstance(callback, DataDogLogger):
+                await self.init_datadog_logger_if_none()
+                await self.dd_logger.async_service_success_hook(
+                    payload=payload,
+                    parent_otel_span=parent_otel_span,
+                    start_time=start_time,
+                    end_time=end_time,
+                    event_metadata=event_metadata,
+                )
+            elif callback == "otel" or isinstance(callback, OpenTelemetry):
+                from litellm.proxy.proxy_server import open_telemetry_logger
+                await self.init_otel_logger_if_none()
+                if (
+                    parent_otel_span is not None
+                    and open_telemetry_logger is not None
+                    and isinstance(open_telemetry_logger, OpenTelemetry)
+                ):
+                    await self.otel_logger.async_service_success_hook(
+                        payload=payload,
+                        parent_otel_span=parent_otel_span,
+                        start_time=start_time,
+                        end_time=end_time,
+                        event_metadata=event_metadata,
+                    )
+    async def init_prometheus_services_logger_if_none(self):
+        """
+        initializes prometheusServicesLogger if it is None or no attribute exists on ServiceLogging Object
+        """
+        if not hasattr(self, "prometheusServicesLogger"):
+            self.prometheusServicesLogger = PrometheusServicesLogger()
+        elif self.prometheusServicesLogger is None:
+            self.prometheusServicesLogger = self.prometheusServicesLogger()
+        return
+    async def init_datadog_logger_if_none(self):
+        """
+        initializes dd_logger if it is None or no attribute exists on ServiceLogging Object
+        """
+        from litellm.integrations.datadog.datadog import DataDogLogger
+        if not hasattr(self, "dd_logger"):
+            self.dd_logger: DataDogLogger = DataDogLogger()
+        return
+    async def init_otel_logger_if_none(self):
+        """
+        initializes otel_logger if it is None or no attribute exists on ServiceLogging Object
+        """
+        from litellm.proxy.proxy_server import open_telemetry_logger
+        if not hasattr(self, "otel_logger"):
+            if open_telemetry_logger is not None and isinstance(
+                open_telemetry_logger, OpenTelemetry
+            ):
+                self.otel_logger: OpenTelemetry = open_telemetry_logger
+            else:
+                verbose_logger.warning(
+                    "ServiceLogger: open_telemetry_logger is None or not an instance of OpenTelemetry"
+                )
+        return
+    async def async_service_failure_hook(
+        self,
+        service: ServiceTypes,
+        duration: float,
+        error: Union[str, Exception],
+        call_type: str,
+        parent_otel_span: Optional[Span] = None,
+        start_time: Optional[Union[datetime, float]] = None,
+        end_time: Optional[Union[float, datetime]] = None,
+        event_metadata: Optional[dict] = None,
+    ):
+        """
+        - For counting if the redis, postgres call is unsuccessful
+        """
+        if self.mock_testing:
+            self.mock_testing_async_failure_hook += 1
+        error_message = ""
+        if isinstance(error, Exception):
+            error_message = str(error)
+        elif isinstance(error, str):
+            error_message = error
+        payload = ServiceLoggerPayload(
+            is_error=True,
+            error=error_message,
+            service=service,
+            duration=duration,
+            call_type=call_type,
+            event_metadata=event_metadata,
+        )
+        for callback in litellm.service_callback:
+            if callback == "prometheus_system":
+                await self.init_prometheus_services_logger_if_none()
+                await self.prometheusServicesLogger.async_service_failure_hook(
+                    payload=payload,
+                    error=error,
+                )
+            elif callback == "datadog" or isinstance(callback, DataDogLogger):
+                await self.init_datadog_logger_if_none()
+                await self.dd_logger.async_service_failure_hook(
+                    payload=payload,
+                    error=error_message,
+                    parent_otel_span=parent_otel_span,
+                    start_time=start_time,
+                    end_time=end_time,
+                    event_metadata=event_metadata,
+                )
+            elif callback == "otel" or isinstance(callback, OpenTelemetry):
+                from litellm.proxy.proxy_server import open_telemetry_logger
+                await self.init_otel_logger_if_none()
+                if not isinstance(error, str):
+                    error = str(error)
+                if (
+                    parent_otel_span is not None
+                    and open_telemetry_logger is not None
+                    and isinstance(open_telemetry_logger, OpenTelemetry)
+                ):
+                    await self.otel_logger.async_service_success_hook(
+                        payload=payload,
+                        parent_otel_span=parent_otel_span,
+                        start_time=start_time,
+                        end_time=end_time,
+                        event_metadata=event_metadata,
+                    )
+    async def async_post_call_failure_hook(
+        self,
+        request_data: dict,
+        original_exception: Exception,
+        user_api_key_dict: UserAPIKeyAuth,
+    ):
+        """
+        Hook to track failed litellm-service calls
+        """
+        return await super().async_post_call_failure_hook(
+            request_data,
+            original_exception,
+            user_api_key_dict,
+        )
+    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
+        """
+        Hook to track latency for litellm proxy llm api calls
+        """
+        try:
+            _duration = end_time - start_time
+            if isinstance(_duration, timedelta):
+                _duration = _duration.total_seconds()
+            elif isinstance(_duration, float):
+                pass
+            else:
+                raise Exception(
+                    "Duration={} is not a float or timedelta object. type={}".format(
+                        _duration, type(_duration)
+                    )
+                )  # invalid _duration value
+            await self.async_service_success_hook(
+                service=ServiceTypes.LITELLM,
+                duration=_duration,
+                call_type=kwargs["call_type"],
+            )
+        except Exception as e:
+            raise e

litellm/_version.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import importlib_metadata
+try:
+    version = importlib_metadata.version("litellm")
+except Exception:
+    version = "unknown"

litellm/anthropic_interface/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""
+Anthropic module for LiteLLM
+"""
+from .messages import acreate, create
+__all__ = ["acreate", "create"]

litellm/anthropic_interface/messages/__init__.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+Interface for Anthropic's messages API
+Use this to call LLMs in Anthropic /messages Request/Response format
+This is an __init__.py file to allow the following interface
+- litellm.messages.acreate
+- litellm.messages.create
+"""
+from typing import AsyncIterator, Dict, Iterator, List, Optional, Union
+from litellm.llms.anthropic.experimental_pass_through.messages.handler import (
+    anthropic_messages as _async_anthropic_messages,
+)
+from litellm.types.llms.anthropic_messages.anthropic_response import (
+    AnthropicMessagesResponse,
+)
+async def acreate(
+    max_tokens: int,
+    messages: List[Dict],
+    model: str,
+    metadata: Optional[Dict] = None,
+    stop_sequences: Optional[List[str]] = None,
+    stream: Optional[bool] = False,
+    system: Optional[str] = None,
+    temperature: Optional[float] = 1.0,
+    thinking: Optional[Dict] = None,
+    tool_choice: Optional[Dict] = None,
+    tools: Optional[List[Dict]] = None,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    **kwargs
+) -> Union[AnthropicMessagesResponse, AsyncIterator]:
+    """
+    Async wrapper for Anthropic's messages API
+    Args:
+        max_tokens (int): Maximum tokens to generate (required)
+        messages (List[Dict]): List of message objects with role and content (required)
+        model (str): Model name to use (required)
+        metadata (Dict, optional): Request metadata
+        stop_sequences (List[str], optional): Custom stop sequences
+        stream (bool, optional): Whether to stream the response
+        system (str, optional): System prompt
+        temperature (float, optional): Sampling temperature (0.0 to 1.0)
+        thinking (Dict, optional): Extended thinking configuration
+        tool_choice (Dict, optional): Tool choice configuration
+        tools (List[Dict], optional): List of tool definitions
+        top_k (int, optional): Top K sampling parameter
+        top_p (float, optional): Nucleus sampling parameter
+        **kwargs: Additional arguments
+    Returns:
+        Dict: Response from the API
+    """
+    return await _async_anthropic_messages(
+        max_tokens=max_tokens,
+        messages=messages,
+        model=model,
+        metadata=metadata,
+        stop_sequences=stop_sequences,
+        stream=stream,
+        system=system,
+        temperature=temperature,
+        thinking=thinking,
+        tool_choice=tool_choice,
+        tools=tools,
+        top_k=top_k,
+        top_p=top_p,
+        **kwargs,
+    )
+async def create(
+    max_tokens: int,
+    messages: List[Dict],
+    model: str,
+    metadata: Optional[Dict] = None,
+    stop_sequences: Optional[List[str]] = None,
+    stream: Optional[bool] = False,
+    system: Optional[str] = None,
+    temperature: Optional[float] = 1.0,
+    thinking: Optional[Dict] = None,
+    tool_choice: Optional[Dict] = None,
+    tools: Optional[List[Dict]] = None,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    **kwargs
+) -> Union[AnthropicMessagesResponse, Iterator]:
+    """
+    Async wrapper for Anthropic's messages API
+    Args:
+        max_tokens (int): Maximum tokens to generate (required)
+        messages (List[Dict]): List of message objects with role and content (required)
+        model (str): Model name to use (required)
+        metadata (Dict, optional): Request metadata
+        stop_sequences (List[str], optional): Custom stop sequences
+        stream (bool, optional): Whether to stream the response
+        system (str, optional): System prompt
+        temperature (float, optional): Sampling temperature (0.0 to 1.0)
+        thinking (Dict, optional): Extended thinking configuration
+        tool_choice (Dict, optional): Tool choice configuration
+        tools (List[Dict], optional): List of tool definitions
+        top_k (int, optional): Top K sampling parameter
+        top_p (float, optional): Nucleus sampling parameter
+        **kwargs: Additional arguments
+    Returns:
+        Dict: Response from the API
+    """
+    raise NotImplementedError("This function is not implemented")

litellm/anthropic_interface/readme.md ADDED Viewed

	@@ -0,0 +1,116 @@

+## Use LLM API endpoints in Anthropic Interface
+Note: This is called `anthropic_interface` because `anthropic` is a known python package and was failing mypy type checking.
+## Usage
+---
+### LiteLLM Python SDK
+#### Non-streaming example
+```python showLineNumbers title="Example using LiteLLM Python SDK"
+import litellm
+response = await litellm.anthropic.messages.acreate(
+    messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}],
+    api_key=api_key,
+    model="anthropic/claude-3-haiku-20240307",
+    max_tokens=100,
+)
+```
+Example response:
+```json
+{
+  "content": [
+    {
+      "text": "Hi! this is a very short joke",
+      "type": "text"
+    }
+  ],
+  "id": "msg_013Zva2CMHLNnXjNJJKqJ2EF",
+  "model": "claude-3-7-sonnet-20250219",
+  "role": "assistant",
+  "stop_reason": "end_turn",
+  "stop_sequence": null,
+  "type": "message",
+  "usage": {
+    "input_tokens": 2095,
+    "output_tokens": 503,
+    "cache_creation_input_tokens": 2095,
+    "cache_read_input_tokens": 0
+  }
+}
+```
+#### Streaming example
+```python showLineNumbers title="Example using LiteLLM Python SDK"
+import litellm
+response = await litellm.anthropic.messages.acreate(
+    messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}],
+    api_key=api_key,
+    model="anthropic/claude-3-haiku-20240307",
+    max_tokens=100,
+    stream=True,
+)
+async for chunk in response:
+    print(chunk)
+```
+### LiteLLM Proxy Server
+1. Setup config.yaml
+```yaml
+model_list:
+    - model_name: anthropic-claude
+      litellm_params:
+        model: claude-3-7-sonnet-latest
+```
+2. Start proxy
+```bash
+litellm --config /path/to/config.yaml
+```
+3. Test it!
+<Tabs>
+<TabItem label="Anthropic Python SDK" value="python">
+```python showLineNumbers title="Example using LiteLLM Proxy Server"
+import anthropic
+# point anthropic sdk to litellm proxy
+client = anthropic.Anthropic(
+    base_url="http://0.0.0.0:4000",
+    api_key="sk-1234",
+)
+response = client.messages.create(
+    messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}],
+    model="anthropic/claude-3-haiku-20240307",
+    max_tokens=100,
+)
+```
+</TabItem>
+<TabItem label="curl" value="curl">
+```bash showLineNumbers title="Example using LiteLLM Proxy Server"
+curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \
+-H 'content-type: application/json' \
+-H 'x-api-key: $LITELLM_API_KEY' \
+-H 'anthropic-version: 2023-06-01' \
+-d '{
+  "model": "anthropic-claude",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Hello, can you tell me a short joke?"
+    }
+  ],
+  "max_tokens": 100
+}'
+```

litellm/assistants/main.py ADDED Viewed

	@@ -0,0 +1,1484 @@

+# What is this?
+## Main file for assistants API logic
+import asyncio
+import contextvars
+import os
+from functools import partial
+from typing import Any, Coroutine, Dict, Iterable, List, Literal, Optional, Union
+import httpx
+from openai import AsyncOpenAI, OpenAI
+from openai.types.beta.assistant import Assistant
+from openai.types.beta.assistant_deleted import AssistantDeleted
+import litellm
+from litellm.types.router import GenericLiteLLMParams
+from litellm.utils import (
+    exception_type,
+    get_litellm_params,
+    get_llm_provider,
+    get_secret,
+    supports_httpx_timeout,
+)
+from ..llms.azure.assistants import AzureAssistantsAPI
+from ..llms.openai.openai import OpenAIAssistantsAPI
+from ..types.llms.openai import *
+from ..types.router import *
+from .utils import get_optional_params_add_message
+####### ENVIRONMENT VARIABLES ###################
+openai_assistants_api = OpenAIAssistantsAPI()
+azure_assistants_api = AzureAssistantsAPI()
+### ASSISTANTS ###
+async def aget_assistants(
+    custom_llm_provider: Literal["openai", "azure"],
+    client: Optional[AsyncOpenAI] = None,
+    **kwargs,
+) -> AsyncCursorPage[Assistant]:
+    loop = asyncio.get_event_loop()
+    ### PASS ARGS TO GET ASSISTANTS ###
+    kwargs["aget_assistants"] = True
+    try:
+        # Use a partial function to pass your keyword arguments
+        func = partial(get_assistants, custom_llm_provider, client, **kwargs)
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        _, custom_llm_provider, _, _ = get_llm_provider(  # type: ignore
+            model="", custom_llm_provider=custom_llm_provider
+        )  # type: ignore
+        # Await normally
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response
+        return response  # type: ignore
+    except Exception as e:
+        raise exception_type(
+            model="",
+            custom_llm_provider=custom_llm_provider,
+            original_exception=e,
+            completion_kwargs={},
+            extra_kwargs=kwargs,
+        )
+def get_assistants(
+    custom_llm_provider: Literal["openai", "azure"],
+    client: Optional[Any] = None,
+    api_key: Optional[str] = None,
+    api_base: Optional[str] = None,
+    api_version: Optional[str] = None,
+    **kwargs,
+) -> SyncCursorPage[Assistant]:
+    aget_assistants: Optional[bool] = kwargs.pop("aget_assistants", None)
+    if aget_assistants is not None and not isinstance(aget_assistants, bool):
+        raise Exception(
+            "Invalid value passed in for aget_assistants. Only bool or None allowed"
+        )
+    optional_params = GenericLiteLLMParams(
+        api_key=api_key, api_base=api_base, api_version=api_version, **kwargs
+    )
+    litellm_params_dict = get_litellm_params(**kwargs)
+    ### TIMEOUT LOGIC ###
+    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+    # set timeout for 10 minutes by default
+    if (
+        timeout is not None
+        and isinstance(timeout, httpx.Timeout)
+        and supports_httpx_timeout(custom_llm_provider) is False
+    ):
+        read_timeout = timeout.read or 600
+        timeout = read_timeout  # default 10 min timeout
+    elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+        timeout = float(timeout)  # type: ignore
+    elif timeout is None:
+        timeout = 600.0
+    response: Optional[SyncCursorPage[Assistant]] = None
+    if custom_llm_provider == "openai":
+        api_base = (
+            optional_params.api_base  # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            or litellm.api_base
+            or os.getenv("OPENAI_BASE_URL")
+            or os.getenv("OPENAI_API_BASE")
+            or "https://api.openai.com/v1"
+        )
+        organization = (
+            optional_params.organization
+            or litellm.organization
+            or os.getenv("OPENAI_ORGANIZATION", None)
+            or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+        )
+        # set API KEY
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+            or litellm.openai_key
+            or os.getenv("OPENAI_API_KEY")
+        )
+        response = openai_assistants_api.get_assistants(
+            api_base=api_base,
+            api_key=api_key,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            organization=organization,
+            client=client,
+            aget_assistants=aget_assistants,  # type: ignore
+        )  # type: ignore
+    elif custom_llm_provider == "azure":
+        api_base = (
+            optional_params.api_base or litellm.api_base or get_secret("AZURE_API_BASE")
+        )  # type: ignore
+        api_version = (
+            optional_params.api_version
+            or litellm.api_version
+            or get_secret("AZURE_API_VERSION")
+        )  # type: ignore
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key
+            or litellm.azure_key
+            or get_secret("AZURE_OPENAI_API_KEY")
+            or get_secret("AZURE_API_KEY")
+        )  # type: ignore
+        extra_body = optional_params.get("extra_body", {})
+        azure_ad_token: Optional[str] = None
+        if extra_body is not None:
+            azure_ad_token = extra_body.pop("azure_ad_token", None)
+        else:
+            azure_ad_token = get_secret("AZURE_AD_TOKEN")  # type: ignore
+        response = azure_assistants_api.get_assistants(
+            api_base=api_base,
+            api_key=api_key,
+            api_version=api_version,
+            azure_ad_token=azure_ad_token,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            client=client,
+            aget_assistants=aget_assistants,  # type: ignore
+            litellm_params=litellm_params_dict,
+        )
+    else:
+        raise litellm.exceptions.BadRequestError(
+            message="LiteLLM doesn't support {} for 'get_assistants'. Only 'openai' is supported.".format(
+                custom_llm_provider
+            ),
+            model="n/a",
+            llm_provider=custom_llm_provider,
+            response=httpx.Response(
+                status_code=400,
+                content="Unsupported provider",
+                request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+            ),
+        )
+    if response is None:
+        raise litellm.exceptions.BadRequestError(
+            message="LiteLLM doesn't support {} for 'get_assistants'. Only 'openai' is supported.".format(
+                custom_llm_provider
+            ),
+            model="n/a",
+            llm_provider=custom_llm_provider,
+            response=httpx.Response(
+                status_code=400,
+                content="Unsupported provider",
+                request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+            ),
+        )
+    return response
+async def acreate_assistants(
+    custom_llm_provider: Literal["openai", "azure"],
+    client: Optional[AsyncOpenAI] = None,
+    **kwargs,
+) -> Assistant:
+    loop = asyncio.get_event_loop()
+    ### PASS ARGS TO GET ASSISTANTS ###
+    kwargs["async_create_assistants"] = True
+    model = kwargs.pop("model", None)
+    try:
+        kwargs["client"] = client
+        # Use a partial function to pass your keyword arguments
+        func = partial(create_assistants, custom_llm_provider, model, **kwargs)
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        _, custom_llm_provider, _, _ = get_llm_provider(  # type: ignore
+            model=model, custom_llm_provider=custom_llm_provider
+        )  # type: ignore
+        # Await normally
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response
+        return response  # type: ignore
+    except Exception as e:
+        raise exception_type(
+            model=model,
+            custom_llm_provider=custom_llm_provider,
+            original_exception=e,
+            completion_kwargs={},
+            extra_kwargs=kwargs,
+        )
+def create_assistants(
+    custom_llm_provider: Literal["openai", "azure"],
+    model: str,
+    name: Optional[str] = None,
+    description: Optional[str] = None,
+    instructions: Optional[str] = None,
+    tools: Optional[List[Dict[str, Any]]] = None,
+    tool_resources: Optional[Dict[str, Any]] = None,
+    metadata: Optional[Dict[str, str]] = None,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+    response_format: Optional[Union[str, Dict[str, str]]] = None,
+    client: Optional[Any] = None,
+    api_key: Optional[str] = None,
+    api_base: Optional[str] = None,
+    api_version: Optional[str] = None,
+    **kwargs,
+) -> Union[Assistant, Coroutine[Any, Any, Assistant]]:
+    async_create_assistants: Optional[bool] = kwargs.pop(
+        "async_create_assistants", None
+    )
+    if async_create_assistants is not None and not isinstance(
+        async_create_assistants, bool
+    ):
+        raise ValueError(
+            "Invalid value passed in for async_create_assistants. Only bool or None allowed"
+        )
+    optional_params = GenericLiteLLMParams(
+        api_key=api_key, api_base=api_base, api_version=api_version, **kwargs
+    )
+    litellm_params_dict = get_litellm_params(**kwargs)
+    ### TIMEOUT LOGIC ###
+    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+    # set timeout for 10 minutes by default
+    if (
+        timeout is not None
+        and isinstance(timeout, httpx.Timeout)
+        and supports_httpx_timeout(custom_llm_provider) is False
+    ):
+        read_timeout = timeout.read or 600
+        timeout = read_timeout  # default 10 min timeout
+    elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+        timeout = float(timeout)  # type: ignore
+    elif timeout is None:
+        timeout = 600.0
+    create_assistant_data = {
+        "model": model,
+        "name": name,
+        "description": description,
+        "instructions": instructions,
+        "tools": tools,
+        "tool_resources": tool_resources,
+        "metadata": metadata,
+        "temperature": temperature,
+        "top_p": top_p,
+        "response_format": response_format,
+    }
+    # only send params that are not None
+    create_assistant_data = {
+        k: v for k, v in create_assistant_data.items() if v is not None
+    }
+    response: Optional[Union[Coroutine[Any, Any, Assistant], Assistant]] = None
+    if custom_llm_provider == "openai":
+        api_base = (
+            optional_params.api_base  # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            or litellm.api_base
+            or os.getenv("OPENAI_BASE_URL")
+            or os.getenv("OPENAI_API_BASE")
+            or "https://api.openai.com/v1"
+        )
+        organization = (
+            optional_params.organization
+            or litellm.organization
+            or os.getenv("OPENAI_ORGANIZATION", None)
+            or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+        )
+        # set API KEY
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+            or litellm.openai_key
+            or os.getenv("OPENAI_API_KEY")
+        )
+        response = openai_assistants_api.create_assistants(
+            api_base=api_base,
+            api_key=api_key,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            organization=organization,
+            create_assistant_data=create_assistant_data,
+            client=client,
+            async_create_assistants=async_create_assistants,  # type: ignore
+        )  # type: ignore
+    elif custom_llm_provider == "azure":
+        api_base = (
+            optional_params.api_base or litellm.api_base or get_secret("AZURE_API_BASE")
+        )  # type: ignore
+        api_version = (
+            optional_params.api_version
+            or litellm.api_version
+            or get_secret("AZURE_API_VERSION")
+        )  # type: ignore
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key
+            or litellm.azure_key
+            or get_secret("AZURE_OPENAI_API_KEY")
+            or get_secret("AZURE_API_KEY")
+        )  # type: ignore
+        extra_body = optional_params.get("extra_body", {})
+        azure_ad_token: Optional[str] = None
+        if extra_body is not None:
+            azure_ad_token = extra_body.pop("azure_ad_token", None)
+        else:
+            azure_ad_token = get_secret("AZURE_AD_TOKEN")  # type: ignore
+        if isinstance(client, OpenAI):
+            client = None  # only pass client if it's AzureOpenAI
+        response = azure_assistants_api.create_assistants(
+            api_base=api_base,
+            api_key=api_key,
+            azure_ad_token=azure_ad_token,
+            api_version=api_version,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            client=client,
+            async_create_assistants=async_create_assistants,
+            create_assistant_data=create_assistant_data,
+            litellm_params=litellm_params_dict,
+        )
+    else:
+        raise litellm.exceptions.BadRequestError(
+            message="LiteLLM doesn't support {} for 'create_assistants'. Only 'openai' is supported.".format(
+                custom_llm_provider
+            ),
+            model="n/a",
+            llm_provider=custom_llm_provider,
+            response=httpx.Response(
+                status_code=400,
+                content="Unsupported provider",
+                request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+            ),
+        )
+    if response is None:
+        raise litellm.exceptions.InternalServerError(
+            message="No response returned from 'create_assistants'",
+            model=model,
+            llm_provider=custom_llm_provider,
+        )
+    return response
+async def adelete_assistant(
+    custom_llm_provider: Literal["openai", "azure"],
+    client: Optional[AsyncOpenAI] = None,
+    **kwargs,
+) -> AssistantDeleted:
+    loop = asyncio.get_event_loop()
+    ### PASS ARGS TO GET ASSISTANTS ###
+    kwargs["async_delete_assistants"] = True
+    try:
+        kwargs["client"] = client
+        # Use a partial function to pass your keyword arguments
+        func = partial(delete_assistant, custom_llm_provider, **kwargs)
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        _, custom_llm_provider, _, _ = get_llm_provider(  # type: ignore
+            model="", custom_llm_provider=custom_llm_provider
+        )  # type: ignore
+        # Await normally
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response
+        return response  # type: ignore
+    except Exception as e:
+        raise exception_type(
+            model="",
+            custom_llm_provider=custom_llm_provider,
+            original_exception=e,
+            completion_kwargs={},
+            extra_kwargs=kwargs,
+        )
+def delete_assistant(
+    custom_llm_provider: Literal["openai", "azure"],
+    assistant_id: str,
+    client: Optional[Any] = None,
+    api_key: Optional[str] = None,
+    api_base: Optional[str] = None,
+    api_version: Optional[str] = None,
+    **kwargs,
+) -> Union[AssistantDeleted, Coroutine[Any, Any, AssistantDeleted]]:
+    optional_params = GenericLiteLLMParams(
+        api_key=api_key, api_base=api_base, api_version=api_version, **kwargs
+    )
+    litellm_params_dict = get_litellm_params(**kwargs)
+    async_delete_assistants: Optional[bool] = kwargs.pop(
+        "async_delete_assistants", None
+    )
+    if async_delete_assistants is not None and not isinstance(
+        async_delete_assistants, bool
+    ):
+        raise ValueError(
+            "Invalid value passed in for async_delete_assistants. Only bool or None allowed"
+        )
+    ### TIMEOUT LOGIC ###
+    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+    # set timeout for 10 minutes by default
+    if (
+        timeout is not None
+        and isinstance(timeout, httpx.Timeout)
+        and supports_httpx_timeout(custom_llm_provider) is False
+    ):
+        read_timeout = timeout.read or 600
+        timeout = read_timeout  # default 10 min timeout
+    elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+        timeout = float(timeout)  # type: ignore
+    elif timeout is None:
+        timeout = 600.0
+    response: Optional[
+        Union[AssistantDeleted, Coroutine[Any, Any, AssistantDeleted]]
+    ] = None
+    if custom_llm_provider == "openai":
+        api_base = (
+            optional_params.api_base
+            or litellm.api_base
+            or os.getenv("OPENAI_BASE_URL")
+            or os.getenv("OPENAI_API_BASE")
+            or "https://api.openai.com/v1"
+        )
+        organization = (
+            optional_params.organization
+            or litellm.organization
+            or os.getenv("OPENAI_ORGANIZATION", None)
+            or None
+        )
+        # set API KEY
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key
+            or litellm.openai_key
+            or os.getenv("OPENAI_API_KEY")
+        )
+        response = openai_assistants_api.delete_assistant(
+            api_base=api_base,
+            api_key=api_key,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            organization=organization,
+            assistant_id=assistant_id,
+            client=client,
+            async_delete_assistants=async_delete_assistants,
+        )
+    elif custom_llm_provider == "azure":
+        api_base = (
+            optional_params.api_base or litellm.api_base or get_secret("AZURE_API_BASE")
+        )  # type: ignore
+        api_version = (
+            optional_params.api_version
+            or litellm.api_version
+            or get_secret("AZURE_API_VERSION")
+        )  # type: ignore
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key
+            or litellm.azure_key
+            or get_secret("AZURE_OPENAI_API_KEY")
+            or get_secret("AZURE_API_KEY")
+        )  # type: ignore
+        extra_body = optional_params.get("extra_body", {})
+        azure_ad_token: Optional[str] = None
+        if extra_body is not None:
+            azure_ad_token = extra_body.pop("azure_ad_token", None)
+        else:
+            azure_ad_token = get_secret("AZURE_AD_TOKEN")  # type: ignore
+        if isinstance(client, OpenAI):
+            client = None  # only pass client if it's AzureOpenAI
+        response = azure_assistants_api.delete_assistant(
+            assistant_id=assistant_id,
+            api_base=api_base,
+            api_key=api_key,
+            azure_ad_token=azure_ad_token,
+            api_version=api_version,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            client=client,
+            async_delete_assistants=async_delete_assistants,
+            litellm_params=litellm_params_dict,
+        )
+    else:
+        raise litellm.exceptions.BadRequestError(
+            message="LiteLLM doesn't support {} for 'delete_assistant'. Only 'openai' is supported.".format(
+                custom_llm_provider
+            ),
+            model="n/a",
+            llm_provider=custom_llm_provider,
+            response=httpx.Response(
+                status_code=400,
+                content="Unsupported provider",
+                request=httpx.Request(
+                    method="delete_assistant", url="https://github.com/BerriAI/litellm"
+                ),
+            ),
+        )
+    if response is None:
+        raise litellm.exceptions.InternalServerError(
+            message="No response returned from 'delete_assistant'",
+            model="n/a",
+            llm_provider=custom_llm_provider,
+        )
+    return response
+### THREADS ###
+async def acreate_thread(
+    custom_llm_provider: Literal["openai", "azure"], **kwargs
+) -> Thread:
+    loop = asyncio.get_event_loop()
+    ### PASS ARGS TO GET ASSISTANTS ###
+    kwargs["acreate_thread"] = True
+    try:
+        # Use a partial function to pass your keyword arguments
+        func = partial(create_thread, custom_llm_provider, **kwargs)
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        _, custom_llm_provider, _, _ = get_llm_provider(  # type: ignore
+            model="", custom_llm_provider=custom_llm_provider
+        )  # type: ignore
+        # Await normally
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response
+        return response  # type: ignore
+    except Exception as e:
+        raise exception_type(
+            model="",
+            custom_llm_provider=custom_llm_provider,
+            original_exception=e,
+            completion_kwargs={},
+            extra_kwargs=kwargs,
+        )
+def create_thread(
+    custom_llm_provider: Literal["openai", "azure"],
+    messages: Optional[Iterable[OpenAICreateThreadParamsMessage]] = None,
+    metadata: Optional[dict] = None,
+    tool_resources: Optional[OpenAICreateThreadParamsToolResources] = None,
+    client: Optional[OpenAI] = None,
+    **kwargs,
+) -> Thread:
+    """
+    - get the llm provider
+    - if openai - route it there
+    - pass through relevant params
+    ```
+    from litellm import create_thread
+    create_thread(
+        custom_llm_provider="openai",
+        ### OPTIONAL ###
+        messages =  {
+            "role": "user",
+            "content": "Hello, what is AI?"
+            },
+            {
+            "role": "user",
+            "content": "How does AI work? Explain it in simple terms."
+        }]
+    )
+    ```
+    """
+    acreate_thread = kwargs.get("acreate_thread", None)
+    optional_params = GenericLiteLLMParams(**kwargs)
+    litellm_params_dict = get_litellm_params(**kwargs)
+    ### TIMEOUT LOGIC ###
+    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+    # set timeout for 10 minutes by default
+    if (
+        timeout is not None
+        and isinstance(timeout, httpx.Timeout)
+        and supports_httpx_timeout(custom_llm_provider) is False
+    ):
+        read_timeout = timeout.read or 600
+        timeout = read_timeout  # default 10 min timeout
+    elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+        timeout = float(timeout)  # type: ignore
+    elif timeout is None:
+        timeout = 600.0
+    api_base: Optional[str] = None
+    api_key: Optional[str] = None
+    response: Optional[Thread] = None
+    if custom_llm_provider == "openai":
+        api_base = (
+            optional_params.api_base  # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            or litellm.api_base
+            or os.getenv("OPENAI_BASE_URL")
+            or os.getenv("OPENAI_API_BASE")
+            or "https://api.openai.com/v1"
+        )
+        organization = (
+            optional_params.organization
+            or litellm.organization
+            or os.getenv("OPENAI_ORGANIZATION", None)
+            or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+        )
+        # set API KEY
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+            or litellm.openai_key
+            or os.getenv("OPENAI_API_KEY")
+        )
+        response = openai_assistants_api.create_thread(
+            messages=messages,
+            metadata=metadata,
+            api_base=api_base,
+            api_key=api_key,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            organization=organization,
+            client=client,
+            acreate_thread=acreate_thread,
+        )
+    elif custom_llm_provider == "azure":
+        api_base = (
+            optional_params.api_base or litellm.api_base or get_secret("AZURE_API_BASE")
+        )  # type: ignore
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key
+            or litellm.azure_key
+            or get_secret("AZURE_OPENAI_API_KEY")
+            or get_secret("AZURE_API_KEY")
+        )  # type: ignore
+        api_version: Optional[str] = (
+            optional_params.api_version
+            or litellm.api_version
+            or get_secret("AZURE_API_VERSION")
+        )  # type: ignore
+        extra_body = optional_params.get("extra_body", {})
+        azure_ad_token: Optional[str] = None
+        if extra_body is not None:
+            azure_ad_token = extra_body.pop("azure_ad_token", None)
+        else:
+            azure_ad_token = get_secret("AZURE_AD_TOKEN")  # type: ignore
+        if isinstance(client, OpenAI):
+            client = None  # only pass client if it's AzureOpenAI
+        response = azure_assistants_api.create_thread(
+            messages=messages,
+            metadata=metadata,
+            api_base=api_base,
+            api_key=api_key,
+            azure_ad_token=azure_ad_token,
+            api_version=api_version,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            client=client,
+            acreate_thread=acreate_thread,
+            litellm_params=litellm_params_dict,
+        )
+    else:
+        raise litellm.exceptions.BadRequestError(
+            message="LiteLLM doesn't support {} for 'create_thread'. Only 'openai' is supported.".format(
+                custom_llm_provider
+            ),
+            model="n/a",
+            llm_provider=custom_llm_provider,
+            response=httpx.Response(
+                status_code=400,
+                content="Unsupported provider",
+                request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+            ),
+        )
+    return response  # type: ignore
+async def aget_thread(
+    custom_llm_provider: Literal["openai", "azure"],
+    thread_id: str,
+    client: Optional[AsyncOpenAI] = None,
+    **kwargs,
+) -> Thread:
+    loop = asyncio.get_event_loop()
+    ### PASS ARGS TO GET ASSISTANTS ###
+    kwargs["aget_thread"] = True
+    try:
+        # Use a partial function to pass your keyword arguments
+        func = partial(get_thread, custom_llm_provider, thread_id, client, **kwargs)
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        _, custom_llm_provider, _, _ = get_llm_provider(  # type: ignore
+            model="", custom_llm_provider=custom_llm_provider
+        )  # type: ignore
+        # Await normally
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response
+        return response  # type: ignore
+    except Exception as e:
+        raise exception_type(
+            model="",
+            custom_llm_provider=custom_llm_provider,
+            original_exception=e,
+            completion_kwargs={},
+            extra_kwargs=kwargs,
+        )
+def get_thread(
+    custom_llm_provider: Literal["openai", "azure"],
+    thread_id: str,
+    client=None,
+    **kwargs,
+) -> Thread:
+    """Get the thread object, given a thread_id"""
+    aget_thread = kwargs.pop("aget_thread", None)
+    optional_params = GenericLiteLLMParams(**kwargs)
+    litellm_params_dict = get_litellm_params(**kwargs)
+    ### TIMEOUT LOGIC ###
+    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+    # set timeout for 10 minutes by default
+    if (
+        timeout is not None
+        and isinstance(timeout, httpx.Timeout)
+        and supports_httpx_timeout(custom_llm_provider) is False
+    ):
+        read_timeout = timeout.read or 600
+        timeout = read_timeout  # default 10 min timeout
+    elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+        timeout = float(timeout)  # type: ignore
+    elif timeout is None:
+        timeout = 600.0
+    api_base: Optional[str] = None
+    api_key: Optional[str] = None
+    response: Optional[Thread] = None
+    if custom_llm_provider == "openai":
+        api_base = (
+            optional_params.api_base  # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            or litellm.api_base
+            or os.getenv("OPENAI_BASE_URL")
+            or os.getenv("OPENAI_API_BASE")
+            or "https://api.openai.com/v1"
+        )
+        organization = (
+            optional_params.organization
+            or litellm.organization
+            or os.getenv("OPENAI_ORGANIZATION", None)
+            or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+        )
+        # set API KEY
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+            or litellm.openai_key
+            or os.getenv("OPENAI_API_KEY")
+        )
+        response = openai_assistants_api.get_thread(
+            thread_id=thread_id,
+            api_base=api_base,
+            api_key=api_key,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            organization=organization,
+            client=client,
+            aget_thread=aget_thread,
+        )
+    elif custom_llm_provider == "azure":
+        api_base = (
+            optional_params.api_base or litellm.api_base or get_secret("AZURE_API_BASE")
+        )  # type: ignore
+        api_version: Optional[str] = (
+            optional_params.api_version
+            or litellm.api_version
+            or get_secret("AZURE_API_VERSION")
+        )  # type: ignore
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key
+            or litellm.azure_key
+            or get_secret("AZURE_OPENAI_API_KEY")
+            or get_secret("AZURE_API_KEY")
+        )  # type: ignore
+        extra_body = optional_params.get("extra_body", {})
+        azure_ad_token: Optional[str] = None
+        if extra_body is not None:
+            azure_ad_token = extra_body.pop("azure_ad_token", None)
+        else:
+            azure_ad_token = get_secret("AZURE_AD_TOKEN")  # type: ignore
+        if isinstance(client, OpenAI):
+            client = None  # only pass client if it's AzureOpenAI
+        response = azure_assistants_api.get_thread(
+            thread_id=thread_id,
+            api_base=api_base,
+            api_key=api_key,
+            azure_ad_token=azure_ad_token,
+            api_version=api_version,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            client=client,
+            aget_thread=aget_thread,
+            litellm_params=litellm_params_dict,
+        )
+    else:
+        raise litellm.exceptions.BadRequestError(
+            message="LiteLLM doesn't support {} for 'get_thread'. Only 'openai' is supported.".format(
+                custom_llm_provider
+            ),
+            model="n/a",
+            llm_provider=custom_llm_provider,
+            response=httpx.Response(
+                status_code=400,
+                content="Unsupported provider",
+                request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+            ),
+        )
+    return response  # type: ignore
+### MESSAGES ###
+async def a_add_message(
+    custom_llm_provider: Literal["openai", "azure"],
+    thread_id: str,
+    role: Literal["user", "assistant"],
+    content: str,
+    attachments: Optional[List[Attachment]] = None,
+    metadata: Optional[dict] = None,
+    client=None,
+    **kwargs,
+) -> OpenAIMessage:
+    loop = asyncio.get_event_loop()
+    ### PASS ARGS TO GET ASSISTANTS ###
+    kwargs["a_add_message"] = True
+    try:
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            add_message,
+            custom_llm_provider,
+            thread_id,
+            role,
+            content,
+            attachments,
+            metadata,
+            client,
+            **kwargs,
+        )
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        _, custom_llm_provider, _, _ = get_llm_provider(  # type: ignore
+            model="", custom_llm_provider=custom_llm_provider
+        )  # type: ignore
+        # Await normally
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            # Call the synchronous function using run_in_executor
+            response = init_response
+        return response  # type: ignore
+    except Exception as e:
+        raise exception_type(
+            model="",
+            custom_llm_provider=custom_llm_provider,
+            original_exception=e,
+            completion_kwargs={},
+            extra_kwargs=kwargs,
+        )
+def add_message(
+    custom_llm_provider: Literal["openai", "azure"],
+    thread_id: str,
+    role: Literal["user", "assistant"],
+    content: str,
+    attachments: Optional[List[Attachment]] = None,
+    metadata: Optional[dict] = None,
+    client=None,
+    **kwargs,
+) -> OpenAIMessage:
+    ### COMMON OBJECTS ###
+    a_add_message = kwargs.pop("a_add_message", None)
+    _message_data = MessageData(
+        role=role, content=content, attachments=attachments, metadata=metadata
+    )
+    litellm_params_dict = get_litellm_params(**kwargs)
+    optional_params = GenericLiteLLMParams(**kwargs)
+    message_data = get_optional_params_add_message(
+        role=_message_data["role"],
+        content=_message_data["content"],
+        attachments=_message_data["attachments"],
+        metadata=_message_data["metadata"],
+        custom_llm_provider=custom_llm_provider,
+    )
+    ### TIMEOUT LOGIC ###
+    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+    # set timeout for 10 minutes by default
+    if (
+        timeout is not None
+        and isinstance(timeout, httpx.Timeout)
+        and supports_httpx_timeout(custom_llm_provider) is False
+    ):
+        read_timeout = timeout.read or 600
+        timeout = read_timeout  # default 10 min timeout
+    elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+        timeout = float(timeout)  # type: ignore
+    elif timeout is None:
+        timeout = 600.0
+    api_key: Optional[str] = None
+    api_base: Optional[str] = None
+    response: Optional[OpenAIMessage] = None
+    if custom_llm_provider == "openai":
+        api_base = (
+            optional_params.api_base  # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            or litellm.api_base
+            or os.getenv("OPENAI_BASE_URL")
+            or os.getenv("OPENAI_API_BASE")
+            or "https://api.openai.com/v1"
+        )
+        organization = (
+            optional_params.organization
+            or litellm.organization
+            or os.getenv("OPENAI_ORGANIZATION", None)
+            or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+        )
+        # set API KEY
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+            or litellm.openai_key
+            or os.getenv("OPENAI_API_KEY")
+        )
+        response = openai_assistants_api.add_message(
+            thread_id=thread_id,
+            message_data=message_data,
+            api_base=api_base,
+            api_key=api_key,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            organization=organization,
+            client=client,
+            a_add_message=a_add_message,
+        )
+    elif custom_llm_provider == "azure":
+        api_base = (
+            optional_params.api_base or litellm.api_base or get_secret("AZURE_API_BASE")
+        )  # type: ignore
+        api_version: Optional[str] = (
+            optional_params.api_version
+            or litellm.api_version
+            or get_secret("AZURE_API_VERSION")
+        )  # type: ignore
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key
+            or litellm.azure_key
+            or get_secret("AZURE_OPENAI_API_KEY")
+            or get_secret("AZURE_API_KEY")
+        )  # type: ignore
+        extra_body = optional_params.get("extra_body", {})
+        azure_ad_token: Optional[str] = None
+        if extra_body is not None:
+            azure_ad_token = extra_body.pop("azure_ad_token", None)
+        else:
+            azure_ad_token = get_secret("AZURE_AD_TOKEN")  # type: ignore
+        response = azure_assistants_api.add_message(
+            thread_id=thread_id,
+            message_data=message_data,
+            api_base=api_base,
+            api_key=api_key,
+            api_version=api_version,
+            azure_ad_token=azure_ad_token,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            client=client,
+            a_add_message=a_add_message,
+            litellm_params=litellm_params_dict,
+        )
+    else:
+        raise litellm.exceptions.BadRequestError(
+            message="LiteLLM doesn't support {} for 'create_thread'. Only 'openai' is supported.".format(
+                custom_llm_provider
+            ),
+            model="n/a",
+            llm_provider=custom_llm_provider,
+            response=httpx.Response(
+                status_code=400,
+                content="Unsupported provider",
+                request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+            ),
+        )
+    return response  # type: ignore
+async def aget_messages(
+    custom_llm_provider: Literal["openai", "azure"],
+    thread_id: str,
+    client: Optional[AsyncOpenAI] = None,
+    **kwargs,
+) -> AsyncCursorPage[OpenAIMessage]:
+    loop = asyncio.get_event_loop()
+    ### PASS ARGS TO GET ASSISTANTS ###
+    kwargs["aget_messages"] = True
+    try:
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            get_messages,
+            custom_llm_provider,
+            thread_id,
+            client,
+            **kwargs,
+        )
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        _, custom_llm_provider, _, _ = get_llm_provider(  # type: ignore
+            model="", custom_llm_provider=custom_llm_provider
+        )  # type: ignore
+        # Await normally
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            # Call the synchronous function using run_in_executor
+            response = init_response
+        return response  # type: ignore
+    except Exception as e:
+        raise exception_type(
+            model="",
+            custom_llm_provider=custom_llm_provider,
+            original_exception=e,
+            completion_kwargs={},
+            extra_kwargs=kwargs,
+        )
+def get_messages(
+    custom_llm_provider: Literal["openai", "azure"],
+    thread_id: str,
+    client: Optional[Any] = None,
+    **kwargs,
+) -> SyncCursorPage[OpenAIMessage]:
+    aget_messages = kwargs.pop("aget_messages", None)
+    optional_params = GenericLiteLLMParams(**kwargs)
+    litellm_params_dict = get_litellm_params(**kwargs)
+    ### TIMEOUT LOGIC ###
+    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+    # set timeout for 10 minutes by default
+    if (
+        timeout is not None
+        and isinstance(timeout, httpx.Timeout)
+        and supports_httpx_timeout(custom_llm_provider) is False
+    ):
+        read_timeout = timeout.read or 600
+        timeout = read_timeout  # default 10 min timeout
+    elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+        timeout = float(timeout)  # type: ignore
+    elif timeout is None:
+        timeout = 600.0
+    response: Optional[SyncCursorPage[OpenAIMessage]] = None
+    api_key: Optional[str] = None
+    api_base: Optional[str] = None
+    if custom_llm_provider == "openai":
+        api_base = (
+            optional_params.api_base  # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            or litellm.api_base
+            or os.getenv("OPENAI_BASE_URL")
+            or os.getenv("OPENAI_API_BASE")
+            or "https://api.openai.com/v1"
+        )
+        organization = (
+            optional_params.organization
+            or litellm.organization
+            or os.getenv("OPENAI_ORGANIZATION", None)
+            or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+        )
+        # set API KEY
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+            or litellm.openai_key
+            or os.getenv("OPENAI_API_KEY")
+        )
+        response = openai_assistants_api.get_messages(
+            thread_id=thread_id,
+            api_base=api_base,
+            api_key=api_key,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            organization=organization,
+            client=client,
+            aget_messages=aget_messages,
+        )
+    elif custom_llm_provider == "azure":
+        api_base = (
+            optional_params.api_base or litellm.api_base or get_secret("AZURE_API_BASE")
+        )  # type: ignore
+        api_version: Optional[str] = (
+            optional_params.api_version
+            or litellm.api_version
+            or get_secret("AZURE_API_VERSION")
+        )  # type: ignore
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key
+            or litellm.azure_key
+            or get_secret("AZURE_OPENAI_API_KEY")
+            or get_secret("AZURE_API_KEY")
+        )  # type: ignore
+        extra_body = optional_params.get("extra_body", {})
+        azure_ad_token: Optional[str] = None
+        if extra_body is not None:
+            azure_ad_token = extra_body.pop("azure_ad_token", None)
+        else:
+            azure_ad_token = get_secret("AZURE_AD_TOKEN")  # type: ignore
+        response = azure_assistants_api.get_messages(
+            thread_id=thread_id,
+            api_base=api_base,
+            api_key=api_key,
+            api_version=api_version,
+            azure_ad_token=azure_ad_token,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            client=client,
+            aget_messages=aget_messages,
+            litellm_params=litellm_params_dict,
+        )
+    else:
+        raise litellm.exceptions.BadRequestError(
+            message="LiteLLM doesn't support {} for 'get_messages'. Only 'openai' is supported.".format(
+                custom_llm_provider
+            ),
+            model="n/a",
+            llm_provider=custom_llm_provider,
+            response=httpx.Response(
+                status_code=400,
+                content="Unsupported provider",
+                request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+            ),
+        )
+    return response  # type: ignore
+### RUNS ###
+def arun_thread_stream(
+    *,
+    event_handler: Optional[AssistantEventHandler] = None,
+    **kwargs,
+) -> AsyncAssistantStreamManager[AsyncAssistantEventHandler]:
+    kwargs["arun_thread"] = True
+    return run_thread(stream=True, event_handler=event_handler, **kwargs)  # type: ignore
+async def arun_thread(
+    custom_llm_provider: Literal["openai", "azure"],
+    thread_id: str,
+    assistant_id: str,
+    additional_instructions: Optional[str] = None,
+    instructions: Optional[str] = None,
+    metadata: Optional[dict] = None,
+    model: Optional[str] = None,
+    stream: Optional[bool] = None,
+    tools: Optional[Iterable[AssistantToolParam]] = None,
+    client: Optional[Any] = None,
+    **kwargs,
+) -> Run:
+    loop = asyncio.get_event_loop()
+    ### PASS ARGS TO GET ASSISTANTS ###
+    kwargs["arun_thread"] = True
+    try:
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            run_thread,
+            custom_llm_provider,
+            thread_id,
+            assistant_id,
+            additional_instructions,
+            instructions,
+            metadata,
+            model,
+            stream,
+            tools,
+            client,
+            **kwargs,
+        )
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        _, custom_llm_provider, _, _ = get_llm_provider(  # type: ignore
+            model="", custom_llm_provider=custom_llm_provider
+        )  # type: ignore
+        # Await normally
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            # Call the synchronous function using run_in_executor
+            response = init_response
+        return response  # type: ignore
+    except Exception as e:
+        raise exception_type(
+            model="",
+            custom_llm_provider=custom_llm_provider,
+            original_exception=e,
+            completion_kwargs={},
+            extra_kwargs=kwargs,
+        )
+def run_thread_stream(
+    *,
+    event_handler: Optional[AssistantEventHandler] = None,
+    **kwargs,
+) -> AssistantStreamManager[AssistantEventHandler]:
+    return run_thread(stream=True, event_handler=event_handler, **kwargs)  # type: ignore
+def run_thread(
+    custom_llm_provider: Literal["openai", "azure"],
+    thread_id: str,
+    assistant_id: str,
+    additional_instructions: Optional[str] = None,
+    instructions: Optional[str] = None,
+    metadata: Optional[dict] = None,
+    model: Optional[str] = None,
+    stream: Optional[bool] = None,
+    tools: Optional[Iterable[AssistantToolParam]] = None,
+    client: Optional[Any] = None,
+    event_handler: Optional[AssistantEventHandler] = None,  # for stream=True calls
+    **kwargs,
+) -> Run:
+    """Run a given thread + assistant."""
+    arun_thread = kwargs.pop("arun_thread", None)
+    optional_params = GenericLiteLLMParams(**kwargs)
+    litellm_params_dict = get_litellm_params(**kwargs)
+    ### TIMEOUT LOGIC ###
+    timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+    # set timeout for 10 minutes by default
+    if (
+        timeout is not None
+        and isinstance(timeout, httpx.Timeout)
+        and supports_httpx_timeout(custom_llm_provider) is False
+    ):
+        read_timeout = timeout.read or 600
+        timeout = read_timeout  # default 10 min timeout
+    elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+        timeout = float(timeout)  # type: ignore
+    elif timeout is None:
+        timeout = 600.0
+    response: Optional[Run] = None
+    if custom_llm_provider == "openai":
+        api_base = (
+            optional_params.api_base  # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            or litellm.api_base
+            or os.getenv("OPENAI_BASE_URL")
+            or os.getenv("OPENAI_API_BASE")
+            or "https://api.openai.com/v1"
+        )
+        organization = (
+            optional_params.organization
+            or litellm.organization
+            or os.getenv("OPENAI_ORGANIZATION", None)
+            or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+        )
+        # set API KEY
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+            or litellm.openai_key
+            or os.getenv("OPENAI_API_KEY")
+        )
+        response = openai_assistants_api.run_thread(
+            thread_id=thread_id,
+            assistant_id=assistant_id,
+            additional_instructions=additional_instructions,
+            instructions=instructions,
+            metadata=metadata,
+            model=model,
+            stream=stream,
+            tools=tools,
+            api_base=api_base,
+            api_key=api_key,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            organization=organization,
+            client=client,
+            arun_thread=arun_thread,
+            event_handler=event_handler,
+        )
+    elif custom_llm_provider == "azure":
+        api_base = (
+            optional_params.api_base or litellm.api_base or get_secret("AZURE_API_BASE")
+        )  # type: ignore
+        api_version = (
+            optional_params.api_version
+            or litellm.api_version
+            or get_secret("AZURE_API_VERSION")
+        )  # type: ignore
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key
+            or litellm.azure_key
+            or get_secret("AZURE_OPENAI_API_KEY")
+            or get_secret("AZURE_API_KEY")
+        )  # type: ignore
+        extra_body = optional_params.get("extra_body", {})
+        azure_ad_token = None
+        if extra_body is not None:
+            azure_ad_token = extra_body.pop("azure_ad_token", None)
+        else:
+            azure_ad_token = get_secret("AZURE_AD_TOKEN")  # type: ignore
+        response = azure_assistants_api.run_thread(
+            thread_id=thread_id,
+            assistant_id=assistant_id,
+            additional_instructions=additional_instructions,
+            instructions=instructions,
+            metadata=metadata,
+            model=model,
+            stream=stream,
+            tools=tools,
+            api_base=str(api_base) if api_base is not None else None,
+            api_key=str(api_key) if api_key is not None else None,
+            api_version=str(api_version) if api_version is not None else None,
+            azure_ad_token=str(azure_ad_token) if azure_ad_token is not None else None,
+            timeout=timeout,
+            max_retries=optional_params.max_retries,
+            client=client,
+            arun_thread=arun_thread,
+            litellm_params=litellm_params_dict,
+        )  # type: ignore
+    else:
+        raise litellm.exceptions.BadRequestError(
+            message="LiteLLM doesn't support {} for 'run_thread'. Only 'openai' is supported.".format(
+                custom_llm_provider
+            ),
+            model="n/a",
+            llm_provider=custom_llm_provider,
+            response=httpx.Response(
+                status_code=400,
+                content="Unsupported provider",
+                request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+            ),
+        )
+    return response  # type: ignore

litellm/assistants/utils.py ADDED Viewed

	@@ -0,0 +1,161 @@

+from typing import Optional, Union
+import litellm
+from ..exceptions import UnsupportedParamsError
+from ..types.llms.openai import *
+def get_optional_params_add_message(
+    role: Optional[str],
+    content: Optional[
+        Union[
+            str,
+            List[
+                Union[
+                    MessageContentTextObject,
+                    MessageContentImageFileObject,
+                    MessageContentImageURLObject,
+                ]
+            ],
+        ]
+    ],
+    attachments: Optional[List[Attachment]],
+    metadata: Optional[dict],
+    custom_llm_provider: str,
+    **kwargs,
+):
+    """
+    Azure doesn't support 'attachments' for creating a message
+    Reference - https://learn.microsoft.com/en-us/azure/ai-services/openai/assistants-reference-messages?tabs=python#create-message
+    """
+    passed_params = locals()
+    custom_llm_provider = passed_params.pop("custom_llm_provider")
+    special_params = passed_params.pop("kwargs")
+    for k, v in special_params.items():
+        passed_params[k] = v
+    default_params = {
+        "role": None,
+        "content": None,
+        "attachments": None,
+        "metadata": None,
+    }
+    non_default_params = {
+        k: v
+        for k, v in passed_params.items()
+        if (k in default_params and v != default_params[k])
+    }
+    optional_params = {}
+    ## raise exception if non-default value passed for non-openai/azure embedding calls
+    def _check_valid_arg(supported_params):
+        if len(non_default_params.keys()) > 0:
+            keys = list(non_default_params.keys())
+            for k in keys:
+                if (
+                    litellm.drop_params is True and k not in supported_params
+                ):  # drop the unsupported non-default values
+                    non_default_params.pop(k, None)
+                elif k not in supported_params:
+                    raise litellm.utils.UnsupportedParamsError(
+                        status_code=500,
+                        message="k={}, not supported by {}. Supported params={}. To drop it from the call, set `litellm.drop_params = True`.".format(
+                            k, custom_llm_provider, supported_params
+                        ),
+                    )
+            return non_default_params
+    if custom_llm_provider == "openai":
+        optional_params = non_default_params
+    elif custom_llm_provider == "azure":
+        supported_params = (
+            litellm.AzureOpenAIAssistantsAPIConfig().get_supported_openai_create_message_params()
+        )
+        _check_valid_arg(supported_params=supported_params)
+        optional_params = litellm.AzureOpenAIAssistantsAPIConfig().map_openai_params_create_message_params(
+            non_default_params=non_default_params, optional_params=optional_params
+        )
+    for k in passed_params.keys():
+        if k not in default_params.keys():
+            optional_params[k] = passed_params[k]
+    return optional_params
+def get_optional_params_image_gen(
+    n: Optional[int] = None,
+    quality: Optional[str] = None,
+    response_format: Optional[str] = None,
+    size: Optional[str] = None,
+    style: Optional[str] = None,
+    user: Optional[str] = None,
+    custom_llm_provider: Optional[str] = None,
+    **kwargs,
+):
+    # retrieve all parameters passed to the function
+    passed_params = locals()
+    custom_llm_provider = passed_params.pop("custom_llm_provider")
+    special_params = passed_params.pop("kwargs")
+    for k, v in special_params.items():
+        passed_params[k] = v
+    default_params = {
+        "n": None,
+        "quality": None,
+        "response_format": None,
+        "size": None,
+        "style": None,
+        "user": None,
+    }
+    non_default_params = {
+        k: v
+        for k, v in passed_params.items()
+        if (k in default_params and v != default_params[k])
+    }
+    optional_params = {}
+    ## raise exception if non-default value passed for non-openai/azure embedding calls
+    def _check_valid_arg(supported_params):
+        if len(non_default_params.keys()) > 0:
+            keys = list(non_default_params.keys())
+            for k in keys:
+                if (
+                    litellm.drop_params is True and k not in supported_params
+                ):  # drop the unsupported non-default values
+                    non_default_params.pop(k, None)
+                elif k not in supported_params:
+                    raise UnsupportedParamsError(
+                        status_code=500,
+                        message=f"Setting user/encoding format is not supported by {custom_llm_provider}. To drop it from the call, set `litellm.drop_params = True`.",
+                    )
+            return non_default_params
+    if (
+        custom_llm_provider == "openai"
+        or custom_llm_provider == "azure"
+        or custom_llm_provider in litellm.openai_compatible_providers
+    ):
+        optional_params = non_default_params
+    elif custom_llm_provider == "bedrock":
+        supported_params = ["size"]
+        _check_valid_arg(supported_params=supported_params)
+        if size is not None:
+            width, height = size.split("x")
+            optional_params["width"] = int(width)
+            optional_params["height"] = int(height)
+    elif custom_llm_provider == "vertex_ai":
+        supported_params = ["n"]
+        """
+        All params here: https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/imagegeneration?project=adroit-crow-413218
+        """
+        _check_valid_arg(supported_params=supported_params)
+        if n is not None:
+            optional_params["sampleCount"] = int(n)
+    for k in passed_params.keys():
+        if k not in default_params.keys():
+            optional_params[k] = passed_params[k]
+    return optional_params

litellm/batch_completion/Readme.md ADDED Viewed

	@@ -0,0 +1,11 @@

+# Implementation of `litellm.batch_completion`, `litellm.batch_completion_models`, `litellm.batch_completion_models_all_responses`
+Doc: https://docs.litellm.ai/docs/completion/batching
+LiteLLM Python SDK allows you to:
+1. `litellm.batch_completion` Batch litellm.completion function for a given model.
+2. `litellm.batch_completion_models` Send a request to multiple language models concurrently and return the response
+    as soon as one of the models responds.
+3. `litellm.batch_completion_models_all_responses` Send a request to multiple language models concurrently and return a list of responses
+    from all models that respond.

litellm/batch_completion/main.py ADDED Viewed

	@@ -0,0 +1,253 @@

+from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
+from typing import List, Optional
+import litellm
+from litellm._logging import print_verbose
+from litellm.utils import get_optional_params
+from ..llms.vllm.completion import handler as vllm_handler
+def batch_completion(
+    model: str,
+    # Optional OpenAI params: see https://platform.openai.com/docs/api-reference/chat/create
+    messages: List = [],
+    functions: Optional[List] = None,
+    function_call: Optional[str] = None,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+    n: Optional[int] = None,
+    stream: Optional[bool] = None,
+    stop=None,
+    max_tokens: Optional[int] = None,
+    presence_penalty: Optional[float] = None,
+    frequency_penalty: Optional[float] = None,
+    logit_bias: Optional[dict] = None,
+    user: Optional[str] = None,
+    deployment_id=None,
+    request_timeout: Optional[int] = None,
+    timeout: Optional[int] = 600,
+    max_workers: Optional[int] = 100,
+    # Optional liteLLM function params
+    **kwargs,
+):
+    """
+    Batch litellm.completion function for a given model.
+    Args:
+        model (str): The model to use for generating completions.
+        messages (List, optional): List of messages to use as input for generating completions. Defaults to [].
+        functions (List, optional): List of functions to use as input for generating completions. Defaults to [].
+        function_call (str, optional): The function call to use as input for generating completions. Defaults to "".
+        temperature (float, optional): The temperature parameter for generating completions. Defaults to None.
+        top_p (float, optional): The top-p parameter for generating completions. Defaults to None.
+        n (int, optional): The number of completions to generate. Defaults to None.
+        stream (bool, optional): Whether to stream completions or not. Defaults to None.
+        stop (optional): The stop parameter for generating completions. Defaults to None.
+        max_tokens (float, optional): The maximum number of tokens to generate. Defaults to None.
+        presence_penalty (float, optional): The presence penalty for generating completions. Defaults to None.
+        frequency_penalty (float, optional): The frequency penalty for generating completions. Defaults to None.
+        logit_bias (dict, optional): The logit bias for generating completions. Defaults to {}.
+        user (str, optional): The user string for generating completions. Defaults to "".
+        deployment_id (optional): The deployment ID for generating completions. Defaults to None.
+        request_timeout (int, optional): The request timeout for generating completions. Defaults to None.
+        max_workers (int,optional): The maximum number of threads to use for parallel processing.
+    Returns:
+        list: A list of completion results.
+    """
+    args = locals()
+    batch_messages = messages
+    completions = []
+    model = model
+    custom_llm_provider = None
+    if model.split("/", 1)[0] in litellm.provider_list:
+        custom_llm_provider = model.split("/", 1)[0]
+        model = model.split("/", 1)[1]
+    if custom_llm_provider == "vllm":
+        optional_params = get_optional_params(
+            functions=functions,
+            function_call=function_call,
+            temperature=temperature,
+            top_p=top_p,
+            n=n,
+            stream=stream or False,
+            stop=stop,
+            max_tokens=max_tokens,
+            presence_penalty=presence_penalty,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            user=user,
+            # params to identify the model
+            model=model,
+            custom_llm_provider=custom_llm_provider,
+        )
+        results = vllm_handler.batch_completions(
+            model=model,
+            messages=batch_messages,
+            custom_prompt_dict=litellm.custom_prompt_dict,
+            optional_params=optional_params,
+        )
+    # all non VLLM models for batch completion models
+    else:
+        def chunks(lst, n):
+            """Yield successive n-sized chunks from lst."""
+            for i in range(0, len(lst), n):
+                yield lst[i : i + n]
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            for sub_batch in chunks(batch_messages, 100):
+                for message_list in sub_batch:
+                    kwargs_modified = args.copy()
+                    kwargs_modified.pop("max_workers")
+                    kwargs_modified["messages"] = message_list
+                    original_kwargs = {}
+                    if "kwargs" in kwargs_modified:
+                        original_kwargs = kwargs_modified.pop("kwargs")
+                    future = executor.submit(
+                        litellm.completion, **kwargs_modified, **original_kwargs
+                    )
+                    completions.append(future)
+        # Retrieve the results from the futures
+        # results = [future.result() for future in completions]
+        # return exceptions if any
+        results = []
+        for future in completions:
+            try:
+                results.append(future.result())
+            except Exception as exc:
+                results.append(exc)
+    return results
+# send one request to multiple models
+# return as soon as one of the llms responds
+def batch_completion_models(*args, **kwargs):
+    """
+    Send a request to multiple language models concurrently and return the response
+    as soon as one of the models responds.
+    Args:
+        *args: Variable-length positional arguments passed to the completion function.
+        **kwargs: Additional keyword arguments:
+            - models (str or list of str): The language models to send requests to.
+            - Other keyword arguments to be passed to the completion function.
+    Returns:
+        str or None: The response from one of the language models, or None if no response is received.
+    Note:
+        This function utilizes a ThreadPoolExecutor to parallelize requests to multiple models.
+        It sends requests concurrently and returns the response from the first model that responds.
+    """
+    if "model" in kwargs:
+        kwargs.pop("model")
+    if "models" in kwargs:
+        models = kwargs["models"]
+        kwargs.pop("models")
+        futures = {}
+        with ThreadPoolExecutor(max_workers=len(models)) as executor:
+            for model in models:
+                futures[model] = executor.submit(
+                    litellm.completion, *args, model=model, **kwargs
+                )
+            for model, future in sorted(
+                futures.items(), key=lambda x: models.index(x[0])
+            ):
+                if future.result() is not None:
+                    return future.result()
+    elif "deployments" in kwargs:
+        deployments = kwargs["deployments"]
+        kwargs.pop("deployments")
+        kwargs.pop("model_list")
+        nested_kwargs = kwargs.pop("kwargs", {})
+        futures = {}
+        with ThreadPoolExecutor(max_workers=len(deployments)) as executor:
+            for deployment in deployments:
+                for key in kwargs.keys():
+                    if (
+                        key not in deployment
+                    ):  # don't override deployment values e.g. model name, api base, etc.
+                        deployment[key] = kwargs[key]
+                kwargs = {**deployment, **nested_kwargs}
+                futures[deployment["model"]] = executor.submit(
+                    litellm.completion, **kwargs
+                )
+            while futures:
+                # wait for the first returned future
+                print_verbose("\n\n waiting for next result\n\n")
+                done, _ = wait(futures.values(), return_when=FIRST_COMPLETED)
+                print_verbose(f"done list\n{done}")
+                for future in done:
+                    try:
+                        result = future.result()
+                        return result
+                    except Exception:
+                        # if model 1 fails, continue with response from model 2, model3
+                        print_verbose(
+                            "\n\ngot an exception, ignoring, removing from futures"
+                        )
+                        print_verbose(futures)
+                        new_futures = {}
+                        for key, value in futures.items():
+                            if future == value:
+                                print_verbose(f"removing key{key}")
+                                continue
+                            else:
+                                new_futures[key] = value
+                        futures = new_futures
+                        print_verbose(f"new futures{futures}")
+                        continue
+                print_verbose("\n\ndone looping through futures\n\n")
+                print_verbose(futures)
+    return None  # If no response is received from any model
+def batch_completion_models_all_responses(*args, **kwargs):
+    """
+    Send a request to multiple language models concurrently and return a list of responses
+    from all models that respond.
+    Args:
+        *args: Variable-length positional arguments passed to the completion function.
+        **kwargs: Additional keyword arguments:
+            - models (str or list of str): The language models to send requests to.
+            - Other keyword arguments to be passed to the completion function.
+    Returns:
+        list: A list of responses from the language models that responded.
+    Note:
+        This function utilizes a ThreadPoolExecutor to parallelize requests to multiple models.
+        It sends requests concurrently and collects responses from all models that respond.
+    """
+    import concurrent.futures
+    # ANSI escape codes for colored output
+    if "model" in kwargs:
+        kwargs.pop("model")
+    if "models" in kwargs:
+        models = kwargs["models"]
+        kwargs.pop("models")
+    else:
+        raise Exception("'models' param not in kwargs")
+    responses = []
+    with concurrent.futures.ThreadPoolExecutor(max_workers=len(models)) as executor:
+        for idx, model in enumerate(models):
+            future = executor.submit(litellm.completion, *args, model=model, **kwargs)
+            if future.result() is not None:
+                responses.append(future.result())
+    return responses

litellm/batches/batch_utils.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import json
+from typing import Any, List, Literal, Tuple
+import litellm
+from litellm._logging import verbose_logger
+from litellm.types.llms.openai import Batch
+from litellm.types.utils import CallTypes, Usage
+async def _handle_completed_batch(
+    batch: Batch,
+    custom_llm_provider: Literal["openai", "azure", "vertex_ai"],
+) -> Tuple[float, Usage, List[str]]:
+    """Helper function to process a completed batch and handle logging"""
+    # Get batch results
+    file_content_dictionary = await _get_batch_output_file_content_as_dictionary(
+        batch, custom_llm_provider
+    )
+    # Calculate costs and usage
+    batch_cost = await _batch_cost_calculator(
+        custom_llm_provider=custom_llm_provider,
+        file_content_dictionary=file_content_dictionary,
+    )
+    batch_usage = _get_batch_job_total_usage_from_file_content(
+        file_content_dictionary=file_content_dictionary,
+        custom_llm_provider=custom_llm_provider,
+    )
+    batch_models = _get_batch_models_from_file_content(file_content_dictionary)
+    return batch_cost, batch_usage, batch_models
+def _get_batch_models_from_file_content(
+    file_content_dictionary: List[dict],
+) -> List[str]:
+    """
+    Get the models from the file content
+    """
+    batch_models = []
+    for _item in file_content_dictionary:
+        if _batch_response_was_successful(_item):
+            _response_body = _get_response_from_batch_job_output_file(_item)
+            _model = _response_body.get("model")
+            if _model:
+                batch_models.append(_model)
+    return batch_models
+async def _batch_cost_calculator(
+    file_content_dictionary: List[dict],
+    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
+) -> float:
+    """
+    Calculate the cost of a batch based on the output file id
+    """
+    if custom_llm_provider == "vertex_ai":
+        raise ValueError("Vertex AI does not support file content retrieval")
+    total_cost = _get_batch_job_cost_from_file_content(
+        file_content_dictionary=file_content_dictionary,
+        custom_llm_provider=custom_llm_provider,
+    )
+    verbose_logger.debug("total_cost=%s", total_cost)
+    return total_cost
+async def _get_batch_output_file_content_as_dictionary(
+    batch: Batch,
+    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
+) -> List[dict]:
+    """
+    Get the batch output file content as a list of dictionaries
+    """
+    from litellm.files.main import afile_content
+    if custom_llm_provider == "vertex_ai":
+        raise ValueError("Vertex AI does not support file content retrieval")
+    if batch.output_file_id is None:
+        raise ValueError("Output file id is None cannot retrieve file content")
+    _file_content = await afile_content(
+        file_id=batch.output_file_id,
+        custom_llm_provider=custom_llm_provider,
+    )
+    return _get_file_content_as_dictionary(_file_content.content)
+def _get_file_content_as_dictionary(file_content: bytes) -> List[dict]:
+    """
+    Get the file content as a list of dictionaries from JSON Lines format
+    """
+    try:
+        _file_content_str = file_content.decode("utf-8")
+        # Split by newlines and parse each line as a separate JSON object
+        json_objects = []
+        for line in _file_content_str.strip().split("\n"):
+            if line:  # Skip empty lines
+                json_objects.append(json.loads(line))
+        verbose_logger.debug("json_objects=%s", json.dumps(json_objects, indent=4))
+        return json_objects
+    except Exception as e:
+        raise e
+def _get_batch_job_cost_from_file_content(
+    file_content_dictionary: List[dict],
+    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
+) -> float:
+    """
+    Get the cost of a batch job from the file content
+    """
+    try:
+        total_cost: float = 0.0
+        # parse the file content as json
+        verbose_logger.debug(
+            "file_content_dictionary=%s", json.dumps(file_content_dictionary, indent=4)
+        )
+        for _item in file_content_dictionary:
+            if _batch_response_was_successful(_item):
+                _response_body = _get_response_from_batch_job_output_file(_item)
+                total_cost += litellm.completion_cost(
+                    completion_response=_response_body,
+                    custom_llm_provider=custom_llm_provider,
+                    call_type=CallTypes.aretrieve_batch.value,
+                )
+                verbose_logger.debug("total_cost=%s", total_cost)
+        return total_cost
+    except Exception as e:
+        verbose_logger.error("error in _get_batch_job_cost_from_file_content", e)
+        raise e
+def _get_batch_job_total_usage_from_file_content(
+    file_content_dictionary: List[dict],
+    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
+) -> Usage:
+    """
+    Get the tokens of a batch job from the file content
+    """
+    total_tokens: int = 0
+    prompt_tokens: int = 0
+    completion_tokens: int = 0
+    for _item in file_content_dictionary:
+        if _batch_response_was_successful(_item):
+            _response_body = _get_response_from_batch_job_output_file(_item)
+            usage: Usage = _get_batch_job_usage_from_response_body(_response_body)
+            total_tokens += usage.total_tokens
+            prompt_tokens += usage.prompt_tokens
+            completion_tokens += usage.completion_tokens
+    return Usage(
+        total_tokens=total_tokens,
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+    )
+def _get_batch_job_usage_from_response_body(response_body: dict) -> Usage:
+    """
+    Get the tokens of a batch job from the response body
+    """
+    _usage_dict = response_body.get("usage", None) or {}
+    usage: Usage = Usage(**_usage_dict)
+    return usage
+def _get_response_from_batch_job_output_file(batch_job_output_file: dict) -> Any:
+    """
+    Get the response from the batch job output file
+    """
+    _response: dict = batch_job_output_file.get("response", None) or {}
+    _response_body = _response.get("body", None) or {}
+    return _response_body
+def _batch_response_was_successful(batch_job_output_file: dict) -> bool:
+    """
+    Check if the batch job response status == 200
+    """
+    _response: dict = batch_job_output_file.get("response", None) or {}
+    return _response.get("status_code", None) == 200

litellm/batches/main.py ADDED Viewed

	@@ -0,0 +1,796 @@

+"""
+Main File for Batches API implementation
+https://platform.openai.com/docs/api-reference/batch
+- create_batch()
+- retrieve_batch()
+- cancel_batch()
+- list_batch()
+"""
+import asyncio
+import contextvars
+import os
+from functools import partial
+from typing import Any, Coroutine, Dict, Literal, Optional, Union
+import httpx
+import litellm
+from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
+from litellm.llms.azure.batches.handler import AzureBatchesAPI
+from litellm.llms.openai.openai import OpenAIBatchesAPI
+from litellm.llms.vertex_ai.batches.handler import VertexAIBatchPrediction
+from litellm.secret_managers.main import get_secret_str
+from litellm.types.llms.openai import (
+    Batch,
+    CancelBatchRequest,
+    CreateBatchRequest,
+    RetrieveBatchRequest,
+)
+from litellm.types.router import GenericLiteLLMParams
+from litellm.types.utils import LiteLLMBatch
+from litellm.utils import client, get_litellm_params, supports_httpx_timeout
+####### ENVIRONMENT VARIABLES ###################
+openai_batches_instance = OpenAIBatchesAPI()
+azure_batches_instance = AzureBatchesAPI()
+vertex_ai_batches_instance = VertexAIBatchPrediction(gcs_bucket_name="")
+#################################################
+@client
+async def acreate_batch(
+    completion_window: Literal["24h"],
+    endpoint: Literal["/v1/chat/completions", "/v1/embeddings", "/v1/completions"],
+    input_file_id: str,
+    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
+    metadata: Optional[Dict[str, str]] = None,
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> Batch:
+    """
+    Async: Creates and executes a batch from an uploaded file of request
+    LiteLLM Equivalent of POST: https://api.openai.com/v1/batches
+    """
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["acreate_batch"] = True
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            create_batch,
+            completion_window,
+            endpoint,
+            input_file_id,
+            custom_llm_provider,
+            metadata,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response
+        return response
+    except Exception as e:
+        raise e
+@client
+def create_batch(
+    completion_window: Literal["24h"],
+    endpoint: Literal["/v1/chat/completions", "/v1/embeddings", "/v1/completions"],
+    input_file_id: str,
+    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
+    metadata: Optional[Dict[str, str]] = None,
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> Union[LiteLLMBatch, Coroutine[Any, Any, LiteLLMBatch]]:
+    """
+    Creates and executes a batch from an uploaded file of request
+    LiteLLM Equivalent of POST: https://api.openai.com/v1/batches
+    """
+    try:
+        optional_params = GenericLiteLLMParams(**kwargs)
+        litellm_call_id = kwargs.get("litellm_call_id", None)
+        proxy_server_request = kwargs.get("proxy_server_request", None)
+        model_info = kwargs.get("model_info", None)
+        _is_async = kwargs.pop("acreate_batch", False) is True
+        litellm_params = get_litellm_params(**kwargs)
+        litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj", None)
+        ### TIMEOUT LOGIC ###
+        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+        litellm_logging_obj.update_environment_variables(
+            model=None,
+            user=None,
+            optional_params=optional_params.model_dump(),
+            litellm_params={
+                "litellm_call_id": litellm_call_id,
+                "proxy_server_request": proxy_server_request,
+                "model_info": model_info,
+                "metadata": metadata,
+                "preset_cache_key": None,
+                "stream_response": {},
+                **optional_params.model_dump(exclude_unset=True),
+            },
+            custom_llm_provider=custom_llm_provider,
+        )
+        if (
+            timeout is not None
+            and isinstance(timeout, httpx.Timeout)
+            and supports_httpx_timeout(custom_llm_provider) is False
+        ):
+            read_timeout = timeout.read or 600
+            timeout = read_timeout  # default 10 min timeout
+        elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+            timeout = float(timeout)  # type: ignore
+        elif timeout is None:
+            timeout = 600.0
+        _create_batch_request = CreateBatchRequest(
+            completion_window=completion_window,
+            endpoint=endpoint,
+            input_file_id=input_file_id,
+            metadata=metadata,
+            extra_headers=extra_headers,
+            extra_body=extra_body,
+        )
+        api_base: Optional[str] = None
+        if custom_llm_provider == "openai":
+            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_BASE_URL")
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            # set API KEY
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+            response = openai_batches_instance.create_batch(
+                api_base=api_base,
+                api_key=api_key,
+                organization=organization,
+                create_batch_data=_create_batch_request,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                _is_async=_is_async,
+            )
+        elif custom_llm_provider == "azure":
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or get_secret_str("AZURE_API_BASE")
+            )
+            api_version = (
+                optional_params.api_version
+                or litellm.api_version
+                or get_secret_str("AZURE_API_VERSION")
+            )
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key
+                or litellm.azure_key
+                or get_secret_str("AZURE_OPENAI_API_KEY")
+                or get_secret_str("AZURE_API_KEY")
+            )
+            extra_body = optional_params.get("extra_body", {})
+            if extra_body is not None:
+                extra_body.pop("azure_ad_token", None)
+            else:
+                get_secret_str("AZURE_AD_TOKEN")  # type: ignore
+            response = azure_batches_instance.create_batch(
+                _is_async=_is_async,
+                api_base=api_base,
+                api_key=api_key,
+                api_version=api_version,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                create_batch_data=_create_batch_request,
+                litellm_params=litellm_params,
+            )
+        elif custom_llm_provider == "vertex_ai":
+            api_base = optional_params.api_base or ""
+            vertex_ai_project = (
+                optional_params.vertex_project
+                or litellm.vertex_project
+                or get_secret_str("VERTEXAI_PROJECT")
+            )
+            vertex_ai_location = (
+                optional_params.vertex_location
+                or litellm.vertex_location
+                or get_secret_str("VERTEXAI_LOCATION")
+            )
+            vertex_credentials = optional_params.vertex_credentials or get_secret_str(
+                "VERTEXAI_CREDENTIALS"
+            )
+            response = vertex_ai_batches_instance.create_batch(
+                _is_async=_is_async,
+                api_base=api_base,
+                vertex_project=vertex_ai_project,
+                vertex_location=vertex_ai_location,
+                vertex_credentials=vertex_credentials,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                create_batch_data=_create_batch_request,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support custom_llm_provider={} for 'create_batch'".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="create_batch", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return response
+    except Exception as e:
+        raise e
+@client
+async def aretrieve_batch(
+    batch_id: str,
+    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
+    metadata: Optional[Dict[str, str]] = None,
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> LiteLLMBatch:
+    """
+    Async: Retrieves a batch.
+    LiteLLM Equivalent of GET https://api.openai.com/v1/batches/{batch_id}
+    """
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["aretrieve_batch"] = True
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            retrieve_batch,
+            batch_id,
+            custom_llm_provider,
+            metadata,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response  # type: ignore
+        return response
+    except Exception as e:
+        raise e
+@client
+def retrieve_batch(
+    batch_id: str,
+    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
+    metadata: Optional[Dict[str, str]] = None,
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> Union[LiteLLMBatch, Coroutine[Any, Any, LiteLLMBatch]]:
+    """
+    Retrieves a batch.
+    LiteLLM Equivalent of GET https://api.openai.com/v1/batches/{batch_id}
+    """
+    try:
+        optional_params = GenericLiteLLMParams(**kwargs)
+        litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj", None)
+        ### TIMEOUT LOGIC ###
+        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+        litellm_params = get_litellm_params(
+            custom_llm_provider=custom_llm_provider,
+            **kwargs,
+        )
+        litellm_logging_obj.update_environment_variables(
+            model=None,
+            user=None,
+            optional_params=optional_params.model_dump(),
+            litellm_params=litellm_params,
+            custom_llm_provider=custom_llm_provider,
+        )
+        if (
+            timeout is not None
+            and isinstance(timeout, httpx.Timeout)
+            and supports_httpx_timeout(custom_llm_provider) is False
+        ):
+            read_timeout = timeout.read or 600
+            timeout = read_timeout  # default 10 min timeout
+        elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+            timeout = float(timeout)  # type: ignore
+        elif timeout is None:
+            timeout = 600.0
+        _retrieve_batch_request = RetrieveBatchRequest(
+            batch_id=batch_id,
+            extra_headers=extra_headers,
+            extra_body=extra_body,
+        )
+        _is_async = kwargs.pop("aretrieve_batch", False) is True
+        api_base: Optional[str] = None
+        if custom_llm_provider == "openai":
+            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_BASE_URL")
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            # set API KEY
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+            response = openai_batches_instance.retrieve_batch(
+                _is_async=_is_async,
+                retrieve_batch_data=_retrieve_batch_request,
+                api_base=api_base,
+                api_key=api_key,
+                organization=organization,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+            )
+        elif custom_llm_provider == "azure":
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or get_secret_str("AZURE_API_BASE")
+            )
+            api_version = (
+                optional_params.api_version
+                or litellm.api_version
+                or get_secret_str("AZURE_API_VERSION")
+            )
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key
+                or litellm.azure_key
+                or get_secret_str("AZURE_OPENAI_API_KEY")
+                or get_secret_str("AZURE_API_KEY")
+            )
+            extra_body = optional_params.get("extra_body", {})
+            if extra_body is not None:
+                extra_body.pop("azure_ad_token", None)
+            else:
+                get_secret_str("AZURE_AD_TOKEN")  # type: ignore
+            response = azure_batches_instance.retrieve_batch(
+                _is_async=_is_async,
+                api_base=api_base,
+                api_key=api_key,
+                api_version=api_version,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                retrieve_batch_data=_retrieve_batch_request,
+                litellm_params=litellm_params,
+            )
+        elif custom_llm_provider == "vertex_ai":
+            api_base = optional_params.api_base or ""
+            vertex_ai_project = (
+                optional_params.vertex_project
+                or litellm.vertex_project
+                or get_secret_str("VERTEXAI_PROJECT")
+            )
+            vertex_ai_location = (
+                optional_params.vertex_location
+                or litellm.vertex_location
+                or get_secret_str("VERTEXAI_LOCATION")
+            )
+            vertex_credentials = optional_params.vertex_credentials or get_secret_str(
+                "VERTEXAI_CREDENTIALS"
+            )
+            response = vertex_ai_batches_instance.retrieve_batch(
+                _is_async=_is_async,
+                batch_id=batch_id,
+                api_base=api_base,
+                vertex_project=vertex_ai_project,
+                vertex_location=vertex_ai_location,
+                vertex_credentials=vertex_credentials,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return response
+    except Exception as e:
+        raise e
+async def alist_batches(
+    after: Optional[str] = None,
+    limit: Optional[int] = None,
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
+    metadata: Optional[Dict[str, str]] = None,
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+):
+    """
+    Async: List your organization's batches.
+    """
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["alist_batches"] = True
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            list_batches,
+            after,
+            limit,
+            custom_llm_provider,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response  # type: ignore
+        return response
+    except Exception as e:
+        raise e
+def list_batches(
+    after: Optional[str] = None,
+    limit: Optional[int] = None,
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+):
+    """
+    Lists batches
+    List your organization's batches.
+    """
+    try:
+        # set API KEY
+        optional_params = GenericLiteLLMParams(**kwargs)
+        litellm_params = get_litellm_params(
+            custom_llm_provider=custom_llm_provider,
+            **kwargs,
+        )
+        api_key = (
+            optional_params.api_key
+            or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+            or litellm.openai_key
+            or os.getenv("OPENAI_API_KEY")
+        )
+        ### TIMEOUT LOGIC ###
+        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+        # set timeout for 10 minutes by default
+        if (
+            timeout is not None
+            and isinstance(timeout, httpx.Timeout)
+            and supports_httpx_timeout(custom_llm_provider) is False
+        ):
+            read_timeout = timeout.read or 600
+            timeout = read_timeout  # default 10 min timeout
+        elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+            timeout = float(timeout)  # type: ignore
+        elif timeout is None:
+            timeout = 600.0
+        _is_async = kwargs.pop("alist_batches", False) is True
+        if custom_llm_provider == "openai":
+            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_BASE_URL")
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            response = openai_batches_instance.list_batches(
+                _is_async=_is_async,
+                after=after,
+                limit=limit,
+                api_base=api_base,
+                api_key=api_key,
+                organization=organization,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+            )
+        elif custom_llm_provider == "azure":
+            api_base = optional_params.api_base or litellm.api_base or get_secret_str("AZURE_API_BASE")  # type: ignore
+            api_version = (
+                optional_params.api_version
+                or litellm.api_version
+                or get_secret_str("AZURE_API_VERSION")
+            )
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key
+                or litellm.azure_key
+                or get_secret_str("AZURE_OPENAI_API_KEY")
+                or get_secret_str("AZURE_API_KEY")
+            )
+            extra_body = optional_params.get("extra_body", {})
+            if extra_body is not None:
+                extra_body.pop("azure_ad_token", None)
+            else:
+                get_secret_str("AZURE_AD_TOKEN")  # type: ignore
+            response = azure_batches_instance.list_batches(
+                _is_async=_is_async,
+                api_base=api_base,
+                api_key=api_key,
+                api_version=api_version,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                litellm_params=litellm_params,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'list_batch'. Only 'openai' is supported.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return response
+    except Exception as e:
+        raise e
+async def acancel_batch(
+    batch_id: str,
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
+    metadata: Optional[Dict[str, str]] = None,
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> Batch:
+    """
+    Async: Cancels a batch.
+    LiteLLM Equivalent of POST https://api.openai.com/v1/batches/{batch_id}/cancel
+    """
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["acancel_batch"] = True
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            cancel_batch,
+            batch_id,
+            custom_llm_provider,
+            metadata,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response
+        return response
+    except Exception as e:
+        raise e
+def cancel_batch(
+    batch_id: str,
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
+    metadata: Optional[Dict[str, str]] = None,
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> Union[Batch, Coroutine[Any, Any, Batch]]:
+    """
+    Cancels a batch.
+    LiteLLM Equivalent of POST https://api.openai.com/v1/batches/{batch_id}/cancel
+    """
+    try:
+        optional_params = GenericLiteLLMParams(**kwargs)
+        litellm_params = get_litellm_params(
+            custom_llm_provider=custom_llm_provider,
+            **kwargs,
+        )
+        ### TIMEOUT LOGIC ###
+        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+        # set timeout for 10 minutes by default
+        if (
+            timeout is not None
+            and isinstance(timeout, httpx.Timeout)
+            and supports_httpx_timeout(custom_llm_provider) is False
+        ):
+            read_timeout = timeout.read or 600
+            timeout = read_timeout  # default 10 min timeout
+        elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+            timeout = float(timeout)  # type: ignore
+        elif timeout is None:
+            timeout = 600.0
+        _cancel_batch_request = CancelBatchRequest(
+            batch_id=batch_id,
+            extra_headers=extra_headers,
+            extra_body=extra_body,
+        )
+        _is_async = kwargs.pop("acancel_batch", False) is True
+        api_base: Optional[str] = None
+        if custom_llm_provider == "openai":
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_BASE_URL")
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None
+            )
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+            response = openai_batches_instance.cancel_batch(
+                _is_async=_is_async,
+                cancel_batch_data=_cancel_batch_request,
+                api_base=api_base,
+                api_key=api_key,
+                organization=organization,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+            )
+        elif custom_llm_provider == "azure":
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or get_secret_str("AZURE_API_BASE")
+            )
+            api_version = (
+                optional_params.api_version
+                or litellm.api_version
+                or get_secret_str("AZURE_API_VERSION")
+            )
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key
+                or litellm.azure_key
+                or get_secret_str("AZURE_OPENAI_API_KEY")
+                or get_secret_str("AZURE_API_KEY")
+            )
+            extra_body = optional_params.get("extra_body", {})
+            if extra_body is not None:
+                extra_body.pop("azure_ad_token", None)
+            else:
+                get_secret_str("AZURE_AD_TOKEN")  # type: ignore
+            response = azure_batches_instance.cancel_batch(
+                _is_async=_is_async,
+                api_base=api_base,
+                api_key=api_key,
+                api_version=api_version,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                cancel_batch_data=_cancel_batch_request,
+                litellm_params=litellm_params,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'cancel_batch'. Only 'openai' and 'azure' are supported.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="cancel_batch", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return response
+    except Exception as e:
+        raise e

litellm/budget_manager.py ADDED Viewed

	@@ -0,0 +1,230 @@

+# +-----------------------------------------------+
+# |                                               |
+# |           NOT PROXY BUDGET MANAGER            |
+# |  proxy budget manager is in proxy_server.py   |
+# |                                               |
+# +-----------------------------------------------+
+#
+#  Thank you users! We ❤️ you! - Krrish & Ishaan
+import json
+import os
+import threading
+import time
+from typing import Literal, Optional
+import litellm
+from litellm.constants import (
+    DAYS_IN_A_MONTH,
+    DAYS_IN_A_WEEK,
+    DAYS_IN_A_YEAR,
+    HOURS_IN_A_DAY,
+)
+from litellm.utils import ModelResponse
+class BudgetManager:
+    def __init__(
+        self,
+        project_name: str,
+        client_type: str = "local",
+        api_base: Optional[str] = None,
+        headers: Optional[dict] = None,
+    ):
+        self.client_type = client_type
+        self.project_name = project_name
+        self.api_base = api_base or "https://api.litellm.ai"
+        self.headers = headers or {"Content-Type": "application/json"}
+        ## load the data or init the initial dictionaries
+        self.load_data()
+    def print_verbose(self, print_statement):
+        try:
+            if litellm.set_verbose:
+                import logging
+                logging.info(print_statement)
+        except Exception:
+            pass
+    def load_data(self):
+        if self.client_type == "local":
+            # Check if user dict file exists
+            if os.path.isfile("user_cost.json"):
+                # Load the user dict
+                with open("user_cost.json", "r") as json_file:
+                    self.user_dict = json.load(json_file)
+            else:
+                self.print_verbose("User Dictionary not found!")
+                self.user_dict = {}
+            self.print_verbose(f"user dict from local: {self.user_dict}")
+        elif self.client_type == "hosted":
+            # Load the user_dict from hosted db
+            url = self.api_base + "/get_budget"
+            data = {"project_name": self.project_name}
+            response = litellm.module_level_client.post(
+                url, headers=self.headers, json=data
+            )
+            response = response.json()
+            if response["status"] == "error":
+                self.user_dict = (
+                    {}
+                )  # assume this means the user dict hasn't been stored yet
+            else:
+                self.user_dict = response["data"]
+    def create_budget(
+        self,
+        total_budget: float,
+        user: str,
+        duration: Optional[Literal["daily", "weekly", "monthly", "yearly"]] = None,
+        created_at: float = time.time(),
+    ):
+        self.user_dict[user] = {"total_budget": total_budget}
+        if duration is None:
+            return self.user_dict[user]
+        if duration == "daily":
+            duration_in_days = 1
+        elif duration == "weekly":
+            duration_in_days = DAYS_IN_A_WEEK
+        elif duration == "monthly":
+            duration_in_days = DAYS_IN_A_MONTH
+        elif duration == "yearly":
+            duration_in_days = DAYS_IN_A_YEAR
+        else:
+            raise ValueError(
+                """duration needs to be one of ["daily", "weekly", "monthly", "yearly"]"""
+            )
+        self.user_dict[user] = {
+            "total_budget": total_budget,
+            "duration": duration_in_days,
+            "created_at": created_at,
+            "last_updated_at": created_at,
+        }
+        self._save_data_thread()  # [Non-Blocking] Update persistent storage without blocking execution
+        return self.user_dict[user]
+    def projected_cost(self, model: str, messages: list, user: str):
+        text = "".join(message["content"] for message in messages)
+        prompt_tokens = litellm.token_counter(model=model, text=text)
+        prompt_cost, _ = litellm.cost_per_token(
+            model=model, prompt_tokens=prompt_tokens, completion_tokens=0
+        )
+        current_cost = self.user_dict[user].get("current_cost", 0)
+        projected_cost = prompt_cost + current_cost
+        return projected_cost
+    def get_total_budget(self, user: str):
+        return self.user_dict[user]["total_budget"]
+    def update_cost(
+        self,
+        user: str,
+        completion_obj: Optional[ModelResponse] = None,
+        model: Optional[str] = None,
+        input_text: Optional[str] = None,
+        output_text: Optional[str] = None,
+    ):
+        if model and input_text and output_text:
+            prompt_tokens = litellm.token_counter(
+                model=model, messages=[{"role": "user", "content": input_text}]
+            )
+            completion_tokens = litellm.token_counter(
+                model=model, messages=[{"role": "user", "content": output_text}]
+            )
+            (
+                prompt_tokens_cost_usd_dollar,
+                completion_tokens_cost_usd_dollar,
+            ) = litellm.cost_per_token(
+                model=model,
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+            )
+            cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
+        elif completion_obj:
+            cost = litellm.completion_cost(completion_response=completion_obj)
+            model = completion_obj[
+                "model"
+            ]  # if this throws an error try, model = completion_obj['model']
+        else:
+            raise ValueError(
+                "Either a chat completion object or the text response needs to be passed in. Learn more - https://docs.litellm.ai/docs/budget_manager"
+            )
+        self.user_dict[user]["current_cost"] = cost + self.user_dict[user].get(
+            "current_cost", 0
+        )
+        if "model_cost" in self.user_dict[user]:
+            self.user_dict[user]["model_cost"][model] = cost + self.user_dict[user][
+                "model_cost"
+            ].get(model, 0)
+        else:
+            self.user_dict[user]["model_cost"] = {model: cost}
+        self._save_data_thread()  # [Non-Blocking] Update persistent storage without blocking execution
+        return {"user": self.user_dict[user]}
+    def get_current_cost(self, user):
+        return self.user_dict[user].get("current_cost", 0)
+    def get_model_cost(self, user):
+        return self.user_dict[user].get("model_cost", 0)
+    def is_valid_user(self, user: str) -> bool:
+        return user in self.user_dict
+    def get_users(self):
+        return list(self.user_dict.keys())
+    def reset_cost(self, user):
+        self.user_dict[user]["current_cost"] = 0
+        self.user_dict[user]["model_cost"] = {}
+        return {"user": self.user_dict[user]}
+    def reset_on_duration(self, user: str):
+        # Get current and creation time
+        last_updated_at = self.user_dict[user]["last_updated_at"]
+        current_time = time.time()
+        # Convert duration from days to seconds
+        duration_in_seconds = (
+            self.user_dict[user]["duration"] * HOURS_IN_A_DAY * 60 * 60
+        )
+        # Check if duration has elapsed
+        if current_time - last_updated_at >= duration_in_seconds:
+            # Reset cost if duration has elapsed and update the creation time
+            self.reset_cost(user)
+            self.user_dict[user]["last_updated_at"] = current_time
+            self._save_data_thread()  # Save the data
+    def update_budget_all_users(self):
+        for user in self.get_users():
+            if "duration" in self.user_dict[user]:
+                self.reset_on_duration(user)
+    def _save_data_thread(self):
+        thread = threading.Thread(
+            target=self.save_data
+        )  # [Non-Blocking]: saves data without blocking execution
+        thread.start()
+    def save_data(self):
+        if self.client_type == "local":
+            import json
+            # save the user dict
+            with open("user_cost.json", "w") as json_file:
+                json.dump(
+                    self.user_dict, json_file, indent=4
+                )  # Indent for pretty formatting
+            return {"status": "success"}
+        elif self.client_type == "hosted":
+            url = self.api_base + "/set_budget"
+            data = {"project_name": self.project_name, "user_dict": self.user_dict}
+            response = litellm.module_level_client.post(
+                url, headers=self.headers, json=data
+            )
+            response = response.json()
+            return response

litellm/caching/Readme.md ADDED Viewed

	@@ -0,0 +1,40 @@

+# Caching on LiteLLM
+LiteLLM supports multiple caching mechanisms. This allows users to choose the most suitable caching solution for their use case.
+The following caching mechanisms are supported:
+1. **RedisCache**
+2. **RedisSemanticCache**
+3. **QdrantSemanticCache**
+4. **InMemoryCache**
+5. **DiskCache**
+6. **S3Cache**
+7. **DualCache** (updates both Redis and an in-memory cache simultaneously)
+## Folder Structure
+```
+litellm/caching/
+├── base_cache.py
+├── caching.py
+├── caching_handler.py
+├── disk_cache.py
+├── dual_cache.py
+├── in_memory_cache.py
+├── qdrant_semantic_cache.py
+├── redis_cache.py
+├── redis_semantic_cache.py
+├── s3_cache.py
+```
+## Documentation
+- [Caching on LiteLLM Gateway](https://docs.litellm.ai/docs/proxy/caching)
+- [Caching on LiteLLM Python](https://docs.litellm.ai/docs/caching/all_caches)

litellm/caching/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .caching import Cache, LiteLLMCacheType
+from .disk_cache import DiskCache
+from .dual_cache import DualCache
+from .in_memory_cache import InMemoryCache
+from .qdrant_semantic_cache import QdrantSemanticCache
+from .redis_cache import RedisCache
+from .redis_cluster_cache import RedisClusterCache
+from .redis_semantic_cache import RedisSemanticCache
+from .s3_cache import S3Cache

litellm/caching/_internal_lru_cache.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from functools import lru_cache
+from typing import Callable, Optional, TypeVar
+T = TypeVar("T")
+def lru_cache_wrapper(
+    maxsize: Optional[int] = None,
+) -> Callable[[Callable[..., T]], Callable[..., T]]:
+    """
+    Wrapper for lru_cache that caches success and exceptions
+    """
+    def decorator(f: Callable[..., T]) -> Callable[..., T]:
+        @lru_cache(maxsize=maxsize)
+        def wrapper(*args, **kwargs):
+            try:
+                return ("success", f(*args, **kwargs))
+            except Exception as e:
+                return ("error", e)
+        def wrapped(*args, **kwargs):
+            result = wrapper(*args, **kwargs)
+            if result[0] == "error":
+                raise result[1]
+            return result[1]
+        return wrapped
+    return decorator

litellm/caching/base_cache.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""
+Base Cache implementation. All cache implementations should inherit from this class.
+Has 4 methods:
+    - set_cache
+    - get_cache
+    - async_set_cache
+    - async_get_cache
+"""
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, Optional, Union
+if TYPE_CHECKING:
+    from opentelemetry.trace import Span as _Span
+    Span = Union[_Span, Any]
+else:
+    Span = Any
+class BaseCache(ABC):
+    def __init__(self, default_ttl: int = 60):
+        self.default_ttl = default_ttl
+    def get_ttl(self, **kwargs) -> Optional[int]:
+        kwargs_ttl: Optional[int] = kwargs.get("ttl")
+        if kwargs_ttl is not None:
+            try:
+                return int(kwargs_ttl)
+            except ValueError:
+                return self.default_ttl
+        return self.default_ttl
+    def set_cache(self, key, value, **kwargs):
+        raise NotImplementedError
+    async def async_set_cache(self, key, value, **kwargs):
+        raise NotImplementedError
+    @abstractmethod
+    async def async_set_cache_pipeline(self, cache_list, **kwargs):
+        pass
+    def get_cache(self, key, **kwargs):
+        raise NotImplementedError
+    async def async_get_cache(self, key, **kwargs):
+        raise NotImplementedError
+    async def batch_cache_write(self, key, value, **kwargs):
+        raise NotImplementedError
+    async def disconnect(self):
+        raise NotImplementedError

litellm/caching/caching.py ADDED Viewed

	@@ -0,0 +1,818 @@

+# +-----------------------------------------------+
+# |                                               |
+# |           Give Feedback / Get Help            |
+# | https://github.com/BerriAI/litellm/issues/new |
+# |                                               |
+# +-----------------------------------------------+
+#
+#  Thank you users! We ❤️ you! - Krrish & Ishaan
+import ast
+import hashlib
+import json
+import time
+import traceback
+from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple, Union
+from pydantic import BaseModel
+import litellm
+from litellm._logging import verbose_logger
+from litellm.constants import CACHED_STREAMING_CHUNK_DELAY
+from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
+from litellm.types.caching import *
+from litellm.types.utils import EmbeddingResponse, all_litellm_params
+from .base_cache import BaseCache
+from .disk_cache import DiskCache
+from .dual_cache import DualCache  # noqa
+from .in_memory_cache import InMemoryCache
+from .qdrant_semantic_cache import QdrantSemanticCache
+from .redis_cache import RedisCache
+from .redis_cluster_cache import RedisClusterCache
+from .redis_semantic_cache import RedisSemanticCache
+from .s3_cache import S3Cache
+def print_verbose(print_statement):
+    try:
+        verbose_logger.debug(print_statement)
+        if litellm.set_verbose:
+            print(print_statement)  # noqa
+    except Exception:
+        pass
+class CacheMode(str, Enum):
+    default_on = "default_on"
+    default_off = "default_off"
+#### LiteLLM.Completion / Embedding Cache ####
+class Cache:
+    def __init__(
+        self,
+        type: Optional[LiteLLMCacheType] = LiteLLMCacheType.LOCAL,
+        mode: Optional[
+            CacheMode
+        ] = CacheMode.default_on,  # when default_on cache is always on, when default_off cache is opt in
+        host: Optional[str] = None,
+        port: Optional[str] = None,
+        password: Optional[str] = None,
+        namespace: Optional[str] = None,
+        ttl: Optional[float] = None,
+        default_in_memory_ttl: Optional[float] = None,
+        default_in_redis_ttl: Optional[float] = None,
+        similarity_threshold: Optional[float] = None,
+        supported_call_types: Optional[List[CachingSupportedCallTypes]] = [
+            "completion",
+            "acompletion",
+            "embedding",
+            "aembedding",
+            "atranscription",
+            "transcription",
+            "atext_completion",
+            "text_completion",
+            "arerank",
+            "rerank",
+        ],
+        # s3 Bucket, boto3 configuration
+        s3_bucket_name: Optional[str] = None,
+        s3_region_name: Optional[str] = None,
+        s3_api_version: Optional[str] = None,
+        s3_use_ssl: Optional[bool] = True,
+        s3_verify: Optional[Union[bool, str]] = None,
+        s3_endpoint_url: Optional[str] = None,
+        s3_aws_access_key_id: Optional[str] = None,
+        s3_aws_secret_access_key: Optional[str] = None,
+        s3_aws_session_token: Optional[str] = None,
+        s3_config: Optional[Any] = None,
+        s3_path: Optional[str] = None,
+        redis_semantic_cache_embedding_model: str = "text-embedding-ada-002",
+        redis_semantic_cache_index_name: Optional[str] = None,
+        redis_flush_size: Optional[int] = None,
+        redis_startup_nodes: Optional[List] = None,
+        disk_cache_dir: Optional[str] = None,
+        qdrant_api_base: Optional[str] = None,
+        qdrant_api_key: Optional[str] = None,
+        qdrant_collection_name: Optional[str] = None,
+        qdrant_quantization_config: Optional[str] = None,
+        qdrant_semantic_cache_embedding_model: str = "text-embedding-ada-002",
+        **kwargs,
+    ):
+        """
+        Initializes the cache based on the given type.
+        Args:
+            type (str, optional): The type of cache to initialize. Can be "local", "redis", "redis-semantic", "qdrant-semantic", "s3" or "disk". Defaults to "local".
+            # Redis Cache Args
+            host (str, optional): The host address for the Redis cache. Required if type is "redis".
+            port (int, optional): The port number for the Redis cache. Required if type is "redis".
+            password (str, optional): The password for the Redis cache. Required if type is "redis".
+            namespace (str, optional): The namespace for the Redis cache. Required if type is "redis".
+            ttl (float, optional): The ttl for the Redis cache
+            redis_flush_size (int, optional): The number of keys to flush at a time. Defaults to 1000. Only used if batch redis set caching is used.
+            redis_startup_nodes (list, optional): The list of startup nodes for the Redis cache. Defaults to None.
+            # Qdrant Cache Args
+            qdrant_api_base (str, optional): The url for your qdrant cluster. Required if type is "qdrant-semantic".
+            qdrant_api_key (str, optional): The api_key for the local or cloud qdrant cluster.
+            qdrant_collection_name (str, optional): The name for your qdrant collection. Required if type is "qdrant-semantic".
+            similarity_threshold (float, optional): The similarity threshold for semantic-caching, Required if type is "redis-semantic" or "qdrant-semantic".
+            # Disk Cache Args
+            disk_cache_dir (str, optional): The directory for the disk cache. Defaults to None.
+            # S3 Cache Args
+            s3_bucket_name (str, optional): The bucket name for the s3 cache. Defaults to None.
+            s3_region_name (str, optional): The region name for the s3 cache. Defaults to None.
+            s3_api_version (str, optional): The api version for the s3 cache. Defaults to None.
+            s3_use_ssl (bool, optional): The use ssl for the s3 cache. Defaults to True.
+            s3_verify (bool, optional): The verify for the s3 cache. Defaults to None.
+            s3_endpoint_url (str, optional): The endpoint url for the s3 cache. Defaults to None.
+            s3_aws_access_key_id (str, optional): The aws access key id for the s3 cache. Defaults to None.
+            s3_aws_secret_access_key (str, optional): The aws secret access key for the s3 cache. Defaults to None.
+            s3_aws_session_token (str, optional): The aws session token for the s3 cache. Defaults to None.
+            s3_config (dict, optional): The config for the s3 cache. Defaults to None.
+            # Common Cache Args
+            supported_call_types (list, optional): List of call types to cache for. Defaults to cache == on for all call types.
+            **kwargs: Additional keyword arguments for redis.Redis() cache
+        Raises:
+            ValueError: If an invalid cache type is provided.
+        Returns:
+            None. Cache is set as a litellm param
+        """
+        if type == LiteLLMCacheType.REDIS:
+            if redis_startup_nodes:
+                self.cache: BaseCache = RedisClusterCache(
+                    host=host,
+                    port=port,
+                    password=password,
+                    redis_flush_size=redis_flush_size,
+                    startup_nodes=redis_startup_nodes,
+                    **kwargs,
+                )
+            else:
+                self.cache = RedisCache(
+                    host=host,
+                    port=port,
+                    password=password,
+                    redis_flush_size=redis_flush_size,
+                    **kwargs,
+                )
+        elif type == LiteLLMCacheType.REDIS_SEMANTIC:
+            self.cache = RedisSemanticCache(
+                host=host,
+                port=port,
+                password=password,
+                similarity_threshold=similarity_threshold,
+                embedding_model=redis_semantic_cache_embedding_model,
+                index_name=redis_semantic_cache_index_name,
+                **kwargs,
+            )
+        elif type == LiteLLMCacheType.QDRANT_SEMANTIC:
+            self.cache = QdrantSemanticCache(
+                qdrant_api_base=qdrant_api_base,
+                qdrant_api_key=qdrant_api_key,
+                collection_name=qdrant_collection_name,
+                similarity_threshold=similarity_threshold,
+                quantization_config=qdrant_quantization_config,
+                embedding_model=qdrant_semantic_cache_embedding_model,
+            )
+        elif type == LiteLLMCacheType.LOCAL:
+            self.cache = InMemoryCache()
+        elif type == LiteLLMCacheType.S3:
+            self.cache = S3Cache(
+                s3_bucket_name=s3_bucket_name,
+                s3_region_name=s3_region_name,
+                s3_api_version=s3_api_version,
+                s3_use_ssl=s3_use_ssl,
+                s3_verify=s3_verify,
+                s3_endpoint_url=s3_endpoint_url,
+                s3_aws_access_key_id=s3_aws_access_key_id,
+                s3_aws_secret_access_key=s3_aws_secret_access_key,
+                s3_aws_session_token=s3_aws_session_token,
+                s3_config=s3_config,
+                s3_path=s3_path,
+                **kwargs,
+            )
+        elif type == LiteLLMCacheType.DISK:
+            self.cache = DiskCache(disk_cache_dir=disk_cache_dir)
+        if "cache" not in litellm.input_callback:
+            litellm.input_callback.append("cache")
+        if "cache" not in litellm.success_callback:
+            litellm.logging_callback_manager.add_litellm_success_callback("cache")
+        if "cache" not in litellm._async_success_callback:
+            litellm.logging_callback_manager.add_litellm_async_success_callback("cache")
+        self.supported_call_types = supported_call_types  # default to ["completion", "acompletion", "embedding", "aembedding"]
+        self.type = type
+        self.namespace = namespace
+        self.redis_flush_size = redis_flush_size
+        self.ttl = ttl
+        self.mode: CacheMode = mode or CacheMode.default_on
+        if self.type == LiteLLMCacheType.LOCAL and default_in_memory_ttl is not None:
+            self.ttl = default_in_memory_ttl
+        if (
+            self.type == LiteLLMCacheType.REDIS
+            or self.type == LiteLLMCacheType.REDIS_SEMANTIC
+        ) and default_in_redis_ttl is not None:
+            self.ttl = default_in_redis_ttl
+        if self.namespace is not None and isinstance(self.cache, RedisCache):
+            self.cache.namespace = self.namespace
+    def get_cache_key(self, **kwargs) -> str:
+        """
+        Get the cache key for the given arguments.
+        Args:
+            **kwargs: kwargs to litellm.completion() or embedding()
+        Returns:
+            str: The cache key generated from the arguments, or None if no cache key could be generated.
+        """
+        cache_key = ""
+        # verbose_logger.debug("\nGetting Cache key. Kwargs: %s", kwargs)
+        preset_cache_key = self._get_preset_cache_key_from_kwargs(**kwargs)
+        if preset_cache_key is not None:
+            verbose_logger.debug("\nReturning preset cache key: %s", preset_cache_key)
+            return preset_cache_key
+        combined_kwargs = ModelParamHelper._get_all_llm_api_params()
+        litellm_param_kwargs = all_litellm_params
+        for param in kwargs:
+            if param in combined_kwargs:
+                param_value: Optional[str] = self._get_param_value(param, kwargs)
+                if param_value is not None:
+                    cache_key += f"{str(param)}: {str(param_value)}"
+            elif (
+                param not in litellm_param_kwargs
+            ):  # check if user passed in optional param - e.g. top_k
+                if (
+                    litellm.enable_caching_on_provider_specific_optional_params is True
+                ):  # feature flagged for now
+                    if kwargs[param] is None:
+                        continue  # ignore None params
+                    param_value = kwargs[param]
+                    cache_key += f"{str(param)}: {str(param_value)}"
+        verbose_logger.debug("\nCreated cache key: %s", cache_key)
+        hashed_cache_key = Cache._get_hashed_cache_key(cache_key)
+        hashed_cache_key = self._add_namespace_to_cache_key(hashed_cache_key, **kwargs)
+        self._set_preset_cache_key_in_kwargs(
+            preset_cache_key=hashed_cache_key, **kwargs
+        )
+        return hashed_cache_key
+    def _get_param_value(
+        self,
+        param: str,
+        kwargs: dict,
+    ) -> Optional[str]:
+        """
+        Get the value for the given param from kwargs
+        """
+        if param == "model":
+            return self._get_model_param_value(kwargs)
+        elif param == "file":
+            return self._get_file_param_value(kwargs)
+        return kwargs[param]
+    def _get_model_param_value(self, kwargs: dict) -> str:
+        """
+        Handles getting the value for the 'model' param from kwargs
+        1. If caching groups are set, then return the caching group as the model https://docs.litellm.ai/docs/routing#caching-across-model-groups
+        2. Else if a model_group is set, then return the model_group as the model. This is used for all requests sent through the litellm.Router()
+        3. Else use the `model` passed in kwargs
+        """
+        metadata: Dict = kwargs.get("metadata", {}) or {}
+        litellm_params: Dict = kwargs.get("litellm_params", {}) or {}
+        metadata_in_litellm_params: Dict = litellm_params.get("metadata", {}) or {}
+        model_group: Optional[str] = metadata.get(
+            "model_group"
+        ) or metadata_in_litellm_params.get("model_group")
+        caching_group = self._get_caching_group(metadata, model_group)
+        return caching_group or model_group or kwargs["model"]
+    def _get_caching_group(
+        self, metadata: dict, model_group: Optional[str]
+    ) -> Optional[str]:
+        caching_groups: Optional[List] = metadata.get("caching_groups", [])
+        if caching_groups:
+            for group in caching_groups:
+                if model_group in group:
+                    return str(group)
+        return None
+    def _get_file_param_value(self, kwargs: dict) -> str:
+        """
+        Handles getting the value for the 'file' param from kwargs. Used for `transcription` requests
+        """
+        file = kwargs.get("file")
+        metadata = kwargs.get("metadata", {})
+        litellm_params = kwargs.get("litellm_params", {})
+        return (
+            metadata.get("file_checksum")
+            or getattr(file, "name", None)
+            or metadata.get("file_name")
+            or litellm_params.get("file_name")
+        )
+    def _get_preset_cache_key_from_kwargs(self, **kwargs) -> Optional[str]:
+        """
+        Get the preset cache key from kwargs["litellm_params"]
+        We use _get_preset_cache_keys for two reasons
+        1. optional params like max_tokens, get transformed for bedrock -> max_new_tokens
+        2. avoid doing duplicate / repeated work
+        """
+        if kwargs:
+            if "litellm_params" in kwargs:
+                return kwargs["litellm_params"].get("preset_cache_key", None)
+        return None
+    def _set_preset_cache_key_in_kwargs(self, preset_cache_key: str, **kwargs) -> None:
+        """
+        Set the calculated cache key in kwargs
+        This is used to avoid doing duplicate / repeated work
+        Placed in kwargs["litellm_params"]
+        """
+        if kwargs:
+            if "litellm_params" in kwargs:
+                kwargs["litellm_params"]["preset_cache_key"] = preset_cache_key
+    @staticmethod
+    def _get_hashed_cache_key(cache_key: str) -> str:
+        """
+        Get the hashed cache key for the given cache key.
+        Use hashlib to create a sha256 hash of the cache key
+        Args:
+            cache_key (str): The cache key to hash.
+        Returns:
+            str: The hashed cache key.
+        """
+        hash_object = hashlib.sha256(cache_key.encode())
+        # Hexadecimal representation of the hash
+        hash_hex = hash_object.hexdigest()
+        verbose_logger.debug("Hashed cache key (SHA-256): %s", hash_hex)
+        return hash_hex
+    def _add_namespace_to_cache_key(self, hash_hex: str, **kwargs) -> str:
+        """
+        If a redis namespace is provided, add it to the cache key
+        Args:
+            hash_hex (str): The hashed cache key.
+            **kwargs: Additional keyword arguments.
+        Returns:
+            str: The final hashed cache key with the redis namespace.
+        """
+        dynamic_cache_control: DynamicCacheControl = kwargs.get("cache", {})
+        namespace = (
+            dynamic_cache_control.get("namespace")
+            or kwargs.get("metadata", {}).get("redis_namespace")
+            or self.namespace
+        )
+        if namespace:
+            hash_hex = f"{namespace}:{hash_hex}"
+        verbose_logger.debug("Final hashed key: %s", hash_hex)
+        return hash_hex
+    def generate_streaming_content(self, content):
+        chunk_size = 5  # Adjust the chunk size as needed
+        for i in range(0, len(content), chunk_size):
+            yield {
+                "choices": [
+                    {
+                        "delta": {
+                            "role": "assistant",
+                            "content": content[i : i + chunk_size],
+                        }
+                    }
+                ]
+            }
+            time.sleep(CACHED_STREAMING_CHUNK_DELAY)
+    def _get_cache_logic(
+        self,
+        cached_result: Optional[Any],
+        max_age: Optional[float],
+    ):
+        """
+        Common get cache logic across sync + async implementations
+        """
+        # Check if a timestamp was stored with the cached response
+        if (
+            cached_result is not None
+            and isinstance(cached_result, dict)
+            and "timestamp" in cached_result
+        ):
+            timestamp = cached_result["timestamp"]
+            current_time = time.time()
+            # Calculate age of the cached response
+            response_age = current_time - timestamp
+            # Check if the cached response is older than the max-age
+            if max_age is not None and response_age > max_age:
+                return None  # Cached response is too old
+            # If the response is fresh, or there's no max-age requirement, return the cached response
+            # cached_response is in `b{} convert it to ModelResponse
+            cached_response = cached_result.get("response")
+            try:
+                if isinstance(cached_response, dict):
+                    pass
+                else:
+                    cached_response = json.loads(
+                        cached_response  # type: ignore
+                    )  # Convert string to dictionary
+            except Exception:
+                cached_response = ast.literal_eval(cached_response)  # type: ignore
+            return cached_response
+        return cached_result
+    def get_cache(self, **kwargs):
+        """
+        Retrieves the cached result for the given arguments.
+        Args:
+            *args: args to litellm.completion() or embedding()
+            **kwargs: kwargs to litellm.completion() or embedding()
+        Returns:
+            The cached result if it exists, otherwise None.
+        """
+        try:  # never block execution
+            if self.should_use_cache(**kwargs) is not True:
+                return
+            messages = kwargs.get("messages", [])
+            if "cache_key" in kwargs:
+                cache_key = kwargs["cache_key"]
+            else:
+                cache_key = self.get_cache_key(**kwargs)
+            if cache_key is not None:
+                cache_control_args: DynamicCacheControl = kwargs.get("cache", {})
+                max_age = (
+                    cache_control_args.get("s-maxage")
+                    or cache_control_args.get("s-max-age")
+                    or float("inf")
+                )
+                cached_result = self.cache.get_cache(cache_key, messages=messages)
+                cached_result = self.cache.get_cache(cache_key, messages=messages)
+                return self._get_cache_logic(
+                    cached_result=cached_result, max_age=max_age
+                )
+        except Exception:
+            print_verbose(f"An exception occurred: {traceback.format_exc()}")
+            return None
+    async def async_get_cache(self, **kwargs):
+        """
+        Async get cache implementation.
+        Used for embedding calls in async wrapper
+        """
+        try:  # never block execution
+            if self.should_use_cache(**kwargs) is not True:
+                return
+            kwargs.get("messages", [])
+            if "cache_key" in kwargs:
+                cache_key = kwargs["cache_key"]
+            else:
+                cache_key = self.get_cache_key(**kwargs)
+            if cache_key is not None:
+                cache_control_args = kwargs.get("cache", {})
+                max_age = cache_control_args.get(
+                    "s-max-age", cache_control_args.get("s-maxage", float("inf"))
+                )
+                cached_result = await self.cache.async_get_cache(cache_key, **kwargs)
+                return self._get_cache_logic(
+                    cached_result=cached_result, max_age=max_age
+                )
+        except Exception:
+            print_verbose(f"An exception occurred: {traceback.format_exc()}")
+            return None
+    def _add_cache_logic(self, result, **kwargs):
+        """
+        Common implementation across sync + async add_cache functions
+        """
+        try:
+            if "cache_key" in kwargs:
+                cache_key = kwargs["cache_key"]
+            else:
+                cache_key = self.get_cache_key(**kwargs)
+            if cache_key is not None:
+                if isinstance(result, BaseModel):
+                    result = result.model_dump_json()
+                ## DEFAULT TTL ##
+                if self.ttl is not None:
+                    kwargs["ttl"] = self.ttl
+                ## Get Cache-Controls ##
+                _cache_kwargs = kwargs.get("cache", None)
+                if isinstance(_cache_kwargs, dict):
+                    for k, v in _cache_kwargs.items():
+                        if k == "ttl":
+                            kwargs["ttl"] = v
+                cached_data = {"timestamp": time.time(), "response": result}
+                return cache_key, cached_data, kwargs
+            else:
+                raise Exception("cache key is None")
+        except Exception as e:
+            raise e
+    def add_cache(self, result, **kwargs):
+        """
+        Adds a result to the cache.
+        Args:
+            *args: args to litellm.completion() or embedding()
+            **kwargs: kwargs to litellm.completion() or embedding()
+        Returns:
+            None
+        """
+        try:
+            if self.should_use_cache(**kwargs) is not True:
+                return
+            cache_key, cached_data, kwargs = self._add_cache_logic(
+                result=result, **kwargs
+            )
+            self.cache.set_cache(cache_key, cached_data, **kwargs)
+        except Exception as e:
+            verbose_logger.exception(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
+    async def async_add_cache(self, result, **kwargs):
+        """
+        Async implementation of add_cache
+        """
+        try:
+            if self.should_use_cache(**kwargs) is not True:
+                return
+            if self.type == "redis" and self.redis_flush_size is not None:
+                # high traffic - fill in results in memory and then flush
+                await self.batch_cache_write(result, **kwargs)
+            else:
+                cache_key, cached_data, kwargs = self._add_cache_logic(
+                    result=result, **kwargs
+                )
+                await self.cache.async_set_cache(cache_key, cached_data, **kwargs)
+        except Exception as e:
+            verbose_logger.exception(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
+    def add_embedding_response_to_cache(
+        self,
+        result: EmbeddingResponse,
+        input: str,
+        kwargs: dict,
+        idx_in_result_data: int = 0,
+    ) -> Tuple[str, dict, dict]:
+        preset_cache_key = self.get_cache_key(**{**kwargs, "input": input})
+        kwargs["cache_key"] = preset_cache_key
+        embedding_response = result.data[idx_in_result_data]
+        cache_key, cached_data, kwargs = self._add_cache_logic(
+            result=embedding_response,
+            **kwargs,
+        )
+        return cache_key, cached_data, kwargs
+    async def async_add_cache_pipeline(self, result, **kwargs):
+        """
+        Async implementation of add_cache for Embedding calls
+        Does a bulk write, to prevent using too many clients
+        """
+        try:
+            if self.should_use_cache(**kwargs) is not True:
+                return
+            # set default ttl if not set
+            if self.ttl is not None:
+                kwargs["ttl"] = self.ttl
+            cache_list = []
+            if isinstance(kwargs["input"], list):
+                for idx, i in enumerate(kwargs["input"]):
+                    (
+                        cache_key,
+                        cached_data,
+                        kwargs,
+                    ) = self.add_embedding_response_to_cache(result, i, kwargs, idx)
+                    cache_list.append((cache_key, cached_data))
+            elif isinstance(kwargs["input"], str):
+                cache_key, cached_data, kwargs = self.add_embedding_response_to_cache(
+                    result, kwargs["input"], kwargs
+                )
+                cache_list.append((cache_key, cached_data))
+            await self.cache.async_set_cache_pipeline(cache_list=cache_list, **kwargs)
+            # if async_set_cache_pipeline:
+            #     await async_set_cache_pipeline(cache_list=cache_list, **kwargs)
+            # else:
+            #     tasks = []
+            #     for val in cache_list:
+            #         tasks.append(self.cache.async_set_cache(val[0], val[1], **kwargs))
+            #     await asyncio.gather(*tasks)
+        except Exception as e:
+            verbose_logger.exception(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
+    def should_use_cache(self, **kwargs):
+        """
+        Returns true if we should use the cache for LLM API calls
+        If cache is default_on then this is True
+        If cache is default_off then this is only true when user has opted in to use cache
+        """
+        if self.mode == CacheMode.default_on:
+            return True
+        # when mode == default_off -> Cache is opt in only
+        _cache = kwargs.get("cache", None)
+        verbose_logger.debug("should_use_cache: kwargs: %s; _cache: %s", kwargs, _cache)
+        if _cache and isinstance(_cache, dict):
+            if _cache.get("use-cache", False) is True:
+                return True
+        return False
+    async def batch_cache_write(self, result, **kwargs):
+        cache_key, cached_data, kwargs = self._add_cache_logic(result=result, **kwargs)
+        await self.cache.batch_cache_write(cache_key, cached_data, **kwargs)
+    async def ping(self):
+        cache_ping = getattr(self.cache, "ping")
+        if cache_ping:
+            return await cache_ping()
+        return None
+    async def delete_cache_keys(self, keys):
+        cache_delete_cache_keys = getattr(self.cache, "delete_cache_keys")
+        if cache_delete_cache_keys:
+            return await cache_delete_cache_keys(keys)
+        return None
+    async def disconnect(self):
+        if hasattr(self.cache, "disconnect"):
+            await self.cache.disconnect()
+    def _supports_async(self) -> bool:
+        """
+        Internal method to check if the cache type supports async get/set operations
+        Only S3 Cache Does NOT support async operations
+        """
+        if self.type and self.type == LiteLLMCacheType.S3:
+            return False
+        return True
+def enable_cache(
+    type: Optional[LiteLLMCacheType] = LiteLLMCacheType.LOCAL,
+    host: Optional[str] = None,
+    port: Optional[str] = None,
+    password: Optional[str] = None,
+    supported_call_types: Optional[List[CachingSupportedCallTypes]] = [
+        "completion",
+        "acompletion",
+        "embedding",
+        "aembedding",
+        "atranscription",
+        "transcription",
+        "atext_completion",
+        "text_completion",
+        "arerank",
+        "rerank",
+    ],
+    **kwargs,
+):
+    """
+    Enable cache with the specified configuration.
+    Args:
+        type (Optional[Literal["local", "redis", "s3", "disk"]]): The type of cache to enable. Defaults to "local".
+        host (Optional[str]): The host address of the cache server. Defaults to None.
+        port (Optional[str]): The port number of the cache server. Defaults to None.
+        password (Optional[str]): The password for the cache server. Defaults to None.
+        supported_call_types (Optional[List[Literal["completion", "acompletion", "embedding", "aembedding"]]]):
+            The supported call types for the cache. Defaults to ["completion", "acompletion", "embedding", "aembedding"].
+        **kwargs: Additional keyword arguments.
+    Returns:
+        None
+    Raises:
+        None
+    """
+    print_verbose("LiteLLM: Enabling Cache")
+    if "cache" not in litellm.input_callback:
+        litellm.input_callback.append("cache")
+    if "cache" not in litellm.success_callback:
+        litellm.logging_callback_manager.add_litellm_success_callback("cache")
+    if "cache" not in litellm._async_success_callback:
+        litellm.logging_callback_manager.add_litellm_async_success_callback("cache")
+    if litellm.cache is None:
+        litellm.cache = Cache(
+            type=type,
+            host=host,
+            port=port,
+            password=password,
+            supported_call_types=supported_call_types,
+            **kwargs,
+        )
+    print_verbose(f"LiteLLM: Cache enabled, litellm.cache={litellm.cache}")
+    print_verbose(f"LiteLLM Cache: {vars(litellm.cache)}")
+def update_cache(
+    type: Optional[LiteLLMCacheType] = LiteLLMCacheType.LOCAL,
+    host: Optional[str] = None,
+    port: Optional[str] = None,
+    password: Optional[str] = None,
+    supported_call_types: Optional[List[CachingSupportedCallTypes]] = [
+        "completion",
+        "acompletion",
+        "embedding",
+        "aembedding",
+        "atranscription",
+        "transcription",
+        "atext_completion",
+        "text_completion",
+        "arerank",
+        "rerank",
+    ],
+    **kwargs,
+):
+    """
+    Update the cache for LiteLLM.
+    Args:
+        type (Optional[Literal["local", "redis", "s3", "disk"]]): The type of cache. Defaults to "local".
+        host (Optional[str]): The host of the cache. Defaults to None.
+        port (Optional[str]): The port of the cache. Defaults to None.
+        password (Optional[str]): The password for the cache. Defaults to None.
+        supported_call_types (Optional[List[Literal["completion", "acompletion", "embedding", "aembedding"]]]):
+            The supported call types for the cache. Defaults to ["completion", "acompletion", "embedding", "aembedding"].
+        **kwargs: Additional keyword arguments for the cache.
+    Returns:
+        None
+    """
+    print_verbose("LiteLLM: Updating Cache")
+    litellm.cache = Cache(
+        type=type,
+        host=host,
+        port=port,
+        password=password,
+        supported_call_types=supported_call_types,
+        **kwargs,
+    )
+    print_verbose(f"LiteLLM: Cache Updated, litellm.cache={litellm.cache}")
+    print_verbose(f"LiteLLM Cache: {vars(litellm.cache)}")
+def disable_cache():
+    """
+    Disable the cache used by LiteLLM.
+    This function disables the cache used by the LiteLLM module. It removes the cache-related callbacks from the input_callback, success_callback, and _async_success_callback lists. It also sets the litellm.cache attribute to None.
+    Parameters:
+    None
+    Returns:
+    None
+    """
+    from contextlib import suppress
+    print_verbose("LiteLLM: Disabling Cache")
+    with suppress(ValueError):
+        litellm.input_callback.remove("cache")
+        litellm.success_callback.remove("cache")
+        litellm._async_success_callback.remove("cache")
+    litellm.cache = None
+    print_verbose(f"LiteLLM: Cache disabled, litellm.cache={litellm.cache}")

litellm/caching/caching_handler.py ADDED Viewed

	@@ -0,0 +1,938 @@

+"""
+This contains LLMCachingHandler
+This exposes two methods:
+    - async_get_cache
+    - async_set_cache
+This file is a wrapper around caching.py
+This class is used to handle caching logic specific for LLM API requests (completion / embedding / text_completion / transcription etc)
+It utilizes the (RedisCache, s3Cache, RedisSemanticCache, QdrantSemanticCache, InMemoryCache, DiskCache) based on what the user has setup
+In each method it will call the appropriate method from caching.py
+"""
+import asyncio
+import datetime
+import inspect
+import threading
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    AsyncGenerator,
+    Callable,
+    Dict,
+    Generator,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
+from pydantic import BaseModel
+import litellm
+from litellm._logging import print_verbose, verbose_logger
+from litellm.caching.caching import S3Cache
+from litellm.litellm_core_utils.logging_utils import (
+    _assemble_complete_response_from_streaming_chunks,
+)
+from litellm.types.rerank import RerankResponse
+from litellm.types.utils import (
+    CallTypes,
+    Embedding,
+    EmbeddingResponse,
+    ModelResponse,
+    TextCompletionResponse,
+    TranscriptionResponse,
+    Usage,
+)
+if TYPE_CHECKING:
+    from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
+    from litellm.utils import CustomStreamWrapper
+else:
+    LiteLLMLoggingObj = Any
+    CustomStreamWrapper = Any
+class CachingHandlerResponse(BaseModel):
+    """
+    This is the response object for the caching handler. We need to separate embedding cached responses and (completion / text_completion / transcription) cached responses
+    For embeddings there can be a cache hit for some of the inputs in the list and a cache miss for others
+    """
+    cached_result: Optional[Any] = None
+    final_embedding_cached_response: Optional[EmbeddingResponse] = None
+    embedding_all_elements_cache_hit: bool = False  # this is set to True when all elements in the list have a cache hit in the embedding cache, if true return the final_embedding_cached_response no need to make an API call
+class LLMCachingHandler:
+    def __init__(
+        self,
+        original_function: Callable,
+        request_kwargs: Dict[str, Any],
+        start_time: datetime.datetime,
+    ):
+        self.async_streaming_chunks: List[ModelResponse] = []
+        self.sync_streaming_chunks: List[ModelResponse] = []
+        self.request_kwargs = request_kwargs
+        self.original_function = original_function
+        self.start_time = start_time
+        pass
+    async def _async_get_cache(
+        self,
+        model: str,
+        original_function: Callable,
+        logging_obj: LiteLLMLoggingObj,
+        start_time: datetime.datetime,
+        call_type: str,
+        kwargs: Dict[str, Any],
+        args: Optional[Tuple[Any, ...]] = None,
+    ) -> CachingHandlerResponse:
+        """
+        Internal method to get from the cache.
+        Handles different call types (embeddings, chat/completions, text_completion, transcription)
+        and accordingly returns the cached response
+        Args:
+            model: str:
+            original_function: Callable:
+            logging_obj: LiteLLMLoggingObj:
+            start_time: datetime.datetime:
+            call_type: str:
+            kwargs: Dict[str, Any]:
+            args: Optional[Tuple[Any, ...]] = None:
+        Returns:
+            CachingHandlerResponse:
+        Raises:
+            None
+        """
+        from litellm.utils import CustomStreamWrapper
+        args = args or ()
+        final_embedding_cached_response: Optional[EmbeddingResponse] = None
+        embedding_all_elements_cache_hit: bool = False
+        cached_result: Optional[Any] = None
+        if (
+            (kwargs.get("caching", None) is None and litellm.cache is not None)
+            or kwargs.get("caching", False) is True
+        ) and (
+            kwargs.get("cache", {}).get("no-cache", False) is not True
+        ):  # allow users to control returning cached responses from the completion function
+            if litellm.cache is not None and self._is_call_type_supported_by_cache(
+                original_function=original_function
+            ):
+                verbose_logger.debug("Checking Async Cache")
+                cached_result = await self._retrieve_from_cache(
+                    call_type=call_type,
+                    kwargs=kwargs,
+                    args=args,
+                )
+                if cached_result is not None and not isinstance(cached_result, list):
+                    verbose_logger.debug("Cache Hit!")
+                    cache_hit = True
+                    end_time = datetime.datetime.now()
+                    model, _, _, _ = litellm.get_llm_provider(
+                        model=model,
+                        custom_llm_provider=kwargs.get("custom_llm_provider", None),
+                        api_base=kwargs.get("api_base", None),
+                        api_key=kwargs.get("api_key", None),
+                    )
+                    self._update_litellm_logging_obj_environment(
+                        logging_obj=logging_obj,
+                        model=model,
+                        kwargs=kwargs,
+                        cached_result=cached_result,
+                        is_async=True,
+                    )
+                    call_type = original_function.__name__
+                    cached_result = self._convert_cached_result_to_model_response(
+                        cached_result=cached_result,
+                        call_type=call_type,
+                        kwargs=kwargs,
+                        logging_obj=logging_obj,
+                        model=model,
+                        custom_llm_provider=kwargs.get("custom_llm_provider", None),
+                        args=args,
+                    )
+                    if kwargs.get("stream", False) is False:
+                        # LOG SUCCESS
+                        self._async_log_cache_hit_on_callbacks(
+                            logging_obj=logging_obj,
+                            cached_result=cached_result,
+                            start_time=start_time,
+                            end_time=end_time,
+                            cache_hit=cache_hit,
+                        )
+                    cache_key = litellm.cache._get_preset_cache_key_from_kwargs(
+                        **kwargs
+                    )
+                    if (
+                        isinstance(cached_result, BaseModel)
+                        or isinstance(cached_result, CustomStreamWrapper)
+                    ) and hasattr(cached_result, "_hidden_params"):
+                        cached_result._hidden_params["cache_key"] = cache_key  # type: ignore
+                    return CachingHandlerResponse(cached_result=cached_result)
+                elif (
+                    call_type == CallTypes.aembedding.value
+                    and cached_result is not None
+                    and isinstance(cached_result, list)
+                    and litellm.cache is not None
+                    and not isinstance(
+                        litellm.cache.cache, S3Cache
+                    )  # s3 doesn't support bulk writing. Exclude.
+                ):
+                    (
+                        final_embedding_cached_response,
+                        embedding_all_elements_cache_hit,
+                    ) = self._process_async_embedding_cached_response(
+                        final_embedding_cached_response=final_embedding_cached_response,
+                        cached_result=cached_result,
+                        kwargs=kwargs,
+                        logging_obj=logging_obj,
+                        start_time=start_time,
+                        model=model,
+                    )
+                    return CachingHandlerResponse(
+                        final_embedding_cached_response=final_embedding_cached_response,
+                        embedding_all_elements_cache_hit=embedding_all_elements_cache_hit,
+                    )
+        verbose_logger.debug(f"CACHE RESULT: {cached_result}")
+        return CachingHandlerResponse(
+            cached_result=cached_result,
+            final_embedding_cached_response=final_embedding_cached_response,
+        )
+    def _sync_get_cache(
+        self,
+        model: str,
+        original_function: Callable,
+        logging_obj: LiteLLMLoggingObj,
+        start_time: datetime.datetime,
+        call_type: str,
+        kwargs: Dict[str, Any],
+        args: Optional[Tuple[Any, ...]] = None,
+    ) -> CachingHandlerResponse:
+        from litellm.utils import CustomStreamWrapper
+        args = args or ()
+        new_kwargs = kwargs.copy()
+        new_kwargs.update(
+            convert_args_to_kwargs(
+                self.original_function,
+                args,
+            )
+        )
+        cached_result: Optional[Any] = None
+        if litellm.cache is not None and self._is_call_type_supported_by_cache(
+            original_function=original_function
+        ):
+            print_verbose("Checking Sync Cache")
+            cached_result = litellm.cache.get_cache(**new_kwargs)
+            if cached_result is not None:
+                if "detail" in cached_result:
+                    # implies an error occurred
+                    pass
+                else:
+                    call_type = original_function.__name__
+                    cached_result = self._convert_cached_result_to_model_response(
+                        cached_result=cached_result,
+                        call_type=call_type,
+                        kwargs=kwargs,
+                        logging_obj=logging_obj,
+                        model=model,
+                        custom_llm_provider=kwargs.get("custom_llm_provider", None),
+                        args=args,
+                    )
+                    # LOG SUCCESS
+                    cache_hit = True
+                    end_time = datetime.datetime.now()
+                    (
+                        model,
+                        custom_llm_provider,
+                        dynamic_api_key,
+                        api_base,
+                    ) = litellm.get_llm_provider(
+                        model=model or "",
+                        custom_llm_provider=kwargs.get("custom_llm_provider", None),
+                        api_base=kwargs.get("api_base", None),
+                        api_key=kwargs.get("api_key", None),
+                    )
+                    self._update_litellm_logging_obj_environment(
+                        logging_obj=logging_obj,
+                        model=model,
+                        kwargs=kwargs,
+                        cached_result=cached_result,
+                        is_async=False,
+                    )
+                    threading.Thread(
+                        target=logging_obj.success_handler,
+                        args=(cached_result, start_time, end_time, cache_hit),
+                    ).start()
+                    cache_key = litellm.cache._get_preset_cache_key_from_kwargs(
+                        **kwargs
+                    )
+                    if (
+                        isinstance(cached_result, BaseModel)
+                        or isinstance(cached_result, CustomStreamWrapper)
+                    ) and hasattr(cached_result, "_hidden_params"):
+                        cached_result._hidden_params["cache_key"] = cache_key  # type: ignore
+                    return CachingHandlerResponse(cached_result=cached_result)
+        return CachingHandlerResponse(cached_result=cached_result)
+    def _process_async_embedding_cached_response(
+        self,
+        final_embedding_cached_response: Optional[EmbeddingResponse],
+        cached_result: List[Optional[Dict[str, Any]]],
+        kwargs: Dict[str, Any],
+        logging_obj: LiteLLMLoggingObj,
+        start_time: datetime.datetime,
+        model: str,
+    ) -> Tuple[Optional[EmbeddingResponse], bool]:
+        """
+        Returns the final embedding cached response and a boolean indicating if all elements in the list have a cache hit
+        For embedding responses, there can be a cache hit for some of the inputs in the list and a cache miss for others
+        This function processes the cached embedding responses and returns the final embedding cached response and a boolean indicating if all elements in the list have a cache hit
+        Args:
+            final_embedding_cached_response: Optional[EmbeddingResponse]:
+            cached_result: List[Optional[Dict[str, Any]]]:
+            kwargs: Dict[str, Any]:
+            logging_obj: LiteLLMLoggingObj:
+            start_time: datetime.datetime:
+            model: str:
+        Returns:
+            Tuple[Optional[EmbeddingResponse], bool]:
+            Returns the final embedding cached response and a boolean indicating if all elements in the list have a cache hit
+        """
+        embedding_all_elements_cache_hit: bool = False
+        remaining_list = []
+        non_null_list = []
+        for idx, cr in enumerate(cached_result):
+            if cr is None:
+                remaining_list.append(kwargs["input"][idx])
+            else:
+                non_null_list.append((idx, cr))
+        original_kwargs_input = kwargs["input"]
+        kwargs["input"] = remaining_list
+        if len(non_null_list) > 0:
+            print_verbose(f"EMBEDDING CACHE HIT! - {len(non_null_list)}")
+            final_embedding_cached_response = EmbeddingResponse(
+                model=kwargs.get("model"),
+                data=[None] * len(original_kwargs_input),
+            )
+            final_embedding_cached_response._hidden_params["cache_hit"] = True
+            prompt_tokens = 0
+            for val in non_null_list:
+                idx, cr = val  # (idx, cr) tuple
+                if cr is not None:
+                    final_embedding_cached_response.data[idx] = Embedding(
+                        embedding=cr["embedding"],
+                        index=idx,
+                        object="embedding",
+                    )
+                    if isinstance(original_kwargs_input[idx], str):
+                        from litellm.utils import token_counter
+                        prompt_tokens += token_counter(
+                            text=original_kwargs_input[idx], count_response_tokens=True
+                        )
+            ## USAGE
+            usage = Usage(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=0,
+                total_tokens=prompt_tokens,
+            )
+            final_embedding_cached_response.usage = usage
+        if len(remaining_list) == 0:
+            # LOG SUCCESS
+            cache_hit = True
+            embedding_all_elements_cache_hit = True
+            end_time = datetime.datetime.now()
+            (
+                model,
+                custom_llm_provider,
+                dynamic_api_key,
+                api_base,
+            ) = litellm.get_llm_provider(
+                model=model,
+                custom_llm_provider=kwargs.get("custom_llm_provider", None),
+                api_base=kwargs.get("api_base", None),
+                api_key=kwargs.get("api_key", None),
+            )
+            self._update_litellm_logging_obj_environment(
+                logging_obj=logging_obj,
+                model=model,
+                kwargs=kwargs,
+                cached_result=final_embedding_cached_response,
+                is_async=True,
+                is_embedding=True,
+            )
+            self._async_log_cache_hit_on_callbacks(
+                logging_obj=logging_obj,
+                cached_result=final_embedding_cached_response,
+                start_time=start_time,
+                end_time=end_time,
+                cache_hit=cache_hit,
+            )
+            return final_embedding_cached_response, embedding_all_elements_cache_hit
+        return final_embedding_cached_response, embedding_all_elements_cache_hit
+    def combine_usage(self, usage1: Usage, usage2: Usage) -> Usage:
+        return Usage(
+            prompt_tokens=usage1.prompt_tokens + usage2.prompt_tokens,
+            completion_tokens=usage1.completion_tokens + usage2.completion_tokens,
+            total_tokens=usage1.total_tokens + usage2.total_tokens,
+        )
+    def _combine_cached_embedding_response_with_api_result(
+        self,
+        _caching_handler_response: CachingHandlerResponse,
+        embedding_response: EmbeddingResponse,
+        start_time: datetime.datetime,
+        end_time: datetime.datetime,
+    ) -> EmbeddingResponse:
+        """
+        Combines the cached embedding response with the API EmbeddingResponse
+        For caching there can be a cache hit for some of the inputs in the list and a cache miss for others
+        This function combines the cached embedding response with the API EmbeddingResponse
+        Args:
+            caching_handler_response: CachingHandlerResponse:
+            embedding_response: EmbeddingResponse:
+        Returns:
+            EmbeddingResponse:
+        """
+        if _caching_handler_response.final_embedding_cached_response is None:
+            return embedding_response
+        idx = 0
+        final_data_list = []
+        for item in _caching_handler_response.final_embedding_cached_response.data:
+            if item is None and embedding_response.data is not None:
+                final_data_list.append(embedding_response.data[idx])
+                idx += 1
+            else:
+                final_data_list.append(item)
+        _caching_handler_response.final_embedding_cached_response.data = final_data_list
+        _caching_handler_response.final_embedding_cached_response._hidden_params[
+            "cache_hit"
+        ] = True
+        _caching_handler_response.final_embedding_cached_response._response_ms = (
+            end_time - start_time
+        ).total_seconds() * 1000
+        ## USAGE
+        if (
+            _caching_handler_response.final_embedding_cached_response.usage is not None
+            and embedding_response.usage is not None
+        ):
+            _caching_handler_response.final_embedding_cached_response.usage = self.combine_usage(
+                usage1=_caching_handler_response.final_embedding_cached_response.usage,
+                usage2=embedding_response.usage,
+            )
+        return _caching_handler_response.final_embedding_cached_response
+    def _async_log_cache_hit_on_callbacks(
+        self,
+        logging_obj: LiteLLMLoggingObj,
+        cached_result: Any,
+        start_time: datetime.datetime,
+        end_time: datetime.datetime,
+        cache_hit: bool,
+    ):
+        """
+        Helper function to log the success of a cached result on callbacks
+        Args:
+            logging_obj (LiteLLMLoggingObj): The logging object.
+            cached_result: The cached result.
+            start_time (datetime): The start time of the operation.
+            end_time (datetime): The end time of the operation.
+            cache_hit (bool): Whether it was a cache hit.
+        """
+        asyncio.create_task(
+            logging_obj.async_success_handler(
+                cached_result, start_time, end_time, cache_hit
+            )
+        )
+        threading.Thread(
+            target=logging_obj.success_handler,
+            args=(cached_result, start_time, end_time, cache_hit),
+        ).start()
+    async def _retrieve_from_cache(
+        self, call_type: str, kwargs: Dict[str, Any], args: Tuple[Any, ...]
+    ) -> Optional[Any]:
+        """
+        Internal method to
+        - get cache key
+        - check what type of cache is used - Redis, RedisSemantic, Qdrant, S3
+        - async get cache value
+        - return the cached value
+        Args:
+            call_type: str:
+            kwargs: Dict[str, Any]:
+            args: Optional[Tuple[Any, ...]] = None:
+        Returns:
+            Optional[Any]:
+        Raises:
+            None
+        """
+        if litellm.cache is None:
+            return None
+        new_kwargs = kwargs.copy()
+        new_kwargs.update(
+            convert_args_to_kwargs(
+                self.original_function,
+                args,
+            )
+        )
+        cached_result: Optional[Any] = None
+        if call_type == CallTypes.aembedding.value and isinstance(
+            new_kwargs["input"], list
+        ):
+            tasks = []
+            for idx, i in enumerate(new_kwargs["input"]):
+                preset_cache_key = litellm.cache.get_cache_key(
+                    **{**new_kwargs, "input": i}
+                )
+                tasks.append(litellm.cache.async_get_cache(cache_key=preset_cache_key))
+            cached_result = await asyncio.gather(*tasks)
+            ## check if cached result is None ##
+            if cached_result is not None and isinstance(cached_result, list):
+                # set cached_result to None if all elements are None
+                if all(result is None for result in cached_result):
+                    cached_result = None
+        else:
+            if litellm.cache._supports_async() is True:
+                cached_result = await litellm.cache.async_get_cache(**new_kwargs)
+            else:  # for s3 caching. [NOT RECOMMENDED IN PROD - this will slow down responses since boto3 is sync]
+                cached_result = litellm.cache.get_cache(**new_kwargs)
+        return cached_result
+    def _convert_cached_result_to_model_response(
+        self,
+        cached_result: Any,
+        call_type: str,
+        kwargs: Dict[str, Any],
+        logging_obj: LiteLLMLoggingObj,
+        model: str,
+        args: Tuple[Any, ...],
+        custom_llm_provider: Optional[str] = None,
+    ) -> Optional[
+        Union[
+            ModelResponse,
+            TextCompletionResponse,
+            EmbeddingResponse,
+            RerankResponse,
+            TranscriptionResponse,
+            CustomStreamWrapper,
+        ]
+    ]:
+        """
+        Internal method to process the cached result
+        Checks the call type and converts the cached result to the appropriate model response object
+        example if call type is text_completion -> returns TextCompletionResponse object
+        Args:
+            cached_result: Any:
+            call_type: str:
+            kwargs: Dict[str, Any]:
+            logging_obj: LiteLLMLoggingObj:
+            model: str:
+            custom_llm_provider: Optional[str] = None:
+            args: Optional[Tuple[Any, ...]] = None:
+        Returns:
+            Optional[Any]:
+        """
+        from litellm.utils import convert_to_model_response_object
+        if (
+            call_type == CallTypes.acompletion.value
+            or call_type == CallTypes.completion.value
+        ) and isinstance(cached_result, dict):
+            if kwargs.get("stream", False) is True:
+                cached_result = self._convert_cached_stream_response(
+                    cached_result=cached_result,
+                    call_type=call_type,
+                    logging_obj=logging_obj,
+                    model=model,
+                )
+            else:
+                cached_result = convert_to_model_response_object(
+                    response_object=cached_result,
+                    model_response_object=ModelResponse(),
+                )
+        if (
+            call_type == CallTypes.atext_completion.value
+            or call_type == CallTypes.text_completion.value
+        ) and isinstance(cached_result, dict):
+            if kwargs.get("stream", False) is True:
+                cached_result = self._convert_cached_stream_response(
+                    cached_result=cached_result,
+                    call_type=call_type,
+                    logging_obj=logging_obj,
+                    model=model,
+                )
+            else:
+                cached_result = TextCompletionResponse(**cached_result)
+        elif (
+            call_type == CallTypes.aembedding.value
+            or call_type == CallTypes.embedding.value
+        ) and isinstance(cached_result, dict):
+            cached_result = convert_to_model_response_object(
+                response_object=cached_result,
+                model_response_object=EmbeddingResponse(),
+                response_type="embedding",
+            )
+        elif (
+            call_type == CallTypes.arerank.value or call_type == CallTypes.rerank.value
+        ) and isinstance(cached_result, dict):
+            cached_result = convert_to_model_response_object(
+                response_object=cached_result,
+                model_response_object=None,
+                response_type="rerank",
+            )
+        elif (
+            call_type == CallTypes.atranscription.value
+            or call_type == CallTypes.transcription.value
+        ) and isinstance(cached_result, dict):
+            hidden_params = {
+                "model": "whisper-1",
+                "custom_llm_provider": custom_llm_provider,
+                "cache_hit": True,
+            }
+            cached_result = convert_to_model_response_object(
+                response_object=cached_result,
+                model_response_object=TranscriptionResponse(),
+                response_type="audio_transcription",
+                hidden_params=hidden_params,
+            )
+        if (
+            hasattr(cached_result, "_hidden_params")
+            and cached_result._hidden_params is not None
+            and isinstance(cached_result._hidden_params, dict)
+        ):
+            cached_result._hidden_params["cache_hit"] = True
+        return cached_result
+    def _convert_cached_stream_response(
+        self,
+        cached_result: Any,
+        call_type: str,
+        logging_obj: LiteLLMLoggingObj,
+        model: str,
+    ) -> CustomStreamWrapper:
+        from litellm.utils import (
+            CustomStreamWrapper,
+            convert_to_streaming_response,
+            convert_to_streaming_response_async,
+        )
+        _stream_cached_result: Union[AsyncGenerator, Generator]
+        if (
+            call_type == CallTypes.acompletion.value
+            or call_type == CallTypes.atext_completion.value
+        ):
+            _stream_cached_result = convert_to_streaming_response_async(
+                response_object=cached_result,
+            )
+        else:
+            _stream_cached_result = convert_to_streaming_response(
+                response_object=cached_result,
+            )
+        return CustomStreamWrapper(
+            completion_stream=_stream_cached_result,
+            model=model,
+            custom_llm_provider="cached_response",
+            logging_obj=logging_obj,
+        )
+    async def async_set_cache(
+        self,
+        result: Any,
+        original_function: Callable,
+        kwargs: Dict[str, Any],
+        args: Optional[Tuple[Any, ...]] = None,
+    ):
+        """
+        Internal method to check the type of the result & cache used and adds the result to the cache accordingly
+        Args:
+            result: Any:
+            original_function: Callable:
+            kwargs: Dict[str, Any]:
+            args: Optional[Tuple[Any, ...]] = None:
+        Returns:
+            None
+        Raises:
+            None
+        """
+        if litellm.cache is None:
+            return
+        new_kwargs = kwargs.copy()
+        new_kwargs.update(
+            convert_args_to_kwargs(
+                original_function,
+                args,
+            )
+        )
+        # [OPTIONAL] ADD TO CACHE
+        if self._should_store_result_in_cache(
+            original_function=original_function, kwargs=new_kwargs
+        ):
+            if (
+                isinstance(result, litellm.ModelResponse)
+                or isinstance(result, litellm.EmbeddingResponse)
+                or isinstance(result, TranscriptionResponse)
+                or isinstance(result, RerankResponse)
+            ):
+                if (
+                    isinstance(result, EmbeddingResponse)
+                    and litellm.cache is not None
+                    and not isinstance(
+                        litellm.cache.cache, S3Cache
+                    )  # s3 doesn't support bulk writing. Exclude.
+                ):
+                    asyncio.create_task(
+                        litellm.cache.async_add_cache_pipeline(result, **new_kwargs)
+                    )
+                elif isinstance(litellm.cache.cache, S3Cache):
+                    threading.Thread(
+                        target=litellm.cache.add_cache,
+                        args=(result,),
+                        kwargs=new_kwargs,
+                    ).start()
+                else:
+                    asyncio.create_task(
+                        litellm.cache.async_add_cache(
+                            result.model_dump_json(), **new_kwargs
+                        )
+                    )
+            else:
+                asyncio.create_task(litellm.cache.async_add_cache(result, **new_kwargs))
+    def sync_set_cache(
+        self,
+        result: Any,
+        kwargs: Dict[str, Any],
+        args: Optional[Tuple[Any, ...]] = None,
+    ):
+        """
+        Sync internal method to add the result to the cache
+        """
+        new_kwargs = kwargs.copy()
+        new_kwargs.update(
+            convert_args_to_kwargs(
+                self.original_function,
+                args,
+            )
+        )
+        if litellm.cache is None:
+            return
+        if self._should_store_result_in_cache(
+            original_function=self.original_function, kwargs=new_kwargs
+        ):
+            litellm.cache.add_cache(result, **new_kwargs)
+        return
+    def _should_store_result_in_cache(
+        self, original_function: Callable, kwargs: Dict[str, Any]
+    ) -> bool:
+        """
+        Helper function to determine if the result should be stored in the cache.
+        Returns:
+            bool: True if the result should be stored in the cache, False otherwise.
+        """
+        return (
+            (litellm.cache is not None)
+            and litellm.cache.supported_call_types is not None
+            and (str(original_function.__name__) in litellm.cache.supported_call_types)
+            and (kwargs.get("cache", {}).get("no-store", False) is not True)
+        )
+    def _is_call_type_supported_by_cache(
+        self,
+        original_function: Callable,
+    ) -> bool:
+        """
+        Helper function to determine if the call type is supported by the cache.
+        call types are acompletion, aembedding, atext_completion, atranscription, arerank
+        Defined on `litellm.types.utils.CallTypes`
+        Returns:
+            bool: True if the call type is supported by the cache, False otherwise.
+        """
+        if (
+            litellm.cache is not None
+            and litellm.cache.supported_call_types is not None
+            and str(original_function.__name__) in litellm.cache.supported_call_types
+        ):
+            return True
+        return False
+    async def _add_streaming_response_to_cache(self, processed_chunk: ModelResponse):
+        """
+        Internal method to add the streaming response to the cache
+        - If 'streaming_chunk' has a 'finish_reason' then assemble a litellm.ModelResponse object
+        - Else append the chunk to self.async_streaming_chunks
+        """
+        complete_streaming_response: Optional[
+            Union[ModelResponse, TextCompletionResponse]
+        ] = _assemble_complete_response_from_streaming_chunks(
+            result=processed_chunk,
+            start_time=self.start_time,
+            end_time=datetime.datetime.now(),
+            request_kwargs=self.request_kwargs,
+            streaming_chunks=self.async_streaming_chunks,
+            is_async=True,
+        )
+        # if a complete_streaming_response is assembled, add it to the cache
+        if complete_streaming_response is not None:
+            await self.async_set_cache(
+                result=complete_streaming_response,
+                original_function=self.original_function,
+                kwargs=self.request_kwargs,
+            )
+    def _sync_add_streaming_response_to_cache(self, processed_chunk: ModelResponse):
+        """
+        Sync internal method to add the streaming response to the cache
+        """
+        complete_streaming_response: Optional[
+            Union[ModelResponse, TextCompletionResponse]
+        ] = _assemble_complete_response_from_streaming_chunks(
+            result=processed_chunk,
+            start_time=self.start_time,
+            end_time=datetime.datetime.now(),
+            request_kwargs=self.request_kwargs,
+            streaming_chunks=self.sync_streaming_chunks,
+            is_async=False,
+        )
+        # if a complete_streaming_response is assembled, add it to the cache
+        if complete_streaming_response is not None:
+            self.sync_set_cache(
+                result=complete_streaming_response,
+                kwargs=self.request_kwargs,
+            )
+    def _update_litellm_logging_obj_environment(
+        self,
+        logging_obj: LiteLLMLoggingObj,
+        model: str,
+        kwargs: Dict[str, Any],
+        cached_result: Any,
+        is_async: bool,
+        is_embedding: bool = False,
+    ):
+        """
+        Helper function to update the LiteLLMLoggingObj environment variables.
+        Args:
+            logging_obj (LiteLLMLoggingObj): The logging object to update.
+            model (str): The model being used.
+            kwargs (Dict[str, Any]): The keyword arguments from the original function call.
+            cached_result (Any): The cached result to log.
+            is_async (bool): Whether the call is asynchronous or not.
+            is_embedding (bool): Whether the call is for embeddings or not.
+        Returns:
+            None
+        """
+        litellm_params = {
+            "logger_fn": kwargs.get("logger_fn", None),
+            "acompletion": is_async,
+            "api_base": kwargs.get("api_base", ""),
+            "metadata": kwargs.get("metadata", {}),
+            "model_info": kwargs.get("model_info", {}),
+            "proxy_server_request": kwargs.get("proxy_server_request", None),
+            "stream_response": kwargs.get("stream_response", {}),
+        }
+        if litellm.cache is not None:
+            litellm_params[
+                "preset_cache_key"
+            ] = litellm.cache._get_preset_cache_key_from_kwargs(**kwargs)
+        else:
+            litellm_params["preset_cache_key"] = None
+        logging_obj.update_environment_variables(
+            model=model,
+            user=kwargs.get("user", None),
+            optional_params={},
+            litellm_params=litellm_params,
+            input=(
+                kwargs.get("messages", "")
+                if not is_embedding
+                else kwargs.get("input", "")
+            ),
+            api_key=kwargs.get("api_key", None),
+            original_response=str(cached_result),
+            additional_args=None,
+            stream=kwargs.get("stream", False),
+        )
+def convert_args_to_kwargs(
+    original_function: Callable,
+    args: Optional[Tuple[Any, ...]] = None,
+) -> Dict[str, Any]:
+    # Get the signature of the original function
+    signature = inspect.signature(original_function)
+    # Get parameter names in the order they appear in the original function
+    param_names = list(signature.parameters.keys())
+    # Create a mapping of positional arguments to parameter names
+    args_to_kwargs = {}
+    if args:
+        for index, arg in enumerate(args):
+            if index < len(param_names):
+                param_name = param_names[index]
+                args_to_kwargs[param_name] = arg
+    return args_to_kwargs

litellm/caching/disk_cache.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import json
+from typing import TYPE_CHECKING, Any, Optional, Union
+from .base_cache import BaseCache
+if TYPE_CHECKING:
+    from opentelemetry.trace import Span as _Span
+    Span = Union[_Span, Any]
+else:
+    Span = Any
+class DiskCache(BaseCache):
+    def __init__(self, disk_cache_dir: Optional[str] = None):
+        import diskcache as dc
+        # if users don't provider one, use the default litellm cache
+        if disk_cache_dir is None:
+            self.disk_cache = dc.Cache(".litellm_cache")
+        else:
+            self.disk_cache = dc.Cache(disk_cache_dir)
+    def set_cache(self, key, value, **kwargs):
+        if "ttl" in kwargs:
+            self.disk_cache.set(key, value, expire=kwargs["ttl"])
+        else:
+            self.disk_cache.set(key, value)
+    async def async_set_cache(self, key, value, **kwargs):
+        self.set_cache(key=key, value=value, **kwargs)
+    async def async_set_cache_pipeline(self, cache_list, **kwargs):
+        for cache_key, cache_value in cache_list:
+            if "ttl" in kwargs:
+                self.set_cache(key=cache_key, value=cache_value, ttl=kwargs["ttl"])
+            else:
+                self.set_cache(key=cache_key, value=cache_value)
+    def get_cache(self, key, **kwargs):
+        original_cached_response = self.disk_cache.get(key)
+        if original_cached_response:
+            try:
+                cached_response = json.loads(original_cached_response)  # type: ignore
+            except Exception:
+                cached_response = original_cached_response
+            return cached_response
+        return None
+    def batch_get_cache(self, keys: list, **kwargs):
+        return_val = []
+        for k in keys:
+            val = self.get_cache(key=k, **kwargs)
+            return_val.append(val)
+        return return_val
+    def increment_cache(self, key, value: int, **kwargs) -> int:
+        # get the value
+        init_value = self.get_cache(key=key) or 0
+        value = init_value + value  # type: ignore
+        self.set_cache(key, value, **kwargs)
+        return value
+    async def async_get_cache(self, key, **kwargs):
+        return self.get_cache(key=key, **kwargs)
+    async def async_batch_get_cache(self, keys: list, **kwargs):
+        return_val = []
+        for k in keys:
+            val = self.get_cache(key=k, **kwargs)
+            return_val.append(val)
+        return return_val
+    async def async_increment(self, key, value: int, **kwargs) -> int:
+        # get the value
+        init_value = await self.async_get_cache(key=key) or 0
+        value = init_value + value  # type: ignore
+        await self.async_set_cache(key, value, **kwargs)
+        return value
+    def flush_cache(self):
+        self.disk_cache.clear()
+    async def disconnect(self):
+        pass
+    def delete_cache(self, key):
+        self.disk_cache.pop(key)

litellm/caching/dual_cache.py ADDED Viewed

	@@ -0,0 +1,434 @@

+"""
+Dual Cache implementation - Class to update both Redis and an in-memory cache simultaneously.
+Has 4 primary methods:
+    - set_cache
+    - get_cache
+    - async_set_cache
+    - async_get_cache
+"""
+import asyncio
+import time
+import traceback
+from concurrent.futures import ThreadPoolExecutor
+from typing import TYPE_CHECKING, Any, List, Optional, Union
+import litellm
+from litellm._logging import print_verbose, verbose_logger
+from .base_cache import BaseCache
+from .in_memory_cache import InMemoryCache
+from .redis_cache import RedisCache
+if TYPE_CHECKING:
+    from opentelemetry.trace import Span as _Span
+    Span = Union[_Span, Any]
+else:
+    Span = Any
+from collections import OrderedDict
+class LimitedSizeOrderedDict(OrderedDict):
+    def __init__(self, *args, max_size=100, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.max_size = max_size
+    def __setitem__(self, key, value):
+        # If inserting a new key exceeds max size, remove the oldest item
+        if len(self) >= self.max_size:
+            self.popitem(last=False)
+        super().__setitem__(key, value)
+class DualCache(BaseCache):
+    """
+    DualCache is a cache implementation that updates both Redis and an in-memory cache simultaneously.
+    When data is updated or inserted, it is written to both the in-memory cache + Redis.
+    This ensures that even if Redis hasn't been updated yet, the in-memory cache reflects the most recent data.
+    """
+    def __init__(
+        self,
+        in_memory_cache: Optional[InMemoryCache] = None,
+        redis_cache: Optional[RedisCache] = None,
+        default_in_memory_ttl: Optional[float] = None,
+        default_redis_ttl: Optional[float] = None,
+        default_redis_batch_cache_expiry: Optional[float] = None,
+        default_max_redis_batch_cache_size: int = 100,
+    ) -> None:
+        super().__init__()
+        # If in_memory_cache is not provided, use the default InMemoryCache
+        self.in_memory_cache = in_memory_cache or InMemoryCache()
+        # If redis_cache is not provided, use the default RedisCache
+        self.redis_cache = redis_cache
+        self.last_redis_batch_access_time = LimitedSizeOrderedDict(
+            max_size=default_max_redis_batch_cache_size
+        )
+        self.redis_batch_cache_expiry = (
+            default_redis_batch_cache_expiry
+            or litellm.default_redis_batch_cache_expiry
+            or 10
+        )
+        self.default_in_memory_ttl = (
+            default_in_memory_ttl or litellm.default_in_memory_ttl
+        )
+        self.default_redis_ttl = default_redis_ttl or litellm.default_redis_ttl
+    def update_cache_ttl(
+        self, default_in_memory_ttl: Optional[float], default_redis_ttl: Optional[float]
+    ):
+        if default_in_memory_ttl is not None:
+            self.default_in_memory_ttl = default_in_memory_ttl
+        if default_redis_ttl is not None:
+            self.default_redis_ttl = default_redis_ttl
+    def set_cache(self, key, value, local_only: bool = False, **kwargs):
+        # Update both Redis and in-memory cache
+        try:
+            if self.in_memory_cache is not None:
+                if "ttl" not in kwargs and self.default_in_memory_ttl is not None:
+                    kwargs["ttl"] = self.default_in_memory_ttl
+                self.in_memory_cache.set_cache(key, value, **kwargs)
+            if self.redis_cache is not None and local_only is False:
+                self.redis_cache.set_cache(key, value, **kwargs)
+        except Exception as e:
+            print_verbose(e)
+    def increment_cache(
+        self, key, value: int, local_only: bool = False, **kwargs
+    ) -> int:
+        """
+        Key - the key in cache
+        Value - int - the value you want to increment by
+        Returns - int - the incremented value
+        """
+        try:
+            result: int = value
+            if self.in_memory_cache is not None:
+                result = self.in_memory_cache.increment_cache(key, value, **kwargs)
+            if self.redis_cache is not None and local_only is False:
+                result = self.redis_cache.increment_cache(key, value, **kwargs)
+            return result
+        except Exception as e:
+            verbose_logger.error(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
+            raise e
+    def get_cache(
+        self,
+        key,
+        parent_otel_span: Optional[Span] = None,
+        local_only: bool = False,
+        **kwargs,
+    ):
+        # Try to fetch from in-memory cache first
+        try:
+            result = None
+            if self.in_memory_cache is not None:
+                in_memory_result = self.in_memory_cache.get_cache(key, **kwargs)
+                if in_memory_result is not None:
+                    result = in_memory_result
+            if result is None and self.redis_cache is not None and local_only is False:
+                # If not found in in-memory cache, try fetching from Redis
+                redis_result = self.redis_cache.get_cache(
+                    key, parent_otel_span=parent_otel_span
+                )
+                if redis_result is not None:
+                    # Update in-memory cache with the value from Redis
+                    self.in_memory_cache.set_cache(key, redis_result, **kwargs)
+                result = redis_result
+            print_verbose(f"get cache: cache result: {result}")
+            return result
+        except Exception:
+            verbose_logger.error(traceback.format_exc())
+    def batch_get_cache(
+        self,
+        keys: list,
+        parent_otel_span: Optional[Span] = None,
+        local_only: bool = False,
+        **kwargs,
+    ):
+        received_args = locals()
+        received_args.pop("self")
+        def run_in_new_loop():
+            """Run the coroutine in a new event loop within this thread."""
+            new_loop = asyncio.new_event_loop()
+            try:
+                asyncio.set_event_loop(new_loop)
+                return new_loop.run_until_complete(
+                    self.async_batch_get_cache(**received_args)
+                )
+            finally:
+                new_loop.close()
+                asyncio.set_event_loop(None)
+        try:
+            # First, try to get the current event loop
+            _ = asyncio.get_running_loop()
+            # If we're already in an event loop, run in a separate thread
+            # to avoid nested event loop issues
+            with ThreadPoolExecutor(max_workers=1) as executor:
+                future = executor.submit(run_in_new_loop)
+                return future.result()
+        except RuntimeError:
+            # No running event loop, we can safely run in this thread
+            return run_in_new_loop()
+    async def async_get_cache(
+        self,
+        key,
+        parent_otel_span: Optional[Span] = None,
+        local_only: bool = False,
+        **kwargs,
+    ):
+        # Try to fetch from in-memory cache first
+        try:
+            print_verbose(
+                f"async get cache: cache key: {key}; local_only: {local_only}"
+            )
+            result = None
+            if self.in_memory_cache is not None:
+                in_memory_result = await self.in_memory_cache.async_get_cache(
+                    key, **kwargs
+                )
+                print_verbose(f"in_memory_result: {in_memory_result}")
+                if in_memory_result is not None:
+                    result = in_memory_result
+            if result is None and self.redis_cache is not None and local_only is False:
+                # If not found in in-memory cache, try fetching from Redis
+                redis_result = await self.redis_cache.async_get_cache(
+                    key, parent_otel_span=parent_otel_span
+                )
+                if redis_result is not None:
+                    # Update in-memory cache with the value from Redis
+                    await self.in_memory_cache.async_set_cache(
+                        key, redis_result, **kwargs
+                    )
+                result = redis_result
+            print_verbose(f"get cache: cache result: {result}")
+            return result
+        except Exception:
+            verbose_logger.error(traceback.format_exc())
+    def get_redis_batch_keys(
+        self,
+        current_time: float,
+        keys: List[str],
+        result: List[Any],
+    ) -> List[str]:
+        sublist_keys = []
+        for key, value in zip(keys, result):
+            if value is None:
+                if (
+                    key not in self.last_redis_batch_access_time
+                    or current_time - self.last_redis_batch_access_time[key]
+                    >= self.redis_batch_cache_expiry
+                ):
+                    sublist_keys.append(key)
+        return sublist_keys
+    async def async_batch_get_cache(
+        self,
+        keys: list,
+        parent_otel_span: Optional[Span] = None,
+        local_only: bool = False,
+        **kwargs,
+    ):
+        try:
+            result = [None for _ in range(len(keys))]
+            if self.in_memory_cache is not None:
+                in_memory_result = await self.in_memory_cache.async_batch_get_cache(
+                    keys, **kwargs
+                )
+                if in_memory_result is not None:
+                    result = in_memory_result
+            if None in result and self.redis_cache is not None and local_only is False:
+                """
+                - for the none values in the result
+                - check the redis cache
+                """
+                current_time = time.time()
+                sublist_keys = self.get_redis_batch_keys(current_time, keys, result)
+                # Only hit Redis if the last access time was more than 5 seconds ago
+                if len(sublist_keys) > 0:
+                    # If not found in in-memory cache, try fetching from Redis
+                    redis_result = await self.redis_cache.async_batch_get_cache(
+                        sublist_keys, parent_otel_span=parent_otel_span
+                    )
+                    if redis_result is not None:
+                        # Update in-memory cache with the value from Redis
+                        for key, value in redis_result.items():
+                            if value is not None:
+                                await self.in_memory_cache.async_set_cache(
+                                    key, redis_result[key], **kwargs
+                                )
+                            # Update the last access time for each key fetched from Redis
+                            self.last_redis_batch_access_time[key] = current_time
+                    for key, value in redis_result.items():
+                        index = keys.index(key)
+                        result[index] = value
+            return result
+        except Exception:
+            verbose_logger.error(traceback.format_exc())
+    async def async_set_cache(self, key, value, local_only: bool = False, **kwargs):
+        print_verbose(
+            f"async set cache: cache key: {key}; local_only: {local_only}; value: {value}"
+        )
+        try:
+            if self.in_memory_cache is not None:
+                await self.in_memory_cache.async_set_cache(key, value, **kwargs)
+            if self.redis_cache is not None and local_only is False:
+                await self.redis_cache.async_set_cache(key, value, **kwargs)
+        except Exception as e:
+            verbose_logger.exception(
+                f"LiteLLM Cache: Excepton async add_cache: {str(e)}"
+            )
+    # async_batch_set_cache
+    async def async_set_cache_pipeline(
+        self, cache_list: list, local_only: bool = False, **kwargs
+    ):
+        """
+        Batch write values to the cache
+        """
+        print_verbose(
+            f"async batch set cache: cache keys: {cache_list}; local_only: {local_only}"
+        )
+        try:
+            if self.in_memory_cache is not None:
+                await self.in_memory_cache.async_set_cache_pipeline(
+                    cache_list=cache_list, **kwargs
+                )
+            if self.redis_cache is not None and local_only is False:
+                await self.redis_cache.async_set_cache_pipeline(
+                    cache_list=cache_list, ttl=kwargs.pop("ttl", None), **kwargs
+                )
+        except Exception as e:
+            verbose_logger.exception(
+                f"LiteLLM Cache: Excepton async add_cache: {str(e)}"
+            )
+    async def async_increment_cache(
+        self,
+        key,
+        value: float,
+        parent_otel_span: Optional[Span] = None,
+        local_only: bool = False,
+        **kwargs,
+    ) -> float:
+        """
+        Key - the key in cache
+        Value - float - the value you want to increment by
+        Returns - float - the incremented value
+        """
+        try:
+            result: float = value
+            if self.in_memory_cache is not None:
+                result = await self.in_memory_cache.async_increment(
+                    key, value, **kwargs
+                )
+            if self.redis_cache is not None and local_only is False:
+                result = await self.redis_cache.async_increment(
+                    key,
+                    value,
+                    parent_otel_span=parent_otel_span,
+                    ttl=kwargs.get("ttl", None),
+                )
+            return result
+        except Exception as e:
+            raise e  # don't log if exception is raised
+    async def async_set_cache_sadd(
+        self, key, value: List, local_only: bool = False, **kwargs
+    ) -> None:
+        """
+        Add value to a set
+        Key - the key in cache
+        Value - str - the value you want to add to the set
+        Returns - None
+        """
+        try:
+            if self.in_memory_cache is not None:
+                _ = await self.in_memory_cache.async_set_cache_sadd(
+                    key, value, ttl=kwargs.get("ttl", None)
+                )
+            if self.redis_cache is not None and local_only is False:
+                _ = await self.redis_cache.async_set_cache_sadd(
+                    key, value, ttl=kwargs.get("ttl", None)
+                )
+            return None
+        except Exception as e:
+            raise e  # don't log, if exception is raised
+    def flush_cache(self):
+        if self.in_memory_cache is not None:
+            self.in_memory_cache.flush_cache()
+        if self.redis_cache is not None:
+            self.redis_cache.flush_cache()
+    def delete_cache(self, key):
+        """
+        Delete a key from the cache
+        """
+        if self.in_memory_cache is not None:
+            self.in_memory_cache.delete_cache(key)
+        if self.redis_cache is not None:
+            self.redis_cache.delete_cache(key)
+    async def async_delete_cache(self, key: str):
+        """
+        Delete a key from the cache
+        """
+        if self.in_memory_cache is not None:
+            self.in_memory_cache.delete_cache(key)
+        if self.redis_cache is not None:
+            await self.redis_cache.async_delete_cache(key)
+    async def async_get_ttl(self, key: str) -> Optional[int]:
+        """
+        Get the remaining TTL of a key in in-memory cache or redis
+        """
+        ttl = await self.in_memory_cache.async_get_ttl(key)
+        if ttl is None and self.redis_cache is not None:
+            ttl = await self.redis_cache.async_get_ttl(key)
+        return ttl

litellm/caching/in_memory_cache.py ADDED Viewed

	@@ -0,0 +1,203 @@

+"""
+In-Memory Cache implementation
+Has 4 methods:
+    - set_cache
+    - get_cache
+    - async_set_cache
+    - async_get_cache
+"""
+import json
+import sys
+import time
+from typing import Any, List, Optional
+from pydantic import BaseModel
+from litellm.constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
+from .base_cache import BaseCache
+class InMemoryCache(BaseCache):
+    def __init__(
+        self,
+        max_size_in_memory: Optional[int] = 200,
+        default_ttl: Optional[
+            int
+        ] = 600,  # default ttl is 10 minutes. At maximum litellm rate limiting logic requires objects to be in memory for 1 minute
+        max_size_per_item: Optional[int] = 1024,  # 1MB = 1024KB
+    ):
+        """
+        max_size_in_memory [int]: Maximum number of items in cache. done to prevent memory leaks. Use 200 items as a default
+        """
+        self.max_size_in_memory = (
+            max_size_in_memory or 200
+        )  # set an upper bound of 200 items in-memory
+        self.default_ttl = default_ttl or 600
+        self.max_size_per_item = (
+            max_size_per_item or MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
+        )  # 1MB = 1024KB
+        # in-memory cache
+        self.cache_dict: dict = {}
+        self.ttl_dict: dict = {}
+    def check_value_size(self, value: Any):
+        """
+        Check if value size exceeds max_size_per_item (1MB)
+        Returns True if value size is acceptable, False otherwise
+        """
+        try:
+            # Fast path for common primitive types that are typically small
+            if (
+                isinstance(value, (bool, int, float, str))
+                and len(str(value))
+                < self.max_size_per_item * MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
+            ):  # Conservative estimate
+                return True
+            # Direct size check for bytes objects
+            if isinstance(value, bytes):
+                return sys.getsizeof(value) / 1024 <= self.max_size_per_item
+            # Handle special types without full conversion when possible
+            if hasattr(value, "__sizeof__"):  # Use __sizeof__ if available
+                size = value.__sizeof__() / 1024
+                return size <= self.max_size_per_item
+            # Fallback for complex types
+            if isinstance(value, BaseModel) and hasattr(
+                value, "model_dump"
+            ):  # Pydantic v2
+                value = value.model_dump()
+            elif hasattr(value, "isoformat"):  # datetime objects
+                return True  # datetime strings are always small
+            # Only convert to JSON if absolutely necessary
+            if not isinstance(value, (str, bytes)):
+                value = json.dumps(value, default=str)
+            return sys.getsizeof(value) / 1024 <= self.max_size_per_item
+        except Exception:
+            return False
+    def evict_cache(self):
+        """
+        Eviction policy:
+        - check if any items in ttl_dict are expired -> remove them from ttl_dict and cache_dict
+        This guarantees the following:
+        - 1. When item ttl not set: At minimumm each item will remain in memory for 5 minutes
+        - 2. When ttl is set: the item will remain in memory for at least that amount of time
+        - 3. the size of in-memory cache is bounded
+        """
+        for key in list(self.ttl_dict.keys()):
+            if time.time() > self.ttl_dict[key]:
+                self.cache_dict.pop(key, None)
+                self.ttl_dict.pop(key, None)
+                # de-reference the removed item
+                # https://www.geeksforgeeks.org/diagnosing-and-fixing-memory-leaks-in-python/
+                # One of the most common causes of memory leaks in Python is the retention of objects that are no longer being used.
+                # This can occur when an object is referenced by another object, but the reference is never removed.
+    def set_cache(self, key, value, **kwargs):
+        if len(self.cache_dict) >= self.max_size_in_memory:
+            # only evict when cache is full
+            self.evict_cache()
+        if not self.check_value_size(value):
+            return
+        self.cache_dict[key] = value
+        if "ttl" in kwargs and kwargs["ttl"] is not None:
+            self.ttl_dict[key] = time.time() + kwargs["ttl"]
+        else:
+            self.ttl_dict[key] = time.time() + self.default_ttl
+    async def async_set_cache(self, key, value, **kwargs):
+        self.set_cache(key=key, value=value, **kwargs)
+    async def async_set_cache_pipeline(self, cache_list, ttl=None, **kwargs):
+        for cache_key, cache_value in cache_list:
+            if ttl is not None:
+                self.set_cache(key=cache_key, value=cache_value, ttl=ttl)
+            else:
+                self.set_cache(key=cache_key, value=cache_value)
+    async def async_set_cache_sadd(self, key, value: List, ttl: Optional[float]):
+        """
+        Add value to set
+        """
+        # get the value
+        init_value = self.get_cache(key=key) or set()
+        for val in value:
+            init_value.add(val)
+        self.set_cache(key, init_value, ttl=ttl)
+        return value
+    def get_cache(self, key, **kwargs):
+        if key in self.cache_dict:
+            if key in self.ttl_dict:
+                if time.time() > self.ttl_dict[key]:
+                    self.cache_dict.pop(key, None)
+                    return None
+            original_cached_response = self.cache_dict[key]
+            try:
+                cached_response = json.loads(original_cached_response)
+            except Exception:
+                cached_response = original_cached_response
+            return cached_response
+        return None
+    def batch_get_cache(self, keys: list, **kwargs):
+        return_val = []
+        for k in keys:
+            val = self.get_cache(key=k, **kwargs)
+            return_val.append(val)
+        return return_val
+    def increment_cache(self, key, value: int, **kwargs) -> int:
+        # get the value
+        init_value = self.get_cache(key=key) or 0
+        value = init_value + value
+        self.set_cache(key, value, **kwargs)
+        return value
+    async def async_get_cache(self, key, **kwargs):
+        return self.get_cache(key=key, **kwargs)
+    async def async_batch_get_cache(self, keys: list, **kwargs):
+        return_val = []
+        for k in keys:
+            val = self.get_cache(key=k, **kwargs)
+            return_val.append(val)
+        return return_val
+    async def async_increment(self, key, value: float, **kwargs) -> float:
+        # get the value
+        init_value = await self.async_get_cache(key=key) or 0
+        value = init_value + value
+        await self.async_set_cache(key, value, **kwargs)
+        return value
+    def flush_cache(self):
+        self.cache_dict.clear()
+        self.ttl_dict.clear()
+    async def disconnect(self):
+        pass
+    def delete_cache(self, key):
+        self.cache_dict.pop(key, None)
+        self.ttl_dict.pop(key, None)
+    async def async_get_ttl(self, key: str) -> Optional[int]:
+        """
+        Get the remaining TTL of a key in in-memory cache
+        """
+        return self.ttl_dict.get(key, None)

litellm/caching/llm_caching_handler.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""
+Add the event loop to the cache key, to prevent event loop closed errors.
+"""
+import asyncio
+from .in_memory_cache import InMemoryCache
+class LLMClientCache(InMemoryCache):
+    def update_cache_key_with_event_loop(self, key):
+        """
+        Add the event loop to the cache key, to prevent event loop closed errors.
+        If none, use the key as is.
+        """
+        try:
+            event_loop = asyncio.get_event_loop()
+            stringified_event_loop = str(id(event_loop))
+            return f"{key}-{stringified_event_loop}"
+        except Exception:  # handle no current event loop
+            return key
+    def set_cache(self, key, value, **kwargs):
+        key = self.update_cache_key_with_event_loop(key)
+        return super().set_cache(key, value, **kwargs)
+    async def async_set_cache(self, key, value, **kwargs):
+        key = self.update_cache_key_with_event_loop(key)
+        return await super().async_set_cache(key, value, **kwargs)
+    def get_cache(self, key, **kwargs):
+        key = self.update_cache_key_with_event_loop(key)
+        return super().get_cache(key, **kwargs)
+    async def async_get_cache(self, key, **kwargs):
+        key = self.update_cache_key_with_event_loop(key)
+        return await super().async_get_cache(key, **kwargs)

litellm/caching/qdrant_semantic_cache.py ADDED Viewed

	@@ -0,0 +1,442 @@

+"""
+Qdrant Semantic Cache implementation
+Has 4 methods:
+    - set_cache
+    - get_cache
+    - async_set_cache
+    - async_get_cache
+"""
+import ast
+import asyncio
+import json
+from typing import Any, cast
+import litellm
+from litellm._logging import print_verbose
+from litellm.constants import QDRANT_SCALAR_QUANTILE, QDRANT_VECTOR_SIZE
+from litellm.types.utils import EmbeddingResponse
+from .base_cache import BaseCache
+class QdrantSemanticCache(BaseCache):
+    def __init__(  # noqa: PLR0915
+        self,
+        qdrant_api_base=None,
+        qdrant_api_key=None,
+        collection_name=None,
+        similarity_threshold=None,
+        quantization_config=None,
+        embedding_model="text-embedding-ada-002",
+        host_type=None,
+    ):
+        import os
+        from litellm.llms.custom_httpx.http_handler import (
+            _get_httpx_client,
+            get_async_httpx_client,
+            httpxSpecialProvider,
+        )
+        from litellm.secret_managers.main import get_secret_str
+        if collection_name is None:
+            raise Exception("collection_name must be provided, passed None")
+        self.collection_name = collection_name
+        print_verbose(
+            f"qdrant semantic-cache initializing COLLECTION - {self.collection_name}"
+        )
+        if similarity_threshold is None:
+            raise Exception("similarity_threshold must be provided, passed None")
+        self.similarity_threshold = similarity_threshold
+        self.embedding_model = embedding_model
+        headers = {}
+        # check if defined as os.environ/ variable
+        if qdrant_api_base:
+            if isinstance(qdrant_api_base, str) and qdrant_api_base.startswith(
+                "os.environ/"
+            ):
+                qdrant_api_base = get_secret_str(qdrant_api_base)
+        if qdrant_api_key:
+            if isinstance(qdrant_api_key, str) and qdrant_api_key.startswith(
+                "os.environ/"
+            ):
+                qdrant_api_key = get_secret_str(qdrant_api_key)
+        qdrant_api_base = (
+            qdrant_api_base or os.getenv("QDRANT_URL") or os.getenv("QDRANT_API_BASE")
+        )
+        qdrant_api_key = qdrant_api_key or os.getenv("QDRANT_API_KEY")
+        headers = {"Content-Type": "application/json"}
+        if qdrant_api_key:
+            headers["api-key"] = qdrant_api_key
+        if qdrant_api_base is None:
+            raise ValueError("Qdrant url must be provided")
+        self.qdrant_api_base = qdrant_api_base
+        self.qdrant_api_key = qdrant_api_key
+        print_verbose(f"qdrant semantic-cache qdrant_api_base: {self.qdrant_api_base}")
+        self.headers = headers
+        self.sync_client = _get_httpx_client()
+        self.async_client = get_async_httpx_client(
+            llm_provider=httpxSpecialProvider.Caching
+        )
+        if quantization_config is None:
+            print_verbose(
+                "Quantization config is not provided. Default binary quantization will be used."
+            )
+        collection_exists = self.sync_client.get(
+            url=f"{self.qdrant_api_base}/collections/{self.collection_name}/exists",
+            headers=self.headers,
+        )
+        if collection_exists.status_code != 200:
+            raise ValueError(
+                f"Error from qdrant checking if /collections exist {collection_exists.text}"
+            )
+        if collection_exists.json()["result"]["exists"]:
+            collection_details = self.sync_client.get(
+                url=f"{self.qdrant_api_base}/collections/{self.collection_name}",
+                headers=self.headers,
+            )
+            self.collection_info = collection_details.json()
+            print_verbose(
+                f"Collection already exists.\nCollection details:{self.collection_info}"
+            )
+        else:
+            if quantization_config is None or quantization_config == "binary":
+                quantization_params = {
+                    "binary": {
+                        "always_ram": False,
+                    }
+                }
+            elif quantization_config == "scalar":
+                quantization_params = {
+                    "scalar": {
+                        "type": "int8",
+                        "quantile": QDRANT_SCALAR_QUANTILE,
+                        "always_ram": False,
+                    }
+                }
+            elif quantization_config == "product":
+                quantization_params = {
+                    "product": {"compression": "x16", "always_ram": False}
+                }
+            else:
+                raise Exception(
+                    "Quantization config must be one of 'scalar', 'binary' or 'product'"
+                )
+            new_collection_status = self.sync_client.put(
+                url=f"{self.qdrant_api_base}/collections/{self.collection_name}",
+                json={
+                    "vectors": {"size": QDRANT_VECTOR_SIZE, "distance": "Cosine"},
+                    "quantization_config": quantization_params,
+                },
+                headers=self.headers,
+            )
+            if new_collection_status.json()["result"]:
+                collection_details = self.sync_client.get(
+                    url=f"{self.qdrant_api_base}/collections/{self.collection_name}",
+                    headers=self.headers,
+                )
+                self.collection_info = collection_details.json()
+                print_verbose(
+                    f"New collection created.\nCollection details:{self.collection_info}"
+                )
+            else:
+                raise Exception("Error while creating new collection")
+    def _get_cache_logic(self, cached_response: Any):
+        if cached_response is None:
+            return cached_response
+        try:
+            cached_response = json.loads(
+                cached_response
+            )  # Convert string to dictionary
+        except Exception:
+            cached_response = ast.literal_eval(cached_response)
+        return cached_response
+    def set_cache(self, key, value, **kwargs):
+        print_verbose(f"qdrant semantic-cache set_cache, kwargs: {kwargs}")
+        import uuid
+        # get the prompt
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+        # create an embedding for prompt
+        embedding_response = cast(
+            EmbeddingResponse,
+            litellm.embedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            ),
+        )
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+        value = str(value)
+        assert isinstance(value, str)
+        data = {
+            "points": [
+                {
+                    "id": str(uuid.uuid4()),
+                    "vector": embedding,
+                    "payload": {
+                        "text": prompt,
+                        "response": value,
+                    },
+                },
+            ]
+        }
+        self.sync_client.put(
+            url=f"{self.qdrant_api_base}/collections/{self.collection_name}/points",
+            headers=self.headers,
+            json=data,
+        )
+        return
+    def get_cache(self, key, **kwargs):
+        print_verbose(f"sync qdrant semantic-cache get_cache, kwargs: {kwargs}")
+        # get the messages
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+        # convert to embedding
+        embedding_response = cast(
+            EmbeddingResponse,
+            litellm.embedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            ),
+        )
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+        data = {
+            "vector": embedding,
+            "params": {
+                "quantization": {
+                    "ignore": False,
+                    "rescore": True,
+                    "oversampling": 3.0,
+                }
+            },
+            "limit": 1,
+            "with_payload": True,
+        }
+        search_response = self.sync_client.post(
+            url=f"{self.qdrant_api_base}/collections/{self.collection_name}/points/search",
+            headers=self.headers,
+            json=data,
+        )
+        results = search_response.json()["result"]
+        if results is None:
+            return None
+        if isinstance(results, list):
+            if len(results) == 0:
+                return None
+        similarity = results[0]["score"]
+        cached_prompt = results[0]["payload"]["text"]
+        # check similarity, if more than self.similarity_threshold, return results
+        print_verbose(
+            f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
+        )
+        if similarity >= self.similarity_threshold:
+            # cache hit !
+            cached_value = results[0]["payload"]["response"]
+            print_verbose(
+                f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
+            )
+            return self._get_cache_logic(cached_response=cached_value)
+        else:
+            # cache miss !
+            return None
+        pass
+    async def async_set_cache(self, key, value, **kwargs):
+        import uuid
+        from litellm.proxy.proxy_server import llm_model_list, llm_router
+        print_verbose(f"async qdrant semantic-cache set_cache, kwargs: {kwargs}")
+        # get the prompt
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+        # create an embedding for prompt
+        router_model_names = (
+            [m["model_name"] for m in llm_model_list]
+            if llm_model_list is not None
+            else []
+        )
+        if llm_router is not None and self.embedding_model in router_model_names:
+            user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
+            embedding_response = await llm_router.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+                metadata={
+                    "user_api_key": user_api_key,
+                    "semantic-cache-embedding": True,
+                    "trace_id": kwargs.get("metadata", {}).get("trace_id", None),
+                },
+            )
+        else:
+            # convert to embedding
+            embedding_response = await litellm.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+        value = str(value)
+        assert isinstance(value, str)
+        data = {
+            "points": [
+                {
+                    "id": str(uuid.uuid4()),
+                    "vector": embedding,
+                    "payload": {
+                        "text": prompt,
+                        "response": value,
+                    },
+                },
+            ]
+        }
+        await self.async_client.put(
+            url=f"{self.qdrant_api_base}/collections/{self.collection_name}/points",
+            headers=self.headers,
+            json=data,
+        )
+        return
+    async def async_get_cache(self, key, **kwargs):
+        print_verbose(f"async qdrant semantic-cache get_cache, kwargs: {kwargs}")
+        from litellm.proxy.proxy_server import llm_model_list, llm_router
+        # get the messages
+        messages = kwargs["messages"]
+        prompt = ""
+        for message in messages:
+            prompt += message["content"]
+        router_model_names = (
+            [m["model_name"] for m in llm_model_list]
+            if llm_model_list is not None
+            else []
+        )
+        if llm_router is not None and self.embedding_model in router_model_names:
+            user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
+            embedding_response = await llm_router.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+                metadata={
+                    "user_api_key": user_api_key,
+                    "semantic-cache-embedding": True,
+                    "trace_id": kwargs.get("metadata", {}).get("trace_id", None),
+                },
+            )
+        else:
+            # convert to embedding
+            embedding_response = await litellm.aembedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            )
+        # get the embedding
+        embedding = embedding_response["data"][0]["embedding"]
+        data = {
+            "vector": embedding,
+            "params": {
+                "quantization": {
+                    "ignore": False,
+                    "rescore": True,
+                    "oversampling": 3.0,
+                }
+            },
+            "limit": 1,
+            "with_payload": True,
+        }
+        search_response = await self.async_client.post(
+            url=f"{self.qdrant_api_base}/collections/{self.collection_name}/points/search",
+            headers=self.headers,
+            json=data,
+        )
+        results = search_response.json()["result"]
+        if results is None:
+            kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
+            return None
+        if isinstance(results, list):
+            if len(results) == 0:
+                kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
+                return None
+        similarity = results[0]["score"]
+        cached_prompt = results[0]["payload"]["text"]
+        # check similarity, if more than self.similarity_threshold, return results
+        print_verbose(
+            f"semantic cache: similarity threshold: {self.similarity_threshold}, similarity: {similarity}, prompt: {prompt}, closest_cached_prompt: {cached_prompt}"
+        )
+        # update kwargs["metadata"] with similarity, don't rewrite the original metadata
+        kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity
+        if similarity >= self.similarity_threshold:
+            # cache hit !
+            cached_value = results[0]["payload"]["response"]
+            print_verbose(
+                f"got a cache hit, similarity: {similarity}, Current prompt: {prompt}, cached_prompt: {cached_prompt}"
+            )
+            return self._get_cache_logic(cached_response=cached_value)
+        else:
+            # cache miss !
+            return None
+        pass
+    async def _collection_info(self):
+        return self.collection_info
+    async def async_set_cache_pipeline(self, cache_list, **kwargs):
+        tasks = []
+        for val in cache_list:
+            tasks.append(self.async_set_cache(val[0], val[1], **kwargs))
+        await asyncio.gather(*tasks)

litellm/caching/redis_cache.py ADDED Viewed

	@@ -0,0 +1,1162 @@

+"""
+Redis Cache implementation
+Has 4 primary methods:
+    - set_cache
+    - get_cache
+    - async_set_cache
+    - async_get_cache
+"""
+import ast
+import asyncio
+import inspect
+import json
+import time
+from datetime import timedelta
+from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union
+import litellm
+from litellm._logging import print_verbose, verbose_logger
+from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs
+from litellm.types.caching import RedisPipelineIncrementOperation
+from litellm.types.services import ServiceTypes
+from .base_cache import BaseCache
+if TYPE_CHECKING:
+    from opentelemetry.trace import Span as _Span
+    from redis.asyncio import Redis, RedisCluster
+    from redis.asyncio.client import Pipeline
+    from redis.asyncio.cluster import ClusterPipeline
+    pipeline = Pipeline
+    cluster_pipeline = ClusterPipeline
+    async_redis_client = Redis
+    async_redis_cluster_client = RedisCluster
+    Span = Union[_Span, Any]
+else:
+    pipeline = Any
+    cluster_pipeline = Any
+    async_redis_client = Any
+    async_redis_cluster_client = Any
+    Span = Any
+class RedisCache(BaseCache):
+    # if users don't provider one, use the default litellm cache
+    def __init__(
+        self,
+        host=None,
+        port=None,
+        password=None,
+        redis_flush_size: Optional[int] = 100,
+        namespace: Optional[str] = None,
+        startup_nodes: Optional[List] = None,  # for redis-cluster
+        socket_timeout: Optional[float] = 5.0,  # default 5 second timeout
+        **kwargs,
+    ):
+        from litellm._service_logger import ServiceLogging
+        from .._redis import get_redis_client, get_redis_connection_pool
+        redis_kwargs = {}
+        if host is not None:
+            redis_kwargs["host"] = host
+        if port is not None:
+            redis_kwargs["port"] = port
+        if password is not None:
+            redis_kwargs["password"] = password
+        if startup_nodes is not None:
+            redis_kwargs["startup_nodes"] = startup_nodes
+        if socket_timeout is not None:
+            redis_kwargs["socket_timeout"] = socket_timeout
+        ### HEALTH MONITORING OBJECT ###
+        if kwargs.get("service_logger_obj", None) is not None and isinstance(
+            kwargs["service_logger_obj"], ServiceLogging
+        ):
+            self.service_logger_obj = kwargs.pop("service_logger_obj")
+        else:
+            self.service_logger_obj = ServiceLogging()
+        redis_kwargs.update(kwargs)
+        self.redis_client = get_redis_client(**redis_kwargs)
+        self.redis_async_client: Optional[async_redis_client] = None
+        self.redis_kwargs = redis_kwargs
+        self.async_redis_conn_pool = get_redis_connection_pool(**redis_kwargs)
+        # redis namespaces
+        self.namespace = namespace
+        # for high traffic, we store the redis results in memory and then batch write to redis
+        self.redis_batch_writing_buffer: list = []
+        if redis_flush_size is None:
+            self.redis_flush_size: int = 100
+        else:
+            self.redis_flush_size = redis_flush_size
+        self.redis_version = "Unknown"
+        try:
+            if not inspect.iscoroutinefunction(self.redis_client):
+                self.redis_version = self.redis_client.info()["redis_version"]  # type: ignore
+        except Exception:
+            pass
+        ### ASYNC HEALTH PING ###
+        try:
+            # asyncio.get_running_loop().create_task(self.ping())
+            _ = asyncio.get_running_loop().create_task(self.ping())
+        except Exception as e:
+            if "no running event loop" in str(e):
+                verbose_logger.debug(
+                    "Ignoring async redis ping. No running event loop."
+                )
+            else:
+                verbose_logger.error(
+                    "Error connecting to Async Redis client - {}".format(str(e)),
+                    extra={"error": str(e)},
+                )
+        ### SYNC HEALTH PING ###
+        try:
+            if hasattr(self.redis_client, "ping"):
+                self.redis_client.ping()  # type: ignore
+        except Exception as e:
+            verbose_logger.error(
+                "Error connecting to Sync Redis client", extra={"error": str(e)}
+            )
+        if litellm.default_redis_ttl is not None:
+            super().__init__(default_ttl=int(litellm.default_redis_ttl))
+        else:
+            super().__init__()  # defaults to 60s
+    def init_async_client(
+        self,
+    ) -> Union[async_redis_client, async_redis_cluster_client]:
+        from .._redis import get_redis_async_client
+        if self.redis_async_client is None:
+            self.redis_async_client = get_redis_async_client(
+                connection_pool=self.async_redis_conn_pool, **self.redis_kwargs
+            )
+        return self.redis_async_client
+    def check_and_fix_namespace(self, key: str) -> str:
+        """
+        Make sure each key starts with the given namespace
+        """
+        if self.namespace is not None and not key.startswith(self.namespace):
+            key = self.namespace + ":" + key
+        return key
+    def set_cache(self, key, value, **kwargs):
+        ttl = self.get_ttl(**kwargs)
+        print_verbose(
+            f"Set Redis Cache: key: {key}\nValue {value}\nttl={ttl}, redis_version={self.redis_version}"
+        )
+        key = self.check_and_fix_namespace(key=key)
+        try:
+            start_time = time.time()
+            self.redis_client.set(name=key, value=str(value), ex=ttl)
+            end_time = time.time()
+            _duration = end_time - start_time
+            self.service_logger_obj.service_success_hook(
+                service=ServiceTypes.REDIS,
+                duration=_duration,
+                call_type="set_cache",
+                start_time=start_time,
+                end_time=end_time,
+            )
+        except Exception as e:
+            # NON blocking - notify users Redis is throwing an exception
+            print_verbose(
+                f"litellm.caching.caching: set() - Got exception from REDIS : {str(e)}"
+            )
+    def increment_cache(
+        self, key, value: int, ttl: Optional[float] = None, **kwargs
+    ) -> int:
+        _redis_client = self.redis_client
+        start_time = time.time()
+        set_ttl = self.get_ttl(ttl=ttl)
+        try:
+            start_time = time.time()
+            result: int = _redis_client.incr(name=key, amount=value)  # type: ignore
+            end_time = time.time()
+            _duration = end_time - start_time
+            self.service_logger_obj.service_success_hook(
+                service=ServiceTypes.REDIS,
+                duration=_duration,
+                call_type="increment_cache",
+                start_time=start_time,
+                end_time=end_time,
+            )
+            if set_ttl is not None:
+                # check if key already has ttl, if not -> set ttl
+                start_time = time.time()
+                current_ttl = _redis_client.ttl(key)
+                end_time = time.time()
+                _duration = end_time - start_time
+                self.service_logger_obj.service_success_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    call_type="increment_cache_ttl",
+                    start_time=start_time,
+                    end_time=end_time,
+                )
+                if current_ttl == -1:
+                    # Key has no expiration
+                    start_time = time.time()
+                    _redis_client.expire(key, set_ttl)  # type: ignore
+                    end_time = time.time()
+                    _duration = end_time - start_time
+                    self.service_logger_obj.service_success_hook(
+                        service=ServiceTypes.REDIS,
+                        duration=_duration,
+                        call_type="increment_cache_expire",
+                        start_time=start_time,
+                        end_time=end_time,
+                    )
+            return result
+        except Exception as e:
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            verbose_logger.error(
+                "LiteLLM Redis Caching: increment_cache() - Got exception from REDIS %s, Writing value=%s",
+                str(e),
+                value,
+            )
+            raise e
+    async def async_scan_iter(self, pattern: str, count: int = 100) -> list:
+        start_time = time.time()
+        try:
+            keys = []
+            _redis_client = self.init_async_client()
+            if not hasattr(_redis_client, "scan_iter"):
+                verbose_logger.debug(
+                    "Redis client does not support scan_iter, potentially using Redis Cluster. Returning empty list."
+                )
+                return []
+            async for key in _redis_client.scan_iter(match=pattern + "*", count=count):  # type: ignore
+                keys.append(key)
+                if len(keys) >= count:
+                    break
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_success_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    call_type="async_scan_iter",
+                    start_time=start_time,
+                    end_time=end_time,
+                )
+            )  # DO NOT SLOW DOWN CALL B/C OF THIS
+            return keys
+        except Exception as e:
+            # NON blocking - notify users Redis is throwing an exception
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    error=e,
+                    call_type="async_scan_iter",
+                    start_time=start_time,
+                    end_time=end_time,
+                )
+            )
+            raise e
+    async def async_set_cache(self, key, value, **kwargs):
+        from redis.asyncio import Redis
+        start_time = time.time()
+        try:
+            _redis_client: Redis = self.init_async_client()  # type: ignore
+        except Exception as e:
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    error=e,
+                    start_time=start_time,
+                    end_time=end_time,
+                    parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs),
+                    call_type="async_set_cache",
+                )
+            )
+            verbose_logger.error(
+                "LiteLLM Redis Caching: async set() - Got exception from REDIS %s, Writing value=%s",
+                str(e),
+                value,
+            )
+            raise e
+        key = self.check_and_fix_namespace(key=key)
+        ttl = self.get_ttl(**kwargs)
+        nx = kwargs.get("nx", False)
+        print_verbose(f"Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}")
+        try:
+            if not hasattr(_redis_client, "set"):
+                raise Exception("Redis client cannot set cache. Attribute not found.")
+            result = await _redis_client.set(
+                name=key,
+                value=json.dumps(value),
+                nx=nx,
+                ex=ttl,
+            )
+            print_verbose(
+                f"Successfully Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}"
+            )
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_success_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    call_type="async_set_cache",
+                    start_time=start_time,
+                    end_time=end_time,
+                    parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs),
+                    event_metadata={"key": key},
+                )
+            )
+            return result
+        except Exception as e:
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    error=e,
+                    call_type="async_set_cache",
+                    start_time=start_time,
+                    end_time=end_time,
+                    parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs),
+                    event_metadata={"key": key},
+                )
+            )
+            verbose_logger.error(
+                "LiteLLM Redis Caching: async set() - Got exception from REDIS %s, Writing value=%s",
+                str(e),
+                value,
+            )
+    async def _pipeline_helper(
+        self,
+        pipe: Union[pipeline, cluster_pipeline],
+        cache_list: List[Tuple[Any, Any]],
+        ttl: Optional[float],
+    ) -> List:
+        """
+        Helper function for executing a pipeline of set operations on Redis
+        """
+        ttl = self.get_ttl(ttl=ttl)
+        # Iterate through each key-value pair in the cache_list and set them in the pipeline.
+        for cache_key, cache_value in cache_list:
+            cache_key = self.check_and_fix_namespace(key=cache_key)
+            print_verbose(
+                f"Set ASYNC Redis Cache PIPELINE: key: {cache_key}\nValue {cache_value}\nttl={ttl}"
+            )
+            json_cache_value = json.dumps(cache_value)
+            # Set the value with a TTL if it's provided.
+            _td: Optional[timedelta] = None
+            if ttl is not None:
+                _td = timedelta(seconds=ttl)
+            pipe.set(  # type: ignore
+                name=cache_key,
+                value=json_cache_value,
+                ex=_td,
+            )
+        # Execute the pipeline and return the results.
+        results = await pipe.execute()
+        return results
+    async def async_set_cache_pipeline(
+        self, cache_list: List[Tuple[Any, Any]], ttl: Optional[float] = None, **kwargs
+    ):
+        """
+        Use Redis Pipelines for bulk write operations
+        """
+        # don't waste a network request if there's nothing to set
+        if len(cache_list) == 0:
+            return
+        _redis_client = self.init_async_client()
+        start_time = time.time()
+        print_verbose(
+            f"Set Async Redis Cache: key list: {cache_list}\nttl={ttl}, redis_version={self.redis_version}"
+        )
+        cache_value: Any = None
+        try:
+            async with _redis_client.pipeline(transaction=False) as pipe:
+                results = await self._pipeline_helper(pipe, cache_list, ttl)
+            print_verbose(f"pipeline results: {results}")
+            # Optionally, you could process 'results' to make sure that all set operations were successful.
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_success_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    call_type="async_set_cache_pipeline",
+                    start_time=start_time,
+                    end_time=end_time,
+                    parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs),
+                )
+            )
+            return None
+        except Exception as e:
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    error=e,
+                    call_type="async_set_cache_pipeline",
+                    start_time=start_time,
+                    end_time=end_time,
+                    parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs),
+                )
+            )
+            verbose_logger.error(
+                "LiteLLM Redis Caching: async set_cache_pipeline() - Got exception from REDIS %s, Writing value=%s",
+                str(e),
+                cache_value,
+            )
+    async def _set_cache_sadd_helper(
+        self,
+        redis_client: async_redis_client,
+        key: str,
+        value: List,
+        ttl: Optional[float],
+    ) -> None:
+        """Helper function for async_set_cache_sadd. Separated for testing."""
+        ttl = self.get_ttl(ttl=ttl)
+        try:
+            await redis_client.sadd(key, *value)  # type: ignore
+            if ttl is not None:
+                _td = timedelta(seconds=ttl)
+                await redis_client.expire(key, _td)
+        except Exception:
+            raise
+    async def async_set_cache_sadd(
+        self, key, value: List, ttl: Optional[float], **kwargs
+    ):
+        from redis.asyncio import Redis
+        start_time = time.time()
+        try:
+            _redis_client: Redis = self.init_async_client()  # type: ignore
+        except Exception as e:
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    error=e,
+                    start_time=start_time,
+                    end_time=end_time,
+                    parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs),
+                    call_type="async_set_cache_sadd",
+                )
+            )
+            # NON blocking - notify users Redis is throwing an exception
+            verbose_logger.error(
+                "LiteLLM Redis Caching: async set() - Got exception from REDIS %s, Writing value=%s",
+                str(e),
+                value,
+            )
+            raise e
+        key = self.check_and_fix_namespace(key=key)
+        print_verbose(f"Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}")
+        try:
+            await self._set_cache_sadd_helper(
+                redis_client=_redis_client, key=key, value=value, ttl=ttl
+            )
+            print_verbose(
+                f"Successfully Set ASYNC Redis Cache SADD: key: {key}\nValue {value}\nttl={ttl}"
+            )
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_success_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    call_type="async_set_cache_sadd",
+                    start_time=start_time,
+                    end_time=end_time,
+                    parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs),
+                )
+            )
+        except Exception as e:
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    error=e,
+                    call_type="async_set_cache_sadd",
+                    start_time=start_time,
+                    end_time=end_time,
+                    parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs),
+                )
+            )
+            # NON blocking - notify users Redis is throwing an exception
+            verbose_logger.error(
+                "LiteLLM Redis Caching: async set_cache_sadd() - Got exception from REDIS %s, Writing value=%s",
+                str(e),
+                value,
+            )
+    async def batch_cache_write(self, key, value, **kwargs):
+        print_verbose(
+            f"in batch cache writing for redis buffer size={len(self.redis_batch_writing_buffer)}",
+        )
+        key = self.check_and_fix_namespace(key=key)
+        self.redis_batch_writing_buffer.append((key, value))
+        if len(self.redis_batch_writing_buffer) >= self.redis_flush_size:
+            await self.flush_cache_buffer()  # logging done in here
+    async def async_increment(
+        self,
+        key,
+        value: float,
+        ttl: Optional[int] = None,
+        parent_otel_span: Optional[Span] = None,
+    ) -> float:
+        from redis.asyncio import Redis
+        _redis_client: Redis = self.init_async_client()  # type: ignore
+        start_time = time.time()
+        _used_ttl = self.get_ttl(ttl=ttl)
+        key = self.check_and_fix_namespace(key=key)
+        try:
+            result = await _redis_client.incrbyfloat(name=key, amount=value)
+            if _used_ttl is not None:
+                # check if key already has ttl, if not -> set ttl
+                current_ttl = await _redis_client.ttl(key)
+                if current_ttl == -1:
+                    # Key has no expiration
+                    await _redis_client.expire(key, _used_ttl)
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_success_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    call_type="async_increment",
+                    start_time=start_time,
+                    end_time=end_time,
+                    parent_otel_span=parent_otel_span,
+                )
+            )
+            return result
+        except Exception as e:
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    error=e,
+                    call_type="async_increment",
+                    start_time=start_time,
+                    end_time=end_time,
+                    parent_otel_span=parent_otel_span,
+                )
+            )
+            verbose_logger.error(
+                "LiteLLM Redis Caching: async async_increment() - Got exception from REDIS %s, Writing value=%s",
+                str(e),
+                value,
+            )
+            raise e
+    async def flush_cache_buffer(self):
+        print_verbose(
+            f"flushing to redis....reached size of buffer {len(self.redis_batch_writing_buffer)}"
+        )
+        await self.async_set_cache_pipeline(self.redis_batch_writing_buffer)
+        self.redis_batch_writing_buffer = []
+    def _get_cache_logic(self, cached_response: Any):
+        """
+        Common 'get_cache_logic' across sync + async redis client implementations
+        """
+        if cached_response is None:
+            return cached_response
+        # cached_response is in `b{} convert it to ModelResponse
+        cached_response = cached_response.decode("utf-8")  # Convert bytes to string
+        try:
+            cached_response = json.loads(
+                cached_response
+            )  # Convert string to dictionary
+        except Exception:
+            cached_response = ast.literal_eval(cached_response)
+        return cached_response
+    def get_cache(self, key, parent_otel_span: Optional[Span] = None, **kwargs):
+        try:
+            key = self.check_and_fix_namespace(key=key)
+            print_verbose(f"Get Redis Cache: key: {key}")
+            start_time = time.time()
+            cached_response = self.redis_client.get(key)
+            end_time = time.time()
+            _duration = end_time - start_time
+            self.service_logger_obj.service_success_hook(
+                service=ServiceTypes.REDIS,
+                duration=_duration,
+                call_type="get_cache",
+                start_time=start_time,
+                end_time=end_time,
+                parent_otel_span=parent_otel_span,
+            )
+            print_verbose(
+                f"Got Redis Cache: key: {key}, cached_response {cached_response}"
+            )
+            return self._get_cache_logic(cached_response=cached_response)
+        except Exception as e:
+            # NON blocking - notify users Redis is throwing an exception
+            verbose_logger.error(
+                "litellm.caching.caching: get() - Got exception from REDIS: ", e
+            )
+    def _run_redis_mget_operation(self, keys: List[str]) -> List[Any]:
+        """
+        Wrapper to call `mget` on the redis client
+        We use a wrapper so RedisCluster can override this method
+        """
+        return self.redis_client.mget(keys=keys)  # type: ignore
+    async def _async_run_redis_mget_operation(self, keys: List[str]) -> List[Any]:
+        """
+        Wrapper to call `mget` on the redis client
+        We use a wrapper so RedisCluster can override this method
+        """
+        async_redis_client = self.init_async_client()
+        return await async_redis_client.mget(keys=keys)  # type: ignore
+    def batch_get_cache(
+        self,
+        key_list: Union[List[str], List[Optional[str]]],
+        parent_otel_span: Optional[Span] = None,
+    ) -> dict:
+        """
+        Use Redis for bulk read operations
+        Args:
+            key_list: List of keys to get from Redis
+            parent_otel_span: Optional parent OpenTelemetry span
+        Returns:
+            dict: A dictionary mapping keys to their cached values
+        """
+        key_value_dict = {}
+        _key_list = [key for key in key_list if key is not None]
+        try:
+            _keys = []
+            for cache_key in _key_list:
+                cache_key = self.check_and_fix_namespace(key=cache_key or "")
+                _keys.append(cache_key)
+            start_time = time.time()
+            results: List = self._run_redis_mget_operation(keys=_keys)
+            end_time = time.time()
+            _duration = end_time - start_time
+            self.service_logger_obj.service_success_hook(
+                service=ServiceTypes.REDIS,
+                duration=_duration,
+                call_type="batch_get_cache",
+                start_time=start_time,
+                end_time=end_time,
+                parent_otel_span=parent_otel_span,
+            )
+            # Associate the results back with their keys.
+            # 'results' is a list of values corresponding to the order of keys in '_key_list'.
+            key_value_dict = dict(zip(_key_list, results))
+            decoded_results = {}
+            for k, v in key_value_dict.items():
+                if isinstance(k, bytes):
+                    k = k.decode("utf-8")
+                v = self._get_cache_logic(v)
+                decoded_results[k] = v
+            return decoded_results
+        except Exception as e:
+            verbose_logger.error(f"Error occurred in batch get cache - {str(e)}")
+            return key_value_dict
+    async def async_get_cache(
+        self, key, parent_otel_span: Optional[Span] = None, **kwargs
+    ):
+        from redis.asyncio import Redis
+        _redis_client: Redis = self.init_async_client()  # type: ignore
+        key = self.check_and_fix_namespace(key=key)
+        start_time = time.time()
+        try:
+            print_verbose(f"Get Async Redis Cache: key: {key}")
+            cached_response = await _redis_client.get(key)
+            print_verbose(
+                f"Got Async Redis Cache: key: {key}, cached_response {cached_response}"
+            )
+            response = self._get_cache_logic(cached_response=cached_response)
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_success_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    call_type="async_get_cache",
+                    start_time=start_time,
+                    end_time=end_time,
+                    parent_otel_span=parent_otel_span,
+                    event_metadata={"key": key},
+                )
+            )
+            return response
+        except Exception as e:
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    error=e,
+                    call_type="async_get_cache",
+                    start_time=start_time,
+                    end_time=end_time,
+                    parent_otel_span=parent_otel_span,
+                    event_metadata={"key": key},
+                )
+            )
+            print_verbose(
+                f"litellm.caching.caching: async get() - Got exception from REDIS: {str(e)}"
+            )
+    async def async_batch_get_cache(
+        self,
+        key_list: Union[List[str], List[Optional[str]]],
+        parent_otel_span: Optional[Span] = None,
+    ) -> dict:
+        """
+        Use Redis for bulk read operations
+        Args:
+            key_list: List of keys to get from Redis
+            parent_otel_span: Optional parent OpenTelemetry span
+        Returns:
+            dict: A dictionary mapping keys to their cached values
+        `.mget` does not support None keys. This will filter out None keys.
+        """
+        # typed as Any, redis python lib has incomplete type stubs for RedisCluster and does not include `mget`
+        key_value_dict = {}
+        start_time = time.time()
+        _key_list = [key for key in key_list if key is not None]
+        try:
+            _keys = []
+            for cache_key in _key_list:
+                cache_key = self.check_and_fix_namespace(key=cache_key)
+                _keys.append(cache_key)
+            results = await self._async_run_redis_mget_operation(keys=_keys)
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_success_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    call_type="async_batch_get_cache",
+                    start_time=start_time,
+                    end_time=end_time,
+                    parent_otel_span=parent_otel_span,
+                )
+            )
+            # Associate the results back with their keys.
+            # 'results' is a list of values corresponding to the order of keys in 'key_list'.
+            key_value_dict = dict(zip(_key_list, results))
+            decoded_results = {}
+            for k, v in key_value_dict.items():
+                if isinstance(k, bytes):
+                    k = k.decode("utf-8")
+                v = self._get_cache_logic(v)
+                decoded_results[k] = v
+            return decoded_results
+        except Exception as e:
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    error=e,
+                    call_type="async_batch_get_cache",
+                    start_time=start_time,
+                    end_time=end_time,
+                    parent_otel_span=parent_otel_span,
+                )
+            )
+            verbose_logger.error(f"Error occurred in async batch get cache - {str(e)}")
+            return key_value_dict
+    def sync_ping(self) -> bool:
+        """
+        Tests if the sync redis client is correctly setup.
+        """
+        print_verbose("Pinging Sync Redis Cache")
+        start_time = time.time()
+        try:
+            response: bool = self.redis_client.ping()  # type: ignore
+            print_verbose(f"Redis Cache PING: {response}")
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            self.service_logger_obj.service_success_hook(
+                service=ServiceTypes.REDIS,
+                duration=_duration,
+                call_type="sync_ping",
+                start_time=start_time,
+                end_time=end_time,
+            )
+            return response
+        except Exception as e:
+            # NON blocking - notify users Redis is throwing an exception
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            self.service_logger_obj.service_failure_hook(
+                service=ServiceTypes.REDIS,
+                duration=_duration,
+                error=e,
+                call_type="sync_ping",
+            )
+            verbose_logger.error(
+                f"LiteLLM Redis Cache PING: - Got exception from REDIS : {str(e)}"
+            )
+            raise e
+    async def ping(self) -> bool:
+        # typed as Any, redis python lib has incomplete type stubs for RedisCluster and does not include `ping`
+        _redis_client: Any = self.init_async_client()
+        start_time = time.time()
+        print_verbose("Pinging Async Redis Cache")
+        try:
+            response = await _redis_client.ping()
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_success_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    call_type="async_ping",
+                )
+            )
+            return response
+        except Exception as e:
+            # NON blocking - notify users Redis is throwing an exception
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    error=e,
+                    call_type="async_ping",
+                )
+            )
+            verbose_logger.error(
+                f"LiteLLM Redis Cache PING: - Got exception from REDIS : {str(e)}"
+            )
+            raise e
+    async def delete_cache_keys(self, keys):
+        # typed as Any, redis python lib has incomplete type stubs for RedisCluster and does not include `delete`
+        _redis_client: Any = self.init_async_client()
+        # keys is a list, unpack it so it gets passed as individual elements to delete
+        await _redis_client.delete(*keys)
+    def client_list(self) -> List:
+        client_list: List = self.redis_client.client_list()  # type: ignore
+        return client_list
+    def info(self):
+        info = self.redis_client.info()
+        return info
+    def flush_cache(self):
+        self.redis_client.flushall()
+    def flushall(self):
+        self.redis_client.flushall()
+    async def disconnect(self):
+        await self.async_redis_conn_pool.disconnect(inuse_connections=True)
+    async def async_delete_cache(self, key: str):
+        # typed as Any, redis python lib has incomplete type stubs for RedisCluster and does not include `delete`
+        _redis_client: Any = self.init_async_client()
+        # keys is str
+        return await _redis_client.delete(key)
+    def delete_cache(self, key):
+        self.redis_client.delete(key)
+    async def _pipeline_increment_helper(
+        self,
+        pipe: pipeline,
+        increment_list: List[RedisPipelineIncrementOperation],
+    ) -> Optional[List[float]]:
+        """Helper function for pipeline increment operations"""
+        # Iterate through each increment operation and add commands to pipeline
+        for increment_op in increment_list:
+            cache_key = self.check_and_fix_namespace(key=increment_op["key"])
+            print_verbose(
+                f"Increment ASYNC Redis Cache PIPELINE: key: {cache_key}\nValue {increment_op['increment_value']}\nttl={increment_op['ttl']}"
+            )
+            pipe.incrbyfloat(cache_key, increment_op["increment_value"])
+            if increment_op["ttl"] is not None:
+                _td = timedelta(seconds=increment_op["ttl"])
+                pipe.expire(cache_key, _td)
+        # Execute the pipeline and return results
+        results = await pipe.execute()
+        print_verbose(f"Increment ASYNC Redis Cache PIPELINE: results: {results}")
+        return results
+    async def async_increment_pipeline(
+        self, increment_list: List[RedisPipelineIncrementOperation], **kwargs
+    ) -> Optional[List[float]]:
+        """
+        Use Redis Pipelines for bulk increment operations
+        Args:
+            increment_list: List of RedisPipelineIncrementOperation dicts containing:
+                - key: str
+                - increment_value: float
+                - ttl_seconds: int
+        """
+        # don't waste a network request if there's nothing to increment
+        if len(increment_list) == 0:
+            return None
+        from redis.asyncio import Redis
+        _redis_client: Redis = self.init_async_client()  # type: ignore
+        start_time = time.time()
+        print_verbose(
+            f"Increment Async Redis Cache Pipeline: increment list: {increment_list}"
+        )
+        try:
+            async with _redis_client.pipeline(transaction=False) as pipe:
+                results = await self._pipeline_increment_helper(pipe, increment_list)
+            print_verbose(f"pipeline increment results: {results}")
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_success_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    call_type="async_increment_pipeline",
+                    start_time=start_time,
+                    end_time=end_time,
+                    parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs),
+                )
+            )
+            return results
+        except Exception as e:
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    error=e,
+                    call_type="async_increment_pipeline",
+                    start_time=start_time,
+                    end_time=end_time,
+                    parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs),
+                )
+            )
+            verbose_logger.error(
+                "LiteLLM Redis Caching: async increment_pipeline() - Got exception from REDIS %s",
+                str(e),
+            )
+            raise e
+    async def async_get_ttl(self, key: str) -> Optional[int]:
+        """
+        Get the remaining TTL of a key in Redis
+        Args:
+            key (str): The key to get TTL for
+        Returns:
+            Optional[int]: The remaining TTL in seconds, or None if key doesn't exist
+        Redis ref: https://redis.io/docs/latest/commands/ttl/
+        """
+        try:
+            # typed as Any, redis python lib has incomplete type stubs for RedisCluster and does not include `ttl`
+            _redis_client: Any = self.init_async_client()
+            ttl = await _redis_client.ttl(key)
+            if ttl <= -1:  # -1 means the key does not exist, -2 key does not exist
+                return None
+            return ttl
+        except Exception as e:
+            verbose_logger.debug(f"Redis TTL Error: {e}")
+            return None
+    async def async_rpush(
+        self,
+        key: str,
+        values: List[Any],
+        parent_otel_span: Optional[Span] = None,
+        **kwargs,
+    ) -> int:
+        """
+        Append one or multiple values to a list stored at key
+        Args:
+            key: The Redis key of the list
+            values: One or more values to append to the list
+            parent_otel_span: Optional parent OpenTelemetry span
+        Returns:
+            int: The length of the list after the push operation
+        """
+        _redis_client: Any = self.init_async_client()
+        start_time = time.time()
+        try:
+            response = await _redis_client.rpush(key, *values)
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_success_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    call_type="async_rpush",
+                )
+            )
+            return response
+        except Exception as e:
+            # NON blocking - notify users Redis is throwing an exception
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    error=e,
+                    call_type="async_rpush",
+                )
+            )
+            verbose_logger.error(
+                f"LiteLLM Redis Cache RPUSH: - Got exception from REDIS : {str(e)}"
+            )
+            raise e
+    async def async_lpop(
+        self,
+        key: str,
+        count: Optional[int] = None,
+        parent_otel_span: Optional[Span] = None,
+        **kwargs,
+    ) -> Union[Any, List[Any]]:
+        _redis_client: Any = self.init_async_client()
+        start_time = time.time()
+        print_verbose(f"LPOP from Redis list: key: {key}, count: {count}")
+        try:
+            result = await _redis_client.lpop(key, count)
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_success_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    call_type="async_lpop",
+                )
+            )
+            # Handle result parsing if needed
+            if isinstance(result, bytes):
+                try:
+                    return result.decode("utf-8")
+                except Exception:
+                    return result
+            elif isinstance(result, list) and all(
+                isinstance(item, bytes) for item in result
+            ):
+                try:
+                    return [item.decode("utf-8") for item in result]
+                except Exception:
+                    return result
+            return result
+        except Exception as e:
+            # NON blocking - notify users Redis is throwing an exception
+            ## LOGGING ##
+            end_time = time.time()
+            _duration = end_time - start_time
+            asyncio.create_task(
+                self.service_logger_obj.async_service_failure_hook(
+                    service=ServiceTypes.REDIS,
+                    duration=_duration,
+                    error=e,
+                    call_type="async_lpop",
+                )
+            )
+            verbose_logger.error(
+                f"LiteLLM Redis Cache LPOP: - Got exception from REDIS : {str(e)}"
+            )
+            raise e

litellm/caching/redis_cluster_cache.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""
+Redis Cluster Cache implementation
+Key differences:
+- RedisClient NEEDs to be re-used across requests, adds 3000ms latency if it's re-created
+"""
+from typing import TYPE_CHECKING, Any, List, Optional, Union
+from litellm.caching.redis_cache import RedisCache
+if TYPE_CHECKING:
+    from opentelemetry.trace import Span as _Span
+    from redis.asyncio import Redis, RedisCluster
+    from redis.asyncio.client import Pipeline
+    pipeline = Pipeline
+    async_redis_client = Redis
+    Span = Union[_Span, Any]
+else:
+    pipeline = Any
+    async_redis_client = Any
+    Span = Any
+class RedisClusterCache(RedisCache):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.redis_async_redis_cluster_client: Optional[RedisCluster] = None
+        self.redis_sync_redis_cluster_client: Optional[RedisCluster] = None
+    def init_async_client(self):
+        from redis.asyncio import RedisCluster
+        from .._redis import get_redis_async_client
+        if self.redis_async_redis_cluster_client:
+            return self.redis_async_redis_cluster_client
+        _redis_client = get_redis_async_client(
+            connection_pool=self.async_redis_conn_pool, **self.redis_kwargs
+        )
+        if isinstance(_redis_client, RedisCluster):
+            self.redis_async_redis_cluster_client = _redis_client
+        return _redis_client
+    def _run_redis_mget_operation(self, keys: List[str]) -> List[Any]:
+        """
+        Overrides `_run_redis_mget_operation` in redis_cache.py
+        """
+        return self.redis_client.mget_nonatomic(keys=keys)  # type: ignore
+    async def _async_run_redis_mget_operation(self, keys: List[str]) -> List[Any]:
+        """
+        Overrides `_async_run_redis_mget_operation` in redis_cache.py
+        """
+        async_redis_cluster_client = self.init_async_client()
+        return await async_redis_cluster_client.mget_nonatomic(keys=keys)  # type: ignore

litellm/caching/redis_semantic_cache.py ADDED Viewed

	@@ -0,0 +1,450 @@

+"""
+Redis Semantic Cache implementation for LiteLLM
+The RedisSemanticCache provides semantic caching functionality using Redis as a backend.
+This cache stores responses based on the semantic similarity of prompts rather than
+exact matching, allowing for more flexible caching of LLM responses.
+This implementation uses RedisVL's SemanticCache to find semantically similar prompts
+and their cached responses.
+"""
+import ast
+import asyncio
+import json
+import os
+from typing import Any, Dict, List, Optional, Tuple, cast
+import litellm
+from litellm._logging import print_verbose
+from litellm.litellm_core_utils.prompt_templates.common_utils import (
+    get_str_from_messages,
+)
+from litellm.types.utils import EmbeddingResponse
+from .base_cache import BaseCache
+class RedisSemanticCache(BaseCache):
+    """
+    Redis-backed semantic cache for LLM responses.
+    This cache uses vector similarity to find semantically similar prompts that have been
+    previously sent to the LLM, allowing for cache hits even when prompts are not identical
+    but carry similar meaning.
+    """
+    DEFAULT_REDIS_INDEX_NAME: str = "litellm_semantic_cache_index"
+    def __init__(
+        self,
+        host: Optional[str] = None,
+        port: Optional[str] = None,
+        password: Optional[str] = None,
+        redis_url: Optional[str] = None,
+        similarity_threshold: Optional[float] = None,
+        embedding_model: str = "text-embedding-ada-002",
+        index_name: Optional[str] = None,
+        **kwargs,
+    ):
+        """
+        Initialize the Redis Semantic Cache.
+        Args:
+            host: Redis host address
+            port: Redis port
+            password: Redis password
+            redis_url: Full Redis URL (alternative to separate host/port/password)
+            similarity_threshold: Threshold for semantic similarity (0.0 to 1.0)
+                where 1.0 requires exact matches and 0.0 accepts any match
+            embedding_model: Model to use for generating embeddings
+            index_name: Name for the Redis index
+            ttl: Default time-to-live for cache entries in seconds
+            **kwargs: Additional arguments passed to the Redis client
+        Raises:
+            Exception: If similarity_threshold is not provided or required Redis
+                connection information is missing
+        """
+        from redisvl.extensions.llmcache import SemanticCache
+        from redisvl.utils.vectorize import CustomTextVectorizer
+        if index_name is None:
+            index_name = self.DEFAULT_REDIS_INDEX_NAME
+        print_verbose(f"Redis semantic-cache initializing index - {index_name}")
+        # Validate similarity threshold
+        if similarity_threshold is None:
+            raise ValueError("similarity_threshold must be provided, passed None")
+        # Store configuration
+        self.similarity_threshold = similarity_threshold
+        # Convert similarity threshold [0,1] to distance threshold [0,2]
+        # For cosine distance: 0 = most similar, 2 = least similar
+        # While similarity: 1 = most similar, 0 = least similar
+        self.distance_threshold = 1 - similarity_threshold
+        self.embedding_model = embedding_model
+        # Set up Redis connection
+        if redis_url is None:
+            try:
+                # Attempt to use provided parameters or fallback to environment variables
+                host = host or os.environ["REDIS_HOST"]
+                port = port or os.environ["REDIS_PORT"]
+                password = password or os.environ["REDIS_PASSWORD"]
+            except KeyError as e:
+                # Raise a more informative exception if any of the required keys are missing
+                missing_var = e.args[0]
+                raise ValueError(
+                    f"Missing required Redis configuration: {missing_var}. "
+                    f"Provide {missing_var} or redis_url."
+                ) from e
+            redis_url = f"redis://:{password}@{host}:{port}"
+        print_verbose(f"Redis semantic-cache redis_url: {redis_url}")
+        # Initialize the Redis vectorizer and cache
+        cache_vectorizer = CustomTextVectorizer(self._get_embedding)
+        self.llmcache = SemanticCache(
+            name=index_name,
+            redis_url=redis_url,
+            vectorizer=cache_vectorizer,
+            distance_threshold=self.distance_threshold,
+            overwrite=False,
+        )
+    def _get_ttl(self, **kwargs) -> Optional[int]:
+        """
+        Get the TTL (time-to-live) value for cache entries.
+        Args:
+            **kwargs: Keyword arguments that may contain a custom TTL
+        Returns:
+            Optional[int]: The TTL value in seconds, or None if no TTL should be applied
+        """
+        ttl = kwargs.get("ttl")
+        if ttl is not None:
+            ttl = int(ttl)
+        return ttl
+    def _get_embedding(self, prompt: str) -> List[float]:
+        """
+        Generate an embedding vector for the given prompt using the configured embedding model.
+        Args:
+            prompt: The text to generate an embedding for
+        Returns:
+            List[float]: The embedding vector
+        """
+        # Create an embedding from prompt
+        embedding_response = cast(
+            EmbeddingResponse,
+            litellm.embedding(
+                model=self.embedding_model,
+                input=prompt,
+                cache={"no-store": True, "no-cache": True},
+            ),
+        )
+        embedding = embedding_response["data"][0]["embedding"]
+        return embedding
+    def _get_cache_logic(self, cached_response: Any) -> Any:
+        """
+        Process the cached response to prepare it for use.
+        Args:
+            cached_response: The raw cached response
+        Returns:
+            The processed cache response, or None if input was None
+        """
+        if cached_response is None:
+            return cached_response
+        # Convert bytes to string if needed
+        if isinstance(cached_response, bytes):
+            cached_response = cached_response.decode("utf-8")
+        # Convert string representation to Python object
+        try:
+            cached_response = json.loads(cached_response)
+        except json.JSONDecodeError:
+            try:
+                cached_response = ast.literal_eval(cached_response)
+            except (ValueError, SyntaxError) as e:
+                print_verbose(f"Error parsing cached response: {str(e)}")
+                return None
+        return cached_response
+    def set_cache(self, key: str, value: Any, **kwargs) -> None:
+        """
+        Store a value in the semantic cache.
+        Args:
+            key: The cache key (not directly used in semantic caching)
+            value: The response value to cache
+            **kwargs: Additional arguments including 'messages' for the prompt
+                and optional 'ttl' for time-to-live
+        """
+        print_verbose(f"Redis semantic-cache set_cache, kwargs: {kwargs}")
+        value_str: Optional[str] = None
+        try:
+            # Extract the prompt from messages
+            messages = kwargs.get("messages", [])
+            if not messages:
+                print_verbose("No messages provided for semantic caching")
+                return
+            prompt = get_str_from_messages(messages)
+            value_str = str(value)
+            # Get TTL and store in Redis semantic cache
+            ttl = self._get_ttl(**kwargs)
+            if ttl is not None:
+                self.llmcache.store(prompt, value_str, ttl=int(ttl))
+            else:
+                self.llmcache.store(prompt, value_str)
+        except Exception as e:
+            print_verbose(
+                f"Error setting {value_str or value} in the Redis semantic cache: {str(e)}"
+            )
+    def get_cache(self, key: str, **kwargs) -> Any:
+        """
+        Retrieve a semantically similar cached response.
+        Args:
+            key: The cache key (not directly used in semantic caching)
+            **kwargs: Additional arguments including 'messages' for the prompt
+        Returns:
+            The cached response if a semantically similar prompt is found, else None
+        """
+        print_verbose(f"Redis semantic-cache get_cache, kwargs: {kwargs}")
+        try:
+            # Extract the prompt from messages
+            messages = kwargs.get("messages", [])
+            if not messages:
+                print_verbose("No messages provided for semantic cache lookup")
+                return None
+            prompt = get_str_from_messages(messages)
+            # Check the cache for semantically similar prompts
+            results = self.llmcache.check(prompt=prompt)
+            # Return None if no similar prompts found
+            if not results:
+                return None
+            # Process the best matching result
+            cache_hit = results[0]
+            vector_distance = float(cache_hit["vector_distance"])
+            # Convert vector distance back to similarity score
+            # For cosine distance: 0 = most similar, 2 = least similar
+            # While similarity: 1 = most similar, 0 = least similar
+            similarity = 1 - vector_distance
+            cached_prompt = cache_hit["prompt"]
+            cached_response = cache_hit["response"]
+            print_verbose(
+                f"Cache hit: similarity threshold: {self.similarity_threshold}, "
+                f"actual similarity: {similarity}, "
+                f"current prompt: {prompt}, "
+                f"cached prompt: {cached_prompt}"
+            )
+            return self._get_cache_logic(cached_response=cached_response)
+        except Exception as e:
+            print_verbose(f"Error retrieving from Redis semantic cache: {str(e)}")
+    async def _get_async_embedding(self, prompt: str, **kwargs) -> List[float]:
+        """
+        Asynchronously generate an embedding for the given prompt.
+        Args:
+            prompt: The text to generate an embedding for
+            **kwargs: Additional arguments that may contain metadata
+        Returns:
+            List[float]: The embedding vector
+        """
+        from litellm.proxy.proxy_server import llm_model_list, llm_router
+        # Route the embedding request through the proxy if appropriate
+        router_model_names = (
+            [m["model_name"] for m in llm_model_list]
+            if llm_model_list is not None
+            else []
+        )
+        try:
+            if llm_router is not None and self.embedding_model in router_model_names:
+                # Use the router for embedding generation
+                user_api_key = kwargs.get("metadata", {}).get("user_api_key", "")
+                embedding_response = await llm_router.aembedding(
+                    model=self.embedding_model,
+                    input=prompt,
+                    cache={"no-store": True, "no-cache": True},
+                    metadata={
+                        "user_api_key": user_api_key,
+                        "semantic-cache-embedding": True,
+                        "trace_id": kwargs.get("metadata", {}).get("trace_id", None),
+                    },
+                )
+            else:
+                # Generate embedding directly
+                embedding_response = await litellm.aembedding(
+                    model=self.embedding_model,
+                    input=prompt,
+                    cache={"no-store": True, "no-cache": True},
+                )
+            # Extract and return the embedding vector
+            return embedding_response["data"][0]["embedding"]
+        except Exception as e:
+            print_verbose(f"Error generating async embedding: {str(e)}")
+            raise ValueError(f"Failed to generate embedding: {str(e)}") from e
+    async def async_set_cache(self, key: str, value: Any, **kwargs) -> None:
+        """
+        Asynchronously store a value in the semantic cache.
+        Args:
+            key: The cache key (not directly used in semantic caching)
+            value: The response value to cache
+            **kwargs: Additional arguments including 'messages' for the prompt
+                and optional 'ttl' for time-to-live
+        """
+        print_verbose(f"Async Redis semantic-cache set_cache, kwargs: {kwargs}")
+        try:
+            # Extract the prompt from messages
+            messages = kwargs.get("messages", [])
+            if not messages:
+                print_verbose("No messages provided for semantic caching")
+                return
+            prompt = get_str_from_messages(messages)
+            value_str = str(value)
+            # Generate embedding for the value (response) to cache
+            prompt_embedding = await self._get_async_embedding(prompt, **kwargs)
+            # Get TTL and store in Redis semantic cache
+            ttl = self._get_ttl(**kwargs)
+            if ttl is not None:
+                await self.llmcache.astore(
+                    prompt,
+                    value_str,
+                    vector=prompt_embedding,  # Pass through custom embedding
+                    ttl=ttl,
+                )
+            else:
+                await self.llmcache.astore(
+                    prompt,
+                    value_str,
+                    vector=prompt_embedding,  # Pass through custom embedding
+                )
+        except Exception as e:
+            print_verbose(f"Error in async_set_cache: {str(e)}")
+    async def async_get_cache(self, key: str, **kwargs) -> Any:
+        """
+        Asynchronously retrieve a semantically similar cached response.
+        Args:
+            key: The cache key (not directly used in semantic caching)
+            **kwargs: Additional arguments including 'messages' for the prompt
+        Returns:
+            The cached response if a semantically similar prompt is found, else None
+        """
+        print_verbose(f"Async Redis semantic-cache get_cache, kwargs: {kwargs}")
+        try:
+            # Extract the prompt from messages
+            messages = kwargs.get("messages", [])
+            if not messages:
+                print_verbose("No messages provided for semantic cache lookup")
+                kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
+                return None
+            prompt = get_str_from_messages(messages)
+            # Generate embedding for the prompt
+            prompt_embedding = await self._get_async_embedding(prompt, **kwargs)
+            # Check the cache for semantically similar prompts
+            results = await self.llmcache.acheck(prompt=prompt, vector=prompt_embedding)
+            # handle results / cache hit
+            if not results:
+                kwargs.setdefault("metadata", {})[
+                    "semantic-similarity"
+                ] = 0.0  # TODO why here but not above??
+                return None
+            cache_hit = results[0]
+            vector_distance = float(cache_hit["vector_distance"])
+            # Convert vector distance back to similarity
+            # For cosine distance: 0 = most similar, 2 = least similar
+            # While similarity: 1 = most similar, 0 = least similar
+            similarity = 1 - vector_distance
+            cached_prompt = cache_hit["prompt"]
+            cached_response = cache_hit["response"]
+            # update kwargs["metadata"] with similarity, don't rewrite the original metadata
+            kwargs.setdefault("metadata", {})["semantic-similarity"] = similarity
+            print_verbose(
+                f"Cache hit: similarity threshold: {self.similarity_threshold}, "
+                f"actual similarity: {similarity}, "
+                f"current prompt: {prompt}, "
+                f"cached prompt: {cached_prompt}"
+            )
+            return self._get_cache_logic(cached_response=cached_response)
+        except Exception as e:
+            print_verbose(f"Error in async_get_cache: {str(e)}")
+            kwargs.setdefault("metadata", {})["semantic-similarity"] = 0.0
+    async def _index_info(self) -> Dict[str, Any]:
+        """
+        Get information about the Redis index.
+        Returns:
+            Dict[str, Any]: Information about the Redis index
+        """
+        aindex = await self.llmcache._get_async_index()
+        return await aindex.info()
+    async def async_set_cache_pipeline(
+        self, cache_list: List[Tuple[str, Any]], **kwargs
+    ) -> None:
+        """
+        Asynchronously store multiple values in the semantic cache.
+        Args:
+            cache_list: List of (key, value) tuples to cache
+            **kwargs: Additional arguments
+        """
+        try:
+            tasks = []
+            for val in cache_list:
+                tasks.append(self.async_set_cache(val[0], val[1], **kwargs))
+            await asyncio.gather(*tasks)
+        except Exception as e:
+            print_verbose(f"Error in async_set_cache_pipeline: {str(e)}")

litellm/caching/s3_cache.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""
+S3 Cache implementation
+WARNING: DO NOT USE THIS IN PRODUCTION - This is not ASYNC
+Has 4 methods:
+    - set_cache
+    - get_cache
+    - async_set_cache
+    - async_get_cache
+"""
+import ast
+import asyncio
+import json
+from typing import Optional
+from litellm._logging import print_verbose, verbose_logger
+from .base_cache import BaseCache
+class S3Cache(BaseCache):
+    def __init__(
+        self,
+        s3_bucket_name,
+        s3_region_name=None,
+        s3_api_version=None,
+        s3_use_ssl: Optional[bool] = True,
+        s3_verify=None,
+        s3_endpoint_url=None,
+        s3_aws_access_key_id=None,
+        s3_aws_secret_access_key=None,
+        s3_aws_session_token=None,
+        s3_config=None,
+        s3_path=None,
+        **kwargs,
+    ):
+        import boto3
+        self.bucket_name = s3_bucket_name
+        self.key_prefix = s3_path.rstrip("/") + "/" if s3_path else ""
+        # Create an S3 client with custom endpoint URL
+        self.s3_client = boto3.client(
+            "s3",
+            region_name=s3_region_name,
+            endpoint_url=s3_endpoint_url,
+            api_version=s3_api_version,
+            use_ssl=s3_use_ssl,
+            verify=s3_verify,
+            aws_access_key_id=s3_aws_access_key_id,
+            aws_secret_access_key=s3_aws_secret_access_key,
+            aws_session_token=s3_aws_session_token,
+            config=s3_config,
+            **kwargs,
+        )
+    def set_cache(self, key, value, **kwargs):
+        try:
+            print_verbose(f"LiteLLM SET Cache - S3. Key={key}. Value={value}")
+            ttl = kwargs.get("ttl", None)
+            # Convert value to JSON before storing in S3
+            serialized_value = json.dumps(value)
+            key = self.key_prefix + key
+            if ttl is not None:
+                cache_control = f"immutable, max-age={ttl}, s-maxage={ttl}"
+                import datetime
+                # Calculate expiration time
+                expiration_time = datetime.datetime.now() + ttl
+                # Upload the data to S3 with the calculated expiration time
+                self.s3_client.put_object(
+                    Bucket=self.bucket_name,
+                    Key=key,
+                    Body=serialized_value,
+                    Expires=expiration_time,
+                    CacheControl=cache_control,
+                    ContentType="application/json",
+                    ContentLanguage="en",
+                    ContentDisposition=f'inline; filename="{key}.json"',
+                )
+            else:
+                cache_control = "immutable, max-age=31536000, s-maxage=31536000"
+                # Upload the data to S3 without specifying Expires
+                self.s3_client.put_object(
+                    Bucket=self.bucket_name,
+                    Key=key,
+                    Body=serialized_value,
+                    CacheControl=cache_control,
+                    ContentType="application/json",
+                    ContentLanguage="en",
+                    ContentDisposition=f'inline; filename="{key}.json"',
+                )
+        except Exception as e:
+            # NON blocking - notify users S3 is throwing an exception
+            print_verbose(f"S3 Caching: set_cache() - Got exception from S3: {e}")
+    async def async_set_cache(self, key, value, **kwargs):
+        self.set_cache(key=key, value=value, **kwargs)
+    def get_cache(self, key, **kwargs):
+        import botocore
+        try:
+            key = self.key_prefix + key
+            print_verbose(f"Get S3 Cache: key: {key}")
+            # Download the data from S3
+            cached_response = self.s3_client.get_object(
+                Bucket=self.bucket_name, Key=key
+            )
+            if cached_response is not None:
+                # cached_response is in `b{} convert it to ModelResponse
+                cached_response = (
+                    cached_response["Body"].read().decode("utf-8")
+                )  # Convert bytes to string
+                try:
+                    cached_response = json.loads(
+                        cached_response
+                    )  # Convert string to dictionary
+                except Exception:
+                    cached_response = ast.literal_eval(cached_response)
+            if not isinstance(cached_response, dict):
+                cached_response = dict(cached_response)
+            verbose_logger.debug(
+                f"Got S3 Cache: key: {key}, cached_response {cached_response}. Type Response {type(cached_response)}"
+            )
+            return cached_response
+        except botocore.exceptions.ClientError as e:  # type: ignore
+            if e.response["Error"]["Code"] == "NoSuchKey":
+                verbose_logger.debug(
+                    f"S3 Cache: The specified key '{key}' does not exist in the S3 bucket."
+                )
+                return None
+        except Exception as e:
+            # NON blocking - notify users S3 is throwing an exception
+            verbose_logger.error(
+                f"S3 Caching: get_cache() - Got exception from S3: {e}"
+            )
+    async def async_get_cache(self, key, **kwargs):
+        return self.get_cache(key=key, **kwargs)
+    def flush_cache(self):
+        pass
+    async def disconnect(self):
+        pass
+    async def async_set_cache_pipeline(self, cache_list, **kwargs):
+        tasks = []
+        for val in cache_list:
+            tasks.append(self.async_set_cache(val[0], val[1], **kwargs))
+        await asyncio.gather(*tasks)

litellm/constants.py ADDED Viewed

	@@ -0,0 +1,543 @@

+from typing import List, Literal
+ROUTER_MAX_FALLBACKS = 5
+DEFAULT_BATCH_SIZE = 512
+DEFAULT_FLUSH_INTERVAL_SECONDS = 5
+DEFAULT_MAX_RETRIES = 2
+DEFAULT_MAX_RECURSE_DEPTH = 10
+DEFAULT_FAILURE_THRESHOLD_PERCENT = (
+    0.5  # default cooldown a deployment if 50% of requests fail in a given minute
+)
+DEFAULT_MAX_TOKENS = 4096
+DEFAULT_ALLOWED_FAILS = 3
+DEFAULT_REDIS_SYNC_INTERVAL = 1
+DEFAULT_COOLDOWN_TIME_SECONDS = 5
+DEFAULT_REPLICATE_POLLING_RETRIES = 5
+DEFAULT_REPLICATE_POLLING_DELAY_SECONDS = 1
+DEFAULT_IMAGE_TOKEN_COUNT = 250
+DEFAULT_IMAGE_WIDTH = 300
+DEFAULT_IMAGE_HEIGHT = 300
+DEFAULT_MAX_TOKENS = 256  # used when providers need a default
+MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024  # 1MB = 1024KB
+SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000  # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
+DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET = 1024
+DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET = 2048
+DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET = 4096
+########## Networking constants ##############################################################
+_DEFAULT_TTL_FOR_HTTPX_CLIENTS = 3600  # 1 hour, re-use the same httpx client for 1 hour
+########### v2 Architecture constants for managing writing updates to the database ###########
+REDIS_UPDATE_BUFFER_KEY = "litellm_spend_update_buffer"
+REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_spend_update_buffer"
+REDIS_DAILY_TEAM_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_team_spend_update_buffer"
+REDIS_DAILY_TAG_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_tag_spend_update_buffer"
+MAX_REDIS_BUFFER_DEQUEUE_COUNT = 100
+MAX_SIZE_IN_MEMORY_QUEUE = 10000
+MAX_IN_MEMORY_QUEUE_FLUSH_COUNT = 1000
+###############################################################################################
+MINIMUM_PROMPT_CACHE_TOKEN_COUNT = (
+    1024  # minimum number of tokens to cache a prompt by Anthropic
+)
+DEFAULT_TRIM_RATIO = 0.75  # default ratio of tokens to trim from the end of a prompt
+HOURS_IN_A_DAY = 24
+DAYS_IN_A_WEEK = 7
+DAYS_IN_A_MONTH = 28
+DAYS_IN_A_YEAR = 365
+REPLICATE_MODEL_NAME_WITH_ID_LENGTH = 64
+#### TOKEN COUNTING ####
+FUNCTION_DEFINITION_TOKEN_COUNT = 9
+SYSTEM_MESSAGE_TOKEN_COUNT = 4
+TOOL_CHOICE_OBJECT_TOKEN_COUNT = 4
+DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT = 10
+DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT = 20
+MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES = 768
+MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES = 2000
+MAX_TILE_WIDTH = 512
+MAX_TILE_HEIGHT = 512
+OPENAI_FILE_SEARCH_COST_PER_1K_CALLS = 2.5 / 1000
+MIN_NON_ZERO_TEMPERATURE = 0.0001
+#### RELIABILITY ####
+REPEATED_STREAMING_CHUNK_LIMIT = 100  # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
+DEFAULT_MAX_LRU_CACHE_SIZE = 16
+INITIAL_RETRY_DELAY = 0.5
+MAX_RETRY_DELAY = 8.0
+JITTER = 0.75
+DEFAULT_IN_MEMORY_TTL = 5  # default time to live for the in-memory cache
+DEFAULT_POLLING_INTERVAL = 0.03  # default polling interval for the scheduler
+AZURE_OPERATION_POLLING_TIMEOUT = 120
+REDIS_SOCKET_TIMEOUT = 0.1
+REDIS_CONNECTION_POOL_TIMEOUT = 5
+NON_LLM_CONNECTION_TIMEOUT = 15  # timeout for adjacent services (e.g. jwt auth)
+MAX_EXCEPTION_MESSAGE_LENGTH = 2000
+BEDROCK_MAX_POLICY_SIZE = 75
+REPLICATE_POLLING_DELAY_SECONDS = 0.5
+DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS = 4096
+TOGETHER_AI_4_B = 4
+TOGETHER_AI_8_B = 8
+TOGETHER_AI_21_B = 21
+TOGETHER_AI_41_B = 41
+TOGETHER_AI_80_B = 80
+TOGETHER_AI_110_B = 110
+TOGETHER_AI_EMBEDDING_150_M = 150
+TOGETHER_AI_EMBEDDING_350_M = 350
+QDRANT_SCALAR_QUANTILE = 0.99
+QDRANT_VECTOR_SIZE = 1536
+CACHED_STREAMING_CHUNK_DELAY = 0.02
+MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 512
+DEFAULT_MAX_TOKENS_FOR_TRITON = 2000
+#### Networking settings ####
+request_timeout: float = 6000  # time in seconds
+STREAM_SSE_DONE_STRING: str = "[DONE]"
+### SPEND TRACKING ###
+DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND = 0.001400  # price per second for a100 80GB
+FIREWORKS_AI_56_B_MOE = 56
+FIREWORKS_AI_176_B_MOE = 176
+FIREWORKS_AI_4_B = 4
+FIREWORKS_AI_16_B = 16
+FIREWORKS_AI_80_B = 80
+LITELLM_CHAT_PROVIDERS = [
+    "openai",
+    "openai_like",
+    "xai",
+    "custom_openai",
+    "text-completion-openai",
+    "cohere",
+    "cohere_chat",
+    "clarifai",
+    "anthropic",
+    "anthropic_text",
+    "replicate",
+    "huggingface",
+    "together_ai",
+    "openrouter",
+    "vertex_ai",
+    "vertex_ai_beta",
+    "gemini",
+    "ai21",
+    "baseten",
+    "azure",
+    "azure_text",
+    "azure_ai",
+    "sagemaker",
+    "sagemaker_chat",
+    "bedrock",
+    "vllm",
+    "nlp_cloud",
+    "petals",
+    "oobabooga",
+    "ollama",
+    "ollama_chat",
+    "deepinfra",
+    "perplexity",
+    "mistral",
+    "groq",
+    "nvidia_nim",
+    "cerebras",
+    "ai21_chat",
+    "volcengine",
+    "codestral",
+    "text-completion-codestral",
+    "deepseek",
+    "sambanova",
+    "maritalk",
+    "cloudflare",
+    "fireworks_ai",
+    "friendliai",
+    "watsonx",
+    "watsonx_text",
+    "triton",
+    "predibase",
+    "databricks",
+    "empower",
+    "github",
+    "custom",
+    "litellm_proxy",
+    "hosted_vllm",
+    "llamafile",
+    "lm_studio",
+    "galadriel",
+]
+OPENAI_CHAT_COMPLETION_PARAMS = [
+    "functions",
+    "function_call",
+    "temperature",
+    "temperature",
+    "top_p",
+    "n",
+    "stream",
+    "stream_options",
+    "stop",
+    "max_completion_tokens",
+    "modalities",
+    "prediction",
+    "audio",
+    "max_tokens",
+    "presence_penalty",
+    "frequency_penalty",
+    "logit_bias",
+    "user",
+    "request_timeout",
+    "api_base",
+    "api_version",
+    "api_key",
+    "deployment_id",
+    "organization",
+    "base_url",
+    "default_headers",
+    "timeout",
+    "response_format",
+    "seed",
+    "tools",
+    "tool_choice",
+    "max_retries",
+    "parallel_tool_calls",
+    "logprobs",
+    "top_logprobs",
+    "reasoning_effort",
+    "extra_headers",
+    "thinking",
+]
+openai_compatible_endpoints: List = [
+    "api.perplexity.ai",
+    "api.endpoints.anyscale.com/v1",
+    "api.deepinfra.com/v1/openai",
+    "api.mistral.ai/v1",
+    "codestral.mistral.ai/v1/chat/completions",
+    "codestral.mistral.ai/v1/fim/completions",
+    "api.groq.com/openai/v1",
+    "https://integrate.api.nvidia.com/v1",
+    "api.deepseek.com/v1",
+    "api.together.xyz/v1",
+    "app.empower.dev/api/v1",
+    "https://api.friendli.ai/serverless/v1",
+    "api.sambanova.ai/v1",
+    "api.x.ai/v1",
+    "api.galadriel.ai/v1",
+]
+openai_compatible_providers: List = [
+    "anyscale",
+    "mistral",
+    "groq",
+    "nvidia_nim",
+    "cerebras",
+    "sambanova",
+    "ai21_chat",
+    "ai21",
+    "volcengine",
+    "codestral",
+    "deepseek",
+    "deepinfra",
+    "perplexity",
+    "xinference",
+    "xai",
+    "together_ai",
+    "fireworks_ai",
+    "empower",
+    "friendliai",
+    "azure_ai",
+    "github",
+    "litellm_proxy",
+    "hosted_vllm",
+    "llamafile",
+    "lm_studio",
+    "galadriel",
+]
+openai_text_completion_compatible_providers: List = (
+    [  # providers that support `/v1/completions`
+        "together_ai",
+        "fireworks_ai",
+        "hosted_vllm",
+        "llamafile",
+    ]
+)
+_openai_like_providers: List = [
+    "predibase",
+    "databricks",
+    "watsonx",
+]  # private helper. similar to openai but require some custom auth / endpoint handling, so can't use the openai sdk
+# well supported replicate llms
+replicate_models: List = [
+    # llama replicate supported LLMs
+    "replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf",
+    "a16z-infra/llama-2-13b-chat:2a7f981751ec7fdf87b5b91ad4db53683a98082e9ff7bfd12c8cd5ea85980a52",
+    "meta/codellama-13b:1c914d844307b0588599b8393480a3ba917b660c7e9dfae681542b5325f228db",
+    # Vicuna
+    "replicate/vicuna-13b:6282abe6a492de4145d7bb601023762212f9ddbbe78278bd6771c8b3b2f2a13b",
+    "joehoover/instructblip-vicuna13b:c4c54e3c8c97cd50c2d2fec9be3b6065563ccf7d43787fb99f84151b867178fe",
+    # Flan T-5
+    "daanelson/flan-t5-large:ce962b3f6792a57074a601d3979db5839697add2e4e02696b3ced4c022d4767f",
+    # Others
+    "replicate/dolly-v2-12b:ef0e1aefc61f8e096ebe4db6b2bacc297daf2ef6899f0f7e001ec445893500e5",
+    "replit/replit-code-v1-3b:b84f4c074b807211cd75e3e8b1589b6399052125b4c27106e43d47189e8415ad",
+]
+clarifai_models: List = [
+    "clarifai/meta.Llama-3.Llama-3-8B-Instruct",
+    "clarifai/gcp.generate.gemma-1_1-7b-it",
+    "clarifai/mistralai.completion.mixtral-8x22B",
+    "clarifai/cohere.generate.command-r-plus",
+    "clarifai/databricks.drbx.dbrx-instruct",
+    "clarifai/mistralai.completion.mistral-large",
+    "clarifai/mistralai.completion.mistral-medium",
+    "clarifai/mistralai.completion.mistral-small",
+    "clarifai/mistralai.completion.mixtral-8x7B-Instruct-v0_1",
+    "clarifai/gcp.generate.gemma-2b-it",
+    "clarifai/gcp.generate.gemma-7b-it",
+    "clarifai/deci.decilm.deciLM-7B-instruct",
+    "clarifai/mistralai.completion.mistral-7B-Instruct",
+    "clarifai/gcp.generate.gemini-pro",
+    "clarifai/anthropic.completion.claude-v1",
+    "clarifai/anthropic.completion.claude-instant-1_2",
+    "clarifai/anthropic.completion.claude-instant",
+    "clarifai/anthropic.completion.claude-v2",
+    "clarifai/anthropic.completion.claude-2_1",
+    "clarifai/meta.Llama-2.codeLlama-70b-Python",
+    "clarifai/meta.Llama-2.codeLlama-70b-Instruct",
+    "clarifai/openai.completion.gpt-3_5-turbo-instruct",
+    "clarifai/meta.Llama-2.llama2-7b-chat",
+    "clarifai/meta.Llama-2.llama2-13b-chat",
+    "clarifai/meta.Llama-2.llama2-70b-chat",
+    "clarifai/openai.chat-completion.gpt-4-turbo",
+    "clarifai/microsoft.text-generation.phi-2",
+    "clarifai/meta.Llama-2.llama2-7b-chat-vllm",
+    "clarifai/upstage.solar.solar-10_7b-instruct",
+    "clarifai/openchat.openchat.openchat-3_5-1210",
+    "clarifai/togethercomputer.stripedHyena.stripedHyena-Nous-7B",
+    "clarifai/gcp.generate.text-bison",
+    "clarifai/meta.Llama-2.llamaGuard-7b",
+    "clarifai/fblgit.una-cybertron.una-cybertron-7b-v2",
+    "clarifai/openai.chat-completion.GPT-4",
+    "clarifai/openai.chat-completion.GPT-3_5-turbo",
+    "clarifai/ai21.complete.Jurassic2-Grande",
+    "clarifai/ai21.complete.Jurassic2-Grande-Instruct",
+    "clarifai/ai21.complete.Jurassic2-Jumbo-Instruct",
+    "clarifai/ai21.complete.Jurassic2-Jumbo",
+    "clarifai/ai21.complete.Jurassic2-Large",
+    "clarifai/cohere.generate.cohere-generate-command",
+    "clarifai/wizardlm.generate.wizardCoder-Python-34B",
+    "clarifai/wizardlm.generate.wizardLM-70B",
+    "clarifai/tiiuae.falcon.falcon-40b-instruct",
+    "clarifai/togethercomputer.RedPajama.RedPajama-INCITE-7B-Chat",
+    "clarifai/gcp.generate.code-gecko",
+    "clarifai/gcp.generate.code-bison",
+    "clarifai/mistralai.completion.mistral-7B-OpenOrca",
+    "clarifai/mistralai.completion.openHermes-2-mistral-7B",
+    "clarifai/wizardlm.generate.wizardLM-13B",
+    "clarifai/huggingface-research.zephyr.zephyr-7B-alpha",
+    "clarifai/wizardlm.generate.wizardCoder-15B",
+    "clarifai/microsoft.text-generation.phi-1_5",
+    "clarifai/databricks.Dolly-v2.dolly-v2-12b",
+    "clarifai/bigcode.code.StarCoder",
+    "clarifai/salesforce.xgen.xgen-7b-8k-instruct",
+    "clarifai/mosaicml.mpt.mpt-7b-instruct",
+    "clarifai/anthropic.completion.claude-3-opus",
+    "clarifai/anthropic.completion.claude-3-sonnet",
+    "clarifai/gcp.generate.gemini-1_5-pro",
+    "clarifai/gcp.generate.imagen-2",
+    "clarifai/salesforce.blip.general-english-image-caption-blip-2",
+]
+huggingface_models: List = [
+    "meta-llama/Llama-2-7b-hf",
+    "meta-llama/Llama-2-7b-chat-hf",
+    "meta-llama/Llama-2-13b-hf",
+    "meta-llama/Llama-2-13b-chat-hf",
+    "meta-llama/Llama-2-70b-hf",
+    "meta-llama/Llama-2-70b-chat-hf",
+    "meta-llama/Llama-2-7b",
+    "meta-llama/Llama-2-7b-chat",
+    "meta-llama/Llama-2-13b",
+    "meta-llama/Llama-2-13b-chat",
+    "meta-llama/Llama-2-70b",
+    "meta-llama/Llama-2-70b-chat",
+]  # these have been tested on extensively. But by default all text2text-generation and text-generation models are supported by liteLLM. - https://docs.litellm.ai/docs/providers
+empower_models = [
+    "empower/empower-functions",
+    "empower/empower-functions-small",
+]
+together_ai_models: List = [
+    # llama llms - chat
+    "togethercomputer/llama-2-70b-chat",
+    # llama llms - language / instruct
+    "togethercomputer/llama-2-70b",
+    "togethercomputer/LLaMA-2-7B-32K",
+    "togethercomputer/Llama-2-7B-32K-Instruct",
+    "togethercomputer/llama-2-7b",
+    # falcon llms
+    "togethercomputer/falcon-40b-instruct",
+    "togethercomputer/falcon-7b-instruct",
+    # alpaca
+    "togethercomputer/alpaca-7b",
+    # chat llms
+    "HuggingFaceH4/starchat-alpha",
+    # code llms
+    "togethercomputer/CodeLlama-34b",
+    "togethercomputer/CodeLlama-34b-Instruct",
+    "togethercomputer/CodeLlama-34b-Python",
+    "defog/sqlcoder",
+    "NumbersStation/nsql-llama-2-7B",
+    "WizardLM/WizardCoder-15B-V1.0",
+    "WizardLM/WizardCoder-Python-34B-V1.0",
+    # language llms
+    "NousResearch/Nous-Hermes-Llama2-13b",
+    "Austism/chronos-hermes-13b",
+    "upstage/SOLAR-0-70b-16bit",
+    "WizardLM/WizardLM-70B-V1.0",
+]  # supports all together ai models, just pass in the model id e.g. completion(model="together_computer/replit_code_3b",...)
+baseten_models: List = [
+    "qvv0xeq",
+    "q841o8w",
+    "31dxrj3",
+]  # FALCON 7B  # WizardLM  # Mosaic ML
+BEDROCK_INVOKE_PROVIDERS_LITERAL = Literal[
+    "cohere",
+    "anthropic",
+    "mistral",
+    "amazon",
+    "meta",
+    "llama",
+    "ai21",
+    "nova",
+    "deepseek_r1",
+]
+open_ai_embedding_models: List = ["text-embedding-ada-002"]
+cohere_embedding_models: List = [
+    "embed-english-v3.0",
+    "embed-english-light-v3.0",
+    "embed-multilingual-v3.0",
+    "embed-english-v2.0",
+    "embed-english-light-v2.0",
+    "embed-multilingual-v2.0",
+]
+bedrock_embedding_models: List = [
+    "amazon.titan-embed-text-v1",
+    "cohere.embed-english-v3",
+    "cohere.embed-multilingual-v3",
+]
+known_tokenizer_config = {
+    "mistralai/Mistral-7B-Instruct-v0.1": {
+        "tokenizer": {
+            "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+        },
+        "status": "success",
+    },
+    "meta-llama/Meta-Llama-3-8B-Instruct": {
+        "tokenizer": {
+            "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "",
+        },
+        "status": "success",
+    },
+    "deepseek-r1/deepseek-r1-7b-instruct": {
+        "tokenizer": {
+            "add_bos_token": True,
+            "add_eos_token": False,
+            "bos_token": {
+                "__type": "AddedToken",
+                "content": "<｜begin▁of▁sentence｜>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False,
+            },
+            "clean_up_tokenization_spaces": False,
+            "eos_token": {
+                "__type": "AddedToken",
+                "content": "<｜end▁of▁sentence｜>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False,
+            },
+            "legacy": True,
+            "model_max_length": 16384,
+            "pad_token": {
+                "__type": "AddedToken",
+                "content": "<｜end▁of▁sentence｜>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False,
+            },
+            "sp_model_kwargs": {},
+            "unk_token": None,
+            "tokenizer_class": "LlamaTokenizerFast",
+            "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\\n'}}{% endif %}",
+        },
+        "status": "success",
+    },
+}
+OPENAI_FINISH_REASONS = ["stop", "length", "function_call", "content_filter", "null"]
+HUMANLOOP_PROMPT_CACHE_TTL_SECONDS = 60  # 1 minute
+RESPONSE_FORMAT_TOOL_NAME = "json_tool_call"  # default tool name used when converting response format to tool call
+########################### Logging Callback Constants ###########################
+AZURE_STORAGE_MSFT_VERSION = "2019-07-07"
+PROMETHEUS_BUDGET_METRICS_REFRESH_INTERVAL_MINUTES = 5
+MCP_TOOL_NAME_PREFIX = "mcp_tool"
+########################### LiteLLM Proxy Specific Constants ###########################
+########################################################################################
+MAX_SPENDLOG_ROWS_TO_QUERY = (
+    1_000_000  # if spendLogs has more than 1M rows, do not query the DB
+)
+DEFAULT_SOFT_BUDGET = (
+    50.0  # by default all litellm proxy keys have a soft budget of 50.0
+)
+# makes it clear this is a rate limit error for a litellm virtual key
+RATE_LIMIT_ERROR_MESSAGE_FOR_VIRTUAL_KEY = "LiteLLM Virtual Key user_api_key_hash"
+# pass through route constansts
+BEDROCK_AGENT_RUNTIME_PASS_THROUGH_ROUTES = [
+    "agents/",
+    "knowledgebases/",
+    "flows/",
+    "retrieveAndGenerate/",
+    "rerank/",
+    "generateQuery/",
+    "optimize-prompt/",
+]
+BATCH_STATUS_POLL_INTERVAL_SECONDS = 3600  # 1 hour
+BATCH_STATUS_POLL_MAX_ATTEMPTS = 24  # for 24 hours
+HEALTH_CHECK_TIMEOUT_SECONDS = 60  # 60 seconds
+UI_SESSION_TOKEN_TEAM_ID = "litellm-dashboard"
+LITELLM_PROXY_ADMIN_NAME = "default_user_id"
+########################### DB CRON JOB NAMES ###########################
+DB_SPEND_UPDATE_JOB_NAME = "db_spend_update_job"
+PROMETHEUS_EMIT_BUDGET_METRICS_JOB_NAME = "prometheus_emit_budget_metrics_job"
+DEFAULT_CRON_JOB_LOCK_TTL_SECONDS = 60  # 1 minute
+PROXY_BUDGET_RESCHEDULER_MIN_TIME = 597
+PROXY_BUDGET_RESCHEDULER_MAX_TIME = 605
+PROXY_BATCH_WRITE_AT = 10  # in seconds
+DEFAULT_HEALTH_CHECK_INTERVAL = 300  # 5 minutes
+PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS = 9
+DEFAULT_MODEL_CREATED_AT_TIME = 1677610602  # returns on `/models` endpoint
+DEFAULT_SLACK_ALERTING_THRESHOLD = 300
+MAX_TEAM_LIST_LIMIT = 20
+DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD = 0.7
+LENGTH_OF_LITELLM_GENERATED_KEY = 16
+SECRET_MANAGER_REFRESH_INTERVAL = 86400

litellm/cost.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "gpt-3.5-turbo-0613": 0.00015000000000000001,
+    "claude-2": 0.00016454,
+    "gpt-4-0613": 0.015408
+}

litellm/cost_calculator.py ADDED Viewed

	@@ -0,0 +1,1378 @@

+# What is this?
+## File for 'response_cost' calculation in Logging
+import time
+from functools import lru_cache
+from typing import Any, List, Literal, Optional, Tuple, Union, cast
+from pydantic import BaseModel
+import litellm
+import litellm._logging
+from litellm import verbose_logger
+from litellm.constants import (
+    DEFAULT_MAX_LRU_CACHE_SIZE,
+    DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND,
+)
+from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
+    StandardBuiltInToolCostTracking,
+)
+from litellm.litellm_core_utils.llm_cost_calc.utils import (
+    _generic_cost_per_character,
+    generic_cost_per_token,
+    select_cost_metric_for_model,
+)
+from litellm.llms.anthropic.cost_calculation import (
+    cost_per_token as anthropic_cost_per_token,
+)
+from litellm.llms.azure.cost_calculation import (
+    cost_per_token as azure_openai_cost_per_token,
+)
+from litellm.llms.bedrock.image.cost_calculator import (
+    cost_calculator as bedrock_image_cost_calculator,
+)
+from litellm.llms.databricks.cost_calculator import (
+    cost_per_token as databricks_cost_per_token,
+)
+from litellm.llms.deepseek.cost_calculator import (
+    cost_per_token as deepseek_cost_per_token,
+)
+from litellm.llms.fireworks_ai.cost_calculator import (
+    cost_per_token as fireworks_ai_cost_per_token,
+)
+from litellm.llms.gemini.cost_calculator import cost_per_token as gemini_cost_per_token
+from litellm.llms.openai.cost_calculation import (
+    cost_per_second as openai_cost_per_second,
+)
+from litellm.llms.openai.cost_calculation import cost_per_token as openai_cost_per_token
+from litellm.llms.together_ai.cost_calculator import get_model_params_and_category
+from litellm.llms.vertex_ai.cost_calculator import (
+    cost_per_character as google_cost_per_character,
+)
+from litellm.llms.vertex_ai.cost_calculator import (
+    cost_per_token as google_cost_per_token,
+)
+from litellm.llms.vertex_ai.cost_calculator import cost_router as google_cost_router
+from litellm.llms.vertex_ai.image_generation.cost_calculator import (
+    cost_calculator as vertex_ai_image_cost_calculator,
+)
+from litellm.responses.utils import ResponseAPILoggingUtils
+from litellm.types.llms.openai import (
+    HttpxBinaryResponseContent,
+    ImageGenerationRequestQuality,
+    OpenAIModerationResponse,
+    OpenAIRealtimeStreamList,
+    OpenAIRealtimeStreamResponseBaseObject,
+    OpenAIRealtimeStreamSessionEvents,
+    ResponseAPIUsage,
+    ResponsesAPIResponse,
+)
+from litellm.types.rerank import RerankBilledUnits, RerankResponse
+from litellm.types.utils import (
+    CallTypesLiteral,
+    LiteLLMRealtimeStreamLoggingObject,
+    LlmProviders,
+    LlmProvidersSet,
+    ModelInfo,
+    PassthroughCallTypes,
+    StandardBuiltInToolsParams,
+    Usage,
+)
+from litellm.utils import (
+    CallTypes,
+    CostPerToken,
+    EmbeddingResponse,
+    ImageResponse,
+    ModelResponse,
+    ProviderConfigManager,
+    TextCompletionResponse,
+    TranscriptionResponse,
+    _cached_get_model_info_helper,
+    token_counter,
+)
+def _cost_per_token_custom_pricing_helper(
+    prompt_tokens: float = 0,
+    completion_tokens: float = 0,
+    response_time_ms: Optional[float] = 0.0,
+    ### CUSTOM PRICING ###
+    custom_cost_per_token: Optional[CostPerToken] = None,
+    custom_cost_per_second: Optional[float] = None,
+) -> Optional[Tuple[float, float]]:
+    """Internal helper function for calculating cost, if custom pricing given"""
+    if custom_cost_per_token is None and custom_cost_per_second is None:
+        return None
+    if custom_cost_per_token is not None:
+        input_cost = custom_cost_per_token["input_cost_per_token"] * prompt_tokens
+        output_cost = custom_cost_per_token["output_cost_per_token"] * completion_tokens
+        return input_cost, output_cost
+    elif custom_cost_per_second is not None:
+        output_cost = custom_cost_per_second * response_time_ms / 1000  # type: ignore
+        return 0, output_cost
+    return None
+def cost_per_token(  # noqa: PLR0915
+    model: str = "",
+    prompt_tokens: int = 0,
+    completion_tokens: int = 0,
+    response_time_ms: Optional[float] = 0.0,
+    custom_llm_provider: Optional[str] = None,
+    region_name=None,
+    ### CHARACTER PRICING ###
+    prompt_characters: Optional[int] = None,
+    completion_characters: Optional[int] = None,
+    ### PROMPT CACHING PRICING ### - used for anthropic
+    cache_creation_input_tokens: Optional[int] = 0,
+    cache_read_input_tokens: Optional[int] = 0,
+    ### CUSTOM PRICING ###
+    custom_cost_per_token: Optional[CostPerToken] = None,
+    custom_cost_per_second: Optional[float] = None,
+    ### NUMBER OF QUERIES ###
+    number_of_queries: Optional[int] = None,
+    ### USAGE OBJECT ###
+    usage_object: Optional[Usage] = None,  # just read the usage object if provided
+    ### BILLED UNITS ###
+    rerank_billed_units: Optional[RerankBilledUnits] = None,
+    ### CALL TYPE ###
+    call_type: CallTypesLiteral = "completion",
+    audio_transcription_file_duration: float = 0.0,  # for audio transcription calls - the file time in seconds
+) -> Tuple[float, float]:  # type: ignore
+    """
+    Calculates the cost per token for a given model, prompt tokens, and completion tokens.
+    Parameters:
+        model (str): The name of the model to use. Default is ""
+        prompt_tokens (int): The number of tokens in the prompt.
+        completion_tokens (int): The number of tokens in the completion.
+        response_time (float): The amount of time, in milliseconds, it took the call to complete.
+        prompt_characters (float): The number of characters in the prompt. Used for vertex ai cost calculation.
+        completion_characters (float): The number of characters in the completion response. Used for vertex ai cost calculation.
+        custom_llm_provider (str): The llm provider to whom the call was made (see init.py for full list)
+        custom_cost_per_token: Optional[CostPerToken]: the cost per input + output token for the llm api call.
+        custom_cost_per_second: Optional[float]: the cost per second for the llm api call.
+        call_type: Optional[str]: the call type
+    Returns:
+        tuple: A tuple containing the cost in USD dollars for prompt tokens and completion tokens, respectively.
+    """
+    if model is None:
+        raise Exception("Invalid arg. Model cannot be none.")
+    ## RECONSTRUCT USAGE BLOCK ##
+    if usage_object is not None:
+        usage_block = usage_object
+    else:
+        usage_block = Usage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
+            cache_creation_input_tokens=cache_creation_input_tokens,
+            cache_read_input_tokens=cache_read_input_tokens,
+        )
+    ## CUSTOM PRICING ##
+    response_cost = _cost_per_token_custom_pricing_helper(
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+        response_time_ms=response_time_ms,
+        custom_cost_per_second=custom_cost_per_second,
+        custom_cost_per_token=custom_cost_per_token,
+    )
+    if response_cost is not None:
+        return response_cost[0], response_cost[1]
+    # given
+    prompt_tokens_cost_usd_dollar: float = 0
+    completion_tokens_cost_usd_dollar: float = 0
+    model_cost_ref = litellm.model_cost
+    model_with_provider = model
+    if custom_llm_provider is not None:
+        model_with_provider = custom_llm_provider + "/" + model
+        if region_name is not None:
+            model_with_provider_and_region = (
+                f"{custom_llm_provider}/{region_name}/{model}"
+            )
+            if (
+                model_with_provider_and_region in model_cost_ref
+            ):  # use region based pricing, if it's available
+                model_with_provider = model_with_provider_and_region
+    else:
+        _, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
+    model_without_prefix = model
+    model_parts = model.split("/", 1)
+    if len(model_parts) > 1:
+        model_without_prefix = model_parts[1]
+    else:
+        model_without_prefix = model
+    """
+    Code block that formats model to lookup in litellm.model_cost
+    Option1. model = "bedrock/ap-northeast-1/anthropic.claude-instant-v1". This is the most accurate since it is region based. Should always be option 1
+    Option2. model = "openai/gpt-4"       - model = provider/model
+    Option3. model = "anthropic.claude-3" - model = model
+    """
+    if (
+        model_with_provider in model_cost_ref
+    ):  # Option 2. use model with provider, model = "openai/gpt-4"
+        model = model_with_provider
+    elif model in model_cost_ref:  # Option 1. use model passed, model="gpt-4"
+        model = model
+    elif (
+        model_without_prefix in model_cost_ref
+    ):  # Option 3. if user passed model="bedrock/anthropic.claude-3", use model="anthropic.claude-3"
+        model = model_without_prefix
+    # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
+    if call_type == "speech" or call_type == "aspeech":
+        speech_model_info = litellm.get_model_info(
+            model=model_without_prefix, custom_llm_provider=custom_llm_provider
+        )
+        cost_metric = select_cost_metric_for_model(speech_model_info)
+        prompt_cost: float = 0.0
+        completion_cost: float = 0.0
+        if cost_metric == "cost_per_character":
+            if prompt_characters is None:
+                raise ValueError(
+                    "prompt_characters must be provided for tts calls. prompt_characters={}, model={}, custom_llm_provider={}, call_type={}".format(
+                        prompt_characters,
+                        model,
+                        custom_llm_provider,
+                        call_type,
+                    )
+                )
+            _prompt_cost, _completion_cost = _generic_cost_per_character(
+                model=model_without_prefix,
+                custom_llm_provider=custom_llm_provider,
+                prompt_characters=prompt_characters,
+                completion_characters=0,
+                custom_prompt_cost=None,
+                custom_completion_cost=0,
+            )
+            if _prompt_cost is None or _completion_cost is None:
+                raise ValueError(
+                    "cost for tts call is None. prompt_cost={}, completion_cost={}, model={}, custom_llm_provider={}, prompt_characters={}, completion_characters={}".format(
+                        _prompt_cost,
+                        _completion_cost,
+                        model_without_prefix,
+                        custom_llm_provider,
+                        prompt_characters,
+                        completion_characters,
+                    )
+                )
+            prompt_cost = _prompt_cost
+            completion_cost = _completion_cost
+        elif cost_metric == "cost_per_token":
+            prompt_cost, completion_cost = generic_cost_per_token(
+                model=model_without_prefix,
+                usage=usage_block,
+                custom_llm_provider=custom_llm_provider,
+            )
+        return prompt_cost, completion_cost
+    elif call_type == "arerank" or call_type == "rerank":
+        return rerank_cost(
+            model=model,
+            custom_llm_provider=custom_llm_provider,
+            billed_units=rerank_billed_units,
+        )
+    elif (
+        call_type == "aretrieve_batch"
+        or call_type == "retrieve_batch"
+        or call_type == CallTypes.aretrieve_batch
+        or call_type == CallTypes.retrieve_batch
+    ):
+        return batch_cost_calculator(
+            usage=usage_block, model=model, custom_llm_provider=custom_llm_provider
+        )
+    elif call_type == "atranscription" or call_type == "transcription":
+        return openai_cost_per_second(
+            model=model,
+            custom_llm_provider=custom_llm_provider,
+            duration=audio_transcription_file_duration,
+        )
+    elif custom_llm_provider == "vertex_ai":
+        cost_router = google_cost_router(
+            model=model_without_prefix,
+            custom_llm_provider=custom_llm_provider,
+            call_type=call_type,
+        )
+        if cost_router == "cost_per_character":
+            return google_cost_per_character(
+                model=model_without_prefix,
+                custom_llm_provider=custom_llm_provider,
+                prompt_characters=prompt_characters,
+                completion_characters=completion_characters,
+                usage=usage_block,
+            )
+        elif cost_router == "cost_per_token":
+            return google_cost_per_token(
+                model=model_without_prefix,
+                custom_llm_provider=custom_llm_provider,
+                usage=usage_block,
+            )
+    elif custom_llm_provider == "anthropic":
+        return anthropic_cost_per_token(model=model, usage=usage_block)
+    elif custom_llm_provider == "openai":
+        return openai_cost_per_token(model=model, usage=usage_block)
+    elif custom_llm_provider == "databricks":
+        return databricks_cost_per_token(model=model, usage=usage_block)
+    elif custom_llm_provider == "fireworks_ai":
+        return fireworks_ai_cost_per_token(model=model, usage=usage_block)
+    elif custom_llm_provider == "azure":
+        return azure_openai_cost_per_token(
+            model=model, usage=usage_block, response_time_ms=response_time_ms
+        )
+    elif custom_llm_provider == "gemini":
+        return gemini_cost_per_token(model=model, usage=usage_block)
+    elif custom_llm_provider == "deepseek":
+        return deepseek_cost_per_token(model=model, usage=usage_block)
+    else:
+        model_info = _cached_get_model_info_helper(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
+        if model_info["input_cost_per_token"] > 0:
+            ## COST PER TOKEN ##
+            prompt_tokens_cost_usd_dollar = (
+                model_info["input_cost_per_token"] * prompt_tokens
+            )
+        elif (
+            model_info.get("input_cost_per_second", None) is not None
+            and response_time_ms is not None
+        ):
+            verbose_logger.debug(
+                "For model=%s - input_cost_per_second: %s; response time: %s",
+                model,
+                model_info.get("input_cost_per_second", None),
+                response_time_ms,
+            )
+            ## COST PER SECOND ##
+            prompt_tokens_cost_usd_dollar = (
+                model_info["input_cost_per_second"] * response_time_ms / 1000  # type: ignore
+            )
+        if model_info["output_cost_per_token"] > 0:
+            completion_tokens_cost_usd_dollar = (
+                model_info["output_cost_per_token"] * completion_tokens
+            )
+        elif (
+            model_info.get("output_cost_per_second", None) is not None
+            and response_time_ms is not None
+        ):
+            verbose_logger.debug(
+                "For model=%s - output_cost_per_second: %s; response time: %s",
+                model,
+                model_info.get("output_cost_per_second", None),
+                response_time_ms,
+            )
+            ## COST PER SECOND ##
+            completion_tokens_cost_usd_dollar = (
+                model_info["output_cost_per_second"] * response_time_ms / 1000  # type: ignore
+            )
+        verbose_logger.debug(
+            "Returned custom cost for model=%s - prompt_tokens_cost_usd_dollar: %s, completion_tokens_cost_usd_dollar: %s",
+            model,
+            prompt_tokens_cost_usd_dollar,
+            completion_tokens_cost_usd_dollar,
+        )
+        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
+def get_replicate_completion_pricing(completion_response: dict, total_time=0.0):
+    # see https://replicate.com/pricing
+    # for all litellm currently supported LLMs, almost all requests go to a100_80gb
+    a100_80gb_price_per_second_public = DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND  # assume all calls sent to A100 80GB for now
+    if total_time == 0.0:  # total time is in ms
+        start_time = completion_response.get("created", time.time())
+        end_time = getattr(completion_response, "ended", time.time())
+        total_time = end_time - start_time
+    return a100_80gb_price_per_second_public * total_time / 1000
+def has_hidden_params(obj: Any) -> bool:
+    return hasattr(obj, "_hidden_params")
+def _get_provider_for_cost_calc(
+    model: Optional[str],
+    custom_llm_provider: Optional[str] = None,
+) -> Optional[str]:
+    if custom_llm_provider is not None:
+        return custom_llm_provider
+    if model is None:
+        return None
+    try:
+        _, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
+    except Exception as e:
+        verbose_logger.debug(
+            f"litellm.cost_calculator.py::_get_provider_for_cost_calc() - Error inferring custom_llm_provider - {str(e)}"
+        )
+        return None
+    return custom_llm_provider
+def _select_model_name_for_cost_calc(
+    model: Optional[str],
+    completion_response: Optional[Any],
+    base_model: Optional[str] = None,
+    custom_pricing: Optional[bool] = None,
+    custom_llm_provider: Optional[str] = None,
+    router_model_id: Optional[str] = None,
+) -> Optional[str]:
+    """
+    1. If custom pricing is true, return received model name
+    2. If base_model is set (e.g. for azure models), return that
+    3. If completion response has model set return that
+    4. Check if model is passed in return that
+    """
+    return_model: Optional[str] = None
+    region_name: Optional[str] = None
+    custom_llm_provider = _get_provider_for_cost_calc(
+        model=model, custom_llm_provider=custom_llm_provider
+    )
+    completion_response_model: Optional[str] = None
+    if completion_response is not None:
+        if isinstance(completion_response, BaseModel):
+            completion_response_model = getattr(completion_response, "model", None)
+        elif isinstance(completion_response, dict):
+            completion_response_model = completion_response.get("model", None)
+    hidden_params: Optional[dict] = getattr(completion_response, "_hidden_params", None)
+    if custom_pricing is True:
+        if router_model_id is not None and router_model_id in litellm.model_cost:
+            return_model = router_model_id
+        else:
+            return_model = model
+    if base_model is not None:
+        return_model = base_model
+    if completion_response_model is None and hidden_params is not None:
+        if (
+            hidden_params.get("model", None) is not None
+            and len(hidden_params["model"]) > 0
+        ):
+            return_model = hidden_params.get("model", model)
+    if hidden_params is not None and hidden_params.get("region_name", None) is not None:
+        region_name = hidden_params.get("region_name", None)
+    if return_model is None and completion_response_model is not None:
+        return_model = completion_response_model
+    if return_model is None and model is not None:
+        return_model = model
+    if (
+        return_model is not None
+        and custom_llm_provider is not None
+        and not _model_contains_known_llm_provider(return_model)
+    ):  # add provider prefix if not already present, to match model_cost
+        if region_name is not None:
+            return_model = f"{custom_llm_provider}/{region_name}/{return_model}"
+        else:
+            return_model = f"{custom_llm_provider}/{return_model}"
+    return return_model
+@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
+def _model_contains_known_llm_provider(model: str) -> bool:
+    """
+    Check if the model contains a known llm provider
+    """
+    _provider_prefix = model.split("/")[0]
+    return _provider_prefix in LlmProvidersSet
+def _get_usage_object(
+    completion_response: Any,
+) -> Optional[Usage]:
+    usage_obj = cast(
+        Union[Usage, ResponseAPIUsage, dict, BaseModel],
+        (
+            completion_response.get("usage")
+            if isinstance(completion_response, dict)
+            else getattr(completion_response, "get", lambda x: None)("usage")
+        ),
+    )
+    if usage_obj is None:
+        return None
+    if isinstance(usage_obj, Usage):
+        return usage_obj
+    elif (
+        usage_obj is not None
+        and (isinstance(usage_obj, dict) or isinstance(usage_obj, ResponseAPIUsage))
+        and ResponseAPILoggingUtils._is_response_api_usage(usage_obj)
+    ):
+        return ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage(
+            usage_obj
+        )
+    elif isinstance(usage_obj, dict):
+        return Usage(**usage_obj)
+    elif isinstance(usage_obj, BaseModel):
+        return Usage(**usage_obj.model_dump())
+    else:
+        verbose_logger.debug(
+            f"Unknown usage object type: {type(usage_obj)}, usage_obj: {usage_obj}"
+        )
+        return None
+def _is_known_usage_objects(usage_obj):
+    """Returns True if the usage obj is a known Usage type"""
+    return isinstance(usage_obj, litellm.Usage) or isinstance(
+        usage_obj, ResponseAPIUsage
+    )
+def _infer_call_type(
+    call_type: Optional[CallTypesLiteral], completion_response: Any
+) -> Optional[CallTypesLiteral]:
+    if call_type is not None:
+        return call_type
+    if completion_response is None:
+        return None
+    if isinstance(completion_response, ModelResponse):
+        return "completion"
+    elif isinstance(completion_response, EmbeddingResponse):
+        return "embedding"
+    elif isinstance(completion_response, TranscriptionResponse):
+        return "transcription"
+    elif isinstance(completion_response, HttpxBinaryResponseContent):
+        return "speech"
+    elif isinstance(completion_response, RerankResponse):
+        return "rerank"
+    elif isinstance(completion_response, ImageResponse):
+        return "image_generation"
+    elif isinstance(completion_response, TextCompletionResponse):
+        return "text_completion"
+    return call_type
+def completion_cost(  # noqa: PLR0915
+    completion_response=None,
+    model: Optional[str] = None,
+    prompt="",
+    messages: List = [],
+    completion="",
+    total_time: Optional[float] = 0.0,  # used for replicate, sagemaker
+    call_type: Optional[CallTypesLiteral] = None,
+    ### REGION ###
+    custom_llm_provider=None,
+    region_name=None,  # used for bedrock pricing
+    ### IMAGE GEN ###
+    size: Optional[str] = None,
+    quality: Optional[str] = None,
+    n: Optional[int] = None,  # number of images
+    ### CUSTOM PRICING ###
+    custom_cost_per_token: Optional[CostPerToken] = None,
+    custom_cost_per_second: Optional[float] = None,
+    optional_params: Optional[dict] = None,
+    custom_pricing: Optional[bool] = None,
+    base_model: Optional[str] = None,
+    standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
+    litellm_model_name: Optional[str] = None,
+    router_model_id: Optional[str] = None,
+) -> float:
+    """
+    Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm.
+    Parameters:
+        completion_response (litellm.ModelResponses): [Required] The response received from a LiteLLM completion request.
+        [OPTIONAL PARAMS]
+        model (str): Optional. The name of the language model used in the completion calls
+        prompt (str): Optional. The input prompt passed to the llm
+        completion (str): Optional. The output completion text from the llm
+        total_time (float, int): Optional. (Only used for Replicate LLMs) The total time used for the request in seconds
+        custom_cost_per_token: Optional[CostPerToken]: the cost per input + output token for the llm api call.
+        custom_cost_per_second: Optional[float]: the cost per second for the llm api call.
+    Returns:
+        float: The cost in USD dollars for the completion based on the provided parameters.
+    Exceptions:
+        Raises exception if model not in the litellm model cost map. Register model, via custom pricing or PR - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
+    Note:
+        - If completion_response is provided, the function extracts token information and the model name from it.
+        - If completion_response is not provided, the function calculates token counts based on the model and input text.
+        - The cost is calculated based on the model, prompt tokens, and completion tokens.
+        - For certain models containing "togethercomputer" in the name, prices are based on the model size.
+        - For un-mapped Replicate models, the cost is calculated based on the total time used for the request.
+    """
+    try:
+        call_type = _infer_call_type(call_type, completion_response) or "completion"
+        if (
+            (call_type == "aimage_generation" or call_type == "image_generation")
+            and model is not None
+            and isinstance(model, str)
+            and len(model) == 0
+            and custom_llm_provider == "azure"
+        ):
+            model = "dall-e-2"  # for dall-e-2, azure expects an empty model name
+        # Handle Inputs to completion_cost
+        prompt_tokens = 0
+        prompt_characters: Optional[int] = None
+        completion_tokens = 0
+        completion_characters: Optional[int] = None
+        cache_creation_input_tokens: Optional[int] = None
+        cache_read_input_tokens: Optional[int] = None
+        audio_transcription_file_duration: float = 0.0
+        cost_per_token_usage_object: Optional[Usage] = _get_usage_object(
+            completion_response=completion_response
+        )
+        rerank_billed_units: Optional[RerankBilledUnits] = None
+        selected_model = _select_model_name_for_cost_calc(
+            model=model,
+            completion_response=completion_response,
+            custom_llm_provider=custom_llm_provider,
+            custom_pricing=custom_pricing,
+            base_model=base_model,
+            router_model_id=router_model_id,
+        )
+        potential_model_names = [selected_model]
+        if model is not None:
+            potential_model_names.append(model)
+        for idx, model in enumerate(potential_model_names):
+            try:
+                verbose_logger.info(
+                    f"selected model name for cost calculation: {model}"
+                )
+                if completion_response is not None and (
+                    isinstance(completion_response, BaseModel)
+                    or isinstance(completion_response, dict)
+                ):  # tts returns a custom class
+                    if isinstance(completion_response, dict):
+                        usage_obj: Optional[
+                            Union[dict, Usage]
+                        ] = completion_response.get("usage", {})
+                    else:
+                        usage_obj = getattr(completion_response, "usage", {})
+                    if isinstance(usage_obj, BaseModel) and not _is_known_usage_objects(
+                        usage_obj=usage_obj
+                    ):
+                        setattr(
+                            completion_response,
+                            "usage",
+                            litellm.Usage(**usage_obj.model_dump()),
+                        )
+                    if usage_obj is None:
+                        _usage = {}
+                    elif isinstance(usage_obj, BaseModel):
+                        _usage = usage_obj.model_dump()
+                    else:
+                        _usage = usage_obj
+                    if ResponseAPILoggingUtils._is_response_api_usage(_usage):
+                        _usage = ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage(
+                            _usage
+                        ).model_dump()
+                    # get input/output tokens from completion_response
+                    prompt_tokens = _usage.get("prompt_tokens", 0)
+                    completion_tokens = _usage.get("completion_tokens", 0)
+                    cache_creation_input_tokens = _usage.get(
+                        "cache_creation_input_tokens", 0
+                    )
+                    cache_read_input_tokens = _usage.get("cache_read_input_tokens", 0)
+                    if (
+                        "prompt_tokens_details" in _usage
+                        and _usage["prompt_tokens_details"] != {}
+                        and _usage["prompt_tokens_details"]
+                    ):
+                        prompt_tokens_details = _usage.get("prompt_tokens_details", {})
+                        cache_read_input_tokens = prompt_tokens_details.get(
+                            "cached_tokens", 0
+                        )
+                    total_time = getattr(completion_response, "_response_ms", 0)
+                    hidden_params = getattr(completion_response, "_hidden_params", None)
+                    if hidden_params is not None:
+                        custom_llm_provider = hidden_params.get(
+                            "custom_llm_provider", custom_llm_provider or None
+                        )
+                        region_name = hidden_params.get("region_name", region_name)
+                        size = hidden_params.get("optional_params", {}).get(
+                            "size", "1024-x-1024"
+                        )  # openai default
+                        quality = hidden_params.get("optional_params", {}).get(
+                            "quality", "standard"
+                        )  # openai default
+                        n = hidden_params.get("optional_params", {}).get(
+                            "n", 1
+                        )  # openai default
+                else:
+                    if model is None:
+                        raise ValueError(
+                            f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
+                        )
+                    if len(messages) > 0:
+                        prompt_tokens = token_counter(model=model, messages=messages)
+                    elif len(prompt) > 0:
+                        prompt_tokens = token_counter(model=model, text=prompt)
+                    completion_tokens = token_counter(model=model, text=completion)
+                if model is None:
+                    raise ValueError(
+                        f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
+                    )
+                if custom_llm_provider is None:
+                    try:
+                        model, custom_llm_provider, _, _ = litellm.get_llm_provider(
+                            model=model
+                        )  # strip the llm provider from the model name -> for image gen cost calculation
+                    except Exception as e:
+                        verbose_logger.debug(
+                            "litellm.cost_calculator.py::completion_cost() - Error inferring custom_llm_provider - {}".format(
+                                str(e)
+                            )
+                        )
+                if (
+                    call_type == CallTypes.image_generation.value
+                    or call_type == CallTypes.aimage_generation.value
+                    or call_type
+                    == PassthroughCallTypes.passthrough_image_generation.value
+                ):
+                    ### IMAGE GENERATION COST CALCULATION ###
+                    if custom_llm_provider == "vertex_ai":
+                        if isinstance(completion_response, ImageResponse):
+                            return vertex_ai_image_cost_calculator(
+                                model=model,
+                                image_response=completion_response,
+                            )
+                    elif custom_llm_provider == "bedrock":
+                        if isinstance(completion_response, ImageResponse):
+                            return bedrock_image_cost_calculator(
+                                model=model,
+                                size=size,
+                                image_response=completion_response,
+                                optional_params=optional_params,
+                            )
+                        raise TypeError(
+                            "completion_response must be of type ImageResponse for bedrock image cost calculation"
+                        )
+                    else:
+                        return default_image_cost_calculator(
+                            model=model,
+                            quality=quality,
+                            custom_llm_provider=custom_llm_provider,
+                            n=n,
+                            size=size,
+                            optional_params=optional_params,
+                        )
+                elif (
+                    call_type == CallTypes.speech.value
+                    or call_type == CallTypes.aspeech.value
+                ):
+                    prompt_characters = litellm.utils._count_characters(text=prompt)
+                elif (
+                    call_type == CallTypes.atranscription.value
+                    or call_type == CallTypes.transcription.value
+                ):
+                    audio_transcription_file_duration = getattr(
+                        completion_response, "duration", 0.0
+                    )
+                elif (
+                    call_type == CallTypes.rerank.value
+                    or call_type == CallTypes.arerank.value
+                ):
+                    if completion_response is not None and isinstance(
+                        completion_response, RerankResponse
+                    ):
+                        meta_obj = completion_response.meta
+                        if meta_obj is not None:
+                            billed_units = meta_obj.get("billed_units", {}) or {}
+                        else:
+                            billed_units = {}
+                        rerank_billed_units = RerankBilledUnits(
+                            search_units=billed_units.get("search_units"),
+                            total_tokens=billed_units.get("total_tokens"),
+                        )
+                        search_units = (
+                            billed_units.get("search_units") or 1
+                        )  # cohere charges per request by default.
+                        completion_tokens = search_units
+                elif call_type == CallTypes.arealtime.value and isinstance(
+                    completion_response, LiteLLMRealtimeStreamLoggingObject
+                ):
+                    if (
+                        cost_per_token_usage_object is None
+                        or custom_llm_provider is None
+                    ):
+                        raise ValueError(
+                            "usage object and custom_llm_provider must be provided for realtime stream cost calculation. Got cost_per_token_usage_object={}, custom_llm_provider={}".format(
+                                cost_per_token_usage_object,
+                                custom_llm_provider,
+                            )
+                        )
+                    return handle_realtime_stream_cost_calculation(
+                        results=completion_response.results,
+                        combined_usage_object=cost_per_token_usage_object,
+                        custom_llm_provider=custom_llm_provider,
+                        litellm_model_name=model,
+                    )
+                # Calculate cost based on prompt_tokens, completion_tokens
+                if (
+                    "togethercomputer" in model
+                    or "together_ai" in model
+                    or custom_llm_provider == "together_ai"
+                ):
+                    # together ai prices based on size of llm
+                    # get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json
+                    model = get_model_params_and_category(
+                        model, call_type=CallTypes(call_type)
+                    )
+                # replicate llms are calculate based on time for request running
+                # see https://replicate.com/pricing
+                elif (
+                    model in litellm.replicate_models or "replicate" in model
+                ) and model not in litellm.model_cost:
+                    # for unmapped replicate model, default to replicate's time tracking logic
+                    return get_replicate_completion_pricing(completion_response, total_time)  # type: ignore
+                if model is None:
+                    raise ValueError(
+                        f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
+                    )
+                if (
+                    custom_llm_provider is not None
+                    and custom_llm_provider == "vertex_ai"
+                ):
+                    # Calculate the prompt characters + response characters
+                    if len(messages) > 0:
+                        prompt_string = litellm.utils.get_formatted_prompt(
+                            data={"messages": messages}, call_type="completion"
+                        )
+                        prompt_characters = litellm.utils._count_characters(
+                            text=prompt_string
+                        )
+                    if completion_response is not None and isinstance(
+                        completion_response, ModelResponse
+                    ):
+                        completion_string = litellm.utils.get_response_string(
+                            response_obj=completion_response
+                        )
+                        completion_characters = litellm.utils._count_characters(
+                            text=completion_string
+                        )
+                (
+                    prompt_tokens_cost_usd_dollar,
+                    completion_tokens_cost_usd_dollar,
+                ) = cost_per_token(
+                    model=model,
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    custom_llm_provider=custom_llm_provider,
+                    response_time_ms=total_time,
+                    region_name=region_name,
+                    custom_cost_per_second=custom_cost_per_second,
+                    custom_cost_per_token=custom_cost_per_token,
+                    prompt_characters=prompt_characters,
+                    completion_characters=completion_characters,
+                    cache_creation_input_tokens=cache_creation_input_tokens,
+                    cache_read_input_tokens=cache_read_input_tokens,
+                    usage_object=cost_per_token_usage_object,
+                    call_type=cast(CallTypesLiteral, call_type),
+                    audio_transcription_file_duration=audio_transcription_file_duration,
+                    rerank_billed_units=rerank_billed_units,
+                )
+                _final_cost = (
+                    prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
+                )
+                _final_cost += (
+                    StandardBuiltInToolCostTracking.get_cost_for_built_in_tools(
+                        model=model,
+                        response_object=completion_response,
+                        standard_built_in_tools_params=standard_built_in_tools_params,
+                        custom_llm_provider=custom_llm_provider,
+                    )
+                )
+                return _final_cost
+            except Exception as e:
+                verbose_logger.debug(
+                    "litellm.cost_calculator.py::completion_cost() - Error calculating cost for model={} - {}".format(
+                        model, str(e)
+                    )
+                )
+                if idx == len(potential_model_names) - 1:
+                    raise e
+        raise Exception(
+            "Unable to calculat cost for received potential model names - {}".format(
+                potential_model_names
+            )
+        )
+    except Exception as e:
+        raise e
+def get_response_cost_from_hidden_params(
+    hidden_params: Union[dict, BaseModel],
+) -> Optional[float]:
+    if isinstance(hidden_params, BaseModel):
+        _hidden_params_dict = hidden_params.model_dump()
+    else:
+        _hidden_params_dict = hidden_params
+    additional_headers = _hidden_params_dict.get("additional_headers", {})
+    if (
+        additional_headers
+        and "llm_provider-x-litellm-response-cost" in additional_headers
+    ):
+        response_cost = additional_headers["llm_provider-x-litellm-response-cost"]
+        if response_cost is None:
+            return None
+        return float(additional_headers["llm_provider-x-litellm-response-cost"])
+    return None
+def response_cost_calculator(
+    response_object: Union[
+        ModelResponse,
+        EmbeddingResponse,
+        ImageResponse,
+        TranscriptionResponse,
+        TextCompletionResponse,
+        HttpxBinaryResponseContent,
+        RerankResponse,
+        ResponsesAPIResponse,
+        LiteLLMRealtimeStreamLoggingObject,
+        OpenAIModerationResponse,
+    ],
+    model: str,
+    custom_llm_provider: Optional[str],
+    call_type: Literal[
+        "embedding",
+        "aembedding",
+        "completion",
+        "acompletion",
+        "atext_completion",
+        "text_completion",
+        "image_generation",
+        "aimage_generation",
+        "moderation",
+        "amoderation",
+        "atranscription",
+        "transcription",
+        "aspeech",
+        "speech",
+        "rerank",
+        "arerank",
+    ],
+    optional_params: dict,
+    cache_hit: Optional[bool] = None,
+    base_model: Optional[str] = None,
+    custom_pricing: Optional[bool] = None,
+    prompt: str = "",
+    standard_built_in_tools_params: Optional[StandardBuiltInToolsParams] = None,
+    litellm_model_name: Optional[str] = None,
+    router_model_id: Optional[str] = None,
+) -> float:
+    """
+    Returns
+    - float or None: cost of response
+    """
+    try:
+        response_cost: float = 0.0
+        if cache_hit is not None and cache_hit is True:
+            response_cost = 0.0
+        else:
+            if isinstance(response_object, BaseModel):
+                response_object._hidden_params["optional_params"] = optional_params
+                if hasattr(response_object, "_hidden_params"):
+                    provider_response_cost = get_response_cost_from_hidden_params(
+                        response_object._hidden_params
+                    )
+                    if provider_response_cost is not None:
+                        return provider_response_cost
+            response_cost = completion_cost(
+                completion_response=response_object,
+                model=model,
+                call_type=call_type,
+                custom_llm_provider=custom_llm_provider,
+                optional_params=optional_params,
+                custom_pricing=custom_pricing,
+                base_model=base_model,
+                prompt=prompt,
+                standard_built_in_tools_params=standard_built_in_tools_params,
+                litellm_model_name=litellm_model_name,
+                router_model_id=router_model_id,
+            )
+        return response_cost
+    except Exception as e:
+        raise e
+def rerank_cost(
+    model: str,
+    custom_llm_provider: Optional[str],
+    billed_units: Optional[RerankBilledUnits] = None,
+) -> Tuple[float, float]:
+    """
+    Returns
+    - float or None: cost of response OR none if error.
+    """
+    _, custom_llm_provider, _, _ = litellm.get_llm_provider(
+        model=model, custom_llm_provider=custom_llm_provider
+    )
+    try:
+        config = ProviderConfigManager.get_provider_rerank_config(
+            model=model,
+            api_base=None,
+            present_version_params=[],
+            provider=LlmProviders(custom_llm_provider),
+        )
+        try:
+            model_info: Optional[ModelInfo] = litellm.get_model_info(
+                model=model, custom_llm_provider=custom_llm_provider
+            )
+        except Exception:
+            model_info = None
+        return config.calculate_rerank_cost(
+            model=model,
+            custom_llm_provider=custom_llm_provider,
+            billed_units=billed_units,
+            model_info=model_info,
+        )
+    except Exception as e:
+        raise e
+def transcription_cost(
+    model: str, custom_llm_provider: Optional[str], duration: float
+) -> Tuple[float, float]:
+    return openai_cost_per_second(
+        model=model, custom_llm_provider=custom_llm_provider, duration=duration
+    )
+def default_image_cost_calculator(
+    model: str,
+    custom_llm_provider: Optional[str] = None,
+    quality: Optional[str] = None,
+    n: Optional[int] = 1,  # Default to 1 image
+    size: Optional[str] = "1024-x-1024",  # OpenAI default
+    optional_params: Optional[dict] = None,
+) -> float:
+    """
+    Default image cost calculator for image generation
+    Args:
+        model (str): Model name
+        image_response (ImageResponse): Response from image generation
+        quality (Optional[str]): Image quality setting
+        n (Optional[int]): Number of images generated
+        size (Optional[str]): Image size (e.g. "1024x1024" or "1024-x-1024")
+    Returns:
+        float: Cost in USD for the image generation
+    Raises:
+        Exception: If model pricing not found in cost map
+    """
+    # Standardize size format to use "-x-"
+    size_str: str = size or "1024-x-1024"
+    size_str = (
+        size_str.replace("x", "-x-")
+        if "x" in size_str and "-x-" not in size_str
+        else size_str
+    )
+    # Parse dimensions
+    height, width = map(int, size_str.split("-x-"))
+    # Build model names for cost lookup
+    base_model_name = f"{size_str}/{model}"
+    if custom_llm_provider and model.startswith(custom_llm_provider):
+        base_model_name = (
+            f"{custom_llm_provider}/{size_str}/{model.replace(custom_llm_provider, '')}"
+        )
+    model_name_with_quality = (
+        f"{quality}/{base_model_name}" if quality else base_model_name
+    )
+    # gpt-image-1 models use low, medium, high quality. If user did not specify quality, use medium fot gpt-image-1 model family
+    model_name_with_v2_quality = (
+        f"{ImageGenerationRequestQuality.MEDIUM.value}/{base_model_name}"
+    )
+    verbose_logger.debug(
+        f"Looking up cost for models: {model_name_with_quality}, {base_model_name}"
+    )
+    model_without_provider = f"{size_str}/{model.split('/')[-1]}"
+    model_with_quality_without_provider = (
+        f"{quality}/{model_without_provider}" if quality else model_without_provider
+    )
+    # Try model with quality first, fall back to base model name
+    cost_info: Optional[dict] = None
+    models_to_check = [
+        model_name_with_quality,
+        base_model_name,
+        model_name_with_v2_quality,
+        model_with_quality_without_provider,
+        model_without_provider,
+        model,
+    ]
+    for model in models_to_check:
+        if model in litellm.model_cost:
+            cost_info = litellm.model_cost[model]
+            break
+    if cost_info is None:
+        raise Exception(
+            f"Model not found in cost map. Tried checking {models_to_check}"
+        )
+    return cost_info["input_cost_per_pixel"] * height * width * n
+def batch_cost_calculator(
+    usage: Usage,
+    model: str,
+    custom_llm_provider: Optional[str] = None,
+) -> Tuple[float, float]:
+    """
+    Calculate the cost of a batch job
+    """
+    _, custom_llm_provider, _, _ = litellm.get_llm_provider(
+        model=model, custom_llm_provider=custom_llm_provider
+    )
+    verbose_logger.info(
+        "Calculating batch cost per token. model=%s, custom_llm_provider=%s",
+        model,
+        custom_llm_provider,
+    )
+    try:
+        model_info: Optional[ModelInfo] = litellm.get_model_info(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
+    except Exception:
+        model_info = None
+    if not model_info:
+        return 0.0, 0.0
+    input_cost_per_token_batches = model_info.get("input_cost_per_token_batches")
+    input_cost_per_token = model_info.get("input_cost_per_token")
+    output_cost_per_token_batches = model_info.get("output_cost_per_token_batches")
+    output_cost_per_token = model_info.get("output_cost_per_token")
+    total_prompt_cost = 0.0
+    total_completion_cost = 0.0
+    if input_cost_per_token_batches:
+        total_prompt_cost = usage.prompt_tokens * input_cost_per_token_batches
+    elif input_cost_per_token:
+        total_prompt_cost = (
+            usage.prompt_tokens * (input_cost_per_token) / 2
+        )  # batch cost is usually half of the regular token cost
+    if output_cost_per_token_batches:
+        total_completion_cost = usage.completion_tokens * output_cost_per_token_batches
+    elif output_cost_per_token:
+        total_completion_cost = (
+            usage.completion_tokens * (output_cost_per_token) / 2
+        )  # batch cost is usually half of the regular token cost
+    return total_prompt_cost, total_completion_cost
+class RealtimeAPITokenUsageProcessor:
+    @staticmethod
+    def collect_usage_from_realtime_stream_results(
+        results: OpenAIRealtimeStreamList,
+    ) -> List[Usage]:
+        """
+        Collect usage from realtime stream results
+        """
+        response_done_events: List[OpenAIRealtimeStreamResponseBaseObject] = cast(
+            List[OpenAIRealtimeStreamResponseBaseObject],
+            [result for result in results if result["type"] == "response.done"],
+        )
+        usage_objects: List[Usage] = []
+        for result in response_done_events:
+            usage_object = (
+                ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage(
+                    result["response"].get("usage", {})
+                )
+            )
+            usage_objects.append(usage_object)
+        return usage_objects
+    @staticmethod
+    def combine_usage_objects(usage_objects: List[Usage]) -> Usage:
+        """
+        Combine multiple Usage objects into a single Usage object, checking model keys for nested values.
+        """
+        from litellm.types.utils import (
+            CompletionTokensDetails,
+            PromptTokensDetailsWrapper,
+            Usage,
+        )
+        combined = Usage()
+        # Sum basic token counts
+        for usage in usage_objects:
+            # Handle direct attributes by checking what exists in the model
+            for attr in dir(usage):
+                if not attr.startswith("_") and not callable(getattr(usage, attr)):
+                    current_val = getattr(combined, attr, 0)
+                    new_val = getattr(usage, attr, 0)
+                    if (
+                        new_val is not None
+                        and isinstance(new_val, (int, float))
+                        and isinstance(current_val, (int, float))
+                    ):
+                        setattr(combined, attr, current_val + new_val)
+            # Handle nested prompt_tokens_details
+            if hasattr(usage, "prompt_tokens_details") and usage.prompt_tokens_details:
+                if (
+                    not hasattr(combined, "prompt_tokens_details")
+                    or not combined.prompt_tokens_details
+                ):
+                    combined.prompt_tokens_details = PromptTokensDetailsWrapper()
+                # Check what keys exist in the model's prompt_tokens_details
+                for attr in dir(usage.prompt_tokens_details):
+                    if not attr.startswith("_") and not callable(
+                        getattr(usage.prompt_tokens_details, attr)
+                    ):
+                        current_val = getattr(combined.prompt_tokens_details, attr, 0)
+                        new_val = getattr(usage.prompt_tokens_details, attr, 0)
+                        if new_val is not None:
+                            setattr(
+                                combined.prompt_tokens_details,
+                                attr,
+                                current_val + new_val,
+                            )
+            # Handle nested completion_tokens_details
+            if (
+                hasattr(usage, "completion_tokens_details")
+                and usage.completion_tokens_details
+            ):
+                if (
+                    not hasattr(combined, "completion_tokens_details")
+                    or not combined.completion_tokens_details
+                ):
+                    combined.completion_tokens_details = CompletionTokensDetails()
+                # Check what keys exist in the model's completion_tokens_details
+                for attr in dir(usage.completion_tokens_details):
+                    if not attr.startswith("_") and not callable(
+                        getattr(usage.completion_tokens_details, attr)
+                    ):
+                        current_val = getattr(
+                            combined.completion_tokens_details, attr, 0
+                        )
+                        new_val = getattr(usage.completion_tokens_details, attr, 0)
+                        if new_val is not None:
+                            setattr(
+                                combined.completion_tokens_details,
+                                attr,
+                                current_val + new_val,
+                            )
+        return combined
+    @staticmethod
+    def collect_and_combine_usage_from_realtime_stream_results(
+        results: OpenAIRealtimeStreamList,
+    ) -> Usage:
+        """
+        Collect and combine usage from realtime stream results
+        """
+        collected_usage_objects = (
+            RealtimeAPITokenUsageProcessor.collect_usage_from_realtime_stream_results(
+                results
+            )
+        )
+        combined_usage_object = RealtimeAPITokenUsageProcessor.combine_usage_objects(
+            collected_usage_objects
+        )
+        return combined_usage_object
+    @staticmethod
+    def create_logging_realtime_object(
+        usage: Usage, results: OpenAIRealtimeStreamList
+    ) -> LiteLLMRealtimeStreamLoggingObject:
+        return LiteLLMRealtimeStreamLoggingObject(
+            usage=usage,
+            results=results,
+        )
+def handle_realtime_stream_cost_calculation(
+    results: OpenAIRealtimeStreamList,
+    combined_usage_object: Usage,
+    custom_llm_provider: str,
+    litellm_model_name: str,
+) -> float:
+    """
+    Handles the cost calculation for realtime stream responses.
+    Pick the 'response.done' events. Calculate total cost across all 'response.done' events.
+    Args:
+        results: A list of OpenAIRealtimeStreamBaseObject objects
+    """
+    received_model = None
+    potential_model_names = []
+    for result in results:
+        if result["type"] == "session.created":
+            received_model = cast(OpenAIRealtimeStreamSessionEvents, result)["session"][
+                "model"
+            ]
+            potential_model_names.append(received_model)
+    potential_model_names.append(litellm_model_name)
+    input_cost_per_token = 0.0
+    output_cost_per_token = 0.0
+    for model_name in potential_model_names:
+        try:
+            _input_cost_per_token, _output_cost_per_token = generic_cost_per_token(
+                model=model_name,
+                usage=combined_usage_object,
+                custom_llm_provider=custom_llm_provider,
+            )
+        except Exception:
+            continue
+        input_cost_per_token += _input_cost_per_token
+        output_cost_per_token += _output_cost_per_token
+        break  # exit if we find a valid model
+    total_cost = input_cost_per_token + output_cost_per_token
+    return total_cost

litellm/exceptions.py ADDED Viewed

	@@ -0,0 +1,809 @@

+# +-----------------------------------------------+
+# |                                               |
+# |           Give Feedback / Get Help            |
+# | https://github.com/BerriAI/litellm/issues/new |
+# |                                               |
+# +-----------------------------------------------+
+#
+#  Thank you users! We ❤️ you! - Krrish & Ishaan
+## LiteLLM versions of the OpenAI Exception Types
+from typing import Optional
+import httpx
+import openai
+from litellm.types.utils import LiteLLMCommonStrings
+class AuthenticationError(openai.AuthenticationError):  # type: ignore
+    def __init__(
+        self,
+        message,
+        llm_provider,
+        model,
+        response: Optional[httpx.Response] = None,
+        litellm_debug_info: Optional[str] = None,
+        max_retries: Optional[int] = None,
+        num_retries: Optional[int] = None,
+    ):
+        self.status_code = 401
+        self.message = "litellm.AuthenticationError: {}".format(message)
+        self.llm_provider = llm_provider
+        self.model = model
+        self.litellm_debug_info = litellm_debug_info
+        self.max_retries = max_retries
+        self.num_retries = num_retries
+        self.response = response or httpx.Response(
+            status_code=self.status_code,
+            request=httpx.Request(
+                method="GET", url="https://litellm.ai"
+            ),  # mock request object
+        )
+        super().__init__(
+            self.message, response=self.response, body=None
+        )  # Call the base class constructor with the parameters it needs
+    def __str__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+    def __repr__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+# raise when invalid models passed, example gpt-8
+class NotFoundError(openai.NotFoundError):  # type: ignore
+    def __init__(
+        self,
+        message,
+        model,
+        llm_provider,
+        response: Optional[httpx.Response] = None,
+        litellm_debug_info: Optional[str] = None,
+        max_retries: Optional[int] = None,
+        num_retries: Optional[int] = None,
+    ):
+        self.status_code = 404
+        self.message = "litellm.NotFoundError: {}".format(message)
+        self.model = model
+        self.llm_provider = llm_provider
+        self.litellm_debug_info = litellm_debug_info
+        self.max_retries = max_retries
+        self.num_retries = num_retries
+        self.response = response or httpx.Response(
+            status_code=self.status_code,
+            request=httpx.Request(
+                method="GET", url="https://litellm.ai"
+            ),  # mock request object
+        )
+        super().__init__(
+            self.message, response=self.response, body=None
+        )  # Call the base class constructor with the parameters it needs
+    def __str__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+    def __repr__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+class BadRequestError(openai.BadRequestError):  # type: ignore
+    def __init__(
+        self,
+        message,
+        model,
+        llm_provider,
+        response: Optional[httpx.Response] = None,
+        litellm_debug_info: Optional[str] = None,
+        max_retries: Optional[int] = None,
+        num_retries: Optional[int] = None,
+        body: Optional[dict] = None,
+    ):
+        self.status_code = 400
+        self.message = "litellm.BadRequestError: {}".format(message)
+        self.model = model
+        self.llm_provider = llm_provider
+        self.litellm_debug_info = litellm_debug_info
+        response = httpx.Response(
+            status_code=self.status_code,
+            request=httpx.Request(
+                method="GET", url="https://litellm.ai"
+            ),  # mock request object
+        )
+        self.max_retries = max_retries
+        self.num_retries = num_retries
+        super().__init__(
+            self.message, response=response, body=body
+        )  # Call the base class constructor with the parameters it needs
+    def __str__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+    def __repr__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+class UnprocessableEntityError(openai.UnprocessableEntityError):  # type: ignore
+    def __init__(
+        self,
+        message,
+        model,
+        llm_provider,
+        response: httpx.Response,
+        litellm_debug_info: Optional[str] = None,
+        max_retries: Optional[int] = None,
+        num_retries: Optional[int] = None,
+    ):
+        self.status_code = 422
+        self.message = "litellm.UnprocessableEntityError: {}".format(message)
+        self.model = model
+        self.llm_provider = llm_provider
+        self.litellm_debug_info = litellm_debug_info
+        self.max_retries = max_retries
+        self.num_retries = num_retries
+        super().__init__(
+            self.message, response=response, body=None
+        )  # Call the base class constructor with the parameters it needs
+    def __str__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+    def __repr__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+class Timeout(openai.APITimeoutError):  # type: ignore
+    def __init__(
+        self,
+        message,
+        model,
+        llm_provider,
+        litellm_debug_info: Optional[str] = None,
+        max_retries: Optional[int] = None,
+        num_retries: Optional[int] = None,
+        headers: Optional[dict] = None,
+        exception_status_code: Optional[int] = None,
+    ):
+        request = httpx.Request(
+            method="POST",
+            url="https://api.openai.com/v1",
+        )
+        super().__init__(
+            request=request
+        )  # Call the base class constructor with the parameters it needs
+        self.status_code = exception_status_code or 408
+        self.message = "litellm.Timeout: {}".format(message)
+        self.model = model
+        self.llm_provider = llm_provider
+        self.litellm_debug_info = litellm_debug_info
+        self.max_retries = max_retries
+        self.num_retries = num_retries
+        self.headers = headers
+    # custom function to convert to str
+    def __str__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+    def __repr__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+class PermissionDeniedError(openai.PermissionDeniedError):  # type:ignore
+    def __init__(
+        self,
+        message,
+        llm_provider,
+        model,
+        response: httpx.Response,
+        litellm_debug_info: Optional[str] = None,
+        max_retries: Optional[int] = None,
+        num_retries: Optional[int] = None,
+    ):
+        self.status_code = 403
+        self.message = "litellm.PermissionDeniedError: {}".format(message)
+        self.llm_provider = llm_provider
+        self.model = model
+        self.litellm_debug_info = litellm_debug_info
+        self.max_retries = max_retries
+        self.num_retries = num_retries
+        super().__init__(
+            self.message, response=response, body=None
+        )  # Call the base class constructor with the parameters it needs
+    def __str__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+    def __repr__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+class RateLimitError(openai.RateLimitError):  # type: ignore
+    def __init__(
+        self,
+        message,
+        llm_provider,
+        model,
+        response: Optional[httpx.Response] = None,
+        litellm_debug_info: Optional[str] = None,
+        max_retries: Optional[int] = None,
+        num_retries: Optional[int] = None,
+    ):
+        self.status_code = 429
+        self.message = "litellm.RateLimitError: {}".format(message)
+        self.llm_provider = llm_provider
+        self.model = model
+        self.litellm_debug_info = litellm_debug_info
+        self.max_retries = max_retries
+        self.num_retries = num_retries
+        _response_headers = (
+            getattr(response, "headers", None) if response is not None else None
+        )
+        self.response = httpx.Response(
+            status_code=429,
+            headers=_response_headers,
+            request=httpx.Request(
+                method="POST",
+                url=" https://cloud.google.com/vertex-ai/",
+            ),
+        )
+        super().__init__(
+            self.message, response=self.response, body=None
+        )  # Call the base class constructor with the parameters it needs
+        self.code = "429"
+        self.type = "throttling_error"
+    def __str__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+    def __repr__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+# sub class of rate limit error - meant to give more granularity for error handling context window exceeded errors
+class ContextWindowExceededError(BadRequestError):  # type: ignore
+    def __init__(
+        self,
+        message,
+        model,
+        llm_provider,
+        response: Optional[httpx.Response] = None,
+        litellm_debug_info: Optional[str] = None,
+    ):
+        self.status_code = 400
+        self.model = model
+        self.llm_provider = llm_provider
+        self.litellm_debug_info = litellm_debug_info
+        request = httpx.Request(method="POST", url="https://api.openai.com/v1")
+        self.response = httpx.Response(status_code=400, request=request)
+        super().__init__(
+            message=message,
+            model=self.model,  # type: ignore
+            llm_provider=self.llm_provider,  # type: ignore
+            response=self.response,
+            litellm_debug_info=self.litellm_debug_info,
+        )  # Call the base class constructor with the parameters it needs
+        # set after, to make it clear the raised error is a context window exceeded error
+        self.message = "litellm.ContextWindowExceededError: {}".format(self.message)
+    def __str__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+    def __repr__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+# sub class of bad request error - meant to help us catch guardrails-related errors on proxy.
+class RejectedRequestError(BadRequestError):  # type: ignore
+    def __init__(
+        self,
+        message,
+        model,
+        llm_provider,
+        request_data: dict,
+        litellm_debug_info: Optional[str] = None,
+    ):
+        self.status_code = 400
+        self.message = "litellm.RejectedRequestError: {}".format(message)
+        self.model = model
+        self.llm_provider = llm_provider
+        self.litellm_debug_info = litellm_debug_info
+        self.request_data = request_data
+        request = httpx.Request(method="POST", url="https://api.openai.com/v1")
+        response = httpx.Response(status_code=400, request=request)
+        super().__init__(
+            message=self.message,
+            model=self.model,  # type: ignore
+            llm_provider=self.llm_provider,  # type: ignore
+            response=response,
+            litellm_debug_info=self.litellm_debug_info,
+        )  # Call the base class constructor with the parameters it needs
+    def __str__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+    def __repr__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+class ContentPolicyViolationError(BadRequestError):  # type: ignore
+    #  Error code: 400 - {'error': {'code': 'content_policy_violation', 'message': 'Your request was rejected as a result of our safety system. Image descriptions generated from your prompt may contain text that is not allowed by our safety system. If you believe this was done in error, your request may succeed if retried, or by adjusting your prompt.', 'param': None, 'type': 'invalid_request_error'}}
+    def __init__(
+        self,
+        message,
+        model,
+        llm_provider,
+        response: Optional[httpx.Response] = None,
+        litellm_debug_info: Optional[str] = None,
+    ):
+        self.status_code = 400
+        self.message = "litellm.ContentPolicyViolationError: {}".format(message)
+        self.model = model
+        self.llm_provider = llm_provider
+        self.litellm_debug_info = litellm_debug_info
+        request = httpx.Request(method="POST", url="https://api.openai.com/v1")
+        self.response = httpx.Response(status_code=400, request=request)
+        super().__init__(
+            message=self.message,
+            model=self.model,  # type: ignore
+            llm_provider=self.llm_provider,  # type: ignore
+            response=self.response,
+            litellm_debug_info=self.litellm_debug_info,
+        )  # Call the base class constructor with the parameters it needs
+    def __str__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+    def __repr__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+class ServiceUnavailableError(openai.APIStatusError):  # type: ignore
+    def __init__(
+        self,
+        message,
+        llm_provider,
+        model,
+        response: Optional[httpx.Response] = None,
+        litellm_debug_info: Optional[str] = None,
+        max_retries: Optional[int] = None,
+        num_retries: Optional[int] = None,
+    ):
+        self.status_code = 503
+        self.message = "litellm.ServiceUnavailableError: {}".format(message)
+        self.llm_provider = llm_provider
+        self.model = model
+        self.litellm_debug_info = litellm_debug_info
+        self.max_retries = max_retries
+        self.num_retries = num_retries
+        self.response = httpx.Response(
+            status_code=self.status_code,
+            request=httpx.Request(
+                method="POST",
+                url=" https://cloud.google.com/vertex-ai/",
+            ),
+        )
+        super().__init__(
+            self.message, response=self.response, body=None
+        )  # Call the base class constructor with the parameters it needs
+    def __str__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+    def __repr__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+class InternalServerError(openai.InternalServerError):  # type: ignore
+    def __init__(
+        self,
+        message,
+        llm_provider,
+        model,
+        response: Optional[httpx.Response] = None,
+        litellm_debug_info: Optional[str] = None,
+        max_retries: Optional[int] = None,
+        num_retries: Optional[int] = None,
+    ):
+        self.status_code = 500
+        self.message = "litellm.InternalServerError: {}".format(message)
+        self.llm_provider = llm_provider
+        self.model = model
+        self.litellm_debug_info = litellm_debug_info
+        self.max_retries = max_retries
+        self.num_retries = num_retries
+        self.response = httpx.Response(
+            status_code=self.status_code,
+            request=httpx.Request(
+                method="POST",
+                url=" https://cloud.google.com/vertex-ai/",
+            ),
+        )
+        super().__init__(
+            self.message, response=self.response, body=None
+        )  # Call the base class constructor with the parameters it needs
+    def __str__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+    def __repr__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+# raise this when the API returns an invalid response object - https://github.com/openai/openai-python/blob/1be14ee34a0f8e42d3f9aa5451aa4cb161f1781f/openai/api_requestor.py#L401
+class APIError(openai.APIError):  # type: ignore
+    def __init__(
+        self,
+        status_code: int,
+        message,
+        llm_provider,
+        model,
+        request: Optional[httpx.Request] = None,
+        litellm_debug_info: Optional[str] = None,
+        max_retries: Optional[int] = None,
+        num_retries: Optional[int] = None,
+    ):
+        self.status_code = status_code
+        self.message = "litellm.APIError: {}".format(message)
+        self.llm_provider = llm_provider
+        self.model = model
+        self.litellm_debug_info = litellm_debug_info
+        self.max_retries = max_retries
+        self.num_retries = num_retries
+        if request is None:
+            request = httpx.Request(method="POST", url="https://api.openai.com/v1")
+        super().__init__(self.message, request=request, body=None)  # type: ignore
+    def __str__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+    def __repr__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+# raised if an invalid request (not get, delete, put, post) is made
+class APIConnectionError(openai.APIConnectionError):  # type: ignore
+    def __init__(
+        self,
+        message,
+        llm_provider,
+        model,
+        request: Optional[httpx.Request] = None,
+        litellm_debug_info: Optional[str] = None,
+        max_retries: Optional[int] = None,
+        num_retries: Optional[int] = None,
+    ):
+        self.message = "litellm.APIConnectionError: {}".format(message)
+        self.llm_provider = llm_provider
+        self.model = model
+        self.status_code = 500
+        self.litellm_debug_info = litellm_debug_info
+        self.request = httpx.Request(method="POST", url="https://api.openai.com/v1")
+        self.max_retries = max_retries
+        self.num_retries = num_retries
+        super().__init__(message=self.message, request=self.request)
+    def __str__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+    def __repr__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+# raised if an invalid request (not get, delete, put, post) is made
+class APIResponseValidationError(openai.APIResponseValidationError):  # type: ignore
+    def __init__(
+        self,
+        message,
+        llm_provider,
+        model,
+        litellm_debug_info: Optional[str] = None,
+        max_retries: Optional[int] = None,
+        num_retries: Optional[int] = None,
+    ):
+        self.message = "litellm.APIResponseValidationError: {}".format(message)
+        self.llm_provider = llm_provider
+        self.model = model
+        request = httpx.Request(method="POST", url="https://api.openai.com/v1")
+        response = httpx.Response(status_code=500, request=request)
+        self.litellm_debug_info = litellm_debug_info
+        self.max_retries = max_retries
+        self.num_retries = num_retries
+        super().__init__(response=response, body=None, message=message)
+    def __str__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+    def __repr__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        return _message
+class JSONSchemaValidationError(APIResponseValidationError):
+    def __init__(
+        self, model: str, llm_provider: str, raw_response: str, schema: str
+    ) -> None:
+        self.raw_response = raw_response
+        self.schema = schema
+        self.model = model
+        message = "litellm.JSONSchemaValidationError: model={}, returned an invalid response={}, for schema={}.\nAccess raw response with `e.raw_response`".format(
+            model, raw_response, schema
+        )
+        self.message = message
+        super().__init__(model=model, message=message, llm_provider=llm_provider)
+class OpenAIError(openai.OpenAIError):  # type: ignore
+    def __init__(self, original_exception=None):
+        super().__init__()
+        self.llm_provider = "openai"
+class UnsupportedParamsError(BadRequestError):
+    def __init__(
+        self,
+        message,
+        llm_provider: Optional[str] = None,
+        model: Optional[str] = None,
+        status_code: int = 400,
+        response: Optional[httpx.Response] = None,
+        litellm_debug_info: Optional[str] = None,
+        max_retries: Optional[int] = None,
+        num_retries: Optional[int] = None,
+    ):
+        self.status_code = 400
+        self.message = "litellm.UnsupportedParamsError: {}".format(message)
+        self.model = model
+        self.llm_provider = llm_provider
+        self.litellm_debug_info = litellm_debug_info
+        response = response or httpx.Response(
+            status_code=self.status_code,
+            request=httpx.Request(
+                method="GET", url="https://litellm.ai"
+            ),  # mock request object
+        )
+        self.max_retries = max_retries
+        self.num_retries = num_retries
+LITELLM_EXCEPTION_TYPES = [
+    AuthenticationError,
+    NotFoundError,
+    BadRequestError,
+    UnprocessableEntityError,
+    UnsupportedParamsError,
+    Timeout,
+    PermissionDeniedError,
+    RateLimitError,
+    ContextWindowExceededError,
+    RejectedRequestError,
+    ContentPolicyViolationError,
+    InternalServerError,
+    ServiceUnavailableError,
+    APIError,
+    APIConnectionError,
+    APIResponseValidationError,
+    OpenAIError,
+    InternalServerError,
+    JSONSchemaValidationError,
+]
+class BudgetExceededError(Exception):
+    def __init__(
+        self, current_cost: float, max_budget: float, message: Optional[str] = None
+    ):
+        self.current_cost = current_cost
+        self.max_budget = max_budget
+        message = (
+            message
+            or f"Budget has been exceeded! Current cost: {current_cost}, Max budget: {max_budget}"
+        )
+        self.message = message
+        super().__init__(message)
+## DEPRECATED ##
+class InvalidRequestError(openai.BadRequestError):  # type: ignore
+    def __init__(self, message, model, llm_provider):
+        self.status_code = 400
+        self.message = message
+        self.model = model
+        self.llm_provider = llm_provider
+        self.response = httpx.Response(
+            status_code=400,
+            request=httpx.Request(
+                method="GET", url="https://litellm.ai"
+            ),  # mock request object
+        )
+        super().__init__(
+            message=self.message, response=self.response, body=None
+        )  # Call the base class constructor with the parameters it needs
+class MockException(openai.APIError):
+    # used for testing
+    def __init__(
+        self,
+        status_code: int,
+        message,
+        llm_provider,
+        model,
+        request: Optional[httpx.Request] = None,
+        litellm_debug_info: Optional[str] = None,
+        max_retries: Optional[int] = None,
+        num_retries: Optional[int] = None,
+    ):
+        self.status_code = status_code
+        self.message = "litellm.MockException: {}".format(message)
+        self.llm_provider = llm_provider
+        self.model = model
+        self.litellm_debug_info = litellm_debug_info
+        self.max_retries = max_retries
+        self.num_retries = num_retries
+        if request is None:
+            request = httpx.Request(method="POST", url="https://api.openai.com/v1")
+        super().__init__(self.message, request=request, body=None)  # type: ignore
+class LiteLLMUnknownProvider(BadRequestError):
+    def __init__(self, model: str, custom_llm_provider: Optional[str] = None):
+        self.message = LiteLLMCommonStrings.llm_provider_not_provided.value.format(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
+        super().__init__(
+            self.message, model=model, llm_provider=custom_llm_provider, response=None
+        )
+    def __str__(self):
+        return self.message

litellm/experimental_mcp_client/Readme.md ADDED Viewed

	@@ -0,0 +1,6 @@


1	+ # LiteLLM MCP Client
2	+
3	+ LiteLLM MCP Client is a client that allows you to use MCP tools with LiteLLM.
4	+
5	+
6	+

litellm/experimental_mcp_client/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .tools import call_openai_tool, load_mcp_tools
2	+
3	+ __all__ = ["load_mcp_tools", "call_openai_tool"]

litellm/experimental_mcp_client/client.py ADDED Viewed

File without changes

litellm/experimental_mcp_client/tools.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import json
+from typing import Dict, List, Literal, Union
+from mcp import ClientSession
+from mcp.types import CallToolRequestParams as MCPCallToolRequestParams
+from mcp.types import CallToolResult as MCPCallToolResult
+from mcp.types import Tool as MCPTool
+from openai.types.chat import ChatCompletionToolParam
+from openai.types.shared_params.function_definition import FunctionDefinition
+from litellm.types.utils import ChatCompletionMessageToolCall
+########################################################
+# List MCP Tool functions
+########################################################
+def transform_mcp_tool_to_openai_tool(mcp_tool: MCPTool) -> ChatCompletionToolParam:
+    """Convert an MCP tool to an OpenAI tool."""
+    return ChatCompletionToolParam(
+        type="function",
+        function=FunctionDefinition(
+            name=mcp_tool.name,
+            description=mcp_tool.description or "",
+            parameters=mcp_tool.inputSchema,
+            strict=False,
+        ),
+    )
+async def load_mcp_tools(
+    session: ClientSession, format: Literal["mcp", "openai"] = "mcp"
+) -> Union[List[MCPTool], List[ChatCompletionToolParam]]:
+    """
+    Load all available MCP tools
+    Args:
+        session: The MCP session to use
+        format: The format to convert the tools to
+    By default, the tools are returned in MCP format.
+    If format is set to "openai", the tools are converted to OpenAI API compatible tools.
+    """
+    tools = await session.list_tools()
+    if format == "openai":
+        return [
+            transform_mcp_tool_to_openai_tool(mcp_tool=tool) for tool in tools.tools
+        ]
+    return tools.tools
+########################################################
+# Call MCP Tool functions
+########################################################
+async def call_mcp_tool(
+    session: ClientSession,
+    call_tool_request_params: MCPCallToolRequestParams,
+) -> MCPCallToolResult:
+    """Call an MCP tool."""
+    tool_result = await session.call_tool(
+        name=call_tool_request_params.name,
+        arguments=call_tool_request_params.arguments,
+    )
+    return tool_result
+def _get_function_arguments(function: FunctionDefinition) -> dict:
+    """Helper to safely get and parse function arguments."""
+    arguments = function.get("arguments", {})
+    if isinstance(arguments, str):
+        try:
+            arguments = json.loads(arguments)
+        except json.JSONDecodeError:
+            arguments = {}
+    return arguments if isinstance(arguments, dict) else {}
+def transform_openai_tool_call_request_to_mcp_tool_call_request(
+    openai_tool: Union[ChatCompletionMessageToolCall, Dict],
+) -> MCPCallToolRequestParams:
+    """Convert an OpenAI ChatCompletionMessageToolCall to an MCP CallToolRequestParams."""
+    function = openai_tool["function"]
+    return MCPCallToolRequestParams(
+        name=function["name"],
+        arguments=_get_function_arguments(function),
+    )
+async def call_openai_tool(
+    session: ClientSession,
+    openai_tool: ChatCompletionMessageToolCall,
+) -> MCPCallToolResult:
+    """
+    Call an OpenAI tool using MCP client.
+    Args:
+        session: The MCP session to use
+        openai_tool: The OpenAI tool to call. You can get this from the `choices[0].message.tool_calls[0]` of the response from the OpenAI API.
+    Returns:
+        The result of the MCP tool call.
+    """
+    mcp_tool_call_request_params = (
+        transform_openai_tool_call_request_to_mcp_tool_call_request(
+            openai_tool=openai_tool,
+        )
+    )
+    return await call_mcp_tool(
+        session=session,
+        call_tool_request_params=mcp_tool_call_request_params,
+    )

litellm/files/main.py ADDED Viewed

	@@ -0,0 +1,891 @@

+"""
+Main File for Files API implementation
+https://platform.openai.com/docs/api-reference/files
+"""
+import asyncio
+import contextvars
+import os
+from functools import partial
+from typing import Any, Coroutine, Dict, Literal, Optional, Union, cast
+import httpx
+import litellm
+from litellm import get_secret_str
+from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
+from litellm.llms.azure.files.handler import AzureOpenAIFilesAPI
+from litellm.llms.custom_httpx.llm_http_handler import BaseLLMHTTPHandler
+from litellm.llms.openai.openai import FileDeleted, FileObject, OpenAIFilesAPI
+from litellm.llms.vertex_ai.files.handler import VertexAIFilesHandler
+from litellm.types.llms.openai import (
+    CreateFileRequest,
+    FileContentRequest,
+    FileTypes,
+    HttpxBinaryResponseContent,
+    OpenAIFileObject,
+)
+from litellm.types.router import *
+from litellm.types.utils import LlmProviders
+from litellm.utils import (
+    ProviderConfigManager,
+    client,
+    get_litellm_params,
+    supports_httpx_timeout,
+)
+base_llm_http_handler = BaseLLMHTTPHandler()
+####### ENVIRONMENT VARIABLES ###################
+openai_files_instance = OpenAIFilesAPI()
+azure_files_instance = AzureOpenAIFilesAPI()
+vertex_ai_files_instance = VertexAIFilesHandler()
+#################################################
+@client
+async def acreate_file(
+    file: FileTypes,
+    purpose: Literal["assistants", "batch", "fine-tune"],
+    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> OpenAIFileObject:
+    """
+    Async: Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.
+    LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
+    """
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["acreate_file"] = True
+        call_args = {
+            "file": file,
+            "purpose": purpose,
+            "custom_llm_provider": custom_llm_provider,
+            "extra_headers": extra_headers,
+            "extra_body": extra_body,
+            **kwargs,
+        }
+        # Use a partial function to pass your keyword arguments
+        func = partial(create_file, **call_args)
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response  # type: ignore
+        return response
+    except Exception as e:
+        raise e
+@client
+def create_file(
+    file: FileTypes,
+    purpose: Literal["assistants", "batch", "fine-tune"],
+    custom_llm_provider: Optional[Literal["openai", "azure", "vertex_ai"]] = None,
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> Union[OpenAIFileObject, Coroutine[Any, Any, OpenAIFileObject]]:
+    """
+    Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.
+    LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
+    Specify either provider_list or custom_llm_provider.
+    """
+    try:
+        _is_async = kwargs.pop("acreate_file", False) is True
+        optional_params = GenericLiteLLMParams(**kwargs)
+        litellm_params_dict = get_litellm_params(**kwargs)
+        logging_obj = cast(
+            Optional[LiteLLMLoggingObj], kwargs.get("litellm_logging_obj")
+        )
+        if logging_obj is None:
+            raise ValueError("logging_obj is required")
+        client = kwargs.get("client")
+        ### TIMEOUT LOGIC ###
+        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+        # set timeout for 10 minutes by default
+        if (
+            timeout is not None
+            and isinstance(timeout, httpx.Timeout)
+            and supports_httpx_timeout(cast(str, custom_llm_provider)) is False
+        ):
+            read_timeout = timeout.read or 600
+            timeout = read_timeout  # default 10 min timeout
+        elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+            timeout = float(timeout)  # type: ignore
+        elif timeout is None:
+            timeout = 600.0
+        _create_file_request = CreateFileRequest(
+            file=file,
+            purpose=purpose,
+            extra_headers=extra_headers,
+            extra_body=extra_body,
+        )
+        provider_config = ProviderConfigManager.get_provider_files_config(
+            model="",
+            provider=LlmProviders(custom_llm_provider),
+        )
+        if provider_config is not None:
+            response = base_llm_http_handler.create_file(
+                provider_config=provider_config,
+                litellm_params=litellm_params_dict,
+                create_file_data=_create_file_request,
+                headers=extra_headers or {},
+                api_base=optional_params.api_base,
+                api_key=optional_params.api_key,
+                logging_obj=logging_obj,
+                _is_async=_is_async,
+                client=client
+                if client is not None
+                and isinstance(client, (HTTPHandler, AsyncHTTPHandler))
+                else None,
+                timeout=timeout,
+            )
+        elif custom_llm_provider == "openai":
+            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_BASE_URL")
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            # set API KEY
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+            response = openai_files_instance.create_file(
+                _is_async=_is_async,
+                api_base=api_base,
+                api_key=api_key,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                organization=organization,
+                create_file_data=_create_file_request,
+            )
+        elif custom_llm_provider == "azure":
+            api_base = optional_params.api_base or litellm.api_base or get_secret_str("AZURE_API_BASE")  # type: ignore
+            api_version = (
+                optional_params.api_version
+                or litellm.api_version
+                or get_secret_str("AZURE_API_VERSION")
+            )  # type: ignore
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key
+                or litellm.azure_key
+                or get_secret_str("AZURE_OPENAI_API_KEY")
+                or get_secret_str("AZURE_API_KEY")
+            )  # type: ignore
+            extra_body = optional_params.get("extra_body", {})
+            if extra_body is not None:
+                extra_body.pop("azure_ad_token", None)
+            else:
+                get_secret_str("AZURE_AD_TOKEN")  # type: ignore
+            response = azure_files_instance.create_file(
+                _is_async=_is_async,
+                api_base=api_base,
+                api_key=api_key,
+                api_version=api_version,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                create_file_data=_create_file_request,
+                litellm_params=litellm_params_dict,
+            )
+        elif custom_llm_provider == "vertex_ai":
+            api_base = optional_params.api_base or ""
+            vertex_ai_project = (
+                optional_params.vertex_project
+                or litellm.vertex_project
+                or get_secret_str("VERTEXAI_PROJECT")
+            )
+            vertex_ai_location = (
+                optional_params.vertex_location
+                or litellm.vertex_location
+                or get_secret_str("VERTEXAI_LOCATION")
+            )
+            vertex_credentials = optional_params.vertex_credentials or get_secret_str(
+                "VERTEXAI_CREDENTIALS"
+            )
+            response = vertex_ai_files_instance.create_file(
+                _is_async=_is_async,
+                api_base=api_base,
+                vertex_project=vertex_ai_project,
+                vertex_location=vertex_ai_location,
+                vertex_credentials=vertex_credentials,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                create_file_data=_create_file_request,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'create_file'. Only ['openai', 'azure', 'vertex_ai'] are supported.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="create_file", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return response
+    except Exception as e:
+        raise e
+async def afile_retrieve(
+    file_id: str,
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+):
+    """
+    Async: Get file contents
+    LiteLLM Equivalent of GET https://api.openai.com/v1/files
+    """
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["is_async"] = True
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            file_retrieve,
+            file_id,
+            custom_llm_provider,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response
+        return response
+    except Exception as e:
+        raise e
+def file_retrieve(
+    file_id: str,
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> FileObject:
+    """
+    Returns the contents of the specified file.
+    LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
+    """
+    try:
+        optional_params = GenericLiteLLMParams(**kwargs)
+        ### TIMEOUT LOGIC ###
+        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+        # set timeout for 10 minutes by default
+        if (
+            timeout is not None
+            and isinstance(timeout, httpx.Timeout)
+            and supports_httpx_timeout(custom_llm_provider) is False
+        ):
+            read_timeout = timeout.read or 600
+            timeout = read_timeout  # default 10 min timeout
+        elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+            timeout = float(timeout)  # type: ignore
+        elif timeout is None:
+            timeout = 600.0
+        _is_async = kwargs.pop("is_async", False) is True
+        if custom_llm_provider == "openai":
+            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_BASE_URL")
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            # set API KEY
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+            response = openai_files_instance.retrieve_file(
+                file_id=file_id,
+                _is_async=_is_async,
+                api_base=api_base,
+                api_key=api_key,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                organization=organization,
+            )
+        elif custom_llm_provider == "azure":
+            api_base = optional_params.api_base or litellm.api_base or get_secret_str("AZURE_API_BASE")  # type: ignore
+            api_version = (
+                optional_params.api_version
+                or litellm.api_version
+                or get_secret_str("AZURE_API_VERSION")
+            )  # type: ignore
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key
+                or litellm.azure_key
+                or get_secret_str("AZURE_OPENAI_API_KEY")
+                or get_secret_str("AZURE_API_KEY")
+            )  # type: ignore
+            extra_body = optional_params.get("extra_body", {})
+            if extra_body is not None:
+                extra_body.pop("azure_ad_token", None)
+            else:
+                get_secret_str("AZURE_AD_TOKEN")  # type: ignore
+            response = azure_files_instance.retrieve_file(
+                _is_async=_is_async,
+                api_base=api_base,
+                api_key=api_key,
+                api_version=api_version,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                file_id=file_id,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'file_retrieve'. Only 'openai' and 'azure' are supported.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return cast(FileObject, response)
+    except Exception as e:
+        raise e
+# Delete file
+async def afile_delete(
+    file_id: str,
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> Coroutine[Any, Any, FileObject]:
+    """
+    Async: Delete file
+    LiteLLM Equivalent of DELETE https://api.openai.com/v1/files
+    """
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["is_async"] = True
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            file_delete,
+            file_id,
+            custom_llm_provider,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response  # type: ignore
+        return cast(FileDeleted, response)  # type: ignore
+    except Exception as e:
+        raise e
+def file_delete(
+    file_id: str,
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> FileDeleted:
+    """
+    Delete file
+    LiteLLM Equivalent of DELETE https://api.openai.com/v1/files
+    """
+    try:
+        optional_params = GenericLiteLLMParams(**kwargs)
+        litellm_params_dict = get_litellm_params(**kwargs)
+        ### TIMEOUT LOGIC ###
+        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+        # set timeout for 10 minutes by default
+        client = kwargs.get("client")
+        if (
+            timeout is not None
+            and isinstance(timeout, httpx.Timeout)
+            and supports_httpx_timeout(custom_llm_provider) is False
+        ):
+            read_timeout = timeout.read or 600
+            timeout = read_timeout  # default 10 min timeout
+        elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+            timeout = float(timeout)  # type: ignore
+        elif timeout is None:
+            timeout = 600.0
+        _is_async = kwargs.pop("is_async", False) is True
+        if custom_llm_provider == "openai":
+            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_BASE_URL")
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            # set API KEY
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+            response = openai_files_instance.delete_file(
+                file_id=file_id,
+                _is_async=_is_async,
+                api_base=api_base,
+                api_key=api_key,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                organization=organization,
+            )
+        elif custom_llm_provider == "azure":
+            api_base = optional_params.api_base or litellm.api_base or get_secret_str("AZURE_API_BASE")  # type: ignore
+            api_version = (
+                optional_params.api_version
+                or litellm.api_version
+                or get_secret_str("AZURE_API_VERSION")
+            )  # type: ignore
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key
+                or litellm.azure_key
+                or get_secret_str("AZURE_OPENAI_API_KEY")
+                or get_secret_str("AZURE_API_KEY")
+            )  # type: ignore
+            extra_body = optional_params.get("extra_body", {})
+            if extra_body is not None:
+                extra_body.pop("azure_ad_token", None)
+            else:
+                get_secret_str("AZURE_AD_TOKEN")  # type: ignore
+            response = azure_files_instance.delete_file(
+                _is_async=_is_async,
+                api_base=api_base,
+                api_key=api_key,
+                api_version=api_version,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                file_id=file_id,
+                client=client,
+                litellm_params=litellm_params_dict,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return cast(FileDeleted, response)
+    except Exception as e:
+        raise e
+# List files
+async def afile_list(
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
+    purpose: Optional[str] = None,
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+):
+    """
+    Async: List files
+    LiteLLM Equivalent of GET https://api.openai.com/v1/files
+    """
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["is_async"] = True
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            file_list,
+            custom_llm_provider,
+            purpose,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response  # type: ignore
+        return response
+    except Exception as e:
+        raise e
+def file_list(
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
+    purpose: Optional[str] = None,
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+):
+    """
+    List files
+    LiteLLM Equivalent of GET https://api.openai.com/v1/files
+    """
+    try:
+        optional_params = GenericLiteLLMParams(**kwargs)
+        ### TIMEOUT LOGIC ###
+        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+        # set timeout for 10 minutes by default
+        if (
+            timeout is not None
+            and isinstance(timeout, httpx.Timeout)
+            and supports_httpx_timeout(custom_llm_provider) is False
+        ):
+            read_timeout = timeout.read or 600
+            timeout = read_timeout  # default 10 min timeout
+        elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+            timeout = float(timeout)  # type: ignore
+        elif timeout is None:
+            timeout = 600.0
+        _is_async = kwargs.pop("is_async", False) is True
+        if custom_llm_provider == "openai":
+            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_BASE_URL")
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            # set API KEY
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+            response = openai_files_instance.list_files(
+                purpose=purpose,
+                _is_async=_is_async,
+                api_base=api_base,
+                api_key=api_key,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                organization=organization,
+            )
+        elif custom_llm_provider == "azure":
+            api_base = optional_params.api_base or litellm.api_base or get_secret_str("AZURE_API_BASE")  # type: ignore
+            api_version = (
+                optional_params.api_version
+                or litellm.api_version
+                or get_secret_str("AZURE_API_VERSION")
+            )  # type: ignore
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key
+                or litellm.azure_key
+                or get_secret_str("AZURE_OPENAI_API_KEY")
+                or get_secret_str("AZURE_API_KEY")
+            )  # type: ignore
+            extra_body = optional_params.get("extra_body", {})
+            if extra_body is not None:
+                extra_body.pop("azure_ad_token", None)
+            else:
+                get_secret_str("AZURE_AD_TOKEN")  # type: ignore
+            response = azure_files_instance.list_files(
+                _is_async=_is_async,
+                api_base=api_base,
+                api_key=api_key,
+                api_version=api_version,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                purpose=purpose,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'file_list'. Only 'openai' and 'azure' are supported.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="file_list", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return response
+    except Exception as e:
+        raise e
+async def afile_content(
+    file_id: str,
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> HttpxBinaryResponseContent:
+    """
+    Async: Get file contents
+    LiteLLM Equivalent of GET https://api.openai.com/v1/files
+    """
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["afile_content"] = True
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            file_content,
+            file_id,
+            custom_llm_provider,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response  # type: ignore
+        return response
+    except Exception as e:
+        raise e
+def file_content(
+    file_id: str,
+    custom_llm_provider: Literal["openai", "azure"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> Union[HttpxBinaryResponseContent, Coroutine[Any, Any, HttpxBinaryResponseContent]]:
+    """
+    Returns the contents of the specified file.
+    LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
+    """
+    try:
+        optional_params = GenericLiteLLMParams(**kwargs)
+        litellm_params_dict = get_litellm_params(**kwargs)
+        ### TIMEOUT LOGIC ###
+        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+        client = kwargs.get("client")
+        # set timeout for 10 minutes by default
+        if (
+            timeout is not None
+            and isinstance(timeout, httpx.Timeout)
+            and supports_httpx_timeout(custom_llm_provider) is False
+        ):
+            read_timeout = timeout.read or 600
+            timeout = read_timeout  # default 10 min timeout
+        elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+            timeout = float(timeout)  # type: ignore
+        elif timeout is None:
+            timeout = 600.0
+        _file_content_request = FileContentRequest(
+            file_id=file_id,
+            extra_headers=extra_headers,
+            extra_body=extra_body,
+        )
+        _is_async = kwargs.pop("afile_content", False) is True
+        if custom_llm_provider == "openai":
+            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_BASE_URL")
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            # set API KEY
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+            response = openai_files_instance.file_content(
+                _is_async=_is_async,
+                file_content_request=_file_content_request,
+                api_base=api_base,
+                api_key=api_key,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                organization=organization,
+            )
+        elif custom_llm_provider == "azure":
+            api_base = optional_params.api_base or litellm.api_base or get_secret_str("AZURE_API_BASE")  # type: ignore
+            api_version = (
+                optional_params.api_version
+                or litellm.api_version
+                or get_secret_str("AZURE_API_VERSION")
+            )  # type: ignore
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key
+                or litellm.azure_key
+                or get_secret_str("AZURE_OPENAI_API_KEY")
+                or get_secret_str("AZURE_API_KEY")
+            )  # type: ignore
+            extra_body = optional_params.get("extra_body", {})
+            if extra_body is not None:
+                extra_body.pop("azure_ad_token", None)
+            else:
+                get_secret_str("AZURE_AD_TOKEN")  # type: ignore
+            response = azure_files_instance.file_content(
+                _is_async=_is_async,
+                api_base=api_base,
+                api_key=api_key,
+                api_version=api_version,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                file_content_request=_file_content_request,
+                client=client,
+                litellm_params=litellm_params_dict,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'custom_llm_provider'. Supported providers are 'openai', 'azure', 'vertex_ai'.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return response
+    except Exception as e:
+        raise e

litellm/fine_tuning/main.py ADDED Viewed

	@@ -0,0 +1,761 @@

+"""
+Main File for Fine Tuning API implementation
+https://platform.openai.com/docs/api-reference/fine-tuning
+- fine_tuning.jobs.create()
+- fine_tuning.jobs.list()
+- client.fine_tuning.jobs.list_events()
+"""
+import asyncio
+import contextvars
+import os
+from functools import partial
+from typing import Any, Coroutine, Dict, Literal, Optional, Union
+import httpx
+import litellm
+from litellm._logging import verbose_logger
+from litellm.llms.azure.fine_tuning.handler import AzureOpenAIFineTuningAPI
+from litellm.llms.openai.fine_tuning.handler import OpenAIFineTuningAPI
+from litellm.llms.vertex_ai.fine_tuning.handler import VertexFineTuningAPI
+from litellm.secret_managers.main import get_secret_str
+from litellm.types.llms.openai import (
+    FineTuningJob,
+    FineTuningJobCreate,
+    Hyperparameters,
+)
+from litellm.types.router import *
+from litellm.utils import client, supports_httpx_timeout
+####### ENVIRONMENT VARIABLES ###################
+openai_fine_tuning_apis_instance = OpenAIFineTuningAPI()
+azure_fine_tuning_apis_instance = AzureOpenAIFineTuningAPI()
+vertex_fine_tuning_apis_instance = VertexFineTuningAPI()
+#################################################
+@client
+async def acreate_fine_tuning_job(
+    model: str,
+    training_file: str,
+    hyperparameters: Optional[dict] = {},
+    suffix: Optional[str] = None,
+    validation_file: Optional[str] = None,
+    integrations: Optional[List[str]] = None,
+    seed: Optional[int] = None,
+    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> FineTuningJob:
+    """
+    Async: Creates and executes a batch from an uploaded file of request
+    """
+    verbose_logger.debug(
+        "inside acreate_fine_tuning_job model=%s and kwargs=%s", model, kwargs
+    )
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["acreate_fine_tuning_job"] = True
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            create_fine_tuning_job,
+            model,
+            training_file,
+            hyperparameters,
+            suffix,
+            validation_file,
+            integrations,
+            seed,
+            custom_llm_provider,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response  # type: ignore
+        return response
+    except Exception as e:
+        raise e
+@client
+def create_fine_tuning_job(
+    model: str,
+    training_file: str,
+    hyperparameters: Optional[dict] = {},
+    suffix: Optional[str] = None,
+    validation_file: Optional[str] = None,
+    integrations: Optional[List[str]] = None,
+    seed: Optional[int] = None,
+    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> Union[FineTuningJob, Coroutine[Any, Any, FineTuningJob]]:
+    """
+    Creates a fine-tuning job which begins the process of creating a new model from a given dataset.
+    Response includes details of the enqueued job including job status and the name of the fine-tuned models once complete
+    """
+    try:
+        _is_async = kwargs.pop("acreate_fine_tuning_job", False) is True
+        optional_params = GenericLiteLLMParams(**kwargs)
+        # handle hyperparameters
+        hyperparameters = hyperparameters or {}  # original hyperparameters
+        _oai_hyperparameters: Hyperparameters = Hyperparameters(
+            **hyperparameters
+        )  # Typed Hyperparameters for OpenAI Spec
+        ### TIMEOUT LOGIC ###
+        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+        # set timeout for 10 minutes by default
+        if (
+            timeout is not None
+            and isinstance(timeout, httpx.Timeout)
+            and supports_httpx_timeout(custom_llm_provider) is False
+        ):
+            read_timeout = timeout.read or 600
+            timeout = read_timeout  # default 10 min timeout
+        elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+            timeout = float(timeout)  # type: ignore
+        elif timeout is None:
+            timeout = 600.0
+        # OpenAI
+        if custom_llm_provider == "openai":
+            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_BASE_URL")
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            # set API KEY
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+            create_fine_tuning_job_data = FineTuningJobCreate(
+                model=model,
+                training_file=training_file,
+                hyperparameters=_oai_hyperparameters,
+                suffix=suffix,
+                validation_file=validation_file,
+                integrations=integrations,
+                seed=seed,
+            )
+            create_fine_tuning_job_data_dict = create_fine_tuning_job_data.model_dump(
+                exclude_none=True
+            )
+            response = openai_fine_tuning_apis_instance.create_fine_tuning_job(
+                api_base=api_base,
+                api_key=api_key,
+                api_version=optional_params.api_version,
+                organization=organization,
+                create_fine_tuning_job_data=create_fine_tuning_job_data_dict,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                _is_async=_is_async,
+                client=kwargs.get(
+                    "client", None
+                ),  # note, when we add this to `GenericLiteLLMParams` it impacts a lot of other tests + linting
+            )
+        # Azure OpenAI
+        elif custom_llm_provider == "azure":
+            api_base = optional_params.api_base or litellm.api_base or get_secret_str("AZURE_API_BASE")  # type: ignore
+            api_version = (
+                optional_params.api_version
+                or litellm.api_version
+                or get_secret_str("AZURE_API_VERSION")
+            )  # type: ignore
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key
+                or litellm.azure_key
+                or get_secret_str("AZURE_OPENAI_API_KEY")
+                or get_secret_str("AZURE_API_KEY")
+            )  # type: ignore
+            extra_body = optional_params.get("extra_body", {})
+            if extra_body is not None:
+                extra_body.pop("azure_ad_token", None)
+            else:
+                get_secret_str("AZURE_AD_TOKEN")  # type: ignore
+            create_fine_tuning_job_data = FineTuningJobCreate(
+                model=model,
+                training_file=training_file,
+                hyperparameters=_oai_hyperparameters,
+                suffix=suffix,
+                validation_file=validation_file,
+                integrations=integrations,
+                seed=seed,
+            )
+            create_fine_tuning_job_data_dict = create_fine_tuning_job_data.model_dump(
+                exclude_none=True
+            )
+            response = azure_fine_tuning_apis_instance.create_fine_tuning_job(
+                api_base=api_base,
+                api_key=api_key,
+                api_version=api_version,
+                create_fine_tuning_job_data=create_fine_tuning_job_data_dict,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                _is_async=_is_async,
+                organization=optional_params.organization,
+            )
+        elif custom_llm_provider == "vertex_ai":
+            api_base = optional_params.api_base or ""
+            vertex_ai_project = (
+                optional_params.vertex_project
+                or litellm.vertex_project
+                or get_secret_str("VERTEXAI_PROJECT")
+            )
+            vertex_ai_location = (
+                optional_params.vertex_location
+                or litellm.vertex_location
+                or get_secret_str("VERTEXAI_LOCATION")
+            )
+            vertex_credentials = optional_params.vertex_credentials or get_secret_str(
+                "VERTEXAI_CREDENTIALS"
+            )
+            create_fine_tuning_job_data = FineTuningJobCreate(
+                model=model,
+                training_file=training_file,
+                hyperparameters=_oai_hyperparameters,
+                suffix=suffix,
+                validation_file=validation_file,
+                integrations=integrations,
+                seed=seed,
+            )
+            response = vertex_fine_tuning_apis_instance.create_fine_tuning_job(
+                _is_async=_is_async,
+                create_fine_tuning_job_data=create_fine_tuning_job_data,
+                vertex_credentials=vertex_credentials,
+                vertex_project=vertex_ai_project,
+                vertex_location=vertex_ai_location,
+                timeout=timeout,
+                api_base=api_base,
+                kwargs=kwargs,
+                original_hyperparameters=hyperparameters,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return response
+    except Exception as e:
+        verbose_logger.error("got exception in create_fine_tuning_job=%s", str(e))
+        raise e
+async def acancel_fine_tuning_job(
+    fine_tuning_job_id: str,
+    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> FineTuningJob:
+    """
+    Async: Immediately cancel a fine-tune job.
+    """
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["acancel_fine_tuning_job"] = True
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            cancel_fine_tuning_job,
+            fine_tuning_job_id,
+            custom_llm_provider,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response  # type: ignore
+        return response
+    except Exception as e:
+        raise e
+def cancel_fine_tuning_job(
+    fine_tuning_job_id: str,
+    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> Union[FineTuningJob, Coroutine[Any, Any, FineTuningJob]]:
+    """
+    Immediately cancel a fine-tune job.
+    Response includes details of the enqueued job including job status and the name of the fine-tuned models once complete
+    """
+    try:
+        optional_params = GenericLiteLLMParams(**kwargs)
+        ### TIMEOUT LOGIC ###
+        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+        # set timeout for 10 minutes by default
+        if (
+            timeout is not None
+            and isinstance(timeout, httpx.Timeout)
+            and supports_httpx_timeout(custom_llm_provider) is False
+        ):
+            read_timeout = timeout.read or 600
+            timeout = read_timeout  # default 10 min timeout
+        elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+            timeout = float(timeout)  # type: ignore
+        elif timeout is None:
+            timeout = 600.0
+        _is_async = kwargs.pop("acancel_fine_tuning_job", False) is True
+        # OpenAI
+        if custom_llm_provider == "openai":
+            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_BASE_URL")
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            # set API KEY
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+            response = openai_fine_tuning_apis_instance.cancel_fine_tuning_job(
+                api_base=api_base,
+                api_key=api_key,
+                api_version=optional_params.api_version,
+                organization=organization,
+                fine_tuning_job_id=fine_tuning_job_id,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                _is_async=_is_async,
+                client=kwargs.get("client", None),
+            )
+        # Azure OpenAI
+        elif custom_llm_provider == "azure":
+            api_base = optional_params.api_base or litellm.api_base or get_secret("AZURE_API_BASE")  # type: ignore
+            api_version = (
+                optional_params.api_version
+                or litellm.api_version
+                or get_secret_str("AZURE_API_VERSION")
+            )  # type: ignore
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key
+                or litellm.azure_key
+                or get_secret_str("AZURE_OPENAI_API_KEY")
+                or get_secret_str("AZURE_API_KEY")
+            )  # type: ignore
+            extra_body = optional_params.get("extra_body", {})
+            if extra_body is not None:
+                extra_body.pop("azure_ad_token", None)
+            else:
+                get_secret_str("AZURE_AD_TOKEN")  # type: ignore
+            response = azure_fine_tuning_apis_instance.cancel_fine_tuning_job(
+                api_base=api_base,
+                api_key=api_key,
+                api_version=api_version,
+                fine_tuning_job_id=fine_tuning_job_id,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                _is_async=_is_async,
+                organization=optional_params.organization,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return response
+    except Exception as e:
+        raise e
+async def alist_fine_tuning_jobs(
+    after: Optional[str] = None,
+    limit: Optional[int] = None,
+    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+):
+    """
+    Async: List your organization's fine-tuning jobs
+    """
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["alist_fine_tuning_jobs"] = True
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            list_fine_tuning_jobs,
+            after,
+            limit,
+            custom_llm_provider,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response  # type: ignore
+        return response
+    except Exception as e:
+        raise e
+def list_fine_tuning_jobs(
+    after: Optional[str] = None,
+    limit: Optional[int] = None,
+    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+):
+    """
+    List your organization's fine-tuning jobs
+    Params:
+    - after: Optional[str] = None, Identifier for the last job from the previous pagination request.
+    - limit: Optional[int] = None, Number of fine-tuning jobs to retrieve. Defaults to 20
+    """
+    try:
+        optional_params = GenericLiteLLMParams(**kwargs)
+        ### TIMEOUT LOGIC ###
+        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+        # set timeout for 10 minutes by default
+        if (
+            timeout is not None
+            and isinstance(timeout, httpx.Timeout)
+            and supports_httpx_timeout(custom_llm_provider) is False
+        ):
+            read_timeout = timeout.read or 600
+            timeout = read_timeout  # default 10 min timeout
+        elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+            timeout = float(timeout)  # type: ignore
+        elif timeout is None:
+            timeout = 600.0
+        _is_async = kwargs.pop("alist_fine_tuning_jobs", False) is True
+        # OpenAI
+        if custom_llm_provider == "openai":
+            # for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_BASE_URL")
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            # set API KEY
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+            response = openai_fine_tuning_apis_instance.list_fine_tuning_jobs(
+                api_base=api_base,
+                api_key=api_key,
+                api_version=optional_params.api_version,
+                organization=organization,
+                after=after,
+                limit=limit,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                _is_async=_is_async,
+                client=kwargs.get("client", None),
+            )
+        # Azure OpenAI
+        elif custom_llm_provider == "azure":
+            api_base = optional_params.api_base or litellm.api_base or get_secret_str("AZURE_API_BASE")  # type: ignore
+            api_version = (
+                optional_params.api_version
+                or litellm.api_version
+                or get_secret_str("AZURE_API_VERSION")
+            )  # type: ignore
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key
+                or litellm.azure_key
+                or get_secret_str("AZURE_OPENAI_API_KEY")
+                or get_secret_str("AZURE_API_KEY")
+            )  # type: ignore
+            extra_body = optional_params.get("extra_body", {})
+            if extra_body is not None:
+                extra_body.pop("azure_ad_token", None)
+            else:
+                get_secret("AZURE_AD_TOKEN")  # type: ignore
+            response = azure_fine_tuning_apis_instance.list_fine_tuning_jobs(
+                api_base=api_base,
+                api_key=api_key,
+                api_version=api_version,
+                after=after,
+                limit=limit,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                _is_async=_is_async,
+                organization=optional_params.organization,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return response
+    except Exception as e:
+        raise e
+async def aretrieve_fine_tuning_job(
+    fine_tuning_job_id: str,
+    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> FineTuningJob:
+    """
+    Async: Get info about a fine-tuning job.
+    """
+    try:
+        loop = asyncio.get_event_loop()
+        kwargs["aretrieve_fine_tuning_job"] = True
+        # Use a partial function to pass your keyword arguments
+        func = partial(
+            retrieve_fine_tuning_job,
+            fine_tuning_job_id,
+            custom_llm_provider,
+            extra_headers,
+            extra_body,
+            **kwargs,
+        )
+        # Add the context to the function
+        ctx = contextvars.copy_context()
+        func_with_context = partial(ctx.run, func)
+        init_response = await loop.run_in_executor(None, func_with_context)
+        if asyncio.iscoroutine(init_response):
+            response = await init_response
+        else:
+            response = init_response  # type: ignore
+        return response
+    except Exception as e:
+        raise e
+def retrieve_fine_tuning_job(
+    fine_tuning_job_id: str,
+    custom_llm_provider: Literal["openai", "azure", "vertex_ai"] = "openai",
+    extra_headers: Optional[Dict[str, str]] = None,
+    extra_body: Optional[Dict[str, str]] = None,
+    **kwargs,
+) -> Union[FineTuningJob, Coroutine[Any, Any, FineTuningJob]]:
+    """
+    Get info about a fine-tuning job.
+    """
+    try:
+        optional_params = GenericLiteLLMParams(**kwargs)
+        ### TIMEOUT LOGIC ###
+        timeout = optional_params.timeout or kwargs.get("request_timeout", 600) or 600
+        # set timeout for 10 minutes by default
+        if (
+            timeout is not None
+            and isinstance(timeout, httpx.Timeout)
+            and supports_httpx_timeout(custom_llm_provider) is False
+        ):
+            read_timeout = timeout.read or 600
+            timeout = read_timeout  # default 10 min timeout
+        elif timeout is not None and not isinstance(timeout, httpx.Timeout):
+            timeout = float(timeout)  # type: ignore
+        elif timeout is None:
+            timeout = 600.0
+        _is_async = kwargs.pop("aretrieve_fine_tuning_job", False) is True
+        # OpenAI
+        if custom_llm_provider == "openai":
+            api_base = (
+                optional_params.api_base
+                or litellm.api_base
+                or os.getenv("OPENAI_BASE_URL")
+                or os.getenv("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            organization = (
+                optional_params.organization
+                or litellm.organization
+                or os.getenv("OPENAI_ORGANIZATION", None)
+                or None
+            )
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key
+                or litellm.openai_key
+                or os.getenv("OPENAI_API_KEY")
+            )
+            response = openai_fine_tuning_apis_instance.retrieve_fine_tuning_job(
+                api_base=api_base,
+                api_key=api_key,
+                api_version=optional_params.api_version,
+                organization=organization,
+                fine_tuning_job_id=fine_tuning_job_id,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                _is_async=_is_async,
+                client=kwargs.get("client", None),
+            )
+        # Azure OpenAI
+        elif custom_llm_provider == "azure":
+            api_base = optional_params.api_base or litellm.api_base or get_secret_str("AZURE_API_BASE")  # type: ignore
+            api_version = (
+                optional_params.api_version
+                or litellm.api_version
+                or get_secret_str("AZURE_API_VERSION")
+            )  # type: ignore
+            api_key = (
+                optional_params.api_key
+                or litellm.api_key
+                or litellm.azure_key
+                or get_secret_str("AZURE_OPENAI_API_KEY")
+                or get_secret_str("AZURE_API_KEY")
+            )  # type: ignore
+            extra_body = optional_params.get("extra_body", {})
+            if extra_body is not None:
+                extra_body.pop("azure_ad_token", None)
+            else:
+                get_secret_str("AZURE_AD_TOKEN")  # type: ignore
+            response = azure_fine_tuning_apis_instance.retrieve_fine_tuning_job(
+                api_base=api_base,
+                api_key=api_key,
+                api_version=api_version,
+                fine_tuning_job_id=fine_tuning_job_id,
+                timeout=timeout,
+                max_retries=optional_params.max_retries,
+                _is_async=_is_async,
+                organization=optional_params.organization,
+            )
+        else:
+            raise litellm.exceptions.BadRequestError(
+                message="LiteLLM doesn't support {} for 'retrieve_fine_tuning_job'. Only 'openai' and 'azure' are supported.".format(
+                    custom_llm_provider
+                ),
+                model="n/a",
+                llm_provider=custom_llm_provider,
+                response=httpx.Response(
+                    status_code=400,
+                    content="Unsupported provider",
+                    request=httpx.Request(method="retrieve_fine_tuning_job", url="https://github.com/BerriAI/litellm"),  # type: ignore
+                ),
+            )
+        return response
+    except Exception as e:
+        raise e

litellm/integrations/Readme.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# Integrations
+This folder contains logging integrations for litellm
+eg. logging to Datadog, Langfuse, Prometheus, s3, GCS Bucket, etc.

litellm/integrations/SlackAlerting/Readme.md ADDED Viewed

	@@ -0,0 +1,13 @@

+# Slack Alerting on LiteLLM Gateway
+This folder contains the Slack Alerting integration for LiteLLM Gateway.
+## Folder Structure
+- `slack_alerting.py`: This is the main file that handles sending different types of alerts
+- `batching_handler.py`: Handles Batching + sending Httpx Post requests to slack. Slack alerts are sent every 10s or when events are greater than X events. Done to ensure litellm has good performance under high traffic
+- `types.py`: This file contains the AlertType enum which is used to define the different types of alerts that can be sent to Slack.
+- `utils.py`: This file contains common utils used specifically for slack alerting
+## Further Reading
+- [Doc setting up Alerting on LiteLLM Proxy (Gateway)](https://docs.litellm.ai/docs/proxy/alerting)

litellm/integrations/SlackAlerting/batching_handler.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""
+Handles Batching + sending Httpx Post requests to slack
+Slack alerts are sent every 10s or when events are greater than X events
+see custom_batch_logger.py for more details / defaults
+"""
+from typing import TYPE_CHECKING, Any
+from litellm._logging import verbose_proxy_logger
+if TYPE_CHECKING:
+    from .slack_alerting import SlackAlerting as _SlackAlerting
+    SlackAlertingType = _SlackAlerting
+else:
+    SlackAlertingType = Any
+def squash_payloads(queue):
+    squashed = {}
+    if len(queue) == 0:
+        return squashed
+    if len(queue) == 1:
+        return {"key": {"item": queue[0], "count": 1}}
+    for item in queue:
+        url = item["url"]
+        alert_type = item["alert_type"]
+        _key = (url, alert_type)
+        if _key in squashed:
+            squashed[_key]["count"] += 1
+            # Merge the payloads
+        else:
+            squashed[_key] = {"item": item, "count": 1}
+    return squashed
+def _print_alerting_payload_warning(
+    payload: dict, slackAlertingInstance: SlackAlertingType
+):
+    """
+    Print the payload to the console when
+    slackAlertingInstance.alerting_args.log_to_console is True
+    Relevant issue: https://github.com/BerriAI/litellm/issues/7372
+    """
+    if slackAlertingInstance.alerting_args.log_to_console is True:
+        verbose_proxy_logger.warning(payload)
+async def send_to_webhook(slackAlertingInstance: SlackAlertingType, item, count):
+    """
+    Send a single slack alert to the webhook
+    """
+    import json
+    payload = item.get("payload", {})
+    try:
+        if count > 1:
+            payload["text"] = f"[Num Alerts: {count}]\n\n{payload['text']}"
+        response = await slackAlertingInstance.async_http_handler.post(
+            url=item["url"],
+            headers=item["headers"],
+            data=json.dumps(payload),
+        )
+        if response.status_code != 200:
+            verbose_proxy_logger.debug(
+                f"Error sending slack alert to url={item['url']}. Error={response.text}"
+            )
+    except Exception as e:
+        verbose_proxy_logger.debug(f"Error sending slack alert: {str(e)}")
+    finally:
+        _print_alerting_payload_warning(
+            payload, slackAlertingInstance=slackAlertingInstance
+        )

litellm/integrations/SlackAlerting/slack_alerting.py ADDED Viewed

	@@ -0,0 +1,1825 @@

+#### What this does ####
+#    Class for sending Slack Alerts #
+import asyncio
+import datetime
+import os
+import random
+import time
+from datetime import timedelta
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union
+from openai import APIError
+import litellm
+import litellm.litellm_core_utils
+import litellm.litellm_core_utils.litellm_logging
+import litellm.types
+from litellm._logging import verbose_logger, verbose_proxy_logger
+from litellm.caching.caching import DualCache
+from litellm.constants import HOURS_IN_A_DAY
+from litellm.integrations.custom_batch_logger import CustomBatchLogger
+from litellm.litellm_core_utils.duration_parser import duration_in_seconds
+from litellm.litellm_core_utils.exception_mapping_utils import (
+    _add_key_name_and_team_to_alert,
+)
+from litellm.llms.custom_httpx.http_handler import (
+    get_async_httpx_client,
+    httpxSpecialProvider,
+)
+from litellm.proxy._types import AlertType, CallInfo, VirtualKeyEvent, WebhookEvent
+from litellm.types.integrations.slack_alerting import *
+from ..email_templates.templates import *
+from .batching_handler import send_to_webhook, squash_payloads
+from .utils import _add_langfuse_trace_id_to_alert, process_slack_alerting_variables
+if TYPE_CHECKING:
+    from litellm.router import Router as _Router
+    Router = _Router
+else:
+    Router = Any
+class SlackAlerting(CustomBatchLogger):
+    """
+    Class for sending Slack Alerts
+    """
+    # Class variables or attributes
+    def __init__(
+        self,
+        internal_usage_cache: Optional[DualCache] = None,
+        alerting_threshold: Optional[
+            float
+        ] = None,  # threshold for slow / hanging llm responses (in seconds)
+        alerting: Optional[List] = [],
+        alert_types: List[AlertType] = DEFAULT_ALERT_TYPES,
+        alert_to_webhook_url: Optional[
+            Dict[AlertType, Union[List[str], str]]
+        ] = None,  # if user wants to separate alerts to diff channels
+        alerting_args={},
+        default_webhook_url: Optional[str] = None,
+        **kwargs,
+    ):
+        if alerting_threshold is None:
+            alerting_threshold = 300
+        self.alerting_threshold = alerting_threshold
+        self.alerting = alerting
+        self.alert_types = alert_types
+        self.internal_usage_cache = internal_usage_cache or DualCache()
+        self.async_http_handler = get_async_httpx_client(
+            llm_provider=httpxSpecialProvider.LoggingCallback
+        )
+        self.alert_to_webhook_url = process_slack_alerting_variables(
+            alert_to_webhook_url=alert_to_webhook_url
+        )
+        self.is_running = False
+        self.alerting_args = SlackAlertingArgs(**alerting_args)
+        self.default_webhook_url = default_webhook_url
+        self.flush_lock = asyncio.Lock()
+        super().__init__(**kwargs, flush_lock=self.flush_lock)
+    def update_values(
+        self,
+        alerting: Optional[List] = None,
+        alerting_threshold: Optional[float] = None,
+        alert_types: Optional[List[AlertType]] = None,
+        alert_to_webhook_url: Optional[Dict[AlertType, Union[List[str], str]]] = None,
+        alerting_args: Optional[Dict] = None,
+        llm_router: Optional[Router] = None,
+    ):
+        if alerting is not None:
+            self.alerting = alerting
+            asyncio.create_task(self.periodic_flush())
+        if alerting_threshold is not None:
+            self.alerting_threshold = alerting_threshold
+        if alert_types is not None:
+            self.alert_types = alert_types
+        if alerting_args is not None:
+            self.alerting_args = SlackAlertingArgs(**alerting_args)
+        if alert_to_webhook_url is not None:
+            # update the dict
+            if self.alert_to_webhook_url is None:
+                self.alert_to_webhook_url = process_slack_alerting_variables(
+                    alert_to_webhook_url=alert_to_webhook_url
+                )
+            else:
+                _new_values = (
+                    process_slack_alerting_variables(
+                        alert_to_webhook_url=alert_to_webhook_url
+                    )
+                    or {}
+                )
+                self.alert_to_webhook_url.update(_new_values)
+        if llm_router is not None:
+            self.llm_router = llm_router
+    async def deployment_in_cooldown(self):
+        pass
+    async def deployment_removed_from_cooldown(self):
+        pass
+    def _all_possible_alert_types(self):
+        # used by the UI to show all supported alert types
+        # Note: This is not the alerts the user has configured, instead it's all possible alert types a user can select
+        # return list of all values AlertType enum
+        return list(AlertType)
+    def _response_taking_too_long_callback_helper(
+        self,
+        kwargs,  # kwargs to completion
+        start_time,
+        end_time,  # start/end time
+    ):
+        try:
+            time_difference = end_time - start_time
+            # Convert the timedelta to float (in seconds)
+            time_difference_float = time_difference.total_seconds()
+            litellm_params = kwargs.get("litellm_params", {})
+            model = kwargs.get("model", "")
+            api_base = litellm.get_api_base(model=model, optional_params=litellm_params)
+            messages = kwargs.get("messages", None)
+            # if messages does not exist fallback to "input"
+            if messages is None:
+                messages = kwargs.get("input", None)
+            # only use first 100 chars for alerting
+            _messages = str(messages)[:100]
+            return time_difference_float, model, api_base, _messages
+        except Exception as e:
+            raise e
+    def _get_deployment_latencies_to_alert(self, metadata=None):
+        if metadata is None:
+            return None
+        if "_latency_per_deployment" in metadata:
+            # Translate model_id to -> api_base
+            # _latency_per_deployment is a dictionary that looks like this:
+            """
+            _latency_per_deployment: {
+                api_base: 0.01336697916666667
+            }
+            """
+            _message_to_send = ""
+            _deployment_latencies = metadata["_latency_per_deployment"]
+            if len(_deployment_latencies) == 0:
+                return None
+            _deployment_latency_map: Optional[dict] = None
+            try:
+                # try sorting deployments by latency
+                _deployment_latencies = sorted(
+                    _deployment_latencies.items(), key=lambda x: x[1]
+                )
+                _deployment_latency_map = dict(_deployment_latencies)
+            except Exception:
+                pass
+            if _deployment_latency_map is None:
+                return
+            for api_base, latency in _deployment_latency_map.items():
+                _message_to_send += f"\n{api_base}: {round(latency,2)}s"
+            _message_to_send = "```" + _message_to_send + "```"
+            return _message_to_send
+    async def response_taking_too_long_callback(
+        self,
+        kwargs,  # kwargs to completion
+        completion_response,  # response from completion
+        start_time,
+        end_time,  # start/end time
+    ):
+        if self.alerting is None or self.alert_types is None:
+            return
+        (
+            time_difference_float,
+            model,
+            api_base,
+            messages,
+        ) = self._response_taking_too_long_callback_helper(
+            kwargs=kwargs,
+            start_time=start_time,
+            end_time=end_time,
+        )
+        if litellm.turn_off_message_logging or litellm.redact_messages_in_exceptions:
+            messages = "Message not logged. litellm.redact_messages_in_exceptions=True"
+        request_info = f"\nRequest Model: `{model}`\nAPI Base: `{api_base}`\nMessages: `{messages}`"
+        slow_message = f"`Responses are slow - {round(time_difference_float,2)}s response time > Alerting threshold: {self.alerting_threshold}s`"
+        alerting_metadata: dict = {}
+        if time_difference_float > self.alerting_threshold:
+            # add deployment latencies to alert
+            if (
+                kwargs is not None
+                and "litellm_params" in kwargs
+                and "metadata" in kwargs["litellm_params"]
+            ):
+                _metadata: dict = kwargs["litellm_params"]["metadata"]
+                request_info = _add_key_name_and_team_to_alert(
+                    request_info=request_info, metadata=_metadata
+                )
+                _deployment_latency_map = self._get_deployment_latencies_to_alert(
+                    metadata=_metadata
+                )
+                if _deployment_latency_map is not None:
+                    request_info += (
+                        f"\nAvailable Deployment Latencies\n{_deployment_latency_map}"
+                    )
+                if "alerting_metadata" in _metadata:
+                    alerting_metadata = _metadata["alerting_metadata"]
+            await self.send_alert(
+                message=slow_message + request_info,
+                level="Low",
+                alert_type=AlertType.llm_too_slow,
+                alerting_metadata=alerting_metadata,
+            )
+    async def async_update_daily_reports(
+        self, deployment_metrics: DeploymentMetrics
+    ) -> int:
+        """
+        Store the perf by deployment in cache
+        - Number of failed requests per deployment
+        - Latency / output tokens per deployment
+        'deployment_id:daily_metrics:failed_requests'
+        'deployment_id:daily_metrics:latency_per_output_token'
+        Returns
+            int - count of metrics set (1 - if just latency, 2 - if failed + latency)
+        """
+        return_val = 0
+        try:
+            ## FAILED REQUESTS ##
+            if deployment_metrics.failed_request:
+                await self.internal_usage_cache.async_increment_cache(
+                    key="{}:{}".format(
+                        deployment_metrics.id,
+                        SlackAlertingCacheKeys.failed_requests_key.value,
+                    ),
+                    value=1,
+                    parent_otel_span=None,  # no attached request, this is a background operation
+                )
+                return_val += 1
+            ## LATENCY ##
+            if deployment_metrics.latency_per_output_token is not None:
+                await self.internal_usage_cache.async_increment_cache(
+                    key="{}:{}".format(
+                        deployment_metrics.id, SlackAlertingCacheKeys.latency_key.value
+                    ),
+                    value=deployment_metrics.latency_per_output_token,
+                    parent_otel_span=None,  # no attached request, this is a background operation
+                )
+                return_val += 1
+            return return_val
+        except Exception:
+            return 0
+    async def send_daily_reports(self, router) -> bool:  # noqa: PLR0915
+        """
+        Send a daily report on:
+        - Top 5 deployments with most failed requests
+        - Top 5 slowest deployments (normalized by latency/output tokens)
+        Get the value from redis cache (if available) or in-memory and send it
+        Cleanup:
+        - reset values in cache -> prevent memory leak
+        Returns:
+            True -> if successfuly sent
+            False -> if not sent
+        """
+        ids = router.get_model_ids()
+        # get keys
+        failed_request_keys = [
+            "{}:{}".format(id, SlackAlertingCacheKeys.failed_requests_key.value)
+            for id in ids
+        ]
+        latency_keys = [
+            "{}:{}".format(id, SlackAlertingCacheKeys.latency_key.value) for id in ids
+        ]
+        combined_metrics_keys = failed_request_keys + latency_keys  # reduce cache calls
+        combined_metrics_values = await self.internal_usage_cache.async_batch_get_cache(
+            keys=combined_metrics_keys
+        )  # [1, 2, None, ..]
+        if combined_metrics_values is None:
+            return False
+        all_none = True
+        for val in combined_metrics_values:
+            if val is not None and val > 0:
+                all_none = False
+                break
+        if all_none:
+            return False
+        failed_request_values = combined_metrics_values[
+            : len(failed_request_keys)
+        ]  # # [1, 2, None, ..]
+        latency_values = combined_metrics_values[len(failed_request_keys) :]
+        # find top 5 failed
+        ## Replace None values with a placeholder value (-1 in this case)
+        placeholder_value = 0
+        replaced_failed_values = [
+            value if value is not None else placeholder_value
+            for value in failed_request_values
+        ]
+        ## Get the indices of top 5 keys with the highest numerical values (ignoring None and 0 values)
+        top_5_failed = sorted(
+            range(len(replaced_failed_values)),
+            key=lambda i: replaced_failed_values[i],
+            reverse=True,
+        )[:5]
+        top_5_failed = [
+            index for index in top_5_failed if replaced_failed_values[index] > 0
+        ]
+        # find top 5 slowest
+        # Replace None values with a placeholder value (-1 in this case)
+        placeholder_value = 0
+        replaced_slowest_values = [
+            value if value is not None else placeholder_value
+            for value in latency_values
+        ]
+        # Get the indices of top 5 values with the highest numerical values (ignoring None and 0 values)
+        top_5_slowest = sorted(
+            range(len(replaced_slowest_values)),
+            key=lambda i: replaced_slowest_values[i],
+            reverse=True,
+        )[:5]
+        top_5_slowest = [
+            index for index in top_5_slowest if replaced_slowest_values[index] > 0
+        ]
+        # format alert -> return the litellm model name + api base
+        message = f"\n\nTime: `{time.time()}`s\nHere are today's key metrics 📈: \n\n"
+        message += "\n\n*❗️ Top Deployments with Most Failed Requests:*\n\n"
+        if not top_5_failed:
+            message += "\tNone\n"
+        for i in range(len(top_5_failed)):
+            key = failed_request_keys[top_5_failed[i]].split(":")[0]
+            _deployment = router.get_model_info(key)
+            if isinstance(_deployment, dict):
+                deployment_name = _deployment["litellm_params"].get("model", "")
+            else:
+                return False
+            api_base = litellm.get_api_base(
+                model=deployment_name,
+                optional_params=(
+                    _deployment["litellm_params"] if _deployment is not None else {}
+                ),
+            )
+            if api_base is None:
+                api_base = ""
+            value = replaced_failed_values[top_5_failed[i]]
+            message += f"\t{i+1}. Deployment: `{deployment_name}`, Failed Requests: `{value}`,  API Base: `{api_base}`\n"
+        message += "\n\n*😅 Top Slowest Deployments:*\n\n"
+        if not top_5_slowest:
+            message += "\tNone\n"
+        for i in range(len(top_5_slowest)):
+            key = latency_keys[top_5_slowest[i]].split(":")[0]
+            _deployment = router.get_model_info(key)
+            if _deployment is not None:
+                deployment_name = _deployment["litellm_params"].get("model", "")
+            else:
+                deployment_name = ""
+            api_base = litellm.get_api_base(
+                model=deployment_name,
+                optional_params=(
+                    _deployment["litellm_params"] if _deployment is not None else {}
+                ),
+            )
+            value = round(replaced_slowest_values[top_5_slowest[i]], 3)
+            message += f"\t{i+1}. Deployment: `{deployment_name}`, Latency per output token: `{value}s/token`,  API Base: `{api_base}`\n\n"
+        # cache cleanup -> reset values to 0
+        latency_cache_keys = [(key, 0) for key in latency_keys]
+        failed_request_cache_keys = [(key, 0) for key in failed_request_keys]
+        combined_metrics_cache_keys = latency_cache_keys + failed_request_cache_keys
+        await self.internal_usage_cache.async_set_cache_pipeline(
+            cache_list=combined_metrics_cache_keys
+        )
+        message += f"\n\nNext Run is at: `{time.time() + self.alerting_args.daily_report_frequency}`s"
+        # send alert
+        await self.send_alert(
+            message=message,
+            level="Low",
+            alert_type=AlertType.daily_reports,
+            alerting_metadata={},
+        )
+        return True
+    async def response_taking_too_long(
+        self,
+        start_time: Optional[datetime.datetime] = None,
+        end_time: Optional[datetime.datetime] = None,
+        type: Literal["hanging_request", "slow_response"] = "hanging_request",
+        request_data: Optional[dict] = None,
+    ):
+        if self.alerting is None or self.alert_types is None:
+            return
+        model: str = ""
+        if request_data is not None:
+            model = request_data.get("model", "")
+            messages = request_data.get("messages", None)
+            if messages is None:
+                # if messages does not exist fallback to "input"
+                messages = request_data.get("input", None)
+            # try casting messages to str and get the first 100 characters, else mark as None
+            try:
+                messages = str(messages)
+                messages = messages[:100]
+            except Exception:
+                messages = ""
+            if (
+                litellm.turn_off_message_logging
+                or litellm.redact_messages_in_exceptions
+            ):
+                messages = (
+                    "Message not logged. litellm.redact_messages_in_exceptions=True"
+                )
+            request_info = f"\nRequest Model: `{model}`\nMessages: `{messages}`"
+        else:
+            request_info = ""
+        if type == "hanging_request":
+            await asyncio.sleep(
+                self.alerting_threshold
+            )  # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests
+            alerting_metadata: dict = {}
+            if await self._request_is_completed(request_data=request_data) is True:
+                return
+            if request_data is not None:
+                if request_data.get("deployment", None) is not None and isinstance(
+                    request_data["deployment"], dict
+                ):
+                    _api_base = litellm.get_api_base(
+                        model=model,
+                        optional_params=request_data["deployment"].get(
+                            "litellm_params", {}
+                        ),
+                    )
+                    if _api_base is None:
+                        _api_base = ""
+                    request_info += f"\nAPI Base: {_api_base}"
+                elif request_data.get("metadata", None) is not None and isinstance(
+                    request_data["metadata"], dict
+                ):
+                    # In hanging requests sometime it has not made it to the point where the deployment is passed to the `request_data``
+                    # in that case we fallback to the api base set in the request metadata
+                    _metadata: dict = request_data["metadata"]
+                    _api_base = _metadata.get("api_base", "")
+                    request_info = _add_key_name_and_team_to_alert(
+                        request_info=request_info, metadata=_metadata
+                    )
+                    if _api_base is None:
+                        _api_base = ""
+                    if "alerting_metadata" in _metadata:
+                        alerting_metadata = _metadata["alerting_metadata"]
+                    request_info += f"\nAPI Base: `{_api_base}`"
+                # only alert hanging responses if they have not been marked as success
+                alerting_message = (
+                    f"`Requests are hanging - {self.alerting_threshold}s+ request time`"
+                )
+                if "langfuse" in litellm.success_callback:
+                    langfuse_url = await _add_langfuse_trace_id_to_alert(
+                        request_data=request_data,
+                    )
+                    if langfuse_url is not None:
+                        request_info += "\n🪢 Langfuse Trace: {}".format(langfuse_url)
+                # add deployment latencies to alert
+                _deployment_latency_map = self._get_deployment_latencies_to_alert(
+                    metadata=request_data.get("metadata", {})
+                )
+                if _deployment_latency_map is not None:
+                    request_info += f"\nDeployment Latencies\n{_deployment_latency_map}"
+                await self.send_alert(
+                    message=alerting_message + request_info,
+                    level="Medium",
+                    alert_type=AlertType.llm_requests_hanging,
+                    alerting_metadata=alerting_metadata,
+                )
+    async def failed_tracking_alert(self, error_message: str, failing_model: str):
+        """
+        Raise alert when tracking failed for specific model
+        Args:
+            error_message (str): Error message
+            failing_model (str): Model that failed tracking
+        """
+        if self.alerting is None or self.alert_types is None:
+            # do nothing if alerting is not switched on
+            return
+        if "failed_tracking_spend" not in self.alert_types:
+            return
+        _cache: DualCache = self.internal_usage_cache
+        message = "Failed Tracking Cost for " + error_message
+        _cache_key = "budget_alerts:failed_tracking:{}".format(failing_model)
+        result = await _cache.async_get_cache(key=_cache_key)
+        if result is None:
+            await self.send_alert(
+                message=message,
+                level="High",
+                alert_type=AlertType.failed_tracking_spend,
+                alerting_metadata={},
+            )
+            await _cache.async_set_cache(
+                key=_cache_key,
+                value="SENT",
+                ttl=self.alerting_args.budget_alert_ttl,
+            )
+    async def budget_alerts(  # noqa: PLR0915
+        self,
+        type: Literal[
+            "token_budget",
+            "soft_budget",
+            "user_budget",
+            "team_budget",
+            "proxy_budget",
+            "projected_limit_exceeded",
+        ],
+        user_info: CallInfo,
+    ):
+        ## PREVENTITIVE ALERTING ## - https://github.com/BerriAI/litellm/issues/2727
+        # - Alert once within 24hr period
+        # - Cache this information
+        # - Don't re-alert, if alert already sent
+        _cache: DualCache = self.internal_usage_cache
+        if self.alerting is None or self.alert_types is None:
+            # do nothing if alerting is not switched on
+            return
+        if "budget_alerts" not in self.alert_types:
+            return
+        _id: Optional[str] = "default_id"  # used for caching
+        user_info_json = user_info.model_dump(exclude_none=True)
+        user_info_str = self._get_user_info_str(user_info)
+        event: Optional[
+            Literal[
+                "budget_crossed",
+                "threshold_crossed",
+                "projected_limit_exceeded",
+                "soft_budget_crossed",
+            ]
+        ] = None
+        event_group: Optional[
+            Literal["internal_user", "team", "key", "proxy", "customer"]
+        ] = None
+        event_message: str = ""
+        webhook_event: Optional[WebhookEvent] = None
+        if type == "proxy_budget":
+            event_group = "proxy"
+            event_message += "Proxy Budget: "
+        elif type == "soft_budget":
+            event_group = "proxy"
+            event_message += "Soft Budget Crossed: "
+        elif type == "user_budget":
+            event_group = "internal_user"
+            event_message += "User Budget: "
+            _id = user_info.user_id or _id
+        elif type == "team_budget":
+            event_group = "team"
+            event_message += "Team Budget: "
+            _id = user_info.team_id or _id
+        elif type == "token_budget":
+            event_group = "key"
+            event_message += "Key Budget: "
+            _id = user_info.token
+        elif type == "projected_limit_exceeded":
+            event_group = "key"
+            event_message += "Key Budget: Projected Limit Exceeded"
+            event = "projected_limit_exceeded"
+            _id = user_info.token
+        # percent of max_budget left to spend
+        if user_info.max_budget is None and user_info.soft_budget is None:
+            return
+        percent_left: float = 0
+        if user_info.max_budget is not None:
+            if user_info.max_budget > 0:
+                percent_left = (
+                    user_info.max_budget - user_info.spend
+                ) / user_info.max_budget
+        # check if crossed budget
+        if user_info.max_budget is not None:
+            if user_info.spend >= user_info.max_budget:
+                event = "budget_crossed"
+                event_message += (
+                    f"Budget Crossed\n Total Budget:`{user_info.max_budget}`"
+                )
+            elif percent_left <= SLACK_ALERTING_THRESHOLD_5_PERCENT:
+                event = "threshold_crossed"
+                event_message += "5% Threshold Crossed "
+            elif percent_left <= SLACK_ALERTING_THRESHOLD_15_PERCENT:
+                event = "threshold_crossed"
+                event_message += "15% Threshold Crossed"
+        elif user_info.soft_budget is not None:
+            if user_info.spend >= user_info.soft_budget:
+                event = "soft_budget_crossed"
+        if event is not None and event_group is not None:
+            _cache_key = "budget_alerts:{}:{}".format(event, _id)
+            result = await _cache.async_get_cache(key=_cache_key)
+            if result is None:
+                webhook_event = WebhookEvent(
+                    event=event,
+                    event_group=event_group,
+                    event_message=event_message,
+                    **user_info_json,
+                )
+                await self.send_alert(
+                    message=event_message + "\n\n" + user_info_str,
+                    level="High",
+                    alert_type=AlertType.budget_alerts,
+                    user_info=webhook_event,
+                    alerting_metadata={},
+                )
+                await _cache.async_set_cache(
+                    key=_cache_key,
+                    value="SENT",
+                    ttl=self.alerting_args.budget_alert_ttl,
+                )
+            return
+        return
+    def _get_user_info_str(self, user_info: CallInfo) -> str:
+        """
+        Create a standard message for a budget alert
+        """
+        _all_fields_as_dict = user_info.model_dump(exclude_none=True)
+        _all_fields_as_dict.pop("token")
+        msg = ""
+        for k, v in _all_fields_as_dict.items():
+            msg += f"*{k}:* `{v}`\n"
+        return msg
+    async def customer_spend_alert(
+        self,
+        token: Optional[str],
+        key_alias: Optional[str],
+        end_user_id: Optional[str],
+        response_cost: Optional[float],
+        max_budget: Optional[float],
+    ):
+        if (
+            self.alerting is not None
+            and "webhook" in self.alerting
+            and end_user_id is not None
+            and token is not None
+            and response_cost is not None
+        ):
+            # log customer spend
+            event = WebhookEvent(
+                spend=response_cost,
+                max_budget=max_budget,
+                token=token,
+                customer_id=end_user_id,
+                user_id=None,
+                team_id=None,
+                user_email=None,
+                key_alias=key_alias,
+                projected_exceeded_date=None,
+                projected_spend=None,
+                event="spend_tracked",
+                event_group="customer",
+                event_message="Customer spend tracked. Customer={}, spend={}".format(
+                    end_user_id, response_cost
+                ),
+            )
+            await self.send_webhook_alert(webhook_event=event)
+    def _count_outage_alerts(self, alerts: List[int]) -> str:
+        """
+        Parameters:
+        - alerts: List[int] -> list of error codes (either 408 or 500+)
+        Returns:
+        - str -> formatted string. This is an alert message, giving a human-friendly description of the errors.
+        """
+        error_breakdown = {"Timeout Errors": 0, "API Errors": 0, "Unknown Errors": 0}
+        for alert in alerts:
+            if alert == 408:
+                error_breakdown["Timeout Errors"] += 1
+            elif alert >= 500:
+                error_breakdown["API Errors"] += 1
+            else:
+                error_breakdown["Unknown Errors"] += 1
+        error_msg = ""
+        for key, value in error_breakdown.items():
+            if value > 0:
+                error_msg += "\n{}: {}\n".format(key, value)
+        return error_msg
+    def _outage_alert_msg_factory(
+        self,
+        alert_type: Literal["Major", "Minor"],
+        key: Literal["Model", "Region"],
+        key_val: str,
+        provider: str,
+        api_base: Optional[str],
+        outage_value: BaseOutageModel,
+    ) -> str:
+        """Format an alert message for slack"""
+        headers = {f"{key} Name": key_val, "Provider": provider}
+        if api_base is not None:
+            headers["API Base"] = api_base  # type: ignore
+        headers_str = "\n"
+        for k, v in headers.items():
+            headers_str += f"*{k}:* `{v}`\n"
+        return f"""\n\n
+*⚠️ {alert_type} Service Outage*
+{headers_str}
+*Errors:*
+{self._count_outage_alerts(alerts=outage_value["alerts"])}
+*Last Check:* `{round(time.time() - outage_value["last_updated_at"], 4)}s ago`\n\n
+"""
+    async def region_outage_alerts(
+        self,
+        exception: APIError,
+        deployment_id: str,
+    ) -> None:
+        """
+        Send slack alert if specific provider region is having an outage.
+        Track for 408 (Timeout) and >=500 Error codes
+        """
+        ## CREATE (PROVIDER+REGION) ID ##
+        if self.llm_router is None:
+            return
+        deployment = self.llm_router.get_deployment(model_id=deployment_id)
+        if deployment is None:
+            return
+        model = deployment.litellm_params.model
+        ### GET PROVIDER ###
+        provider = deployment.litellm_params.custom_llm_provider
+        if provider is None:
+            model, provider, _, _ = litellm.get_llm_provider(model=model)
+        ### GET REGION ###
+        region_name = deployment.litellm_params.region_name
+        if region_name is None:
+            region_name = litellm.utils._get_model_region(
+                custom_llm_provider=provider, litellm_params=deployment.litellm_params
+            )
+        if region_name is None:
+            return
+        ### UNIQUE CACHE KEY ###
+        cache_key = provider + region_name
+        outage_value: Optional[
+            ProviderRegionOutageModel
+        ] = await self.internal_usage_cache.async_get_cache(key=cache_key)
+        if (
+            getattr(exception, "status_code", None) is None
+            or (
+                exception.status_code != 408  # type: ignore
+                and exception.status_code < 500  # type: ignore
+            )
+            or self.llm_router is None
+        ):
+            return
+        if outage_value is None:
+            _deployment_set = set()
+            _deployment_set.add(deployment_id)
+            outage_value = ProviderRegionOutageModel(
+                provider_region_id=cache_key,
+                alerts=[exception.status_code],  # type: ignore
+                minor_alert_sent=False,
+                major_alert_sent=False,
+                last_updated_at=time.time(),
+                deployment_ids=_deployment_set,
+            )
+            ## add to cache ##
+            await self.internal_usage_cache.async_set_cache(
+                key=cache_key,
+                value=outage_value,
+                ttl=self.alerting_args.region_outage_alert_ttl,
+            )
+            return
+        if len(outage_value["alerts"]) < self.alerting_args.max_outage_alert_list_size:
+            outage_value["alerts"].append(exception.status_code)  # type: ignore
+        else:  # prevent memory leaks
+            pass
+        _deployment_set = outage_value["deployment_ids"]
+        _deployment_set.add(deployment_id)
+        outage_value["deployment_ids"] = _deployment_set
+        outage_value["last_updated_at"] = time.time()
+        ## MINOR OUTAGE ALERT SENT ##
+        if (
+            outage_value["minor_alert_sent"] is False
+            and len(outage_value["alerts"])
+            >= self.alerting_args.minor_outage_alert_threshold
+            and len(_deployment_set) > 1  # make sure it's not just 1 bad deployment
+        ):
+            msg = self._outage_alert_msg_factory(
+                alert_type="Minor",
+                key="Region",
+                key_val=region_name,
+                api_base=None,
+                outage_value=outage_value,
+                provider=provider,
+            )
+            # send minor alert
+            await self.send_alert(
+                message=msg,
+                level="Medium",
+                alert_type=AlertType.outage_alerts,
+                alerting_metadata={},
+            )
+            # set to true
+            outage_value["minor_alert_sent"] = True
+        ## MAJOR OUTAGE ALERT SENT ##
+        elif (
+            outage_value["major_alert_sent"] is False
+            and len(outage_value["alerts"])
+            >= self.alerting_args.major_outage_alert_threshold
+            and len(_deployment_set) > 1  # make sure it's not just 1 bad deployment
+        ):
+            msg = self._outage_alert_msg_factory(
+                alert_type="Major",
+                key="Region",
+                key_val=region_name,
+                api_base=None,
+                outage_value=outage_value,
+                provider=provider,
+            )
+            # send minor alert
+            await self.send_alert(
+                message=msg,
+                level="High",
+                alert_type=AlertType.outage_alerts,
+                alerting_metadata={},
+            )
+            # set to true
+            outage_value["major_alert_sent"] = True
+        ## update cache ##
+        await self.internal_usage_cache.async_set_cache(
+            key=cache_key, value=outage_value
+        )
+    async def outage_alerts(
+        self,
+        exception: APIError,
+        deployment_id: str,
+    ) -> None:
+        """
+        Send slack alert if model is badly configured / having an outage (408, 401, 429, >=500).
+        key = model_id
+        value = {
+        - model_id
+        - threshold
+        - alerts []
+        }
+        ttl = 1hr
+        max_alerts_size = 10
+        """
+        try:
+            outage_value: Optional[OutageModel] = await self.internal_usage_cache.async_get_cache(key=deployment_id)  # type: ignore
+            if (
+                getattr(exception, "status_code", None) is None
+                or (
+                    exception.status_code != 408  # type: ignore
+                    and exception.status_code < 500  # type: ignore
+                )
+                or self.llm_router is None
+            ):
+                return
+            ### EXTRACT MODEL DETAILS ###
+            deployment = self.llm_router.get_deployment(model_id=deployment_id)
+            if deployment is None:
+                return
+            model = deployment.litellm_params.model
+            provider = deployment.litellm_params.custom_llm_provider
+            if provider is None:
+                try:
+                    model, provider, _, _ = litellm.get_llm_provider(model=model)
+                except Exception:
+                    provider = ""
+            api_base = litellm.get_api_base(
+                model=model, optional_params=deployment.litellm_params
+            )
+            if outage_value is None:
+                outage_value = OutageModel(
+                    model_id=deployment_id,
+                    alerts=[exception.status_code],  # type: ignore
+                    minor_alert_sent=False,
+                    major_alert_sent=False,
+                    last_updated_at=time.time(),
+                )
+                ## add to cache ##
+                await self.internal_usage_cache.async_set_cache(
+                    key=deployment_id,
+                    value=outage_value,
+                    ttl=self.alerting_args.outage_alert_ttl,
+                )
+                return
+            if (
+                len(outage_value["alerts"])
+                < self.alerting_args.max_outage_alert_list_size
+            ):
+                outage_value["alerts"].append(exception.status_code)  # type: ignore
+            else:  # prevent memory leaks
+                pass
+            outage_value["last_updated_at"] = time.time()
+            ## MINOR OUTAGE ALERT SENT ##
+            if (
+                outage_value["minor_alert_sent"] is False
+                and len(outage_value["alerts"])
+                >= self.alerting_args.minor_outage_alert_threshold
+            ):
+                msg = self._outage_alert_msg_factory(
+                    alert_type="Minor",
+                    key="Model",
+                    key_val=model,
+                    api_base=api_base,
+                    outage_value=outage_value,
+                    provider=provider,
+                )
+                # send minor alert
+                await self.send_alert(
+                    message=msg,
+                    level="Medium",
+                    alert_type=AlertType.outage_alerts,
+                    alerting_metadata={},
+                )
+                # set to true
+                outage_value["minor_alert_sent"] = True
+            elif (
+                outage_value["major_alert_sent"] is False
+                and len(outage_value["alerts"])
+                >= self.alerting_args.major_outage_alert_threshold
+            ):
+                msg = self._outage_alert_msg_factory(
+                    alert_type="Major",
+                    key="Model",
+                    key_val=model,
+                    api_base=api_base,
+                    outage_value=outage_value,
+                    provider=provider,
+                )
+                # send minor alert
+                await self.send_alert(
+                    message=msg,
+                    level="High",
+                    alert_type=AlertType.outage_alerts,
+                    alerting_metadata={},
+                )
+                # set to true
+                outage_value["major_alert_sent"] = True
+            ## update cache ##
+            await self.internal_usage_cache.async_set_cache(
+                key=deployment_id, value=outage_value
+            )
+        except Exception:
+            pass
+    async def model_added_alert(
+        self, model_name: str, litellm_model_name: str, passed_model_info: Any
+    ):
+        base_model_from_user = getattr(passed_model_info, "base_model", None)
+        model_info = {}
+        base_model = ""
+        if base_model_from_user is not None:
+            model_info = litellm.model_cost.get(base_model_from_user, {})
+            base_model = f"Base Model: `{base_model_from_user}`\n"
+        else:
+            model_info = litellm.model_cost.get(litellm_model_name, {})
+        model_info_str = ""
+        for k, v in model_info.items():
+            if k == "input_cost_per_token" or k == "output_cost_per_token":
+                # when converting to string it should not be 1.63e-06
+                v = "{:.8f}".format(v)
+            model_info_str += f"{k}: {v}\n"
+        message = f"""
+*🚅 New Model Added*
+Model Name: `{model_name}`
+{base_model}
+Usage OpenAI Python SDK:
+```
+import openai
+client = openai.OpenAI(
+    api_key="your_api_key",
+    base_url={os.getenv("PROXY_BASE_URL", "http://0.0.0.0:4000")}
+)
+response = client.chat.completions.create(
+    model="{model_name}", # model to send to the proxy
+    messages = [
+        {{
+            "role": "user",
+            "content": "this is a test request, write a short poem"
+        }}
+    ]
+)
+```
+Model Info:
+```
+{model_info_str}
+```
+"""
+        alert_val = self.send_alert(
+            message=message,
+            level="Low",
+            alert_type=AlertType.new_model_added,
+            alerting_metadata={},
+        )
+        if alert_val is not None and asyncio.iscoroutine(alert_val):
+            await alert_val
+    async def model_removed_alert(self, model_name: str):
+        pass
+    async def send_webhook_alert(self, webhook_event: WebhookEvent) -> bool:
+        """
+        Sends structured alert to webhook, if set.
+        Currently only implemented for budget alerts
+        Returns -> True if sent, False if not.
+        Raises Exception
+            - if WEBHOOK_URL is not set
+        """
+        webhook_url = os.getenv("WEBHOOK_URL", None)
+        if webhook_url is None:
+            raise Exception("Missing webhook_url from environment")
+        payload = webhook_event.model_dump_json()
+        headers = {"Content-type": "application/json"}
+        response = await self.async_http_handler.post(
+            url=webhook_url,
+            headers=headers,
+            data=payload,
+        )
+        if response.status_code == 200:
+            return True
+        else:
+            print("Error sending webhook alert. Error=", response.text)  # noqa
+        return False
+    async def _check_if_using_premium_email_feature(
+        self,
+        premium_user: bool,
+        email_logo_url: Optional[str] = None,
+        email_support_contact: Optional[str] = None,
+    ):
+        from litellm.proxy.proxy_server import CommonProxyErrors, premium_user
+        if premium_user is not True:
+            if email_logo_url is not None or email_support_contact is not None:
+                raise ValueError(
+                    f"Trying to Customize Email Alerting\n {CommonProxyErrors.not_premium_user.value}"
+                )
+        return
+    async def send_key_created_or_user_invited_email(
+        self, webhook_event: WebhookEvent
+    ) -> bool:
+        try:
+            from litellm.proxy.utils import send_email
+            if self.alerting is None or "email" not in self.alerting:
+                # do nothing if user does not want email alerts
+                verbose_proxy_logger.error(
+                    "Error sending email alert - 'email' not in self.alerting %s",
+                    self.alerting,
+                )
+                return False
+            from litellm.proxy.proxy_server import premium_user, prisma_client
+            email_logo_url = os.getenv(
+                "SMTP_SENDER_LOGO", os.getenv("EMAIL_LOGO_URL", None)
+            )
+            email_support_contact = os.getenv("EMAIL_SUPPORT_CONTACT", None)
+            await self._check_if_using_premium_email_feature(
+                premium_user, email_logo_url, email_support_contact
+            )
+            if email_logo_url is None:
+                email_logo_url = LITELLM_LOGO_URL
+            if email_support_contact is None:
+                email_support_contact = LITELLM_SUPPORT_CONTACT
+            event_name = webhook_event.event_message
+            recipient_email = webhook_event.user_email
+            recipient_user_id = webhook_event.user_id
+            if (
+                recipient_email is None
+                and recipient_user_id is not None
+                and prisma_client is not None
+            ):
+                user_row = await prisma_client.db.litellm_usertable.find_unique(
+                    where={"user_id": recipient_user_id}
+                )
+                if user_row is not None:
+                    recipient_email = user_row.user_email
+            key_token = webhook_event.token
+            key_budget = webhook_event.max_budget
+            base_url = os.getenv("PROXY_BASE_URL", "http://0.0.0.0:4000")
+            email_html_content = "Alert from LiteLLM Server"
+            if recipient_email is None:
+                verbose_proxy_logger.error(
+                    "Trying to send email alert to no recipient",
+                    extra=webhook_event.dict(),
+                )
+            if webhook_event.event == "key_created":
+                email_html_content = KEY_CREATED_EMAIL_TEMPLATE.format(
+                    email_logo_url=email_logo_url,
+                    recipient_email=recipient_email,
+                    key_budget=key_budget,
+                    key_token=key_token,
+                    base_url=base_url,
+                    email_support_contact=email_support_contact,
+                )
+            elif webhook_event.event == "internal_user_created":
+                # GET TEAM NAME
+                team_id = webhook_event.team_id
+                team_name = "Default Team"
+                if team_id is not None and prisma_client is not None:
+                    team_row = await prisma_client.db.litellm_teamtable.find_unique(
+                        where={"team_id": team_id}
+                    )
+                    if team_row is not None:
+                        team_name = team_row.team_alias or "-"
+                email_html_content = USER_INVITED_EMAIL_TEMPLATE.format(
+                    email_logo_url=email_logo_url,
+                    recipient_email=recipient_email,
+                    team_name=team_name,
+                    base_url=base_url,
+                    email_support_contact=email_support_contact,
+                )
+            else:
+                verbose_proxy_logger.error(
+                    "Trying to send email alert on unknown webhook event",
+                    extra=webhook_event.model_dump(),
+                )
+            webhook_event.model_dump_json()
+            email_event = {
+                "to": recipient_email,
+                "subject": f"LiteLLM: {event_name}",
+                "html": email_html_content,
+            }
+            await send_email(
+                receiver_email=email_event["to"],
+                subject=email_event["subject"],
+                html=email_event["html"],
+            )
+            return True
+        except Exception as e:
+            verbose_proxy_logger.error("Error sending email alert %s", str(e))
+            return False
+    async def send_email_alert_using_smtp(
+        self, webhook_event: WebhookEvent, alert_type: str
+    ) -> bool:
+        """
+        Sends structured Email alert to an SMTP server
+        Currently only implemented for budget alerts
+        Returns -> True if sent, False if not.
+        """
+        from litellm.proxy.proxy_server import premium_user
+        from litellm.proxy.utils import send_email
+        email_logo_url = os.getenv(
+            "SMTP_SENDER_LOGO", os.getenv("EMAIL_LOGO_URL", None)
+        )
+        email_support_contact = os.getenv("EMAIL_SUPPORT_CONTACT", None)
+        await self._check_if_using_premium_email_feature(
+            premium_user, email_logo_url, email_support_contact
+        )
+        if email_logo_url is None:
+            email_logo_url = LITELLM_LOGO_URL
+        if email_support_contact is None:
+            email_support_contact = LITELLM_SUPPORT_CONTACT
+        event_name = webhook_event.event_message
+        recipient_email = webhook_event.user_email
+        user_name = webhook_event.user_id
+        max_budget = webhook_event.max_budget
+        email_html_content = "Alert from LiteLLM Server"
+        if recipient_email is None:
+            verbose_proxy_logger.error(
+                "Trying to send email alert to no recipient", extra=webhook_event.dict()
+            )
+        if webhook_event.event == "budget_crossed":
+            email_html_content = f"""
+            <img src="{email_logo_url}" alt="LiteLLM Logo" width="150" height="50" />
+            <p> Hi {user_name}, <br/>
+            Your LLM API usage this month has reached your account's <b> monthly budget of ${max_budget} </b> <br /> <br />
+            API requests will be rejected until either (a) you increase your monthly budget or (b) your monthly usage resets at the beginning of the next calendar month. <br /> <br />
+            If you have any questions, please send an email to {email_support_contact} <br /> <br />
+            Best, <br />
+            The LiteLLM team <br />
+            """
+        webhook_event.model_dump_json()
+        email_event = {
+            "to": recipient_email,
+            "subject": f"LiteLLM: {event_name}",
+            "html": email_html_content,
+        }
+        await send_email(
+            receiver_email=email_event["to"],
+            subject=email_event["subject"],
+            html=email_event["html"],
+        )
+        if webhook_event.event_group == "team":
+            from litellm.integrations.email_alerting import send_team_budget_alert
+            await send_team_budget_alert(webhook_event=webhook_event)
+        return False
+    async def send_alert(
+        self,
+        message: str,
+        level: Literal["Low", "Medium", "High"],
+        alert_type: AlertType,
+        alerting_metadata: dict,
+        user_info: Optional[WebhookEvent] = None,
+        **kwargs,
+    ):
+        """
+        Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298
+        - Responses taking too long
+        - Requests are hanging
+        - Calls are failing
+        - DB Read/Writes are failing
+        - Proxy Close to max budget
+        - Key Close to max budget
+        Parameters:
+            level: str - Low|Medium|High - if calls might fail (Medium) or are failing (High); Currently, no alerts would be 'Low'.
+            message: str - what is the alert about
+        """
+        if self.alerting is None:
+            return
+        if (
+            "webhook" in self.alerting
+            and alert_type == "budget_alerts"
+            and user_info is not None
+        ):
+            await self.send_webhook_alert(webhook_event=user_info)
+        if (
+            "email" in self.alerting
+            and alert_type == "budget_alerts"
+            and user_info is not None
+        ):
+            # only send budget alerts over Email
+            await self.send_email_alert_using_smtp(
+                webhook_event=user_info, alert_type=alert_type
+            )
+        if "slack" not in self.alerting:
+            return
+        if alert_type not in self.alert_types:
+            return
+        from datetime import datetime
+        # Get the current timestamp
+        current_time = datetime.now().strftime("%H:%M:%S")
+        _proxy_base_url = os.getenv("PROXY_BASE_URL", None)
+        if alert_type == "daily_reports" or alert_type == "new_model_added":
+            formatted_message = message
+        else:
+            formatted_message = (
+                f"Level: `{level}`\nTimestamp: `{current_time}`\n\nMessage: {message}"
+            )
+        if kwargs:
+            for key, value in kwargs.items():
+                formatted_message += f"\n\n{key}: `{value}`\n\n"
+        if alerting_metadata:
+            for key, value in alerting_metadata.items():
+                formatted_message += f"\n\n*Alerting Metadata*: \n{key}: `{value}`\n\n"
+        if _proxy_base_url is not None:
+            formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`"
+        # check if we find the slack webhook url in self.alert_to_webhook_url
+        if (
+            self.alert_to_webhook_url is not None
+            and alert_type in self.alert_to_webhook_url
+        ):
+            slack_webhook_url: Optional[
+                Union[str, List[str]]
+            ] = self.alert_to_webhook_url[alert_type]
+        elif self.default_webhook_url is not None:
+            slack_webhook_url = self.default_webhook_url
+        else:
+            slack_webhook_url = os.getenv("SLACK_WEBHOOK_URL", None)
+        if slack_webhook_url is None:
+            raise ValueError("Missing SLACK_WEBHOOK_URL from environment")
+        payload = {"text": formatted_message}
+        headers = {"Content-type": "application/json"}
+        if isinstance(slack_webhook_url, list):
+            for url in slack_webhook_url:
+                self.log_queue.append(
+                    {
+                        "url": url,
+                        "headers": headers,
+                        "payload": payload,
+                        "alert_type": alert_type,
+                    }
+                )
+        else:
+            self.log_queue.append(
+                {
+                    "url": slack_webhook_url,
+                    "headers": headers,
+                    "payload": payload,
+                    "alert_type": alert_type,
+                }
+            )
+        if len(self.log_queue) >= self.batch_size:
+            await self.flush_queue()
+    async def async_send_batch(self):
+        if not self.log_queue:
+            return
+        squashed_queue = squash_payloads(self.log_queue)
+        tasks = [
+            send_to_webhook(
+                slackAlertingInstance=self, item=item["item"], count=item["count"]
+            )
+            for item in squashed_queue.values()
+        ]
+        await asyncio.gather(*tasks)
+        self.log_queue.clear()
+    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
+        """Log deployment latency"""
+        try:
+            if "daily_reports" in self.alert_types:
+                litellm_params = kwargs.get("litellm_params", {}) or {}
+                model_info = litellm_params.get("model_info", {}) or {}
+                model_id = model_info.get("id", "") or ""
+                response_s: timedelta = end_time - start_time
+                final_value = response_s
+                if isinstance(response_obj, litellm.ModelResponse) and (
+                    hasattr(response_obj, "usage")
+                    and response_obj.usage is not None  # type: ignore
+                    and hasattr(response_obj.usage, "completion_tokens")  # type: ignore
+                ):
+                    completion_tokens = response_obj.usage.completion_tokens  # type: ignore
+                    if completion_tokens is not None and completion_tokens > 0:
+                        final_value = float(
+                            response_s.total_seconds() / completion_tokens
+                        )
+                if isinstance(final_value, timedelta):
+                    final_value = final_value.total_seconds()
+                await self.async_update_daily_reports(
+                    DeploymentMetrics(
+                        id=model_id,
+                        failed_request=False,
+                        latency_per_output_token=final_value,
+                        updated_at=litellm.utils.get_utc_datetime(),
+                    )
+                )
+        except Exception as e:
+            verbose_proxy_logger.error(
+                f"[Non-Blocking Error] Slack Alerting: Got error in logging LLM deployment latency: {str(e)}"
+            )
+            pass
+    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
+        """Log failure + deployment latency"""
+        _litellm_params = kwargs.get("litellm_params", {})
+        _model_info = _litellm_params.get("model_info", {}) or {}
+        model_id = _model_info.get("id", "")
+        try:
+            if "daily_reports" in self.alert_types:
+                try:
+                    await self.async_update_daily_reports(
+                        DeploymentMetrics(
+                            id=model_id,
+                            failed_request=True,
+                            latency_per_output_token=None,
+                            updated_at=litellm.utils.get_utc_datetime(),
+                        )
+                    )
+                except Exception as e:
+                    verbose_logger.debug(f"Exception raises -{str(e)}")
+            if isinstance(kwargs.get("exception", ""), APIError):
+                if "outage_alerts" in self.alert_types:
+                    await self.outage_alerts(
+                        exception=kwargs["exception"],
+                        deployment_id=model_id,
+                    )
+                if "region_outage_alerts" in self.alert_types:
+                    await self.region_outage_alerts(
+                        exception=kwargs["exception"], deployment_id=model_id
+                    )
+        except Exception:
+            pass
+    async def _run_scheduler_helper(self, llm_router) -> bool:
+        """
+        Returns:
+        - True -> report sent
+        - False -> report not sent
+        """
+        report_sent_bool = False
+        report_sent = await self.internal_usage_cache.async_get_cache(
+            key=SlackAlertingCacheKeys.report_sent_key.value,
+            parent_otel_span=None,
+        )  # None | float
+        current_time = time.time()
+        if report_sent is None:
+            await self.internal_usage_cache.async_set_cache(
+                key=SlackAlertingCacheKeys.report_sent_key.value,
+                value=current_time,
+            )
+        elif isinstance(report_sent, float):
+            # Check if current time - interval >= time last sent
+            interval_seconds = self.alerting_args.daily_report_frequency
+            if current_time - report_sent >= interval_seconds:
+                # Sneak in the reporting logic here
+                await self.send_daily_reports(router=llm_router)
+                # Also, don't forget to update the report_sent time after sending the report!
+                await self.internal_usage_cache.async_set_cache(
+                    key=SlackAlertingCacheKeys.report_sent_key.value,
+                    value=current_time,
+                )
+                report_sent_bool = True
+        return report_sent_bool
+    async def _run_scheduled_daily_report(self, llm_router: Optional[Any] = None):
+        """
+        If 'daily_reports' enabled
+        Ping redis cache every 5 minutes to check if we should send the report
+        If yes -> call send_daily_report()
+        """
+        if llm_router is None or self.alert_types is None:
+            return
+        if "daily_reports" in self.alert_types:
+            while True:
+                await self._run_scheduler_helper(llm_router=llm_router)
+                interval = random.randint(
+                    self.alerting_args.report_check_interval - 3,
+                    self.alerting_args.report_check_interval + 3,
+                )  # shuffle to prevent collisions
+                await asyncio.sleep(interval)
+        return
+    async def send_weekly_spend_report(
+        self,
+        time_range: str = "7d",
+    ):
+        """
+        Send a spend report for a configurable time range.
+        Args:
+            time_range: A string specifying the time range for the report, e.g., "1d", "7d", "30d"
+        """
+        if self.alerting is None or "spend_reports" not in self.alert_types:
+            return
+        try:
+            from litellm.proxy.spend_tracking.spend_management_endpoints import (
+                _get_spend_report_for_time_range,
+            )
+            # Parse the time range
+            days = int(time_range[:-1])
+            if time_range[-1].lower() != "d":
+                raise ValueError("Time range must be specified in days, e.g., '7d'")
+            todays_date = datetime.datetime.now().date()
+            start_date = todays_date - datetime.timedelta(days=days)
+            _event_cache_key = f"weekly_spend_report_sent_{start_date.strftime('%Y-%m-%d')}_{todays_date.strftime('%Y-%m-%d')}"
+            if await self.internal_usage_cache.async_get_cache(key=_event_cache_key):
+                return
+            _resp = await _get_spend_report_for_time_range(
+                start_date=start_date.strftime("%Y-%m-%d"),
+                end_date=todays_date.strftime("%Y-%m-%d"),
+            )
+            if _resp is None or _resp == ([], []):
+                return
+            spend_per_team, spend_per_tag = _resp
+            _spend_message = f"*💸 Spend Report for `{start_date.strftime('%m-%d-%Y')} - {todays_date.strftime('%m-%d-%Y')}` ({days} days)*\n"
+            if spend_per_team is not None:
+                _spend_message += "\n*Team Spend Report:*\n"
+                for spend in spend_per_team:
+                    _team_spend = round(float(spend["total_spend"]), 4)
+                    _spend_message += (
+                        f"Team: `{spend['team_alias']}` | Spend: `${_team_spend}`\n"
+                    )
+            if spend_per_tag is not None:
+                _spend_message += "\n*Tag Spend Report:*\n"
+                for spend in spend_per_tag:
+                    _tag_spend = round(float(spend["total_spend"]), 4)
+                    _spend_message += f"Tag: `{spend['individual_request_tag']}` | Spend: `${_tag_spend}`\n"
+            await self.send_alert(
+                message=_spend_message,
+                level="Low",
+                alert_type=AlertType.spend_reports,
+                alerting_metadata={},
+            )
+            await self.internal_usage_cache.async_set_cache(
+                key=_event_cache_key,
+                value="SENT",
+                ttl=duration_in_seconds(time_range),
+            )
+        except ValueError as ve:
+            verbose_proxy_logger.error(f"Invalid time range format: {ve}")
+        except Exception as e:
+            verbose_proxy_logger.error(f"Error sending spend report: {e}")
+    async def send_monthly_spend_report(self):
+        """ """
+        try:
+            from calendar import monthrange
+            from litellm.proxy.spend_tracking.spend_management_endpoints import (
+                _get_spend_report_for_time_range,
+            )
+            todays_date = datetime.datetime.now().date()
+            first_day_of_month = todays_date.replace(day=1)
+            _, last_day_of_month = monthrange(todays_date.year, todays_date.month)
+            last_day_of_month = first_day_of_month + datetime.timedelta(
+                days=last_day_of_month - 1
+            )
+            _event_cache_key = f"monthly_spend_report_sent_{first_day_of_month.strftime('%Y-%m-%d')}_{last_day_of_month.strftime('%Y-%m-%d')}"
+            if await self.internal_usage_cache.async_get_cache(key=_event_cache_key):
+                return
+            _resp = await _get_spend_report_for_time_range(
+                start_date=first_day_of_month.strftime("%Y-%m-%d"),
+                end_date=last_day_of_month.strftime("%Y-%m-%d"),
+            )
+            if _resp is None or _resp == ([], []):
+                return
+            monthly_spend_per_team, monthly_spend_per_tag = _resp
+            _spend_message = f"*💸 Monthly Spend Report for `{first_day_of_month.strftime('%m-%d-%Y')} - {last_day_of_month.strftime('%m-%d-%Y')}` *\n"
+            if monthly_spend_per_team is not None:
+                _spend_message += "\n*Team Spend Report:*\n"
+                for spend in monthly_spend_per_team:
+                    _team_spend = spend["total_spend"]
+                    _team_spend = float(_team_spend)
+                    # round to 4 decimal places
+                    _team_spend = round(_team_spend, 4)
+                    _spend_message += (
+                        f"Team: `{spend['team_alias']}` | Spend: `${_team_spend}`\n"
+                    )
+            if monthly_spend_per_tag is not None:
+                _spend_message += "\n*Tag Spend Report:*\n"
+                for spend in monthly_spend_per_tag:
+                    _tag_spend = spend["total_spend"]
+                    _tag_spend = float(_tag_spend)
+                    # round to 4 decimal places
+                    _tag_spend = round(_tag_spend, 4)
+                    _spend_message += f"Tag: `{spend['individual_request_tag']}` | Spend: `${_tag_spend}`\n"
+            await self.send_alert(
+                message=_spend_message,
+                level="Low",
+                alert_type=AlertType.spend_reports,
+                alerting_metadata={},
+            )
+            await self.internal_usage_cache.async_set_cache(
+                key=_event_cache_key,
+                value="SENT",
+                ttl=(30 * HOURS_IN_A_DAY * 60 * 60),  # 1 month
+            )
+        except Exception as e:
+            verbose_proxy_logger.exception("Error sending weekly spend report %s", e)
+    async def send_fallback_stats_from_prometheus(self):
+        """
+        Helper to send fallback statistics from prometheus server -> to slack
+        This runs once per day and sends an overview of all the fallback statistics
+        """
+        try:
+            from litellm.integrations.prometheus_helpers.prometheus_api import (
+                get_fallback_metric_from_prometheus,
+            )
+            # call prometheuslogger.
+            falllback_success_info_prometheus = (
+                await get_fallback_metric_from_prometheus()
+            )
+            fallback_message = (
+                f"*Fallback Statistics:*\n{falllback_success_info_prometheus}"
+            )
+            await self.send_alert(
+                message=fallback_message,
+                level="Low",
+                alert_type=AlertType.fallback_reports,
+                alerting_metadata={},
+            )
+        except Exception as e:
+            verbose_proxy_logger.error("Error sending weekly spend report %s", e)
+        pass
+    async def send_virtual_key_event_slack(
+        self,
+        key_event: VirtualKeyEvent,
+        alert_type: AlertType,
+        event_name: str,
+    ):
+        """
+        Handles sending Virtual Key related alerts
+        Example:
+        - New Virtual Key Created
+        - Internal User Updated
+        - Team Created, Updated, Deleted
+        """
+        try:
+            message = f"`{event_name}`\n"
+            key_event_dict = key_event.model_dump()
+            # Add Created by information first
+            message += "*Action Done by:*\n"
+            for key, value in key_event_dict.items():
+                if "created_by" in key:
+                    message += f"{key}: `{value}`\n"
+            # Add args sent to function in the alert
+            message += "\n*Arguments passed:*\n"
+            request_kwargs = key_event.request_kwargs
+            for key, value in request_kwargs.items():
+                if key == "user_api_key_dict":
+                    continue
+                message += f"{key}: `{value}`\n"
+            await self.send_alert(
+                message=message,
+                level="High",
+                alert_type=alert_type,
+                alerting_metadata={},
+            )
+        except Exception as e:
+            verbose_proxy_logger.error(
+                "Error sending send_virtual_key_event_slack %s", e
+            )
+        return
+    async def _request_is_completed(self, request_data: Optional[dict]) -> bool:
+        """
+        Returns True if the request is completed - either as a success or failure
+        """
+        if request_data is None:
+            return False
+        if (
+            request_data.get("litellm_status", "") != "success"
+            and request_data.get("litellm_status", "") != "fail"
+        ):
+            ## CHECK IF CACHE IS UPDATED
+            litellm_call_id = request_data.get("litellm_call_id", "")
+            status: Optional[str] = await self.internal_usage_cache.async_get_cache(
+                key="request_status:{}".format(litellm_call_id), local_only=True
+            )
+            if status is not None and (status == "success" or status == "fail"):
+                return True
+        return False

litellm/integrations/SlackAlerting/utils.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""
+Utils used for slack alerting
+"""
+import asyncio
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from litellm.proxy._types import AlertType
+from litellm.secret_managers.main import get_secret
+if TYPE_CHECKING:
+    from litellm.litellm_core_utils.litellm_logging import Logging as _Logging
+    Logging = _Logging
+else:
+    Logging = Any
+def process_slack_alerting_variables(
+    alert_to_webhook_url: Optional[Dict[AlertType, Union[List[str], str]]]
+) -> Optional[Dict[AlertType, Union[List[str], str]]]:
+    """
+    process alert_to_webhook_url
+    - check if any urls are set as os.environ/SLACK_WEBHOOK_URL_1 read env var and set the correct value
+    """
+    if alert_to_webhook_url is None:
+        return None
+    for alert_type, webhook_urls in alert_to_webhook_url.items():
+        if isinstance(webhook_urls, list):
+            _webhook_values: List[str] = []
+            for webhook_url in webhook_urls:
+                if "os.environ/" in webhook_url:
+                    _env_value = get_secret(secret_name=webhook_url)
+                    if not isinstance(_env_value, str):
+                        raise ValueError(
+                            f"Invalid webhook url value for: {webhook_url}. Got type={type(_env_value)}"
+                        )
+                    _webhook_values.append(_env_value)
+                else:
+                    _webhook_values.append(webhook_url)
+            alert_to_webhook_url[alert_type] = _webhook_values
+        else:
+            _webhook_value_str: str = webhook_urls
+            if "os.environ/" in webhook_urls:
+                _env_value = get_secret(secret_name=webhook_urls)
+                if not isinstance(_env_value, str):
+                    raise ValueError(
+                        f"Invalid webhook url value for: {webhook_urls}. Got type={type(_env_value)}"
+                    )
+                _webhook_value_str = _env_value
+            else:
+                _webhook_value_str = webhook_urls
+            alert_to_webhook_url[alert_type] = _webhook_value_str
+    return alert_to_webhook_url
+async def _add_langfuse_trace_id_to_alert(
+    request_data: Optional[dict] = None,
+) -> Optional[str]:
+    """
+    Returns langfuse trace url
+    - check:
+    -> existing_trace_id
+    -> trace_id
+    -> litellm_call_id
+    """
+    # do nothing for now
+    if (
+        request_data is not None
+        and request_data.get("litellm_logging_obj", None) is not None
+    ):
+        trace_id: Optional[str] = None
+        litellm_logging_obj: Logging = request_data["litellm_logging_obj"]
+        for _ in range(3):
+            trace_id = litellm_logging_obj._get_trace_id(service_name="langfuse")
+            if trace_id is not None:
+                break
+            await asyncio.sleep(3)  # wait 3s before retrying for trace id
+        _langfuse_object = litellm_logging_obj._get_callback_object(
+            service_name="langfuse"
+        )
+        if _langfuse_object is not None:
+            base_url = _langfuse_object.Langfuse.base_url
+            return f"{base_url}/trace/{trace_id}"
+    return None

litellm/integrations/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import *

litellm/integrations/_types/open_inference.py ADDED Viewed

	@@ -0,0 +1,389 @@

+from enum import Enum
+class SpanAttributes:
+    OUTPUT_VALUE = "output.value"
+    OUTPUT_MIME_TYPE = "output.mime_type"
+    """
+    The type of output.value. If unspecified, the type is plain text by default.
+    If type is JSON, the value is a string representing a JSON object.
+    """
+    INPUT_VALUE = "input.value"
+    INPUT_MIME_TYPE = "input.mime_type"
+    """
+    The type of input.value. If unspecified, the type is plain text by default.
+    If type is JSON, the value is a string representing a JSON object.
+    """
+    EMBEDDING_EMBEDDINGS = "embedding.embeddings"
+    """
+    A list of objects containing embedding data, including the vector and represented piece of text.
+    """
+    EMBEDDING_MODEL_NAME = "embedding.model_name"
+    """
+    The name of the embedding model.
+    """
+    LLM_FUNCTION_CALL = "llm.function_call"
+    """
+    For models and APIs that support function calling. Records attributes such as the function
+    name and arguments to the called function.
+    """
+    LLM_INVOCATION_PARAMETERS = "llm.invocation_parameters"
+    """
+    Invocation parameters passed to the LLM or API, such as the model name, temperature, etc.
+    """
+    LLM_INPUT_MESSAGES = "llm.input_messages"
+    """
+    Messages provided to a chat API.
+    """
+    LLM_OUTPUT_MESSAGES = "llm.output_messages"
+    """
+    Messages received from a chat API.
+    """
+    LLM_MODEL_NAME = "llm.model_name"
+    """
+    The name of the model being used.
+    """
+    LLM_PROVIDER = "llm.provider"
+    """
+    The provider of the model, such as OpenAI, Azure, Google, etc.
+    """
+    LLM_SYSTEM = "llm.system"
+    """
+    The AI product as identified by the client or server
+    """
+    LLM_PROMPTS = "llm.prompts"
+    """
+    Prompts provided to a completions API.
+    """
+    LLM_PROMPT_TEMPLATE = "llm.prompt_template.template"
+    """
+    The prompt template as a Python f-string.
+    """
+    LLM_PROMPT_TEMPLATE_VARIABLES = "llm.prompt_template.variables"
+    """
+    A list of input variables to the prompt template.
+    """
+    LLM_PROMPT_TEMPLATE_VERSION = "llm.prompt_template.version"
+    """
+    The version of the prompt template being used.
+    """
+    LLM_TOKEN_COUNT_PROMPT = "llm.token_count.prompt"
+    """
+    Number of tokens in the prompt.
+    """
+    LLM_TOKEN_COUNT_PROMPT_DETAILS_CACHE_WRITE = "llm.token_count.prompt_details.cache_write"
+    """
+    Number of tokens in the prompt that were written to cache.
+    """
+    LLM_TOKEN_COUNT_PROMPT_DETAILS_CACHE_READ = "llm.token_count.prompt_details.cache_read"
+    """
+    Number of tokens in the prompt that were read from cache.
+    """
+    LLM_TOKEN_COUNT_PROMPT_DETAILS_AUDIO = "llm.token_count.prompt_details.audio"
+    """
+    The number of audio input tokens presented in the prompt
+    """
+    LLM_TOKEN_COUNT_COMPLETION = "llm.token_count.completion"
+    """
+    Number of tokens in the completion.
+    """
+    LLM_TOKEN_COUNT_COMPLETION_DETAILS_REASONING = "llm.token_count.completion_details.reasoning"
+    """
+    Number of tokens used for reasoning steps in the completion.
+    """
+    LLM_TOKEN_COUNT_COMPLETION_DETAILS_AUDIO = "llm.token_count.completion_details.audio"
+    """
+    The number of audio input tokens generated by the model
+    """
+    LLM_TOKEN_COUNT_TOTAL = "llm.token_count.total"
+    """
+    Total number of tokens, including both prompt and completion.
+    """
+    LLM_TOOLS = "llm.tools"
+    """
+    List of tools that are advertised to the LLM to be able to call
+    """
+    TOOL_NAME = "tool.name"
+    """
+    Name of the tool being used.
+    """
+    TOOL_DESCRIPTION = "tool.description"
+    """
+    Description of the tool's purpose, typically used to select the tool.
+    """
+    TOOL_PARAMETERS = "tool.parameters"
+    """
+    Parameters of the tool represented a dictionary JSON string, e.g.
+    see https://platform.openai.com/docs/guides/gpt/function-calling
+    """
+    RETRIEVAL_DOCUMENTS = "retrieval.documents"
+    METADATA = "metadata"
+    """
+    Metadata attributes are used to store user-defined key-value pairs.
+    For example, LangChain uses metadata to store user-defined attributes for a chain.
+    """
+    TAG_TAGS = "tag.tags"
+    """
+    Custom categorical tags for the span.
+    """
+    OPENINFERENCE_SPAN_KIND = "openinference.span.kind"
+    SESSION_ID = "session.id"
+    """
+    The id of the session
+    """
+    USER_ID = "user.id"
+    """
+    The id of the user
+    """
+    PROMPT_VENDOR = "prompt.vendor"
+    """
+    The vendor or origin of the prompt, e.g. a prompt library, a specialized service, etc.
+    """
+    PROMPT_ID = "prompt.id"
+    """
+    A vendor-specific id used to locate the prompt.
+    """
+    PROMPT_URL = "prompt.url"
+    """
+    A vendor-specific url used to locate the prompt.
+    """
+class MessageAttributes:
+    """
+    Attributes for a message sent to or from an LLM
+    """
+    MESSAGE_ROLE = "message.role"
+    """
+    The role of the message, such as "user", "agent", "function".
+    """
+    MESSAGE_CONTENT = "message.content"
+    """
+    The content of the message to or from the llm, must be a string.
+    """
+    MESSAGE_CONTENTS = "message.contents"
+    """
+    The message contents to the llm, it is an array of
+    `message_content` prefixed attributes.
+    """
+    MESSAGE_NAME = "message.name"
+    """
+    The name of the message, often used to identify the function
+    that was used to generate the message.
+    """
+    MESSAGE_TOOL_CALLS = "message.tool_calls"
+    """
+    The tool calls generated by the model, such as function calls.
+    """
+    MESSAGE_FUNCTION_CALL_NAME = "message.function_call_name"
+    """
+    The function name that is a part of the message list.
+    This is populated for role 'function' or 'agent' as a mechanism to identify
+    the function that was called during the execution of a tool.
+    """
+    MESSAGE_FUNCTION_CALL_ARGUMENTS_JSON = "message.function_call_arguments_json"
+    """
+    The JSON string representing the arguments passed to the function
+    during a function call.
+    """
+    MESSAGE_TOOL_CALL_ID = "message.tool_call_id"
+    """
+    The id of the tool call.
+    """
+class MessageContentAttributes:
+    """
+    Attributes for the contents of user messages sent to an LLM.
+    """
+    MESSAGE_CONTENT_TYPE = "message_content.type"
+    """
+    The type of the content, such as "text" or "image".
+    """
+    MESSAGE_CONTENT_TEXT = "message_content.text"
+    """
+    The text content of the message, if the type is "text".
+    """
+    MESSAGE_CONTENT_IMAGE = "message_content.image"
+    """
+    The image content of the message, if the type is "image".
+    An image can be made available to the model by passing a link to
+    the image or by passing the base64 encoded image directly in the
+    request.
+    """
+class ImageAttributes:
+    """
+    Attributes for images
+    """
+    IMAGE_URL = "image.url"
+    """
+    An http or base64 image url
+    """
+class AudioAttributes:
+    """
+    Attributes for audio
+    """
+    AUDIO_URL = "audio.url"
+    """
+    The url to an audio file
+    """
+    AUDIO_MIME_TYPE = "audio.mime_type"
+    """
+    The mime type of the audio file
+    """
+    AUDIO_TRANSCRIPT = "audio.transcript"
+    """
+    The transcript of the audio file
+    """
+class DocumentAttributes:
+    """
+    Attributes for a document.
+    """
+    DOCUMENT_ID = "document.id"
+    """
+    The id of the document.
+    """
+    DOCUMENT_SCORE = "document.score"
+    """
+    The score of the document
+    """
+    DOCUMENT_CONTENT = "document.content"
+    """
+    The content of the document.
+    """
+    DOCUMENT_METADATA = "document.metadata"
+    """
+    The metadata of the document represented as a dictionary
+    JSON string, e.g. `"{ 'title': 'foo' }"`
+    """
+class RerankerAttributes:
+    """
+    Attributes for a reranker
+    """
+    RERANKER_INPUT_DOCUMENTS = "reranker.input_documents"
+    """
+    List of documents as input to the reranker
+    """
+    RERANKER_OUTPUT_DOCUMENTS = "reranker.output_documents"
+    """
+    List of documents as output from the reranker
+    """
+    RERANKER_QUERY = "reranker.query"
+    """
+    Query string for the reranker
+    """
+    RERANKER_MODEL_NAME = "reranker.model_name"
+    """
+    Model name of the reranker
+    """
+    RERANKER_TOP_K = "reranker.top_k"
+    """
+    Top K parameter of the reranker
+    """
+class EmbeddingAttributes:
+    """
+    Attributes for an embedding
+    """
+    EMBEDDING_TEXT = "embedding.text"
+    """
+    The text represented by the embedding.
+    """
+    EMBEDDING_VECTOR = "embedding.vector"
+    """
+    The embedding vector.
+    """
+class ToolCallAttributes:
+    """
+    Attributes for a tool call
+    """
+    TOOL_CALL_ID = "tool_call.id"
+    """
+    The id of the tool call.
+    """
+    TOOL_CALL_FUNCTION_NAME = "tool_call.function.name"
+    """
+    The name of function that is being called during a tool call.
+    """
+    TOOL_CALL_FUNCTION_ARGUMENTS_JSON = "tool_call.function.arguments"
+    """
+    The JSON string representing the arguments passed to the function
+    during a tool call.
+    """
+class ToolAttributes:
+    """
+    Attributes for a tools
+    """
+    TOOL_JSON_SCHEMA = "tool.json_schema"
+    """
+    The json schema of a tool input, It is RECOMMENDED that this be in the
+    OpenAI tool calling format: https://platform.openai.com/docs/assistants/tools
+    """
+class OpenInferenceSpanKindValues(Enum):
+    TOOL = "TOOL"
+    CHAIN = "CHAIN"
+    LLM = "LLM"
+    RETRIEVER = "RETRIEVER"
+    EMBEDDING = "EMBEDDING"
+    AGENT = "AGENT"
+    RERANKER = "RERANKER"
+    UNKNOWN = "UNKNOWN"
+    GUARDRAIL = "GUARDRAIL"
+    EVALUATOR = "EVALUATOR"
+class OpenInferenceMimeTypeValues(Enum):
+    TEXT = "text/plain"
+    JSON = "application/json"
+class OpenInferenceLLMSystemValues(Enum):
+    OPENAI = "openai"
+    ANTHROPIC = "anthropic"
+    COHERE = "cohere"
+    MISTRALAI = "mistralai"
+    VERTEXAI = "vertexai"
+class OpenInferenceLLMProviderValues(Enum):
+    OPENAI = "openai"
+    ANTHROPIC = "anthropic"
+    COHERE = "cohere"
+    MISTRALAI = "mistralai"
+    GOOGLE = "google"
+    AZURE = "azure"
+    AWS = "aws"

litellm/integrations/additional_logging_utils.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""
+Base class for Additional Logging Utils for CustomLoggers
+- Health Check for the logging util
+- Get Request / Response Payload for the logging util
+"""
+from abc import ABC, abstractmethod
+from datetime import datetime
+from typing import Optional
+from litellm.types.integrations.base_health_check import IntegrationHealthCheckStatus
+class AdditionalLoggingUtils(ABC):
+    def __init__(self):
+        super().__init__()
+    @abstractmethod
+    async def async_health_check(self) -> IntegrationHealthCheckStatus:
+        """
+        Check if the service is healthy
+        """
+        pass
+    @abstractmethod
+    async def get_request_response_payload(
+        self,
+        request_id: str,
+        start_time_utc: Optional[datetime],
+        end_time_utc: Optional[datetime],
+    ) -> Optional[dict]:
+        """
+        Get the request and response payload for a given `request_id`
+        """
+        return None

litellm/integrations/agentops/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .agentops import AgentOps
2	+
3	+ __all__ = ["AgentOps"]