Skip to content

api

APIServer

APIServer(
    model: str,
    base_url: Optional[str] = None,
    api_key: Optional[str] = None,
    custom_llm_provider: Optional[str] = None,
    cost_currency: str = "USD",
    **kwargs: Any
)

Bases: BaseServer

The server for API models. It is a wrapper of litellm.completion.

See LiteLLM for available models and providers. See completion for available options.

Source code in src/appl/servers/api.py
def __init__(
    self,
    model: str,
    base_url: Optional[str] = None,
    api_key: Optional[str] = None,
    custom_llm_provider: Optional[str] = None,
    cost_currency: str = "USD",
    **kwargs: Any,
) -> None:
    """Initialize the API server.

    See [LiteLLM](https://docs.litellm.ai/docs/providers)
    for available models and providers.
    See [completion](https://docs.litellm.ai/docs/completion/input#input-params-1)
    for available options.
    """
    super().__init__()
    self._model = model
    self._base_url = base_url
    self._api_key = api_key
    self._custom_llm_provider = custom_llm_provider
    if custom_llm_provider is not None and api_key is None:
        self._api_key = "NotRequired"  # bypass the api_key check of litellm
    self._cost_currency = cost_currency
    self._default_args = kwargs

model_name property

model_name

The model name.

close

close()

Close the server.

Source code in src/appl/servers/api.py
def close(self):
    """Close the server."""
    pass

create

create(
    args: GenArgs, gen_id: str, **kwargs: Any
) -> CompletionResponse

Create a CompletionResponse from the model with given arguments.

Parameters:

  • args (GenArgs) –

    The arguments for generating the response

  • gen_id (str) –

    The ID of the generation

  • **kwargs (Any, default: {} ) –

    Additional keyword arguments

Returns: The response from the model.

Source code in src/appl/core/server.py
def create(self, args: GenArgs, gen_id: str, **kwargs: Any) -> CompletionResponse:
    """Create a CompletionResponse from the model with given arguments.

    Args:
        args: The arguments for generating the response
        gen_id: The ID of the generation
        **kwargs: Additional keyword arguments
    Returns:
        The response from the model.
    """
    create_args = self._get_create_args(args, **kwargs)
    results = self._create(gen_id=gen_id, **create_args)
    return results

chat_completion

chat_completion(**kwargs: Any) -> CompletionResponse

Wrap the litellm.completion function to add tracing and logging.

Source code in src/appl/servers/api.py
@wraps(litellm.completion)
def chat_completion(**kwargs: Any) -> CompletionResponse:
    """Wrap the litellm.completion function to add tracing and logging."""
    if "gen_id" not in kwargs:
        raise ValueError("gen_id is required for tracing completion generation.")
    gen_id = kwargs.pop("gen_id")
    raw_response_holder = []
    if "_raw_response_holder" in kwargs:
        raw_response_holder = kwargs.pop("_raw_response_holder")
    add_to_trace(CompletionRequestEvent(name=gen_id))

    log_llm_call_args = configs.getattrs("settings.logging.display.llm_raw_call_args")
    log_llm_response = configs.getattrs("settings.logging.display.llm_raw_response")
    log_llm_usage = configs.getattrs("settings.logging.display.llm_raw_usage")
    log_llm_cache = configs.getattrs("settings.logging.display.llm_cache")
    if log_llm_call_args:
        logger.info(f"Call completion [{gen_id}] with args: {kwargs}")

    @_langsmith_traceable(
        name=f"ChatCompletion_{gen_id}",
        run_type="llm",
        metadata={"appl": "completion", "appl_version": __version__},
    )  # type: ignore
    def wrapped(**inner_kwargs: Any) -> Tuple[Any, bool]:
        if cache_ret := find_in_cache(gen_id, inner_kwargs):
            if log_llm_cache:
                logger.info("Found in cache, using cached response...")
            # ? support rebuild the stream from cached response
            if inner_kwargs.get("stream", False):
                logger.warning(
                    "Using cached complete response for a streaming generation."
                )
            raw_response = cache_ret
        else:
            # if log_llm_cache:
            #     logger.info("Not found in cache, creating response...")
            raw_response = litellm.completion(**inner_kwargs)
        return raw_response, cache_ret is not None

    try:
        raw_response, use_cache = wrapped(**kwargs)
    except Exception as e:
        # log the error information for debugging
        logger.error(f"Error encountered for the completion: {e}")
        logger.info(f"kwargs:\n{kwargs}")
        raise e

    if raw_response_holder is not None:
        raw_response_holder.append(raw_response)

    def post_completion(response: CompletionResponse) -> None:
        raw_response = response.complete_response
        cost = 0.0 if use_cache else response.cost
        response.cost = cost  # update the cost
        event = CompletionResponseEvent(
            name=gen_id, args=kwargs, ret=raw_response, cost=cost
        )
        add_to_trace(event)
        if log_llm_response:
            logger.info(f"Completion [{gen_id}] response: {response}")
        if log_llm_usage and response.usage is not None:
            logger.info(f"Completion [{gen_id}] usage: {response.usage}")

    return CompletionResponse(
        raw_response=raw_response, post_finish_callbacks=[post_completion]
    )  # type: ignore