data-ge/app/models.py

from __future__ import annotations

from enum import Enum
from typing import Any, Dict, List, Optional, Union

from pydantic import BaseModel, Field, HttpUrl


class LLMRole(str, Enum):
    USER = "user"
    ASSISTANT = "assistant"
    SYSTEM = "system"


class LLMMessage(BaseModel):
    role: LLMRole = Field(..., description="Message author role.")
    content: str = Field(..., description="Plain text content of the message.")


class LLMProvider(str, Enum):
    OPENAI = "openai"
    ANTHROPIC = "anthropic"
    OPENROUTER = "openrouter"
    GEMINI = "gemini"
    QWEN = "qwen"
    DEEPSEEK = "deepseek"


class LLMRequest(BaseModel):
    provider: LLMProvider = Field(..., description="Target LLM provider identifier.")
    model: str = Field(..., description="Model name understood by the provider.")
    messages: List[LLMMessage] = Field(..., description="Ordered chat messages.")
    temperature: Optional[float] = Field(
        0.7, description="Sampling temperature when supported."
    )
    top_p: Optional[float] = Field(
        None, description="Top-p nucleus sampling when supported."
    )
    max_tokens: Optional[int] = Field(
        None, description="Maximum tokens to generate when supported."
    )
    stream: Optional[bool] = Field(
        False, description="Enable provider streaming if both sides support it."
    )
    extra_params: Optional[dict[str, Any]] = Field(
        None, description="Provider-specific parameters to merge into the payload."
    )


class LLMChoice(BaseModel):
    index: int
    message: LLMMessage


class LLMResponse(BaseModel):
    provider: LLMProvider
    model: str
    choices: List[LLMChoice]
    raw: Optional[dict[str, Any]] = Field(
        None, description="Raw provider response for debugging."
    )


class DataImportAnalysisRequest(BaseModel):
    import_record_id: str = Field(..., description="Unique identifier for this import run.")
    example_data: str = Field(
        ...,
        max_length=30_000,
        description="Sample rows from the import payload. Limited to 30k characters.",
    )
    table_headers: List[str] = Field(
        ...,
        min_length=1,
        description="Ordered list of table headers associated with the data.",
    )
    llm_model: str = Field(
        ...,
        description="Model identifier. Accepts 'provider:model' format or plain model name.",
    )
    temperature: Optional[float] = Field(
        None,
        description="Optional override for LLM temperature when generating recognition output.",
    )
    max_tokens: Optional[int] = Field(
        None,
        description="Optional override for maximum tokens generated during recognition.",
    )


class DataImportAnalysisResponse(BaseModel):
    import_record_id: str
    llm_response: LLMResponse


class DataImportAnalysisJobRequest(BaseModel):
    import_record_id: str = Field(
        ..., description="Unique identifier for this import request run."
    )
    rows: List[Union[Dict[str, Any], List[Any]]] = Field(
        ...,
        description="Sample rows from the import payload. Accepts list of dicts or list of lists.",
    )
    headers: Optional[List[str]] = Field(
        None,
        description="Ordered list of table headers associated with the data sample.",
    )
    raw_csv: Optional[str] = Field(
        None,
        description="Optional raw CSV representation of the sample rows, if already prepared.",
    )
    table_schema: Optional[Any] = Field(
        None,
        description="Optional schema description for the table. Can be a string or JSON-serialisable structure.",
    )
    callback_url: HttpUrl = Field(
        ...,
        description="URL to notify when the analysis completes. Receives JSON payload with status/results.",
    )
    llm_model: str = Field(
        "gpt-4.1-mini",
        description="Target LLM model identifier. Defaults to gpt-4.1-mini.",
    )
    temperature: Optional[float] = Field(
        None,
        description="Optional override for model temperature when generating analysis output.",
    )
    max_output_tokens: Optional[int] = Field(
        None,
        description="Optional maximum number of tokens to generate in the analysis response.",
    )


class DataImportAnalysisJobAck(BaseModel):
    import_record_id: str = Field(..., description="Echo of the import record identifier")
    status: str = Field("accepted", description="Processing status acknowledgement.")