data-ge/app/models.py

from __future__ import annotations

from datetime import datetime
from enum import Enum
from typing import Any, Dict, List, Optional, Union

from pydantic import BaseModel, Field, HttpUrl

from app.settings import DEFAULT_IMPORT_MODEL


class LLMRole(str, Enum):
    USER = "user"
    ASSISTANT = "assistant"
    SYSTEM = "system"


class LLMMessage(BaseModel):
    role: LLMRole = Field(..., description="Message author role.")
    content: str = Field(..., description="Plain text content of the message.")


class LLMProvider(str, Enum):
    OPENAI = "openai"
    ANTHROPIC = "anthropic"
    OPENROUTER = "openrouter"
    GEMINI = "gemini"
    QWEN = "qwen"
    DEEPSEEK = "deepseek"


class LLMRequest(BaseModel):
    provider: LLMProvider = Field(..., description="Target LLM provider identifier.")
    model: str = Field(..., description="Model name understood by the provider.")
    messages: List[LLMMessage] = Field(..., description="Ordered chat messages.")
    temperature: Optional[float] = Field(
        0.7, description="Sampling temperature when supported."
    )
    top_p: Optional[float] = Field(
        None, description="Top-p nucleus sampling when supported."
    )
    max_tokens: Optional[int] = Field(
        None, description="Maximum tokens to generate when supported."
    )
    stream: Optional[bool] = Field(
        False, description="Enable provider streaming if both sides support it."
    )
    extra_params: Optional[dict[str, Any]] = Field(
        None, description="Provider-specific parameters to merge into the payload."
    )


class LLMChoice(BaseModel):
    index: int
    message: LLMMessage


class LLMResponse(BaseModel):
    provider: LLMProvider
    model: str
    choices: List[LLMChoice]
    raw: Optional[dict[str, Any]] = Field(
        None, description="Raw provider response for debugging."
    )


class DataImportAnalysisRequest(BaseModel):
    import_record_id: str = Field(..., description="Unique identifier for this import run.")
    example_data: str = Field(
        ...,
        max_length=30_000,
        description="Sample rows from the import payload. Limited to 30k characters.",
    )
    table_headers: List[str] = Field(
        ...,
        min_length=1,
        description="Ordered list of table headers associated with the data.",
    )
    llm_model: str = Field(
        None,
        description="Model identifier. Accepts 'provider:model_name' format or custom model alias.",
    )
    temperature: Optional[float] = Field(
        None,
        description="Optional override for LLM temperature when generating recognition output.",
    )
    max_tokens: Optional[int] = Field(
        None,
        description="Optional override for maximum tokens generated during recognition.",
    )


class DataImportAnalysisResponse(BaseModel):
    import_record_id: str
    llm_response: LLMResponse


class DataImportAnalysisJobRequest(BaseModel):
    import_record_id: str = Field(
        ..., description="Unique identifier for this import request run."
    )
    rows: List[Union[Dict[str, Any], List[Any]]] = Field(
        ...,
        description="Sample rows from the import payload. Accepts list of dicts or list of lists.",
    )
    headers: Optional[List[str]] = Field(
        None,
        description="Ordered list of table headers associated with the data sample.",
    )
    raw_csv: Optional[str] = Field(
        None,
        description="Optional raw CSV representation of the sample rows, if already prepared.",
    )
    table_schema: Optional[Any] = Field(
        None,
        description="Optional schema description for the table. Can be a string or JSON-serialisable structure.",
    )
    callback_url: HttpUrl = Field(
        ...,
        description="URL to notify when the analysis completes. Receives JSON payload with status/results.",
    )
    llm_model: str = Field(
        DEFAULT_IMPORT_MODEL,
        description="Target LLM model identifier. Defaults to DEFAULT_IMPORT_MODEL.",
    )
    temperature: Optional[float] = Field(
        None,
        description="Optional override for model temperature when generating analysis output.",
    )
    max_output_tokens: Optional[int] = Field(
        None,
        description="Optional maximum number of tokens to generate in the analysis response.",
    )


class DataImportAnalysisJobAck(BaseModel):
    import_record_id: str = Field(..., description="Echo of the import record identifier")
    status: str = Field("accepted", description="Processing status acknowledgement.")


class ActionType(str, Enum):
    GE_PROFILING = "ge_profiling"
    GE_RESULT_DESC = "ge_result_desc"
    SNIPPET = "snippet"
    SNIPPET_ALIAS = "snippet_alias"


class ActionStatus(str, Enum):
    PENDING = "pending"
    RUNNING = "running"
    SUCCESS = "success"
    FAILED = "failed"
    PARTIAL = "partial"


class TableProfilingJobRequest(BaseModel):
    table_id: str = Field(..., description="Unique identifier for the table to profile.")
    version_ts: str = Field(
        ...,
        pattern=r"^\d{14}$",
        description="Version timestamp expressed as fourteen digit string (yyyyMMddHHmmss).",
    )
    callback_url: HttpUrl = Field(
        ...,
        description="Callback endpoint invoked after each pipeline action completes.",
    )
    llm_model: Optional[str] = Field(
        None,
        description="Default LLM model spec applied to prompt-based actions when overrides are omitted.",
    )
    table_schema: Optional[Any] = Field(
        None,
        description="Schema structure snapshot for the current table version.",
    )
    table_schema_version_id: Optional[str] = Field(
        None,
        description="Identifier for the schema snapshot provided in table_schema.",
    )
    table_link_info: Optional[Dict[str, Any]] = Field(
        None,
        description=(
            "Information describing how to locate the source table for profiling. "
            "For example: {'type': 'sql', 'connection_string': 'mysql+pymysql://user:pass@host/db', "
            "'table': 'schema.table_name'}."
        ),
    )
    table_access_info: Optional[Dict[str, Any]] = Field(
        None,
        description=(
            "Credentials or supplemental parameters required to access the table described in table_link_info. "
            "These values can be merged into the connection string using Python format placeholders."
        ),
    )
    ge_batch_request: Optional[Dict[str, Any]] = Field(
        None,
        description="Optional Great Expectations batch request payload used for profiling.",
    )
    ge_expectation_suite_name: Optional[str] = Field(
        None,
        description="Expectation suite name used during profiling. Created automatically when absent.",
    )
    ge_data_context_root: Optional[str] = Field(
        None,
        description="Custom root directory for the Great Expectations data context. Defaults to project ./gx.",
    )
    ge_datasource_name: Optional[str] = Field(
        None,
        description="Datasource name registered inside the GE context when batch_request is not supplied.",
    )
    ge_data_asset_name: Optional[str] = Field(
        None,
        description="Data asset reference used when inferring batch request from datasource configuration.",
    )
    ge_profiler_type: str = Field(
        "user_configurable",
        description="Profiler implementation identifier. Currently supports 'user_configurable' or 'data_assistant'.",
    )

    result_desc_model: Optional[str] = Field(
        None,
        description="LLM model override used for GE result description (action 2).",
    )
    snippet_model: Optional[str] = Field(
        None,
        description="LLM model override used for snippet generation (action 3).",
    )
    snippet_alias_model: Optional[str] = Field(
        None,
        description="LLM model override used for snippet alias enrichment (action 4).",
    )
    extra_options: Optional[Dict[str, Any]] = Field(
        None,
        description="Miscellaneous execution flags applied across pipeline steps.",
    )
    workspace_id: Optional[int] = Field(
        None,
        ge=0,
        description="Optional workspace identifier forwarded to snippet_alias callback for RAG ingestion.",
    )
    rag_item_type: Optional[str] = Field(
        "SNIPPET",
        description="Optional RAG item type forwarded to snippet_alias callback.",
    )


class TableProfilingJobAck(BaseModel):
    table_id: str = Field(..., description="Echo of the table identifier.")
    version_ts: str = Field(..., description="Echo of the profiling version timestamp (yyyyMMddHHmmss).")
    status: str = Field("accepted", description="Processing acknowledgement status.")


class TableSnippetUpsertRequest(BaseModel):
    table_id: int = Field(..., ge=1, description="Unique identifier for the table.")
    version_ts: int = Field(
        ...,
        ge=0,
        description="Version timestamp aligned with the pipeline (yyyyMMddHHmmss as integer).",
    )
    workspace_id: Optional[int] = Field(
        None,
        ge=0,
        description="Optional workspace identifier for RAG ingestion; when provided and action_type=snippet_alias "
        "with status=success, merged snippets will be written to rag_snippet and pushed to RAG.",
    )
    rag_item_type: Optional[str] = Field(
        "SNIPPET",
        description="Optional RAG item type used when pushing snippets to RAG. Defaults to 'SNIPPET'.",
    )
    action_type: ActionType = Field(..., description="Pipeline action type for this record.")
    status: ActionStatus = Field(
        ActionStatus.SUCCESS, description="Execution status for the action."
    )
    callback_url: HttpUrl = Field(..., description="Callback URL associated with the action run.")
    table_schema_version_id: int = Field(..., ge=0, description="Identifier for the schema snapshot.")
    table_schema: Any = Field(..., description="Schema snapshot payload for the table.")
    model: Optional[str] = Field(
        None,
        description="LLM model identifier (can be provider alias) used for this action, when applicable.",
    )
    model_provider: Optional[str] = Field(
        None,
        description="LLM provider responsible for executing the action's model.",
    )
    model_params: Optional[Dict[str, Any]] = Field(
        None,
        description="Optional model parameter overrides (e.g., temperature) associated with the action.",
    )
    llm_usage: Optional[Any] = Field(
        None,
        description="Optional token usage metrics reported by the LLM provider.",
    )
    ge_profiling_json: Optional[Any] = Field(
        None, description="Full GE profiling result payload for the profiling action."
    )
    ge_profiling_json_size_bytes: Optional[int] = Field(
        None, ge=0, description="Size in bytes of the GE profiling result JSON."
    )
    ge_profiling_summary: Optional[Any] = Field(
        None, description="Sanitised GE profiling summary payload."
    )
    ge_profiling_summary_size_bytes: Optional[int] = Field(
        None, ge=0, description="Size in bytes of the GE profiling summary JSON."
    )
    ge_profiling_total_size_bytes: Optional[int] = Field(
        None, ge=0, description="Combined size (bytes) of profiling result + summary."
    )
    ge_profiling_html_report_url: Optional[str] = Field(
        None, description="Optional URL to the generated GE profiling HTML report."
    )
    ge_result_desc_json: Optional[Any] = Field(
        None, description="Result JSON for the GE result description action."
    )
    ge_result_desc_json_size_bytes: Optional[int] = Field(
        None, ge=0, description="Size in bytes of the GE result description JSON."
    )
    snippet_json: Optional[Any] = Field(
        None, description="Snippet generation action result JSON."
    )
    snippet_json_size_bytes: Optional[int] = Field(
        None, ge=0, description="Size in bytes of the snippet result JSON."
    )
    snippet_alias_json: Optional[Any] = Field(
        None, description="Snippet alias expansion result JSON."
    )
    snippet_alias_json_size_bytes: Optional[int] = Field(
        None, ge=0, description="Size in bytes of the snippet alias result JSON."
    )
    error_code: Optional[str] = Field(None, description="Optional error code when status indicates a failure.")
    error_message: Optional[str] = Field(None, description="Optional error message when status indicates a failure.")
    started_at: Optional[datetime] = Field(
        None, description="Timestamp when the action started executing."
    )
    finished_at: Optional[datetime] = Field(
        None, description="Timestamp when the action finished executing."
    )
    duration_ms: Optional[int] = Field(
        None,
        ge=0,
        description="Optional execution duration in milliseconds.",
    )


class TableSnippetRagIngestRequest(BaseModel):
    table_id: int = Field(..., ge=1, description="Unique identifier for the table.")
    version_ts: int = Field(
        ...,
        ge=0,
        description="Version timestamp aligned with the pipeline (yyyyMMddHHmmss as integer).",
    )
    workspace_id: int = Field(..., ge=0, description="Workspace id used when pushing snippets to RAG.")
    rag_item_type: Optional[str] = Field(
        "SNIPPET",
        description="Optional RAG item type used when pushing snippets to RAG. Defaults to 'SNIPPET'.",
    )


class TableSnippetRagIngestResponse(BaseModel):
    rag_item_ids: List[int] = Field(..., description="List of ingested rag_item_ids.")
    result_checksum: Optional[str] = Field(
        None,
        description="Optional checksum for the result payload (e.g., MD5).",
    )


class TableSnippetUpsertResponse(BaseModel):
    table_id: int
    version_ts: int
    action_type: ActionType
    status: ActionStatus
    updated: bool