from __future__ import annotations from datetime import datetime from enum import Enum from typing import Any, Dict, List, Optional, Union from pydantic import BaseModel, Field, HttpUrl from app.settings import DEFAULT_IMPORT_MODEL class LLMRole(str, Enum): USER = "user" ASSISTANT = "assistant" SYSTEM = "system" class LLMMessage(BaseModel): role: LLMRole = Field(..., description="Message author role.") content: str = Field(..., description="Plain text content of the message.") class LLMProvider(str, Enum): OPENAI = "openai" ANTHROPIC = "anthropic" OPENROUTER = "openrouter" GEMINI = "gemini" QWEN = "qwen" DEEPSEEK = "deepseek" class LLMRequest(BaseModel): provider: LLMProvider = Field(..., description="Target LLM provider identifier.") model: str = Field(..., description="Model name understood by the provider.") messages: List[LLMMessage] = Field(..., description="Ordered chat messages.") temperature: Optional[float] = Field( 0.7, description="Sampling temperature when supported." ) top_p: Optional[float] = Field( None, description="Top-p nucleus sampling when supported." ) max_tokens: Optional[int] = Field( None, description="Maximum tokens to generate when supported." ) stream: Optional[bool] = Field( False, description="Enable provider streaming if both sides support it." ) extra_params: Optional[dict[str, Any]] = Field( None, description="Provider-specific parameters to merge into the payload." ) class LLMChoice(BaseModel): index: int message: LLMMessage class LLMResponse(BaseModel): provider: LLMProvider model: str choices: List[LLMChoice] raw: Optional[dict[str, Any]] = Field( None, description="Raw provider response for debugging." ) class DataImportAnalysisRequest(BaseModel): import_record_id: str = Field(..., description="Unique identifier for this import run.") example_data: str = Field( ..., max_length=30_000, description="Sample rows from the import payload. Limited to 30k characters.", ) table_headers: List[str] = Field( ..., min_length=1, description="Ordered list of table headers associated with the data.", ) llm_model: str = Field( ..., description="Model identifier. Accepts 'provider:model' format or plain model name.", ) temperature: Optional[float] = Field( None, description="Optional override for LLM temperature when generating recognition output.", ) max_tokens: Optional[int] = Field( None, description="Optional override for maximum tokens generated during recognition.", ) class DataImportAnalysisResponse(BaseModel): import_record_id: str llm_response: LLMResponse class DataImportAnalysisJobRequest(BaseModel): import_record_id: str = Field( ..., description="Unique identifier for this import request run." ) rows: List[Union[Dict[str, Any], List[Any]]] = Field( ..., description="Sample rows from the import payload. Accepts list of dicts or list of lists.", ) headers: Optional[List[str]] = Field( None, description="Ordered list of table headers associated with the data sample.", ) raw_csv: Optional[str] = Field( None, description="Optional raw CSV representation of the sample rows, if already prepared.", ) table_schema: Optional[Any] = Field( None, description="Optional schema description for the table. Can be a string or JSON-serialisable structure.", ) callback_url: HttpUrl = Field( ..., description="URL to notify when the analysis completes. Receives JSON payload with status/results.", ) llm_model: str = Field( DEFAULT_IMPORT_MODEL, description="Target LLM model identifier. Defaults to DEFAULT_IMPORT_MODEL.", ) temperature: Optional[float] = Field( None, description="Optional override for model temperature when generating analysis output.", ) max_output_tokens: Optional[int] = Field( None, description="Optional maximum number of tokens to generate in the analysis response.", ) class DataImportAnalysisJobAck(BaseModel): import_record_id: str = Field(..., description="Echo of the import record identifier") status: str = Field("accepted", description="Processing status acknowledgement.") class TableProfilingJobRequest(BaseModel): table_id: str = Field(..., description="Unique identifier for the table to profile.") version_ts: str = Field( ..., pattern=r"^\d{14}$", description="Version timestamp expressed as fourteen digit string (yyyyMMddHHmmss).", ) callback_url: HttpUrl = Field( ..., description="Callback endpoint invoked after each pipeline action completes.", ) table_schema: Optional[Any] = Field( None, description="Schema structure snapshot for the current table version.", ) table_schema_version_id: Optional[str] = Field( None, description="Identifier for the schema snapshot provided in table_schema.", ) table_link_info: Optional[Dict[str, Any]] = Field( None, description=( "Information describing how to locate the source table for profiling. " "For example: {'type': 'sql', 'connection_string': 'mysql+pymysql://user:pass@host/db', " "'table': 'schema.table_name'}." ), ) table_access_info: Optional[Dict[str, Any]] = Field( None, description=( "Credentials or supplemental parameters required to access the table described in table_link_info. " "These values can be merged into the connection string using Python format placeholders." ), ) ge_batch_request: Optional[Dict[str, Any]] = Field( None, description="Optional Great Expectations batch request payload used for profiling.", ) ge_expectation_suite_name: Optional[str] = Field( None, description="Expectation suite name used during profiling. Created automatically when absent.", ) ge_data_context_root: Optional[str] = Field( None, description="Custom root directory for the Great Expectations data context. Defaults to project ./gx.", ) ge_datasource_name: Optional[str] = Field( None, description="Datasource name registered inside the GE context when batch_request is not supplied.", ) ge_data_asset_name: Optional[str] = Field( None, description="Data asset reference used when inferring batch request from datasource configuration.", ) ge_profiler_type: str = Field( "user_configurable", description="Profiler implementation identifier. Currently supports 'user_configurable' or 'data_assistant'.", ) llm_model: Optional[str] = Field( None, description="Default LLM model spec applied to prompt-based actions when overrides are omitted.", ) result_desc_model: Optional[str] = Field( None, description="LLM model override used for GE result description (action 2).", ) snippet_model: Optional[str] = Field( None, description="LLM model override used for snippet generation (action 3).", ) snippet_alias_model: Optional[str] = Field( None, description="LLM model override used for snippet alias enrichment (action 4).", ) extra_options: Optional[Dict[str, Any]] = Field( None, description="Miscellaneous execution flags applied across pipeline steps.", ) class TableProfilingJobAck(BaseModel): table_id: str = Field(..., description="Echo of the table identifier.") version_ts: str = Field(..., description="Echo of the profiling version timestamp (yyyyMMddHHmmss).") status: str = Field("accepted", description="Processing acknowledgement status.")