294 lines
10 KiB
Python
294 lines
10 KiB
Python
from __future__ import annotations
|
|
|
|
from datetime import datetime
|
|
from enum import Enum
|
|
from typing import Any, Dict, List, Optional, Union
|
|
|
|
from pydantic import BaseModel, Field, HttpUrl
|
|
|
|
from app.settings import DEFAULT_IMPORT_MODEL
|
|
|
|
|
|
class LLMRole(str, Enum):
|
|
USER = "user"
|
|
ASSISTANT = "assistant"
|
|
SYSTEM = "system"
|
|
|
|
|
|
class LLMMessage(BaseModel):
|
|
role: LLMRole = Field(..., description="Message author role.")
|
|
content: str = Field(..., description="Plain text content of the message.")
|
|
|
|
|
|
class LLMProvider(str, Enum):
|
|
OPENAI = "openai"
|
|
ANTHROPIC = "anthropic"
|
|
OPENROUTER = "openrouter"
|
|
GEMINI = "gemini"
|
|
QWEN = "qwen"
|
|
DEEPSEEK = "deepseek"
|
|
|
|
|
|
class LLMRequest(BaseModel):
|
|
provider: LLMProvider = Field(..., description="Target LLM provider identifier.")
|
|
model: str = Field(..., description="Model name understood by the provider.")
|
|
messages: List[LLMMessage] = Field(..., description="Ordered chat messages.")
|
|
temperature: Optional[float] = Field(
|
|
0.7, description="Sampling temperature when supported."
|
|
)
|
|
top_p: Optional[float] = Field(
|
|
None, description="Top-p nucleus sampling when supported."
|
|
)
|
|
max_tokens: Optional[int] = Field(
|
|
None, description="Maximum tokens to generate when supported."
|
|
)
|
|
stream: Optional[bool] = Field(
|
|
False, description="Enable provider streaming if both sides support it."
|
|
)
|
|
extra_params: Optional[dict[str, Any]] = Field(
|
|
None, description="Provider-specific parameters to merge into the payload."
|
|
)
|
|
|
|
|
|
class LLMChoice(BaseModel):
|
|
index: int
|
|
message: LLMMessage
|
|
|
|
|
|
class LLMResponse(BaseModel):
|
|
provider: LLMProvider
|
|
model: str
|
|
choices: List[LLMChoice]
|
|
raw: Optional[dict[str, Any]] = Field(
|
|
None, description="Raw provider response for debugging."
|
|
)
|
|
|
|
|
|
class DataImportAnalysisRequest(BaseModel):
|
|
import_record_id: str = Field(..., description="Unique identifier for this import run.")
|
|
example_data: str = Field(
|
|
...,
|
|
max_length=30_000,
|
|
description="Sample rows from the import payload. Limited to 30k characters.",
|
|
)
|
|
table_headers: List[str] = Field(
|
|
...,
|
|
min_length=1,
|
|
description="Ordered list of table headers associated with the data.",
|
|
)
|
|
llm_model: str = Field(
|
|
None,
|
|
description="Model identifier. Accepts 'provider:model_name' format or custom model alias.",
|
|
)
|
|
temperature: Optional[float] = Field(
|
|
None,
|
|
description="Optional override for LLM temperature when generating recognition output.",
|
|
)
|
|
max_tokens: Optional[int] = Field(
|
|
None,
|
|
description="Optional override for maximum tokens generated during recognition.",
|
|
)
|
|
|
|
|
|
class DataImportAnalysisResponse(BaseModel):
|
|
import_record_id: str
|
|
llm_response: LLMResponse
|
|
|
|
|
|
class DataImportAnalysisJobRequest(BaseModel):
|
|
import_record_id: str = Field(
|
|
..., description="Unique identifier for this import request run."
|
|
)
|
|
rows: List[Union[Dict[str, Any], List[Any]]] = Field(
|
|
...,
|
|
description="Sample rows from the import payload. Accepts list of dicts or list of lists.",
|
|
)
|
|
headers: Optional[List[str]] = Field(
|
|
None,
|
|
description="Ordered list of table headers associated with the data sample.",
|
|
)
|
|
raw_csv: Optional[str] = Field(
|
|
None,
|
|
description="Optional raw CSV representation of the sample rows, if already prepared.",
|
|
)
|
|
table_schema: Optional[Any] = Field(
|
|
None,
|
|
description="Optional schema description for the table. Can be a string or JSON-serialisable structure.",
|
|
)
|
|
callback_url: HttpUrl = Field(
|
|
...,
|
|
description="URL to notify when the analysis completes. Receives JSON payload with status/results.",
|
|
)
|
|
llm_model: str = Field(
|
|
DEFAULT_IMPORT_MODEL,
|
|
description="Target LLM model identifier. Defaults to DEFAULT_IMPORT_MODEL.",
|
|
)
|
|
temperature: Optional[float] = Field(
|
|
None,
|
|
description="Optional override for model temperature when generating analysis output.",
|
|
)
|
|
max_output_tokens: Optional[int] = Field(
|
|
None,
|
|
description="Optional maximum number of tokens to generate in the analysis response.",
|
|
)
|
|
|
|
|
|
class DataImportAnalysisJobAck(BaseModel):
|
|
import_record_id: str = Field(..., description="Echo of the import record identifier")
|
|
status: str = Field("accepted", description="Processing status acknowledgement.")
|
|
|
|
|
|
class ActionType(str, Enum):
|
|
GE_PROFILING = "ge_profiling"
|
|
GE_RESULT_DESC = "ge_result_desc"
|
|
SNIPPET = "snippet"
|
|
SNIPPET_ALIAS = "snippet_alias"
|
|
|
|
|
|
class ActionStatus(str, Enum):
|
|
PENDING = "pending"
|
|
RUNNING = "running"
|
|
SUCCESS = "success"
|
|
FAILED = "failed"
|
|
PARTIAL = "partial"
|
|
|
|
|
|
class TableProfilingJobRequest(BaseModel):
|
|
table_id: str = Field(..., description="Unique identifier for the table to profile.")
|
|
version_ts: str = Field(
|
|
...,
|
|
pattern=r"^\d{14}$",
|
|
description="Version timestamp expressed as fourteen digit string (yyyyMMddHHmmss).",
|
|
)
|
|
callback_url: HttpUrl = Field(
|
|
...,
|
|
description="Callback endpoint invoked after each pipeline action completes.",
|
|
)
|
|
llm_model: Optional[str] = Field(
|
|
None,
|
|
description="Default LLM model spec applied to prompt-based actions when overrides are omitted.",
|
|
)
|
|
table_schema: Optional[Any] = Field(
|
|
None,
|
|
description="Schema structure snapshot for the current table version.",
|
|
)
|
|
table_schema_version_id: Optional[str] = Field(
|
|
None,
|
|
description="Identifier for the schema snapshot provided in table_schema.",
|
|
)
|
|
table_link_info: Optional[Dict[str, Any]] = Field(
|
|
None,
|
|
description=(
|
|
"Information describing how to locate the source table for profiling. "
|
|
"For example: {'type': 'sql', 'connection_string': 'mysql+pymysql://user:pass@host/db', "
|
|
"'table': 'schema.table_name'}."
|
|
),
|
|
)
|
|
table_access_info: Optional[Dict[str, Any]] = Field(
|
|
None,
|
|
description=(
|
|
"Credentials or supplemental parameters required to access the table described in table_link_info. "
|
|
"These values can be merged into the connection string using Python format placeholders."
|
|
),
|
|
)
|
|
ge_batch_request: Optional[Dict[str, Any]] = Field(
|
|
None,
|
|
description="Optional Great Expectations batch request payload used for profiling.",
|
|
)
|
|
ge_expectation_suite_name: Optional[str] = Field(
|
|
None,
|
|
description="Expectation suite name used during profiling. Created automatically when absent.",
|
|
)
|
|
ge_data_context_root: Optional[str] = Field(
|
|
None,
|
|
description="Custom root directory for the Great Expectations data context. Defaults to project ./gx.",
|
|
)
|
|
ge_datasource_name: Optional[str] = Field(
|
|
None,
|
|
description="Datasource name registered inside the GE context when batch_request is not supplied.",
|
|
)
|
|
ge_data_asset_name: Optional[str] = Field(
|
|
None,
|
|
description="Data asset reference used when inferring batch request from datasource configuration.",
|
|
)
|
|
ge_profiler_type: str = Field(
|
|
"user_configurable",
|
|
description="Profiler implementation identifier. Currently supports 'user_configurable' or 'data_assistant'.",
|
|
)
|
|
|
|
result_desc_model: Optional[str] = Field(
|
|
None,
|
|
description="LLM model override used for GE result description (action 2).",
|
|
)
|
|
snippet_model: Optional[str] = Field(
|
|
None,
|
|
description="LLM model override used for snippet generation (action 3).",
|
|
)
|
|
snippet_alias_model: Optional[str] = Field(
|
|
None,
|
|
description="LLM model override used for snippet alias enrichment (action 4).",
|
|
)
|
|
extra_options: Optional[Dict[str, Any]] = Field(
|
|
None,
|
|
description="Miscellaneous execution flags applied across pipeline steps.",
|
|
)
|
|
|
|
|
|
class TableProfilingJobAck(BaseModel):
|
|
table_id: str = Field(..., description="Echo of the table identifier.")
|
|
version_ts: str = Field(..., description="Echo of the profiling version timestamp (yyyyMMddHHmmss).")
|
|
status: str = Field("accepted", description="Processing acknowledgement status.")
|
|
|
|
|
|
class TableSnippetUpsertRequest(BaseModel):
|
|
table_id: int = Field(..., ge=1, description="Unique identifier for the table.")
|
|
version_ts: int = Field(
|
|
...,
|
|
ge=0,
|
|
description="Version timestamp aligned with the pipeline (yyyyMMddHHmmss as integer).",
|
|
)
|
|
action_type: ActionType = Field(..., description="Pipeline action type for this record.")
|
|
status: ActionStatus = Field(
|
|
ActionStatus.SUCCESS, description="Execution status for the action."
|
|
)
|
|
callback_url: HttpUrl = Field(..., description="Callback URL associated with the action run.")
|
|
table_schema_version_id: int = Field(..., ge=0, description="Identifier for the schema snapshot.")
|
|
table_schema: Any = Field(..., description="Schema snapshot payload for the table.")
|
|
result_json: Optional[Any] = Field(
|
|
None,
|
|
description="Primary result payload for the action (e.g., profiling output, snippet array).",
|
|
)
|
|
result_summary_json: Optional[Any] = Field(
|
|
None,
|
|
description="Optional summary payload (e.g., profiling summary) for the action.",
|
|
)
|
|
html_report_url: Optional[str] = Field(
|
|
None,
|
|
description="Optional HTML report URL generated by the action.",
|
|
)
|
|
error_code: Optional[str] = Field(None, description="Optional error code when status indicates a failure.")
|
|
error_message: Optional[str] = Field(None, description="Optional error message when status indicates a failure.")
|
|
started_at: Optional[datetime] = Field(
|
|
None, description="Timestamp when the action started executing."
|
|
)
|
|
finished_at: Optional[datetime] = Field(
|
|
None, description="Timestamp when the action finished executing."
|
|
)
|
|
duration_ms: Optional[int] = Field(
|
|
None,
|
|
ge=0,
|
|
description="Optional execution duration in milliseconds.",
|
|
)
|
|
result_checksum: Optional[str] = Field(
|
|
None,
|
|
description="Optional checksum for the result payload (e.g., MD5).",
|
|
)
|
|
|
|
|
|
class TableSnippetUpsertResponse(BaseModel):
|
|
table_id: int
|
|
version_ts: int
|
|
action_type: ActionType
|
|
status: ActionStatus
|
|
updated: bool
|