安装启动数据分析治理服务指引

2025-10-30 23:01:19 +08:00
66 changed files with 140 additions and 18372 deletions
--- a/.env
+++ b/.env
@ -16,11 +16,8 @@ DEFAULT_IMPORT_MODEL=deepseek:deepseek-chat
 # Service configuration
 IMPORT_GATEWAY_BASE_URL=http://localhost:8000
 # prod nbackend base url
 NBACKEND_BASE_URL=https://chatbi.agentcarrier.cn/chatbi/api
 # HTTP client configuration
-HTTP_CLIENT_TIMEOUT=120
+HTTP_CLIENT_TIMEOUT=30
 HTTP_CLIENT_TRUST_ENV=false
 # HTTP_CLIENT_PROXY=
@ -30,5 +27,3 @@ IMPORT_CHAT_TIMEOUT_SECONDS=120
 # Logging
 LOG_LEVEL=INFO
 # LOG_FORMAT=%(asctime)s %(levelname)s %(name)s:%(lineno)d %(message)s
 NEW_API_BASE_URL=http://localhost:3000
 NEW_API_AUTH_TOKEN="sk-Q79KGFJRs5Vk9HsfFqoiJk948uLMDhAVe037AeCb31URyWGL"
--- a/.gitignore
+++ b/.gitignore
@ -4,5 +4,3 @@ gx/uncommitted/
 **/__pycache__/
 *.pyc
 .DS_Store
 gx/
 logs/
--- a/17
+++ b/17
@ -1,17 +0,0 @@
 FROM python:3.11-slim
 # 设置 pip 全局使用国内源
 ENV PIP_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple/
 ENV PIP_TRUSTED_HOST=pypi.tuna.tsinghua.edu.cn
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir --upgrade pip && \
    pip install --no-cache-dir -r requirements.txt
 COPY . .
 EXPOSE 8000
 CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/README.md
+++ b/README.md
@ -2,7 +2,7 @@
 This project exposes a FastAPI-based microservice that provides:
- A unified chat completions gateway that now forwards requests to the internal `new-api` service (default `http://localhost:3000`) while preserving the same client-facing schema.
+- A unified chat completions gateway supporting multiple LLM providers (OpenAI, Anthropic, OpenRouter, Gemini, Qwen, DeepSeek, etc.)
 - An asynchronous data import analysis pipeline that orchestrates LLM calls to produce structured metadata and processing recommendations
 The following instructions cover environment setup, dependency installation, and running the backend service.
@ -56,7 +56,6 @@ Copy `.env.example` to `.env` (if provided) or edit `.env` to supply API keys an
 - `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `OPENROUTER_API_KEY`, etc.
 - `HTTP_CLIENT_TIMEOUT`, `IMPORT_CHAT_TIMEOUT_SECONDS`
 - `LOG_LEVEL`, `LOG_FORMAT` for logging
 - `NEW_API_BASE_URL` (defaults to `http://localhost:3000`) and optional `NEW_API_AUTH_TOKEN` if the new-api component enforces authentication.
 ## Run the Backend Service
@ -78,9 +77,6 @@ nohup uvicorn app.main:app --host 0.0.0.0 --port 8000 > server.log 2>&1 &
 Or use a process manager such as `pm2`, `supervisor`, or systemd for production deployments.
 ## API List
 1. 导入分析schema接口 http://localhost:8000/v1/import/analyze
 ## Additional Commands
 - Run the data import analysis example: `python test/data_import_analysis_example.py`
--- a/app/db.py
+++ b/app/db.py
@ -1,26 +0,0 @@
 from __future__ import annotations
 import os
 from functools import lru_cache
 from sqlalchemy import create_engine
 from sqlalchemy.engine import Engine
@lru_cache(maxsize=1)
 def get_engine() -> Engine:
    """Return a cached SQLAlchemy engine configured from DATABASE_URL."""
    database_url = os.getenv(
        "DATABASE_URL",
        "mysql+pymysql://root:12345678@localhost:3306/data-ge?charset=utf8mb4",
    )
    connect_args = {}
    if database_url.startswith("sqlite"):
        connect_args["check_same_thread"] = False
    return create_engine(
        database_url,
        pool_pre_ping=True,
        future=True,
        connect_args=connect_args,
    )
--- a/app/main.py
+++ b/app/main.py
@ -2,65 +2,43 @@ from __future__ import annotations
 import asyncio
 import logging
 import logging.config
 import os
 from contextlib import asynccontextmanager
 from typing import Any
 import yaml
 import httpx
 from fastapi import Depends, FastAPI, HTTPException, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse
 from app.exceptions import ProviderAPICallError, ProviderConfigurationError
 from app.models import (
    ActionStatus,
    ActionType,
    DataImportAnalysisJobAck,
    DataImportAnalysisJobRequest,
    LLMRequest,
    LLMResponse,
    TableProfilingJobAck,
    TableProfilingJobRequest,
    TableSnippetRagIngestRequest,
    TableSnippetRagIngestResponse,
    TableSnippetUpsertRequest,
    TableSnippetUpsertResponse,
 )
 from app.routers import chat_router, metrics_router
 from app.services import LLMGateway
 from app.services.import_analysis import process_import_analysis_job
 from app.services.table_profiling import process_table_profiling_job
 from app.services.table_snippet import ingest_snippet_rag_from_db, upsert_action_result
 def _ensure_log_directories(config: dict[str, Any]) -> None:
    handlers = config.get("handlers", {})
    for handler_config in handlers.values():
        filename = handler_config.get("filename")
        if not filename:
            continue
        directory = os.path.dirname(filename)
        if directory and not os.path.exists(directory):
            os.makedirs(directory, exist_ok=True)
 def _configure_logging() -> None:
-    config_path = os.getenv("LOGGING_CONFIG", "logging.yaml")
+    level_name = os.getenv("LOG_LEVEL", "INFO").upper()
-    if os.path.exists(config_path):
+    level = getattr(logging, level_name, logging.INFO)
-        with open(config_path, "r", encoding="utf-8") as fh:
+    log_format = os.getenv(
-            config = yaml.safe_load(fh)
+        "LOG_FORMAT",
-        if isinstance(config, dict):
+        "%(asctime)s %(levelname)s %(name)s:%(lineno)d %(message)s",
            _ensure_log_directories(config)
            logging.config.dictConfig(config)
            return
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(levelname)s %(name)s:%(lineno)d %(message)s",
    )
    root = logging.getLogger()
    if not root.handlers:
        logging.basicConfig(level=level, format=log_format)
    else:
        root.setLevel(level)
        formatter = logging.Formatter(log_format)
        for handler in root.handlers:
            handler.setLevel(level)
            handler.setFormatter(formatter)
 _configure_logging()
 logger = logging.getLogger(__name__)
@ -140,27 +118,6 @@ def create_app() -> FastAPI:
        version="0.1.0",
        lifespan=lifespan,
    )
    # Chat/metric management APIs
    application.include_router(chat_router)
    application.include_router(metrics_router)
    @application.exception_handler(RequestValidationError)
    async def request_validation_exception_handler(
        request: Request, exc: RequestValidationError
    ) -> JSONResponse:
        try:
            raw_body = await request.body()
        except Exception:  # pragma: no cover - defensive
            raw_body = b"<unavailable>"
        truncated_body = raw_body[:4096]
        logger.warning(
            "Validation error on %s %s: %s | body preview=%s",
            request.method,
            request.url.path,
            exc.errors(),
            truncated_body.decode("utf-8", errors="ignore"),
        )
        return JSONResponse(status_code=422, content={"detail": exc.errors()})
    @application.post(
        "/v1/chat/completions",
@ -207,109 +164,6 @@ def create_app() -> FastAPI:
        return DataImportAnalysisJobAck(import_record_id=payload.import_record_id, status="accepted")
    @application.post(
        "/v1/table/profiling",
        response_model=TableProfilingJobAck,
        summary="Run end-to-end GE profiling pipeline and notify via callback per action",
        status_code=202,
    )
    async def run_table_profiling(
        payload: TableProfilingJobRequest,
        gateway: LLMGateway = Depends(get_gateway),
        client: httpx.AsyncClient = Depends(get_http_client),
    ) -> TableProfilingJobAck:
        request_copy = payload.model_copy(deep=True)
        async def _runner() -> None:
            await process_table_profiling_job(request_copy, gateway, client)
        asyncio.create_task(_runner())
        return TableProfilingJobAck(
            table_id=payload.table_id,
            version_ts=payload.version_ts,
            status="accepted",
        )
    @application.post(
        "/v1/table/snippet",
        response_model=TableSnippetUpsertResponse,
        summary="Persist or update action results, such as table snippets.",
    )
    async def upsert_table_snippet(
        payload: TableSnippetUpsertRequest,
        client: httpx.AsyncClient = Depends(get_http_client),
    ) -> TableSnippetUpsertResponse:
        request_copy = payload.model_copy(deep=True)
        try:
            response = await asyncio.to_thread(upsert_action_result, request_copy)
        except Exception as exc:
            logger.error(
                "Failed to upsert table snippet: table_id=%s version_ts=%s action_type=%s",
                payload.table_id,
                payload.version_ts,
                payload.action_type,
                exc_info=True,
            )
            raise HTTPException(status_code=500, detail=str(exc)) from exc
        else:
            # After snippet_alias is stored, automatically trigger RAG ingest when configured.
            if (
                payload.action_type == ActionType.SNIPPET_ALIAS
                and payload.status == ActionStatus.SUCCESS
                and payload.rag_workspace_id is not None
            ):
                try:
                    await ingest_snippet_rag_from_db(
                        table_id=payload.table_id,
                        version_ts=payload.version_ts,
                        workspace_id=payload.rag_workspace_id,
                        rag_item_type=payload.rag_item_type or "SNIPPET",
                        client=client,
                    )
                except Exception:
                    logger.exception(
                        "Failed to ingest snippet RAG artifacts after snippet_alias upsert",
                        extra={
                            "table_id": payload.table_id,
                            "version_ts": payload.version_ts,
                            "workspace_id": payload.rag_workspace_id,
                        },
                    )
            return response
    @application.post(
        "/v1/table/snippet/rag_ingest",
        response_model=TableSnippetRagIngestResponse,
        summary="Merge snippet+alias results from action_results and ingest into RAG.",
    )
    async def ingest_snippet_rag(
        payload: TableSnippetRagIngestRequest,
        client: httpx.AsyncClient = Depends(get_http_client),
    ) -> TableSnippetRagIngestResponse:
        try:
            rag_item_ids = await ingest_snippet_rag_from_db(
                table_id=payload.table_id,
                version_ts=payload.version_ts,
                workspace_id=payload.workspace_id,
                rag_item_type=payload.rag_item_type or "SNIPPET",
                client=client,
            )
        except Exception as exc:
            logger.exception(
                "Failed to ingest snippet RAG artifacts",
                extra={
                    "table_id": payload.table_id,
                    "version_ts": payload.version_ts,
                    "workspace_id": payload.workspace_id,
                },
            )
            raise HTTPException(status_code=500, detail=str(exc)) from exc
        return TableSnippetRagIngestResponse(rag_item_ids=rag_item_ids)
    @application.post("/__mock__/import-callback")
    async def mock_import_callback(payload: dict[str, Any]) -> dict[str, str]:
        logger.info("Received import analysis callback: %s", payload)
--- a/app/models.py
+++ b/app/models.py
@ -1,6 +1,5 @@
 from __future__ import annotations
 from datetime import datetime
 from enum import Enum
 from typing import Any, Dict, List, Optional, Union
@ -77,8 +76,8 @@ class DataImportAnalysisRequest(BaseModel):
        description="Ordered list of table headers associated with the data.",
    )
    llm_model: str = Field(
-        None,
+        ...,
-        description="Model identifier. Accepts 'provider:model_name' format or custom model alias.",
+        description="Model identifier. Accepts 'provider:model' format or plain model name.",
    )
    temperature: Optional[float] = Field(
        None,
@ -136,235 +135,3 @@ class DataImportAnalysisJobRequest(BaseModel):
 class DataImportAnalysisJobAck(BaseModel):
    import_record_id: str = Field(..., description="Echo of the import record identifier")
    status: str = Field("accepted", description="Processing status acknowledgement.")
 class ActionType(str, Enum):
    GE_PROFILING = "ge_profiling"
    GE_RESULT_DESC = "ge_result_desc"
    SNIPPET = "snippet"
    SNIPPET_ALIAS = "snippet_alias"
 class ActionStatus(str, Enum):
    PENDING = "pending"
    RUNNING = "running"
    SUCCESS = "success"
    FAILED = "failed"
    PARTIAL = "partial"
 class TableProfilingJobRequest(BaseModel):
    table_id: str = Field(..., description="Unique identifier for the table to profile.")
    version_ts: str = Field(
        ...,
        pattern=r"^\d{14}$",
        description="Version timestamp expressed as fourteen digit string (yyyyMMddHHmmss).",
    )
    callback_url: HttpUrl = Field(
        ...,
        description="Callback endpoint invoked after each pipeline action completes.",
    )
    llm_model: Optional[str] = Field(
        None,
        description="Default LLM model spec applied to prompt-based actions when overrides are omitted.",
    )
    table_schema: Optional[Any] = Field(
        None,
        description="Schema structure snapshot for the current table version.",
    )
    table_schema_version_id: Optional[str] = Field(
        None,
        description="Identifier for the schema snapshot provided in table_schema.",
    )
    table_link_info: Optional[Dict[str, Any]] = Field(
        None,
        description=(
            "Information describing how to locate the source table for profiling. "
            "For example: {'type': 'sql', 'connection_string': 'mysql+pymysql://user:pass@host/db', "
            "'table': 'schema.table_name'}."
        ),
    )
    table_access_info: Optional[Dict[str, Any]] = Field(
        None,
        description=(
            "Credentials or supplemental parameters required to access the table described in table_link_info. "
            "These values can be merged into the connection string using Python format placeholders."
        ),
    )
    ge_batch_request: Optional[Dict[str, Any]] = Field(
        None,
        description="Optional Great Expectations batch request payload used for profiling.",
    )
    ge_expectation_suite_name: Optional[str] = Field(
        None,
        description="Expectation suite name used during profiling. Created automatically when absent.",
    )
    ge_data_context_root: Optional[str] = Field(
        None,
        description="Custom root directory for the Great Expectations data context. Defaults to project ./gx.",
    )
    ge_datasource_name: Optional[str] = Field(
        None,
        description="Datasource name registered inside the GE context when batch_request is not supplied.",
    )
    ge_data_asset_name: Optional[str] = Field(
        None,
        description="Data asset reference used when inferring batch request from datasource configuration.",
    )
    ge_profiler_type: str = Field(
        "user_configurable",
        description="Profiler implementation identifier. Currently supports 'user_configurable' or 'data_assistant'.",
    )
    result_desc_model: Optional[str] = Field(
        None,
        description="LLM model override used for GE result description (action 2).",
    )
    snippet_model: Optional[str] = Field(
        None,
        description="LLM model override used for snippet generation (action 3).",
    )
    snippet_alias_model: Optional[str] = Field(
        None,
        description="LLM model override used for snippet alias enrichment (action 4).",
    )
    extra_options: Optional[Dict[str, Any]] = Field(
        None,
        description="Miscellaneous execution flags applied across pipeline steps.",
    )
    workspace_id: Optional[int] = Field(
        None,
        ge=0,
        description="Optional workspace identifier forwarded to snippet_alias callback for RAG ingestion.",
    )
    rag_item_type: Optional[str] = Field(
        "SNIPPET",
        description="Optional RAG item type forwarded to snippet_alias callback.",
    )
 class TableProfilingJobAck(BaseModel):
    table_id: str = Field(..., description="Echo of the table identifier.")
    version_ts: str = Field(..., description="Echo of the profiling version timestamp (yyyyMMddHHmmss).")
    status: str = Field("accepted", description="Processing acknowledgement status.")
 class TableSnippetUpsertRequest(BaseModel):
    table_id: int = Field(..., ge=1, description="Unique identifier for the table.")
    version_ts: int = Field(
        ...,
        ge=0,
        description="Version timestamp aligned with the pipeline (yyyyMMddHHmmss as integer).",
    )
    workspace_id: Optional[int] = Field(
        None,
        ge=0,
        description="Optional workspace identifier for RAG ingestion; when provided and action_type=snippet_alias "
        "with status=success, merged snippets will be written to rag_snippet and pushed to RAG.",
    )
    rag_item_type: Optional[str] = Field(
        "SNIPPET",
        description="Optional RAG item type used when pushing snippets to RAG. Defaults to 'SNIPPET'.",
    )
    action_type: ActionType = Field(..., description="Pipeline action type for this record.")
    status: ActionStatus = Field(
        ActionStatus.SUCCESS, description="Execution status for the action."
    )
    callback_url: HttpUrl = Field(..., description="Callback URL associated with the action run.")
    table_schema_version_id: int = Field(..., ge=0, description="Identifier for the schema snapshot.")
    table_schema: Any = Field(..., description="Schema snapshot payload for the table.")
    model: Optional[str] = Field(
        None,
        description="LLM model identifier (can be provider alias) used for this action, when applicable.",
    )
    model_provider: Optional[str] = Field(
        None,
        description="LLM provider responsible for executing the action's model.",
    )
    model_params: Optional[Dict[str, Any]] = Field(
        None,
        description="Optional model parameter overrides (e.g., temperature) associated with the action.",
    )
    llm_usage: Optional[Any] = Field(
        None,
        description="Optional token usage metrics reported by the LLM provider.",
    )
    ge_profiling_json: Optional[Any] = Field(
        None, description="Full GE profiling result payload for the profiling action."
    )
    ge_profiling_json_size_bytes: Optional[int] = Field(
        None, ge=0, description="Size in bytes of the GE profiling result JSON."
    )
    ge_profiling_summary: Optional[Any] = Field(
        None, description="Sanitised GE profiling summary payload."
    )
    ge_profiling_summary_size_bytes: Optional[int] = Field(
        None, ge=0, description="Size in bytes of the GE profiling summary JSON."
    )
    ge_profiling_total_size_bytes: Optional[int] = Field(
        None, ge=0, description="Combined size (bytes) of profiling result + summary."
    )
    ge_profiling_html_report_url: Optional[str] = Field(
        None, description="Optional URL to the generated GE profiling HTML report."
    )
    ge_result_desc_json: Optional[Any] = Field(
        None, description="Result JSON for the GE result description action."
    )
    ge_result_desc_json_size_bytes: Optional[int] = Field(
        None, ge=0, description="Size in bytes of the GE result description JSON."
    )
    snippet_json: Optional[Any] = Field(
        None, description="Snippet generation action result JSON."
    )
    snippet_json_size_bytes: Optional[int] = Field(
        None, ge=0, description="Size in bytes of the snippet result JSON."
    )
    snippet_alias_json: Optional[Any] = Field(
        None, description="Snippet alias expansion result JSON."
    )
    snippet_alias_json_size_bytes: Optional[int] = Field(
        None, ge=0, description="Size in bytes of the snippet alias result JSON."
    )
    error_code: Optional[str] = Field(None, description="Optional error code when status indicates a failure.")
    error_message: Optional[str] = Field(None, description="Optional error message when status indicates a failure.")
    started_at: Optional[datetime] = Field(
        None, description="Timestamp when the action started executing."
    )
    finished_at: Optional[datetime] = Field(
        None, description="Timestamp when the action finished executing."
    )
    duration_ms: Optional[int] = Field(
        None,
        ge=0,
        description="Optional execution duration in milliseconds.",
    )
 class TableSnippetRagIngestRequest(BaseModel):
    table_id: int = Field(..., ge=1, description="Unique identifier for the table.")
    version_ts: int = Field(
        ...,
        ge=0,
        description="Version timestamp aligned with the pipeline (yyyyMMddHHmmss as integer).",
    )
    workspace_id: int = Field(..., ge=0, description="Workspace id used when pushing snippets to RAG.")
    rag_item_type: Optional[str] = Field(
        "SNIPPET",
        description="Optional RAG item type used when pushing snippets to RAG. Defaults to 'SNIPPET'.",
    )
 class TableSnippetRagIngestResponse(BaseModel):
    rag_item_ids: List[int] = Field(..., description="List of ingested rag_item_ids.")
    result_checksum: Optional[str] = Field(
        None,
        description="Optional checksum for the result payload (e.g., MD5).",
    )
 class TableSnippetUpsertResponse(BaseModel):
    table_id: int
    version_ts: int
    action_type: ActionType
    status: ActionStatus
    updated: bool
--- a/app/routers/init.py
+++ b/app/routers/init.py
@ -1,4 +0,0 @@
 from .chat import router as chat_router
 from .metrics import router as metrics_router
 __all__ = ["chat_router", "metrics_router"]
--- a/app/routers/chat.py
+++ b/app/routers/chat.py
@ -1,102 +0,0 @@
 from __future__ import annotations
 from datetime import datetime
 from typing import Any, List, Optional
 from fastapi import APIRouter, HTTPException, Query
 from app.schemas.chat import (
    ChatSessionCreate,
    ChatSessionUpdate,
    ChatTurnCreate,
    ChatTurnRetrievalBatch,
 )
 from app.services import metric_store
 router = APIRouter(prefix="/api/v1/chat", tags=["chat"])
@router.post("/sessions")
 def create_session(payload: ChatSessionCreate) -> Any:
    """Create a chat session."""
    return metric_store.create_chat_session(payload)
@router.post("/sessions/{session_id}/update")
 def update_session(session_id: int, payload: ChatSessionUpdate) -> Any:
    try:
        return metric_store.update_chat_session(session_id, payload)
    except KeyError:
        raise HTTPException(status_code=404, detail="Session not found")
@router.post("/sessions/{session_id}/close")
 def close_session(session_id: int) -> Any:
    """Close a chat session and stamp end_time."""
    try:
        return metric_store.close_chat_session(session_id)
    except KeyError:
        raise HTTPException(status_code=404, detail="Session not found")
@router.get("/sessions/{session_id}")
 def get_session(session_id: int) -> Any:
    """Fetch one session."""
    session = metric_store.get_chat_session(session_id)
    if not session:
        raise HTTPException(status_code=404, detail="Session not found")
    return session
@router.get("/sessions")
 def list_sessions(
    user_id: Optional[int] = None,
    status: Optional[str] = None,
    start_from: Optional[datetime] = Query(None, description="Filter by start time lower bound."),
    start_to: Optional[datetime] = Query(None, description="Filter by start time upper bound."),
    limit: int = Query(50, ge=1, le=500),
    offset: int = Query(0, ge=0),
 ) -> List[Any]:
    return metric_store.list_chat_sessions(
        user_id=user_id,
        status=status,
        start_from=start_from,
        start_to=start_to,
        limit=limit,
        offset=offset,
    )
@router.post("/sessions/{session_id}/turns")
 def create_turn(session_id: int, payload: ChatTurnCreate) -> Any:
    """Create a turn under a session."""
    try:
        return metric_store.create_chat_turn(session_id, payload)
    except Exception as exc:
        raise HTTPException(status_code=400, detail=str(exc)) from exc
@router.get("/sessions/{session_id}/turns")
 def list_turns(session_id: int) -> List[Any]:
    return metric_store.list_chat_turns(session_id)
@router.get("/turns/{turn_id}")
 def get_turn(turn_id: int) -> Any:
    turn = metric_store.get_chat_turn(turn_id)
    if not turn:
        raise HTTPException(status_code=404, detail="Turn not found")
    return turn
@router.post("/turns/{turn_id}/retrievals")
 def write_retrievals(turn_id: int, payload: ChatTurnRetrievalBatch) -> Any:
    """Batch write retrieval records for a turn."""
    count = metric_store.create_retrievals(turn_id, payload.retrievals)
    return {"turn_id": turn_id, "inserted": count}
@router.get("/turns/{turn_id}/retrievals")
 def list_retrievals(turn_id: int) -> List[Any]:
    return metric_store.list_retrievals(turn_id)
--- a/app/routers/metrics.py
+++ b/app/routers/metrics.py
@ -1,166 +0,0 @@
 from __future__ import annotations
 from datetime import datetime
 from typing import Any, List, Optional
 from fastapi import APIRouter, HTTPException, Query
 from app.schemas.metrics import (
    MetricCreate,
    MetricResultsWriteRequest,
    MetricRunTrigger,
    MetricScheduleCreate,
    MetricScheduleUpdate,
    MetricUpdate,
 )
 from app.services import metric_store
 router = APIRouter(prefix="/api/v1", tags=["metrics"])
@router.post("/metrics")
 def create_metric(payload: MetricCreate) -> Any:
    """Create a metric definition."""
    try:
        return metric_store.create_metric(payload)
    except Exception as exc:
        raise HTTPException(status_code=400, detail=str(exc)) from exc
@router.post("/metrics/{metric_id}")
 def update_metric(metric_id: int, payload: MetricUpdate) -> Any:
    """Update fields of a metric definition."""
    try:
        return metric_store.update_metric(metric_id, payload)
    except KeyError:
        raise HTTPException(status_code=404, detail="Metric not found")
    except Exception as exc:
        raise HTTPException(status_code=400, detail=str(exc)) from exc
@router.get("/metrics/{metric_id}")
 def get_metric(metric_id: int) -> Any:
    """Fetch a metric definition by id."""
    metric = metric_store.get_metric(metric_id)
    if not metric:
        raise HTTPException(status_code=404, detail="Metric not found")
    return metric
@router.get("/metrics")
 def list_metrics(
    biz_domain: Optional[str] = None,
    is_active: Optional[bool] = None,
    keyword: Optional[str] = Query(None, description="Search by code/name"),
    limit: int = Query(100, ge=1, le=500),
    offset: int = Query(0, ge=0),
 ) -> List[Any]:
    """List metrics with optional filters."""
    return metric_store.list_metrics(
        biz_domain=biz_domain,
        is_active=is_active,
        keyword=keyword,
        limit=limit,
        offset=offset,
    )
@router.post("/metric-schedules")
 def create_schedule(payload: MetricScheduleCreate) -> Any:
    """Create a metric schedule."""
    try:
        return metric_store.create_metric_schedule(payload)
    except Exception as exc:
        raise HTTPException(status_code=400, detail=str(exc)) from exc
@router.post("/metric-schedules/{schedule_id}")
 def update_schedule(schedule_id: int, payload: MetricScheduleUpdate) -> Any:
    """Update a metric schedule."""
    try:
        return metric_store.update_metric_schedule(schedule_id, payload)
    except KeyError:
        raise HTTPException(status_code=404, detail="Schedule not found")
    except Exception as exc:
        raise HTTPException(status_code=400, detail=str(exc)) from exc
@router.get("/metrics/{metric_id}/schedules")
 def list_schedules(metric_id: int) -> List[Any]:
    """List schedules for one metric."""
    return metric_store.list_schedules_for_metric(metric_id=metric_id)
@router.post("/metric-runs/trigger")
 def trigger_run(payload: MetricRunTrigger) -> Any:
    """Insert a run record (execution handled externally)."""
    try:
        return metric_store.trigger_metric_run(payload)
    except KeyError as exc:
        raise HTTPException(status_code=404, detail=str(exc)) from exc
    except Exception as exc:
        raise HTTPException(status_code=400, detail=str(exc)) from exc
@router.get("/metric-runs")
 def list_runs(
    metric_id: Optional[int] = None,
    status: Optional[str] = None,
    limit: int = Query(100, ge=1, le=500),
    offset: int = Query(0, ge=0),
 ) -> List[Any]:
    """List run records."""
    return metric_store.list_metric_runs(
        metric_id=metric_id, status=status, limit=limit, offset=offset
    )
@router.get("/metric-runs/{run_id}")
 def get_run(run_id: int) -> Any:
    """Fetch run details."""
    run = metric_store.get_metric_run(run_id)
    if not run:
        raise HTTPException(status_code=404, detail="Run not found")
    return run
@router.post("/metric-results/{metric_id}")
 def write_results(metric_id: int, payload: MetricResultsWriteRequest) -> Any:
    # Align path metric_id with payload to avoid mismatch.
    if payload.metric_id != metric_id:
        raise HTTPException(status_code=400, detail="metric_id in path/body mismatch")
    try:
        inserted = metric_store.write_metric_results(payload)
    except KeyError as exc:
        raise HTTPException(status_code=404, detail=str(exc)) from exc
    except Exception as exc:
        raise HTTPException(status_code=400, detail=str(exc)) from exc
    return {"metric_id": metric_id, "inserted": inserted}
@router.get("/metric-results")
 def query_results(
    metric_id: int,
    stat_from: Optional[datetime] = None,
    stat_to: Optional[datetime] = None,
    limit: int = Query(200, ge=1, le=1000),
    offset: int = Query(0, ge=0),
 ) -> List[Any]:
    """Query metric results by time range."""
    return metric_store.query_metric_results(
        metric_id=metric_id,
        stat_from=stat_from,
        stat_to=stat_to,
        limit=limit,
        offset=offset,
    )
@router.get("/metric-results/latest")
 def latest_result(metric_id: int) -> Any:
    """Fetch the latest metric result."""
    result = metric_store.latest_metric_result(metric_id)
    if not result:
        raise HTTPException(status_code=404, detail="Metric result not found")
    return result
--- a/app/schemas/chat.py
+++ b/app/schemas/chat.py
@ -1,53 +0,0 @@
 from __future__ import annotations
 from datetime import datetime
 from typing import Any, List, Optional
 from pydantic import BaseModel, Field
 class ChatSessionCreate(BaseModel):
    """Create a chat session to group multiple turns for a user."""
    user_id: int = Field(..., description="User ID owning the session.")
    session_uuid: Optional[str] = Field(None, description="Optional externally provided UUID.")
    status: Optional[str] = Field("OPEN", description="Session status, default OPEN.")
    end_time: Optional[datetime] = Field(None, description="Optional end time.")
    ext_context: Optional[dict[str, Any]] = Field(None, description="Arbitrary business context.")
 class ChatSessionUpdate(BaseModel):
    """Partial update for a chat session."""
    status: Optional[str] = Field(None, description="New session status.")
    end_time: Optional[datetime] = Field(None, description="Close time override.")
    last_turn_id: Optional[int] = Field(None, description="Pointer to last chat turn.")
    ext_context: Optional[dict[str, Any]] = Field(None, description="Context patch.")
 class ChatTurnCreate(BaseModel):
    """Create a single chat turn with intent/SQL context."""
    user_id: int = Field(..., description="User ID for this turn.")
    user_query: str = Field(..., description="Raw user query content.")
    intent: Optional[str] = Field(None, description="Intent tag such as METRIC_QUERY.")
    ast_json: Optional[dict[str, Any]] = Field(None, description="Parsed AST payload.")
    generated_sql: Optional[str] = Field(None, description="Final SQL text, if generated.")
    sql_status: Optional[str] = Field(None, description="SQL generation/execution status.")
    error_msg: Optional[str] = Field(None, description="Error message when SQL failed.")
    main_metric_ids: Optional[List[int]] = Field(None, description="Metric IDs referenced in this turn.")
    created_metric_ids: Optional[List[int]] = Field(None, description="Metric IDs created in this turn.")
    end_time: Optional[datetime] = Field(None, description="Turn end time.")
 class ChatTurnRetrievalItem(BaseModel):
    """Record of one retrieved item contributing to a turn."""
    item_type: str = Field(..., description="METRIC/SNIPPET/CHAT etc.")
    item_id: str = Field(..., description="Identifier such as metric_id or snippet_id.")
    item_extra: Optional[dict[str, Any]] = Field(None, description="Additional context like column name.")
    similarity_score: Optional[float] = Field(None, description="Similarity score.")
    rank_no: Optional[int] = Field(None, description="Ranking position.")
    used_in_reasoning: Optional[bool] = Field(False, description="Flag if used in reasoning.")
    used_in_sql: Optional[bool] = Field(False, description="Flag if used in final SQL.")
 class ChatTurnRetrievalBatch(BaseModel):
    """Batch insert wrapper for retrieval records."""
    retrievals: List[ChatTurnRetrievalItem]
--- a/app/schemas/metrics.py
+++ b/app/schemas/metrics.py
@ -1,99 +0,0 @@
 from __future__ import annotations
 from datetime import datetime
 from typing import Any, List, Optional
 from pydantic import BaseModel, Field
 class MetricCreate(BaseModel):
    """Create a metric definition with business and technical metadata."""
    metric_code: str = Field(..., description="Internal metric code, unique.")
    metric_name: str = Field(..., description="Display name.")
    metric_aliases: Optional[List[str]] = Field(None, description="Optional alias list.")
    biz_domain: str = Field(..., description="Business domain identifier.")
    biz_desc: Optional[str] = Field(None, description="Business definition.")
    chat_turn_id: Optional[int] = Field(None, description="Source chat turn ID.")
    tech_desc: Optional[str] = Field(None, description="Technical definition.")
    formula_expr: Optional[str] = Field(None, description="Formula expression text.")
    base_sql: str = Field(..., description="Canonical SQL used to compute the metric.")
    time_grain: str = Field(..., description="DAY/HOUR/WEEK/MONTH etc.")
    dim_binding: List[str] = Field(..., description="Dimension columns bound to the metric.")
    update_strategy: str = Field(..., description="FULL/INCR/REALTIME.")
    schedule_id: Optional[int] = Field(None, description="Linked schedule id if any.")
    schedule_type: Optional[int] = Field(None, description="Scheduler type identifier.")
    is_active: bool = Field(True, description="Whether the metric is enabled.")
    created_by: Optional[int] = Field(None, description="Creator user id.")
    updated_by: Optional[int] = Field(None, description="Updater user id.")
 class MetricUpdate(BaseModel):
    """Partial update for an existing metric definition."""
    metric_name: Optional[str] = None
    metric_aliases: Optional[List[str]] = None
    biz_domain: Optional[str] = None
    biz_desc: Optional[str] = None
    tech_desc: Optional[str] = None
    formula_expr: Optional[str] = None
    base_sql: Optional[str] = None
    time_grain: Optional[str] = None
    dim_binding: Optional[List[str]] = None
    update_strategy: Optional[str] = None
    schedule_id: Optional[int] = None
    schedule_type: Optional[int] = None
    is_active: Optional[bool] = None
    updated_by: Optional[int] = None
 class MetricScheduleCreate(BaseModel):
    """Create a cron-based schedule for a metric."""
    metric_id: int
    cron_expr: str
    enabled: bool = True
    priority: int = 10
    backfill_allowed: bool = True
    max_runtime_sec: Optional[int] = None
    retry_times: int = 0
    owner_team: Optional[str] = None
    owner_user_id: Optional[int] = None
 class MetricScheduleUpdate(BaseModel):
    """Update fields of an existing metric schedule."""
    cron_expr: Optional[str] = None
    enabled: Optional[bool] = None
    priority: Optional[int] = None
    backfill_allowed: Optional[bool] = None
    max_runtime_sec: Optional[int] = None
    retry_times: Optional[int] = None
    owner_team: Optional[str] = None
    owner_user_id: Optional[int] = None
 class MetricRunTrigger(BaseModel):
    """Trigger a metric run, optionally linking to a chat turn or schedule."""
    metric_id: int
    schedule_id: Optional[int] = None
    source_turn_id: Optional[int] = None
    data_time_from: Optional[datetime] = None
    data_time_to: Optional[datetime] = None
    metric_version: Optional[int] = None
    base_sql_snapshot: Optional[str] = None
    triggered_by: str = Field("API", description="SCHEDULER/MANUAL/API/QA_TURN")
    triggered_at: Optional[datetime] = None
 class MetricResultItem(BaseModel):
    """Single metric result row to be persisted."""
    stat_time: datetime
    metric_value: float
    metric_version: Optional[int] = None
    extra_dims: Optional[dict[str, Any]] = None
    load_time: Optional[datetime] = None
    data_version: Optional[int] = None
 class MetricResultsWriteRequest(BaseModel):
    """Batch write request for metric results."""
    metric_id: int
    results: List[MetricResultItem]
--- a/app/schemas/rag.py
+++ b/app/schemas/rag.py
@ -1,46 +0,0 @@
 from __future__ import annotations
 from typing import Any, List
 from pydantic import BaseModel, ConfigDict, Field
 class RagItemPayload(BaseModel):
    """Payload for creating or updating a single RAG item."""
    model_config = ConfigDict(populate_by_name=True, extra="ignore")
    id: int = Field(..., description="Unique identifier for the RAG item.")
    workspace_id: int = Field(..., alias="workspaceId", description="Workspace identifier.")
    name: str = Field(..., description="Readable name of the item.")
    embedding_data: str = Field(..., alias="embeddingData", description="Serialized embedding payload.")
    type: str = Field(..., description='Item type, e.g. "METRIC".')
 class RagDeleteRequest(BaseModel):
    """Payload for deleting a single RAG item."""
    model_config = ConfigDict(populate_by_name=True, extra="ignore")
    id: int = Field(..., description="Identifier of the item to delete.")
    type: str = Field(..., description="Item type matching the stored record.")
 class RagRetrieveRequest(BaseModel):
    """Payload for retrieving RAG items by semantic query."""
    model_config = ConfigDict(populate_by_name=True, extra="ignore")
    query: str = Field(..., description="Search query text.")
    num: int = Field(..., description="Number of items to return.")
    workspace_id: int = Field(..., alias="workspaceId", description="Workspace scope for the search.")
    type: str = Field(..., description="Item type to search, e.g. METRIC.")
 class RagRetrieveResponse(BaseModel):
    """Generic RAG retrieval response wrapper."""
    model_config = ConfigDict(extra="allow")
    data: List[Any] = Field(default_factory=list, description="Retrieved items.")
--- a/app/services/init.py
+++ b/app/services/init.py
@ -1,4 +1,3 @@
 from .gateway import LLMGateway
 from .rag_client import RagAPIClient
-__all__ = ["LLMGateway", "RagAPIClient"]
+__all__ = ["LLMGateway"]
--- a/app/services/gateway.py
+++ b/app/services/gateway.py
@ -1,93 +1,53 @@
 from __future__ import annotations
-import logging
+import os
 from typing import Dict, Type
 import httpx
 from pydantic import ValidationError
-from app.exceptions import ProviderAPICallError
+from app.exceptions import ProviderConfigurationError
-from app.models import LLMChoice, LLMMessage, LLMRequest, LLMResponse
+from app.models import LLMProvider, LLMRequest, LLMResponse
-from app.settings import NEW_API_AUTH_TOKEN, NEW_API_BASE_URL
+from app.providers import (
-
+    AnthropicProvider,
-
+    DeepSeekProvider,
-logger = logging.getLogger(__name__)
+    GeminiProvider,
    LLMProviderClient,
    OpenAIProvider,
    OpenRouterProvider,
    QwenProvider,
 )
 class LLMGateway:
-    """Forward chat requests to the configured new-api component."""
+    """Simple registry that dispatches chat requests to provider clients."""
-    def __init__(
+    def __init__(self) -> None:
-        self,
+        self._providers: Dict[LLMProvider, LLMProviderClient] = {}
-        *,
+        self._factory: Dict[LLMProvider, Type[LLMProviderClient]] = {
-        base_url: str | None = None,
+            LLMProvider.OPENAI: OpenAIProvider,
-        auth_token: str | None = None,
+            LLMProvider.ANTHROPIC: AnthropicProvider,
-    ) -> None:
+            LLMProvider.OPENROUTER: OpenRouterProvider,
-        resolved_base = base_url or NEW_API_BASE_URL
+            LLMProvider.GEMINI: GeminiProvider,
-        self._base_url = resolved_base.rstrip("/")
+            LLMProvider.QWEN: QwenProvider,
-        self._auth_token = auth_token or NEW_API_AUTH_TOKEN
+            LLMProvider.DEEPSEEK: DeepSeekProvider,
        }
    def get_provider(self, provider: LLMProvider) -> LLMProviderClient:
        if provider not in self._factory:
            raise ProviderConfigurationError(f"Unsupported provider '{provider.value}'.")
        if provider not in self._providers:
            self._providers[provider] = self._build_provider(provider)
        return self._providers[provider]
    def _build_provider(self, provider: LLMProvider) -> LLMProviderClient:
        provider_cls = self._factory[provider]
        api_key_env = getattr(provider_cls, "api_key_env", None)
        api_key = os.getenv(api_key_env) if api_key_env else None
        return provider_cls(api_key)
    async def chat(
        self, request: LLMRequest, client: httpx.AsyncClient
    ) -> LLMResponse:
-        url = f"{self._base_url}/v1/chat/completions"
+        provider_client = self.get_provider(request.provider)
-        payload = request.model_dump(mode="json", exclude_none=True)
+        return await provider_client.chat(request, client)
        headers = {"Content-Type": "application/json"}
        if self._auth_token:
            headers["Authorization"] = f"Bearer {self._auth_token}"
        logger.info("Forwarding chat request to new-api at %s", url)
        try:
            response = await client.post(url, json=payload, headers=headers)
            response.raise_for_status()
        except httpx.HTTPStatusError as exc:
            status_code = exc.response.status_code if exc.response else None
            response_text = exc.response.text if exc.response else ""
            logger.error(
                "new-api upstream returned %s: %s",
                status_code,
                response_text,
                exc_info=True,
            )
            raise ProviderAPICallError(
                "Chat completion request failed.",
                status_code=status_code,
                response_text=response_text,
            ) from exc
        except httpx.HTTPError as exc:
            logger.error("new-api transport error: %s", exc, exc_info=True)
            raise ProviderAPICallError(f"Chat completion request failed: {exc}") from exc
        try:
            data = response.json()
        except ValueError as exc:
            logger.error("new-api responded with invalid JSON.", exc_info=True)
            raise ProviderAPICallError(
                "Chat completion response was not valid JSON."
            ) from exc
        logger.info("new-api payload: %s", data)
        normalized_choices: list[LLMChoice] = []
        for idx, choice in enumerate(data.get("choices", []) or []):
            message_payload = choice.get("message") or {}
            message = LLMMessage(
                role=message_payload.get("role", "assistant"),
                content=message_payload.get("content", ""),
            )
            normalized_choices.append(
                LLMChoice(index=choice.get("index", idx), message=message)
            )
        try:
            normalized_response = LLMResponse(
                provider=request.provider,
                model=data.get("model", request.model),
                choices=normalized_choices,
                raw=data,
            )
            return normalized_response
        except ValidationError as exc:
            logger.error(
                "new-api response did not match expected schema: %s", data, exc_info=True
            )
            raise ProviderAPICallError(
                "Chat completion response was not in the expected format."
            ) from exc
--- a/app/services/import_analysis.py
+++ b/app/services/import_analysis.py
@ -22,24 +22,13 @@ from app.models import (
    LLMResponse,
    LLMRole,
 )
-from app.settings import (
+from app.settings import DEFAULT_IMPORT_MODEL, get_supported_import_models
    DEFAULT_IMPORT_MODEL,
    NEW_API_AUTH_TOKEN,
    NEW_API_BASE_URL,
    get_supported_import_models,
 )
 from app.utils.llm_usage import extract_usage
 logger = logging.getLogger(__name__)
-IMPORT_GATEWAY_BASE_URL = os.getenv("IMPORT_GATEWAY_BASE_URL", NEW_API_BASE_URL)
+IMPORT_GATEWAY_BASE_URL = os.getenv(
-
+    "IMPORT_GATEWAY_BASE_URL", "http://localhost:8000"
-
+)
 def build_import_gateway_headers() -> dict[str, str]:
    headers = {"Content-Type": "application/json"}
    if NEW_API_AUTH_TOKEN:
        headers["Authorization"] = f"Bearer {NEW_API_AUTH_TOKEN}"
    return headers
 def _env_float(name: str, default: float) -> float:
@ -53,7 +42,7 @@ def _env_float(name: str, default: float) -> float:
        return default
-IMPORT_CHAT_TIMEOUT_SECONDS = _env_float("IMPORT_CHAT_TIMEOUT_SECONDS", 120.0)
+IMPORT_CHAT_TIMEOUT_SECONDS = _env_float("IMPORT_CHAT_TIMEOUT_SECONDS", 90.0)
 SUPPORTED_IMPORT_MODELS = get_supported_import_models()
@ -309,7 +298,7 @@ def parse_llm_analysis_json(llm_response: LLMResponse) -> Dict[str, Any]:
    try:
        return json.loads(json_payload)
    except json.JSONDecodeError as exc:
-        preview = json_payload[:10000]
+        preview = json_payload[:2000]
        logger.error("Failed to parse JSON from LLM response content: %s", preview, exc_info=True)
        raise ProviderAPICallError("LLM response JSON could not be parsed.") from exc
@ -324,18 +313,16 @@ async def dispatch_import_analysis_job(
    url = f"{IMPORT_GATEWAY_BASE_URL.rstrip('/')}/v1/chat/completions"
    logger.info(
-        "Dispatching import %s to %s using provider=%s model=%s",
+        "Dispatching import %s to %s: %s",
        request.import_record_id,
        url,
-        payload.get("provider"),
+        json.dumps(payload, ensure_ascii=False),
        payload.get("model"),
    )
    timeout = httpx.Timeout(IMPORT_CHAT_TIMEOUT_SECONDS)
    headers = build_import_gateway_headers()
    try:
-        response = await client.post(url, json=payload, timeout=timeout, headers=headers)
+        response = await client.post(url, json=payload, timeout=timeout)
        response.raise_for_status()
    except httpx.HTTPStatusError as exc:
        body_preview = ""
@ -360,10 +347,9 @@ async def dispatch_import_analysis_job(
        response.status_code,
    )
    logger.info(
-        "LLM response received for %s (status %s, choices=%s)",
+        "LLM response for %s: %s",
        request.import_record_id,
-        response.status_code,
+        json.dumps(response_data, ensure_ascii=False),
        len(response_data.get("choices") or []),
    )
    try:
@ -389,6 +375,18 @@ async def dispatch_import_analysis_job(
    return result
 # 兼容处理多模型的使用量字段提取
 def extract_usage(resp_json: dict) -> dict:
    usage = resp_json.get("usage") or resp_json.get("usageMetadata") or {}
    return {
        "prompt_tokens": usage.get("prompt_tokens") or usage.get("input_tokens") or usage.get("promptTokenCount"),
        "completion_tokens": usage.get("completion_tokens") or usage.get("output_tokens") or usage.get("candidatesTokenCount"),
        "total_tokens": usage.get("total_tokens") or usage.get("totalTokenCount") or (
            (usage.get("prompt_tokens") or usage.get("input_tokens") or 0)
            + (usage.get("completion_tokens") or usage.get("output_tokens") or 0)
        )
    }
 async def notify_import_analysis_callback(
    callback_url: str,
    payload: Dict[str, Any],
@ -417,7 +415,6 @@ async def process_import_analysis_job(
    request: DataImportAnalysisJobRequest,
    client: httpx.AsyncClient,
 ) -> None:
    # Run the import analysis and ensure the callback fires regardless of success/failure.
    try:
        payload = await dispatch_import_analysis_job(request, client)
    except ProviderAPICallError as exc:
--- a/app/services/metric_store.py
+++ b/app/services/metric_store.py
@ -1,842 +0,0 @@
 from __future__ import annotations
 import hashlib
 import json
 import logging
 from datetime import datetime
 from typing import Any, Dict, Iterable, List, Optional
 from uuid import uuid4
 from sqlalchemy import text
 from sqlalchemy.engine import Row
 from app.db import get_engine
 from app.schemas.chat import (
    ChatSessionCreate,
    ChatSessionUpdate,
    ChatTurnCreate,
    ChatTurnRetrievalItem,
 )
 from app.schemas.metrics import (
    MetricCreate,
    MetricResultItem,
    MetricResultsWriteRequest,
    MetricRunTrigger,
    MetricScheduleCreate,
    MetricScheduleUpdate,
    MetricUpdate,
 )
 logger = logging.getLogger(__name__)
 # Common helpers
 def _json_dump(value: Any) -> Optional[str]:
    """Safe JSON dumper; returns None on failure to keep DB writes simple."""
    if value is None:
        return None
    if isinstance(value, str):
        return value
    try:
        return json.dumps(value, ensure_ascii=False)
    except (TypeError, ValueError):
        return None
 def _parse_json_fields(payload: Dict[str, Any], fields: Iterable[str]) -> Dict[str, Any]:
    """Parse select fields from JSON strings into dict/list for responses."""
    for field in fields:
        raw = payload.get(field)
        if raw is None or isinstance(raw, (dict, list)):
            continue
        if isinstance(raw, (bytes, bytearray)):
            raw = raw.decode("utf-8", errors="ignore")
        if isinstance(raw, str):
            try:
                payload[field] = json.loads(raw)
            except ValueError:
                pass
    return payload
 def _row_to_dict(row: Row[Any]) -> Dict[str, Any]:
    return dict(row._mapping)
 # Chat sessions & turns
 def create_chat_session(payload: ChatSessionCreate) -> Dict[str, Any]:
    """Create a chat session row with optional external UUID."""
    engine = get_engine()
    session_uuid = payload.session_uuid or str(uuid4())
    now = datetime.utcnow()
    params = {
        "user_id": payload.user_id,
        "session_uuid": session_uuid,
        "end_time": payload.end_time,
        "status": payload.status or "OPEN",
        "ext_context": _json_dump(payload.ext_context),
    }
    with engine.begin() as conn:
        result = conn.execute(
            text(
                """
                INSERT INTO chat_session (user_id, session_uuid, end_time, status, ext_context)
                VALUES (:user_id, :session_uuid, :end_time, :status, :ext_context)
                """
            ),
            params,
        )
        session_id = result.lastrowid
        row = conn.execute(
            text("SELECT * FROM chat_session WHERE id=:id"), {"id": session_id}
        ).first()
    if not row:
        raise RuntimeError("Failed to create chat session.")
    data = _row_to_dict(row)
    _parse_json_fields(data, ["ext_context"])
    return data
 def update_chat_session(session_id: int, payload: ChatSessionUpdate) -> Dict[str, Any]:
    """Patch selected chat session fields."""
    updates = {}
    if payload.status is not None:
        updates["status"] = payload.status
    if payload.end_time is not None:
        updates["end_time"] = payload.end_time
    if payload.last_turn_id is not None:
        updates["last_turn_id"] = payload.last_turn_id
    if payload.ext_context is not None:
        updates["ext_context"] = _json_dump(payload.ext_context)
    if not updates:
        current = get_chat_session(session_id)
        if not current:
            raise KeyError(f"Session {session_id} not found.")
        return current
    set_clause = ", ".join(f"{key}=:{key}" for key in updates.keys())
    params = dict(updates)
    params["id"] = session_id
    engine = get_engine()
    with engine.begin() as conn:
        conn.execute(
            text(f"UPDATE chat_session SET {set_clause} WHERE id=:id"),
            params,
        )
        row = conn.execute(
            text("SELECT * FROM chat_session WHERE id=:id"), {"id": session_id}
        ).first()
    if not row:
        raise KeyError(f"Session {session_id} not found.")
    data = _row_to_dict(row)
    _parse_json_fields(data, ["ext_context"])
    return data
 def close_chat_session(session_id: int) -> Dict[str, Any]:
    """Mark a chat session as CLOSED with end_time."""
    now = datetime.utcnow()
    return update_chat_session(
        session_id,
        ChatSessionUpdate(status="CLOSED", end_time=now),
    )
 def get_chat_session(session_id: int) -> Optional[Dict[str, Any]]:
    engine = get_engine()
    with engine.begin() as conn:
        row = conn.execute(
            text("SELECT * FROM chat_session WHERE id=:id"), {"id": session_id}
        ).first()
    if not row:
        return None
    data = _row_to_dict(row)
    _parse_json_fields(data, ["ext_context"])
    return data
 def list_chat_sessions(
    *,
    user_id: Optional[int] = None,
    status: Optional[str] = None,
    start_from: Optional[datetime] = None,
    start_to: Optional[datetime] = None,
    limit: int = 50,
    offset: int = 0,
 ) -> List[Dict[str, Any]]:
    """List chat sessions with optional filters and pagination."""
    conditions = []
    params: Dict[str, Any] = {"limit": limit, "offset": offset}
    if user_id is not None:
        conditions.append("user_id=:user_id")
        params["user_id"] = user_id
    if status is not None:
        conditions.append("status=:status")
        params["status"] = status
    if start_from is not None:
        conditions.append("created_at>=:start_from")
        params["start_from"] = start_from
    if start_to is not None:
        conditions.append("created_at<=:start_to")
        params["start_to"] = start_to
    where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
    engine = get_engine()
    with engine.begin() as conn:
        rows = conn.execute(
            text(
                f"SELECT * FROM chat_session {where_clause} "
                "ORDER BY created_at DESC LIMIT :limit OFFSET :offset"
            ),
            params,
        ).fetchall()
    results: List[Dict[str, Any]] = []
    for row in rows:
        data = _row_to_dict(row)
        _parse_json_fields(data, ["ext_context"])
        results.append(data)
    return results
 def _next_turn_no(conn, session_id: int) -> int:
    row = conn.execute(
        text("SELECT COALESCE(MAX(turn_no), 0) + 1 AS next_no FROM chat_turn WHERE session_id=:sid"),
        {"sid": session_id},
    ).first()
    if not row:
        return 1
    return int(row._mapping["next_no"])
 def create_chat_turn(session_id: int, payload: ChatTurnCreate) -> Dict[str, Any]:
    """Insert a chat turn and auto-increment turn number within the session."""
    engine = get_engine()
    now = datetime.utcnow()
    params = {
        "session_id": session_id,
        "user_id": payload.user_id,
        "user_query": payload.user_query,
        "intent": payload.intent,
        "ast_json": _json_dump(payload.ast_json),
        "generated_sql": payload.generated_sql,
        "sql_status": payload.sql_status,
        "error_msg": payload.error_msg,
        "main_metric_ids": _json_dump(payload.main_metric_ids),
        "created_metric_ids": _json_dump(payload.created_metric_ids),
        "end_time": payload.end_time,
    }
    with engine.begin() as conn:
        turn_no = _next_turn_no(conn, session_id)
        params["turn_no"] = turn_no
        result = conn.execute(
            text(
                """
                INSERT INTO chat_turn (
                    session_id, turn_no, user_id,
                    user_query, intent, ast_json,
                    generated_sql, sql_status, error_msg,
                    main_metric_ids, created_metric_ids,
                    end_time
                )
                VALUES (
                    :session_id, :turn_no, :user_id,
                    :user_query, :intent, :ast_json,
                    :generated_sql, :sql_status, :error_msg,
                    :main_metric_ids, :created_metric_ids,
                     :end_time
                )
                """
            ),
            params,
        )
        turn_id = result.lastrowid
        row = conn.execute(
            text("SELECT * FROM chat_turn WHERE id=:id"), {"id": turn_id}
        ).first()
    if not row:
        raise RuntimeError("Failed to create chat turn.")
    data = _row_to_dict(row)
    _parse_json_fields(data, ["ast_json", "main_metric_ids", "created_metric_ids"])
    return data
 def get_chat_turn(turn_id: int) -> Optional[Dict[str, Any]]:
    engine = get_engine()
    with engine.begin() as conn:
        row = conn.execute(
            text("SELECT * FROM chat_turn WHERE id=:id"), {"id": turn_id}
        ).first()
    if not row:
        return None
    data = _row_to_dict(row)
    _parse_json_fields(data, ["ast_json", "main_metric_ids", "created_metric_ids"])
    return data
 def list_chat_turns(session_id: int) -> List[Dict[str, Any]]:
    engine = get_engine()
    with engine.begin() as conn:
        rows = conn.execute(
            text(
                "SELECT * FROM chat_turn WHERE session_id=:session_id ORDER BY turn_no ASC"
            ),
            {"session_id": session_id},
        ).fetchall()
    results: List[Dict[str, Any]] = []
    for row in rows:
        data = _row_to_dict(row)
        _parse_json_fields(data, ["ast_json", "main_metric_ids", "created_metric_ids"])
        results.append(data)
    return results
 def create_retrievals(turn_id: int, retrievals: List[ChatTurnRetrievalItem]) -> int:
    """Batch insert retrieval records for a turn."""
    if not retrievals:
        return 0
    engine = get_engine()
    params_list = []
    for item in retrievals:
        params_list.append(
            {
                "turn_id": turn_id,
                "item_type": item.item_type,
                "item_id": item.item_id,
                "item_extra": _json_dump(item.item_extra),
                "similarity_score": item.similarity_score,
                "rank_no": item.rank_no,
                "used_in_reasoning": 1 if item.used_in_reasoning else 0,
                "used_in_sql": 1 if item.used_in_sql else 0,
            }
        )
    with engine.begin() as conn:
        conn.execute(
            text(
                """
                INSERT INTO chat_turn_retrieval (
                    turn_id, item_type, item_id, item_extra,
                    similarity_score, rank_no, used_in_reasoning, used_in_sql
                )
                VALUES (
                    :turn_id, :item_type, :item_id, :item_extra,
                    :similarity_score, :rank_no, :used_in_reasoning, :used_in_sql
                )
                """
            ),
            params_list,
        )
    return len(retrievals)
 def list_retrievals(turn_id: int) -> List[Dict[str, Any]]:
    engine = get_engine()
    with engine.begin() as conn:
        rows = conn.execute(
            text(
                "SELECT * FROM chat_turn_retrieval WHERE turn_id=:turn_id ORDER BY created_at ASC, rank_no ASC"
            ),
            {"turn_id": turn_id},
        ).fetchall()
    results: List[Dict[str, Any]] = []
    for row in rows:
        data = _row_to_dict(row)
        _parse_json_fields(data, ["item_extra"])
        data["used_in_reasoning"] = bool(data.get("used_in_reasoning"))
        data["used_in_sql"] = bool(data.get("used_in_sql"))
        results.append(data)
    return results
 # Metric registry
 def _metric_sql_hash(sql_text: str) -> str:
    """Compute a stable hash to detect SQL definition changes."""
    return hashlib.md5(sql_text.encode("utf-8")).hexdigest()
 def create_metric(payload: MetricCreate) -> Dict[str, Any]:
    """Insert a new metric definition; version starts at 1."""
    engine = get_engine()
    now = datetime.utcnow()
    sql_hash = _metric_sql_hash(payload.base_sql)
    params = {
        "metric_code": payload.metric_code,
        "metric_name": payload.metric_name,
        "metric_aliases": _json_dump(payload.metric_aliases),
        "biz_domain": payload.biz_domain,
        "biz_desc": payload.biz_desc,
        "chat_turn_id": payload.chat_turn_id,
        "tech_desc": payload.tech_desc,
        "formula_expr": payload.formula_expr,
        "base_sql": payload.base_sql,
        "time_grain": payload.time_grain,
        "dim_binding": _json_dump(payload.dim_binding),
        "update_strategy": payload.update_strategy,
        "schedule_id": payload.schedule_id,
        "schedule_type": payload.schedule_type,
        "version": 1,
        "is_active": 1 if payload.is_active else 0,
        "sql_hash": sql_hash,
        "created_by": payload.created_by,
        "updated_by": payload.updated_by,
        "created_at": now,
        "updated_at": now,
    }
    with engine.begin() as conn:
        result = conn.execute(
            text(
                """
                INSERT INTO metric_def (
                    metric_code, metric_name, metric_aliases, biz_domain, biz_desc,
                    chat_turn_id, tech_desc, formula_expr, base_sql,
                    time_grain, dim_binding, update_strategy,
                    schedule_id, schedule_type, version, is_active,
                    sql_hash, created_by, updated_by, created_at, updated_at
                )
                VALUES (
                    :metric_code, :metric_name, :metric_aliases, :biz_domain, :biz_desc,
                    :chat_turn_id, :tech_desc, :formula_expr, :base_sql,
                    :time_grain, :dim_binding, :update_strategy,
                    :schedule_id, :schedule_type, :version, :is_active,
                    :sql_hash, :created_by, :updated_by, :created_at, :updated_at
                )
                """
            ),
            params,
        )
        metric_id = result.lastrowid
        row = conn.execute(
            text("SELECT * FROM metric_def WHERE id=:id"), {"id": metric_id}
        ).first()
    if not row:
        raise RuntimeError("Failed to create metric definition.")
    data = _row_to_dict(row)
    _parse_json_fields(data, ["metric_aliases", "dim_binding"])
    data["is_active"] = bool(data.get("is_active"))
    return data
 def update_metric(metric_id: int, payload: MetricUpdate) -> Dict[str, Any]:
    """Update mutable fields of a metric definition and refresh sql_hash when needed."""
    updates: Dict[str, Any] = {}
    for field in (
        "metric_name",
        "biz_domain",
        "biz_desc",
        "tech_desc",
        "formula_expr",
        "base_sql",
        "time_grain",
        "update_strategy",
        "schedule_id",
        "schedule_type",
        "updated_by",
    ):
        value = getattr(payload, field)
        if value is not None:
            updates[field] = value
    if payload.metric_aliases is not None:
        updates["metric_aliases"] = _json_dump(payload.metric_aliases)
    if payload.dim_binding is not None:
        updates["dim_binding"] = _json_dump(payload.dim_binding)
    if payload.is_active is not None:
        updates["is_active"] = 1 if payload.is_active else 0
    if payload.base_sql is not None:
        updates["sql_hash"] = _metric_sql_hash(payload.base_sql)
    if not updates:
        current = get_metric(metric_id)
        if not current:
            raise KeyError(f"Metric {metric_id} not found.")
        return current
    updates["updated_at"] = datetime.utcnow()
    set_clause = ", ".join(f"{key}=:{key}" for key in updates.keys())
    params = dict(updates)
    params["id"] = metric_id
    engine = get_engine()
    with engine.begin() as conn:
        conn.execute(
            text(f"UPDATE metric_def SET {set_clause} WHERE id=:id"),
            params,
        )
        row = conn.execute(
            text("SELECT * FROM metric_def WHERE id=:id"), {"id": metric_id}
        ).first()
    if not row:
        raise KeyError(f"Metric {metric_id} not found.")
    data = _row_to_dict(row)
    _parse_json_fields(data, ["metric_aliases", "dim_binding"])
    data["is_active"] = bool(data.get("is_active"))
    return data
 def get_metric(metric_id: int) -> Optional[Dict[str, Any]]:
    engine = get_engine()
    with engine.begin() as conn:
        row = conn.execute(
            text("SELECT * FROM metric_def WHERE id=:id"), {"id": metric_id}
        ).first()
    if not row:
        return None
    data = _row_to_dict(row)
    _parse_json_fields(data, ["metric_aliases", "dim_binding"])
    data["is_active"] = bool(data.get("is_active"))
    return data
 def list_metrics(
    *,
    biz_domain: Optional[str] = None,
    is_active: Optional[bool] = None,
    keyword: Optional[str] = None,
    limit: int = 100,
    offset: int = 0,
 ) -> List[Dict[str, Any]]:
    """List metric definitions with simple filters and pagination."""
    conditions = []
    params: Dict[str, Any] = {"limit": limit, "offset": offset}
    if biz_domain:
        conditions.append("biz_domain=:biz_domain")
        params["biz_domain"] = biz_domain
    if is_active is not None:
        conditions.append("is_active=:is_active")
        params["is_active"] = 1 if is_active else 0
    if keyword:
        conditions.append("(metric_code LIKE :kw OR metric_name LIKE :kw)")
        params["kw"] = f"%{keyword}%"
    where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
    engine = get_engine()
    with engine.begin() as conn:
        rows = conn.execute(
            text(
                f"SELECT * FROM metric_def {where_clause} "
                "ORDER BY updated_at DESC LIMIT :limit OFFSET :offset"
            ),
            params,
        ).fetchall()
    results: List[Dict[str, Any]] = []
    for row in rows:
        data = _row_to_dict(row)
        _parse_json_fields(data, ["metric_aliases", "dim_binding"])
        data["is_active"] = bool(data.get("is_active"))
        results.append(data)
    return results
 # Metric schedules
 def create_metric_schedule(payload: MetricScheduleCreate) -> Dict[str, Any]:
    """Create a schedule record for a metric."""
    engine = get_engine()
    params = {
        "metric_id": payload.metric_id,
        "cron_expr": payload.cron_expr,
        "enabled": 1 if payload.enabled else 0,
        "priority": payload.priority,
        "backfill_allowed": 1 if payload.backfill_allowed else 0,
        "max_runtime_sec": payload.max_runtime_sec,
        "retry_times": payload.retry_times,
        "owner_team": payload.owner_team,
        "owner_user_id": payload.owner_user_id,
    }
    with engine.begin() as conn:
        result = conn.execute(
            text(
                """
                INSERT INTO metric_schedule (
                    metric_id, cron_expr, enabled, priority,
                    backfill_allowed, max_runtime_sec, retry_times,
                    owner_team, owner_user_id
                ) VALUES (
                    :metric_id, :cron_expr, :enabled, :priority,
                    :backfill_allowed, :max_runtime_sec, :retry_times,
                    :owner_team, :owner_user_id
                )
                """
            ),
            params,
        )
        schedule_id = result.lastrowid
        row = conn.execute(
            text("SELECT * FROM metric_schedule WHERE id=:id"), {"id": schedule_id}
        ).first()
    if not row:
        raise RuntimeError("Failed to create metric schedule.")
    data = _row_to_dict(row)
    data["enabled"] = bool(data.get("enabled"))
    data["backfill_allowed"] = bool(data.get("backfill_allowed"))
    return data
 def update_metric_schedule(schedule_id: int, payload: MetricScheduleUpdate) -> Dict[str, Any]:
    updates: Dict[str, Any] = {}
    for field in (
        "cron_expr",
        "priority",
        "max_runtime_sec",
        "retry_times",
        "owner_team",
        "owner_user_id",
    ):
        value = getattr(payload, field)
        if value is not None:
            updates[field] = value
    if payload.enabled is not None:
        updates["enabled"] = 1 if payload.enabled else 0
    if payload.backfill_allowed is not None:
        updates["backfill_allowed"] = 1 if payload.backfill_allowed else 0
    if not updates:
        current = list_schedules_for_metric(schedule_id=schedule_id)
        if current:
            return current[0]
        raise KeyError(f"Schedule {schedule_id} not found.")
    set_clause = ", ".join(f"{key}=:{key}" for key in updates.keys())
    params = dict(updates)
    params["id"] = schedule_id
    engine = get_engine()
    with engine.begin() as conn:
        conn.execute(
            text(f"UPDATE metric_schedule SET {set_clause} WHERE id=:id"),
            params,
        )
        row = conn.execute(
            text("SELECT * FROM metric_schedule WHERE id=:id"), {"id": schedule_id}
        ).first()
    if not row:
        raise KeyError(f"Schedule {schedule_id} not found.")
    data = _row_to_dict(row)
    data["enabled"] = bool(data.get("enabled"))
    data["backfill_allowed"] = bool(data.get("backfill_allowed"))
    return data
 def list_schedules_for_metric(metric_id: Optional[int] = None, schedule_id: Optional[int] = None) -> List[Dict[str, Any]]:
    conditions = []
    params: Dict[str, Any] = {}
    if metric_id is not None:
        conditions.append("metric_id=:metric_id")
        params["metric_id"] = metric_id
    if schedule_id is not None:
        conditions.append("id=:id")
        params["id"] = schedule_id
    where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
    engine = get_engine()
    with engine.begin() as conn:
        rows = conn.execute(
            text(f"SELECT * FROM metric_schedule {where_clause} ORDER BY id DESC"),
            params,
        ).fetchall()
    results: List[Dict[str, Any]] = []
    for row in rows:
        data = _row_to_dict(row)
        data["enabled"] = bool(data.get("enabled"))
        data["backfill_allowed"] = bool(data.get("backfill_allowed"))
        results.append(data)
    return results
 # Metric runs
 def trigger_metric_run(payload: MetricRunTrigger) -> Dict[str, Any]:
    """Create a metric_job_run entry; execution is orchestrated elsewhere."""
    metric = get_metric(payload.metric_id)
    if not metric:
        raise KeyError(f"Metric {payload.metric_id} not found.")
    metric_version = payload.metric_version or metric.get("version", 1)
    base_sql_snapshot = payload.base_sql_snapshot or metric.get("base_sql")
    triggered_at = payload.triggered_at or datetime.utcnow()
    params = {
        "metric_id": payload.metric_id,
        "schedule_id": payload.schedule_id,
        "source_turn_id": payload.source_turn_id,
        "data_time_from": payload.data_time_from,
        "data_time_to": payload.data_time_to,
        "metric_version": metric_version,
        "base_sql_snapshot": base_sql_snapshot,
        "status": "RUNNING",
        "error_msg": None,
        "affected_rows": None,
        "runtime_ms": None,
        "triggered_by": payload.triggered_by,
        "triggered_at": triggered_at,
        "started_at": None,
        "finished_at": None,
    }
    engine = get_engine()
    with engine.begin() as conn:
        result = conn.execute(
            text(
                """
                INSERT INTO metric_job_run (
                    metric_id, schedule_id, source_turn_id,
                    data_time_from, data_time_to, metric_version,
                    base_sql_snapshot, status, error_msg,
                    affected_rows, runtime_ms,
                    triggered_by, triggered_at, started_at, finished_at
                ) VALUES (
                    :metric_id, :schedule_id, :source_turn_id,
                    :data_time_from, :data_time_to, :metric_version,
                    :base_sql_snapshot, :status, :error_msg,
                    :affected_rows, :runtime_ms,
                    :triggered_by, :triggered_at, :started_at, :finished_at
                )
                """
            ),
            params,
        )
        run_id = result.lastrowid
        row = conn.execute(
            text("SELECT * FROM metric_job_run WHERE id=:id"), {"id": run_id}
        ).first()
    if not row:
        raise RuntimeError("Failed to create metric job run.")
    return _row_to_dict(row)
 def get_metric_run(run_id: int) -> Optional[Dict[str, Any]]:
    engine = get_engine()
    with engine.begin() as conn:
        row = conn.execute(
            text("SELECT * FROM metric_job_run WHERE id=:id"), {"id": run_id}
        ).first()
    if not row:
        return None
    return _row_to_dict(row)
 def list_metric_runs(
    *,
    metric_id: Optional[int] = None,
    status: Optional[str] = None,
    limit: int = 100,
    offset: int = 0,
 ) -> List[Dict[str, Any]]:
    conditions = []
    params: Dict[str, Any] = {"limit": limit, "offset": offset}
    if metric_id is not None:
        conditions.append("metric_id=:metric_id")
        params["metric_id"] = metric_id
    if status is not None:
        conditions.append("status=:status")
        params["status"] = status
    where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
    engine = get_engine()
    with engine.begin() as conn:
        rows = conn.execute(
            text(
                f"SELECT * FROM metric_job_run {where_clause} "
                "ORDER BY triggered_at DESC LIMIT :limit OFFSET :offset"
            ),
            params,
        ).fetchall()
    return [_row_to_dict(row) for row in rows]
 # Metric results
 def write_metric_results(payload: MetricResultsWriteRequest) -> int:
    """Bulk insert metric_result rows for a metric/version."""
    metric = get_metric(payload.metric_id)
    if not metric:
        raise KeyError(f"Metric {payload.metric_id} not found.")
    default_version = metric.get("version", 1)
    now = datetime.utcnow()
    rows: List[Dict[str, Any]] = []
    for item in payload.results:
        rows.append(
            {
                "metric_id": payload.metric_id,
                "metric_version": item.metric_version or default_version,
                "stat_time": item.stat_time,
                "extra_dims": _json_dump(item.extra_dims),
                "metric_value": item.metric_value,
                "load_time": item.load_time or now,
                "data_version": item.data_version,
            }
        )
    if not rows:
        return 0
    engine = get_engine()
    with engine.begin() as conn:
        conn.execute(
            text(
                """
                INSERT INTO metric_result (
                    metric_id, metric_version, stat_time,
                    extra_dims, metric_value, load_time, data_version
                ) VALUES (
                    :metric_id, :metric_version, :stat_time,
                    :extra_dims, :metric_value, :load_time, :data_version
                )
                """
            ),
            rows,
        )
    return len(rows)
 def query_metric_results(
    *,
    metric_id: int,
    stat_from: Optional[datetime] = None,
    stat_to: Optional[datetime] = None,
    limit: int = 200,
    offset: int = 0,
 ) -> List[Dict[str, Any]]:
    conditions = ["metric_id=:metric_id"]
    params: Dict[str, Any] = {
        "metric_id": metric_id,
        "limit": limit,
        "offset": offset,
    }
    if stat_from is not None:
        conditions.append("stat_time>=:stat_from")
        params["stat_from"] = stat_from
    if stat_to is not None:
        conditions.append("stat_time<=:stat_to")
        params["stat_to"] = stat_to
    where_clause = "WHERE " + " AND ".join(conditions)
    engine = get_engine()
    with engine.begin() as conn:
        rows = conn.execute(
            text(
                f"SELECT * FROM metric_result {where_clause} "
                "ORDER BY stat_time DESC LIMIT :limit OFFSET :offset"
            ),
            params,
        ).fetchall()
    results: List[Dict[str, Any]] = []
    for row in rows:
        data = _row_to_dict(row)
        _parse_json_fields(data, ["extra_dims"])
        results.append(data)
    return results
 def latest_metric_result(metric_id: int) -> Optional[Dict[str, Any]]:
    engine = get_engine()
    with engine.begin() as conn:
        row = conn.execute(
            text(
                """
                SELECT * FROM metric_result
                WHERE metric_id=:metric_id
                ORDER BY stat_time DESC
                LIMIT 1
                """
            ),
            {"metric_id": metric_id},
        ).first()
    if not row:
        return None
    data = _row_to_dict(row)
    _parse_json_fields(data, ["extra_dims"])
    return data
--- a/app/services/rag_client.py
+++ b/app/services/rag_client.py
@ -1,83 +0,0 @@
 from __future__ import annotations
 import logging
 from typing import Any, Sequence
 import httpx
 from app.exceptions import ProviderAPICallError
 from app.schemas.rag import RagDeleteRequest, RagItemPayload, RagRetrieveRequest
 from app.settings import RAG_API_AUTH_TOKEN, RAG_API_BASE_URL
 logger = logging.getLogger(__name__)
 class RagAPIClient:
    """Thin async client wrapper around the RAG endpoints described in doc/rag-api.md."""
    def __init__(self, *, base_url: str | None = None, auth_token: str | None = None) -> None:
        resolved_base = base_url or RAG_API_BASE_URL
        self._base_url = resolved_base.rstrip("/")
        self._auth_token = auth_token or RAG_API_AUTH_TOKEN
    def _headers(self) -> dict[str, str]:
        headers = {"Content-Type": "application/json"}
        if self._auth_token:
            headers["Authorization"] = f"Bearer {self._auth_token}"
        return headers
    async def _post(
        self,
        client: httpx.AsyncClient,
        path: str,
        payload: Any,
    ) -> Any:
        url = f"{self._base_url}{path}"
        try:
            response = await client.post(url, json=payload, headers=self._headers())
            response.raise_for_status()
        except httpx.HTTPStatusError as exc:
            status_code = exc.response.status_code if exc.response else None
            response_text = exc.response.text if exc.response else ""
            logger.error(
                "RAG API responded with an error (%s) for %s: %s",
                status_code,
                url,
                response_text,
                exc_info=True,
            )
            raise ProviderAPICallError(
                "RAG API call failed.",
                status_code=status_code,
                response_text=response_text,
            ) from exc
        except httpx.HTTPError as exc:
            logger.error("Transport error calling RAG API %s: %s", url, exc, exc_info=True)
            raise ProviderAPICallError(f"RAG API call failed: {exc}") from exc
        try:
            return response.json()
        except ValueError:
            logger.warning("RAG API returned non-JSON response for %s; returning raw text.", url)
            return {"raw": response.text}
    async def add(self, client: httpx.AsyncClient, payload: RagItemPayload) -> Any:
        body = payload.model_dump(by_alias=True, exclude_none=True)
        return await self._post(client, "/rag/add", body)
    async def add_batch(self, client: httpx.AsyncClient, items: Sequence[RagItemPayload]) -> Any:
        body = [item.model_dump(by_alias=True, exclude_none=True) for item in items]
        return await self._post(client, "/rag/addBatch", body)
    async def update(self, client: httpx.AsyncClient, payload: RagItemPayload) -> Any:
        body = payload.model_dump(by_alias=True, exclude_none=True)
        return await self._post(client, "/rag/update", body)
    async def delete(self, client: httpx.AsyncClient, payload: RagDeleteRequest) -> Any:
        body = payload.model_dump(by_alias=True, exclude_none=True)
        return await self._post(client, "/rag/delete", body)
    async def retrieve(self, client: httpx.AsyncClient, payload: RagRetrieveRequest) -> Any:
        body = payload.model_dump(by_alias=True, exclude_none=True)
        return await self._post(client, "/rag/retrieve", body)
--- a/app/services/table_profiling.py
+++ b/app/services/table_profiling.py
@ -1,857 +0,0 @@
 from __future__ import annotations
 import asyncio
 import json
 import logging
 import os
 import re
 from datetime import date, datetime
 from dataclasses import asdict, dataclass, is_dataclass
 from functools import lru_cache
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 import httpx
 import great_expectations as gx
 from great_expectations.core.batch import RuntimeBatchRequest
 from great_expectations.core.expectation_suite import ExpectationSuite
 from great_expectations.data_context import AbstractDataContext
 from great_expectations.exceptions import DataContextError, MetricResolutionError
 from app.exceptions import ProviderAPICallError
 from app.models import TableProfilingJobRequest
 from app.services import LLMGateway
 from app.settings import DEFAULT_IMPORT_MODEL
 from app.services.import_analysis import (
    IMPORT_GATEWAY_BASE_URL,
    resolve_provider_from_model,
 )
 from app.utils.llm_usage import extract_usage as extract_llm_usage
 logger = logging.getLogger(__name__)
 GE_REPORT_RELATIVE_PATH = Path("uncommitted") / "data_docs" / "local_site" / "index.html"
 PROMPT_FILENAMES = {
    "ge_result_desc": "ge_result_desc_prompt.md",
    "snippet_generator": "snippet_generator.md",
    "snippet_alias": "snippet_alias_generator.md",
 }
 DEFAULT_CHAT_TIMEOUT_SECONDS = 180.0
@dataclass
 class GEProfilingArtifacts:
    profiling_result: Dict[str, Any]
    profiling_summary: Dict[str, Any]
    docs_path: str
@dataclass
 class LLMCallResult:
    data: Any
    usage: Optional[Dict[str, Any]] = None
 class PipelineActionType:
    GE_PROFILING = "ge_profiling"
    GE_RESULT_DESC = "ge_result_desc"
    SNIPPET = "snippet"
    SNIPPET_ALIAS = "snippet_alias"
 def _project_root() -> Path:
    return Path(__file__).resolve().parents[2]
 def _prompt_dir() -> Path:
    return _project_root() / "prompt"
@lru_cache(maxsize=None)
 def _load_prompt_parts(filename: str) -> Tuple[str, str]:
    prompt_path = _prompt_dir() / filename
    if not prompt_path.exists():
        raise FileNotFoundError(f"Prompt template not found: {prompt_path}")
    raw = prompt_path.read_text(encoding="utf-8")
    splitter = "用户消息（User）"
    if splitter not in raw:
        raise ValueError(f"Prompt template '{filename}' missing separator '{splitter}'.")
    system_raw, user_raw = raw.split(splitter, maxsplit=1)
    system_text = system_raw.replace("系统角色（System）", "").strip()
    user_text = user_raw.strip()
    return system_text, user_text
 def _render_prompt(template_key: str, replacements: Dict[str, str]) -> Tuple[str, str]:
    filename = PROMPT_FILENAMES[template_key]
    system_text, user_template = _load_prompt_parts(filename)
    rendered_user = user_template
    for key, value in replacements.items():
        rendered_user = rendered_user.replace(key, value)
    return system_text, rendered_user
 def _extract_timeout_seconds(options: Optional[Dict[str, Any]]) -> Optional[float]:
    if not options:
        return None
    value = options.get("llm_timeout_seconds")
    if value is None:
        return None
    try:
        timeout = float(value)
        if timeout <= 0:
            raise ValueError
        return timeout
    except (TypeError, ValueError):
        logger.warning(
            "Invalid llm_timeout_seconds value in extra_options: %r. Falling back to default.",
            value,
        )
        return DEFAULT_CHAT_TIMEOUT_SECONDS
 def _extract_json_payload(content: str) -> str:
    fenced = re.search(
        r"```(?:json)?\s*([\s\S]+?)```",
        content,
        flags=re.IGNORECASE,
    )
    if fenced:
        snippet = fenced.group(1).strip()
        if snippet:
            return snippet
    stripped = content.strip()
    if not stripped:
        raise ValueError("Empty LLM content.")
    decoder = json.JSONDecoder()
    for idx, char in enumerate(stripped):
        if char not in {"{", "["}:
            continue
        try:
            _, end = decoder.raw_decode(stripped[idx:])
        except json.JSONDecodeError:
            continue
        candidate = stripped[idx : idx + end].strip()
        if candidate:
            return candidate
    return stripped
 def _parse_completion_payload(response_payload: Dict[str, Any]) -> Any:
    choices = response_payload.get("choices") or []
    if not choices:
        raise ProviderAPICallError("LLM response did not contain choices to parse.")
    message = choices[0].get("message") or {}
    content = message.get("content") or ""
    if not content.strip():
        raise ProviderAPICallError("LLM response content is empty.")
    json_payload = _extract_json_payload(content)
    try:
        return json.loads(json_payload)
    except json.JSONDecodeError as exc:
        preview = json_payload[:800]
        logger.error("Failed to parse JSON from LLM response: %s", preview, exc_info=True)
        raise ProviderAPICallError("LLM response JSON parsing failed.") from exc
 async def _post_callback(callback_url: str, payload: Dict[str, Any], client: httpx.AsyncClient) -> None:
    safe_payload = _normalize_for_json(payload)
    try:
        logger.info(
            "Posting pipeline action callback to %s: %s",
            callback_url,
            json.dumps(safe_payload, ensure_ascii=False),
        )
        response = await client.post(callback_url, json=safe_payload)
        response.raise_for_status()
    except httpx.HTTPError as exc:
        logger.error("Callback delivery to %s failed: %s", callback_url, exc, exc_info=True)
 def _sanitize_value_set(value: Any, max_values: int) -> Tuple[Any, Optional[Dict[str, int]]]:
    if not isinstance(value, list):
        return value, None
    original_len = len(value)
    if original_len <= max_values:
        return value, None
    trimmed = value[:max_values]
    return trimmed, {"original_length": original_len, "retained": max_values}
 def _sanitize_expectation_suite(suite: ExpectationSuite, max_value_set_values: int = 100) -> Dict[str, Any]:
    suite_dict = suite.to_json_dict()
    remarks: List[Dict[str, Any]] = []
    for expectation in suite_dict.get("expectations", []):
        kwargs = expectation.get("kwargs", {})
        if "value_set" in kwargs:
            sanitized_value, note = _sanitize_value_set(kwargs["value_set"], max_value_set_values)
            kwargs["value_set"] = sanitized_value
            if note:
                expectation.setdefault("meta", {})
                expectation["meta"]["value_set_truncated"] = note
                remarks.append(
                    {
                        "column": kwargs.get("column"),
                        "expectation": expectation.get("expectation_type"),
                        "note": note,
                    }
                )
    if remarks:
        suite_dict.setdefault("meta", {})
        suite_dict["meta"]["value_set_truncations"] = remarks
    return suite_dict
 def _summarize_expectation_suite(suite_dict: Dict[str, Any]) -> Dict[str, Any]:
    column_map: Dict[str, Dict[str, Any]] = {}
    table_expectations: List[Dict[str, Any]] = []
    for expectation in suite_dict.get("expectations", []):
        expectation_type = expectation.get("expectation_type")
        kwargs = expectation.get("kwargs", {})
        column = kwargs.get("column")
        summary_entry: Dict[str, Any] = {"expectation": expectation_type}
        if "value_set" in kwargs and isinstance(kwargs["value_set"], list):
            summary_entry["value_set_size"] = len(kwargs["value_set"])
            summary_entry["value_set_preview"] = kwargs["value_set"][:5]
        if column:
            column_entry = column_map.setdefault(
                column,
                {"name": column, "expectations": []},
            )
            column_entry["expectations"].append(summary_entry)
        else:
            table_expectations.append(summary_entry)
    summary = {
        "column_profiles": list(column_map.values()),
        "table_level_expectations": table_expectations,
        "total_expectations": len(suite_dict.get("expectations", [])),
    }
    return summary
 def _sanitize_identifier(raw: Optional[str], fallback: str) -> str:
    if not raw:
        return fallback
    candidate = re.sub(r"[^0-9A-Za-z_]+", "_", raw).strip("_")
    return candidate or fallback
 def _format_connection_string(template: str, access_info: Dict[str, Any]) -> str:
    if not access_info:
        return template
    try:
        return template.format_map({k: v for k, v in access_info.items()})
    except KeyError as exc:
        missing = exc.args[0]
        raise ValueError(f"table_access_info missing key '{missing}' required by connection_string.") from exc
 def _ensure_sql_runtime_datasource(
    context: AbstractDataContext,
    datasource_name: str,
    connection_string: str,
 ) -> None:
    try:
        datasource = context.get_datasource(datasource_name)
    except (DataContextError, ValueError) as exc:
        message = str(exc)
        if "Could not find a datasource" in message or "Unable to load datasource" in message:
            datasource = None
        else:  # pragma: no cover - defensive
            raise RuntimeError(f"Failed to inspect datasource '{datasource_name}'.") from exc
    except Exception as exc:  # pragma: no cover - defensive
        raise RuntimeError(f"Failed to inspect datasource '{datasource_name}'.") from exc
    if datasource is not None:
        execution_engine = getattr(datasource, "execution_engine", None)
        current_conn = getattr(execution_engine, "connection_string", None)
        if current_conn and current_conn != connection_string:
            logger.info(
                "Existing datasource %s uses different connection string; creating dedicated runtime datasource.",
                datasource_name,
            )
            try:
                context.delete_datasource(datasource_name)
            except Exception as exc:  # pragma: no cover - defensive
                logger.warning(
                    "Failed to delete datasource %s before recreation: %s",
                    datasource_name,
                    exc,
                )
            else:
                datasource = None
    if datasource is not None:
        return
    runtime_datasource_config = {
        "name": datasource_name,
        "class_name": "Datasource",
        "execution_engine": {
            "class_name": "SqlAlchemyExecutionEngine",
            "connection_string": connection_string,
        },
        "data_connectors": {
            "runtime_connector": {
                "class_name": "RuntimeDataConnector",
                "batch_identifiers": ["default_identifier_name"],
            }
        },
    }
    try:
        context.add_datasource(**runtime_datasource_config)
    except Exception as exc:  # pragma: no cover - defensive
        raise RuntimeError(f"Failed to create runtime datasource '{datasource_name}'.") from exc
 def _build_sql_runtime_batch_request(
    context: AbstractDataContext,
    request: TableProfilingJobRequest,
 ) -> RuntimeBatchRequest:
    link_info = request.table_link_info or {}
    access_info = request.table_access_info or {}
    connection_template = link_info.get("connection_string")
    if not connection_template:
        raise ValueError("table_link_info.connection_string is required when using table_link_info.")
    connection_string = _format_connection_string(connection_template, access_info)
    source_type = (link_info.get("type") or "sql").lower()
    if source_type != "sql":
        raise ValueError(f"Unsupported table_link_info.type='{source_type}'. Only 'sql' is supported.")
    query = link_info.get("query")
    table_name = link_info.get("table") or link_info.get("table_name")
    schema_name = link_info.get("schema")
    if not query and not table_name:
        raise ValueError("Either table_link_info.query or table_link_info.table must be provided.")
    if not query:
        if not table_name:
            raise ValueError("table_link_info.table must be provided when query is omitted.")
        identifier = re.compile(r"^[A-Za-z_][A-Za-z0-9_$]*$")
        def _quote(name: str) -> str:
            if identifier.match(name):
                return name
            return f"`{name.replace('`', '``')}`"
        if schema_name:
            schema_part = schema_name if "." not in schema_name else schema_name.split(".")[-1]
            table_part = table_name if "." not in table_name else table_name.split(".")[-1]
            qualified_table = f"{_quote(schema_part)}.{_quote(table_part)}"
        else:
            qualified_table = _quote(table_name)
        query = f"SELECT * FROM {qualified_table}"
        limit = link_info.get("limit")
        if isinstance(limit, int) and limit > 0:
            query = f"{query} LIMIT {limit}"
    datasource_name = request.ge_datasource_name or _sanitize_identifier(
        f"{request.table_id}_runtime_ds", "runtime_ds"
    )
    data_asset_name = request.ge_data_asset_name or _sanitize_identifier(
        table_name or "runtime_query", "runtime_query"
    )
    _ensure_sql_runtime_datasource(context, datasource_name, connection_string)
    batch_identifiers = {
        "default_identifier_name": f"{request.table_id}:{request.version_ts}",
    }
    return RuntimeBatchRequest(
        datasource_name=datasource_name,
        data_connector_name="runtime_connector",
        data_asset_name=data_asset_name,
        runtime_parameters={"query": query},
        batch_identifiers=batch_identifiers,
    )
 def _run_onboarding_assistant(
    context: AbstractDataContext,
    batch_request: Any,
    suite_name: str,
 ) -> Tuple[ExpectationSuite, Any]:
    assistant = context.assistants.onboarding
    assistant_result = assistant.run(batch_request=batch_request)
    suite = assistant_result.get_expectation_suite(expectation_suite_name=suite_name)
    context.save_expectation_suite(suite, expectation_suite_name=suite_name)
    validation_getter = getattr(assistant_result, "get_validation_result", None)
    if callable(validation_getter):
        validation_result = validation_getter()
    else:
        validation_result = getattr(assistant_result, "validation_result", None)
    if validation_result is None:
        # Fallback: rerun validation using the freshly generated expectation suite.
        validator = context.get_validator(
            batch_request=batch_request,
            expectation_suite_name=suite_name,
        )
        validation_result = validator.validate()
    return suite, validation_result
 def _resolve_context(request: TableProfilingJobRequest) -> AbstractDataContext:
    context_kwargs: Dict[str, Any] = {}
    if request.ge_data_context_root:
        context_kwargs["project_root_dir"] = request.ge_data_context_root
    elif os.environ.get("GE_DATA_CONTEXT_ROOT"):
        context_kwargs["project_root_dir"] = os.environ["GE_DATA_CONTEXT_ROOT"]
    else:
        context_kwargs["project_root_dir"] = str(_project_root())
    return gx.get_context(**context_kwargs)
 def _build_batch_request(
    context: AbstractDataContext,
    request: TableProfilingJobRequest,
 ) -> Any:
    if request.ge_batch_request:
        from great_expectations.core.batch import BatchRequest
        return BatchRequest(**request.ge_batch_request)
    if request.table_link_info:
        return _build_sql_runtime_batch_request(context, request)
    if not request.ge_datasource_name or not request.ge_data_asset_name:
        raise ValueError(
            "ge_batch_request or (ge_datasource_name and ge_data_asset_name) must be provided."
        )
    datasource = context.get_datasource(request.ge_datasource_name)
    data_asset = datasource.get_asset(request.ge_data_asset_name)
    return data_asset.build_batch_request()
 async def _run_ge_profiling(request: TableProfilingJobRequest) -> GEProfilingArtifacts:
    def _execute() -> GEProfilingArtifacts:
        context = _resolve_context(request)
        suite_name = (
            request.ge_expectation_suite_name
            or f"{request.table_id}_profiling"
        )
        batch_request = _build_batch_request(context, request)
        try:
            context.get_expectation_suite(suite_name)
        except DataContextError:
            context.add_expectation_suite(suite_name)
        validator = context.get_validator(
            batch_request=batch_request,
            expectation_suite_name=suite_name,
        )
        profiler_type = (request.ge_profiler_type or "user_configurable").lower()
        if profiler_type == "data_assistant":
            suite, validation_result = _run_onboarding_assistant(
                context,
                batch_request,
                suite_name,
            )
        else:
            try:
                from great_expectations.profile.user_configurable_profiler import (
                    UserConfigurableProfiler,
                )
            except ImportError as err:  # pragma: no cover - dependency guard
                raise RuntimeError(
                    "UserConfigurableProfiler is unavailable; install great_expectations profiling extra or switch profiler."
                ) from err
            profiler = UserConfigurableProfiler(profile_dataset=validator)
            try:
                suite = profiler.build_suite()
                context.save_expectation_suite(suite, expectation_suite_name=suite_name)
                validator.expectation_suite = suite
                validation_result = validator.validate()
            except MetricResolutionError as exc:
                logger.warning(
                    "UserConfigurableProfiler failed (%s); falling back to data assistant profiling.",
                    exc,
                )
                suite, validation_result = _run_onboarding_assistant(
                    context,
                    batch_request,
                    suite_name,
                )
        sanitized_suite = _sanitize_expectation_suite(suite)
        summary = _summarize_expectation_suite(sanitized_suite)
        validation_dict = validation_result.to_json_dict()
        context.build_data_docs()
        docs_path = Path(context.root_directory) / GE_REPORT_RELATIVE_PATH
        profiling_result = {
            "expectation_suite": sanitized_suite,
            "validation_result": validation_dict,
            "batch_request": getattr(batch_request, "to_json_dict", lambda: None)() or getattr(batch_request, "dict", lambda: None)(),
        }
        return GEProfilingArtifacts(
            profiling_result=profiling_result,
            profiling_summary=summary,
            docs_path=str(docs_path),
        )
    return await asyncio.to_thread(_execute)
 async def _call_chat_completions(
    *,
    model_spec: str,
    system_prompt: str,
    user_prompt: str,
    client: httpx.AsyncClient,
    temperature: float = 0.2,
    timeout_seconds: Optional[float] = None,
 ) -> Any:
    provider, model_name = resolve_provider_from_model(model_spec)
    payload = {
        "provider": provider.value,
        "model": model_name,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        "temperature": temperature,
    }
    payload_size_bytes = len(json.dumps(payload, ensure_ascii=False).encode("utf-8"))
    url = f"{IMPORT_GATEWAY_BASE_URL.rstrip('/')}/v1/chat/completions"
    try:
        # log the request whole info
        logger.info(
            "Calling chat completions API %s with model %s and size %s and payload %s",
            url,
            model_name,
            payload_size_bytes,
            payload,
        )
        response = await client.post(url, json=payload, timeout=timeout_seconds)
        response.raise_for_status()
    except httpx.HTTPError as exc:
        error_name = exc.__class__.__name__
        detail = str(exc).strip()
        if detail:
            message = f"Chat completions request failed ({error_name}): {detail}"
        else:
            message = f"Chat completions request failed ({error_name})."
        raise ProviderAPICallError(message) from exc
    try:
        response_payload = response.json()
    except ValueError as exc:
        raise ProviderAPICallError("Chat completions response was not valid JSON.") from exc
    parsed_payload = _parse_completion_payload(response_payload)
    usage_info = extract_llm_usage(response_payload)
    return LLMCallResult(data=parsed_payload, usage=usage_info)
 def _normalize_for_json(value: Any) -> Any:
    if value is None or isinstance(value, (str, int, float, bool)):
        return value
    if isinstance(value, (datetime, date)):
        return str(value)
    if hasattr(value, "model_dump"):
        try:
            return value.model_dump()
        except Exception:  # pragma: no cover - defensive
            pass
    if is_dataclass(value):
        return asdict(value)
    if isinstance(value, dict):
        return {k: _normalize_for_json(v) for k, v in value.items()}
    if isinstance(value, (list, tuple, set)):
        return [_normalize_for_json(v) for v in value]
    if hasattr(value, "to_json_dict"):
        try:
            return value.to_json_dict()
        except Exception:  # pragma: no cover - defensive
            pass
    if hasattr(value, "__dict__"):
        return _normalize_for_json(value.__dict__)
    return repr(value)
 def _json_dumps(data: Any) -> str:
    normalised = _normalize_for_json(data)
    return json.dumps(normalised, ensure_ascii=False, indent=2)
 def _preview_for_log(data: Any) -> str:
    try:
        serialised = _json_dumps(data)
    except Exception:
        serialised = repr(data)
    return serialised
 def _profiling_request_for_log(request: TableProfilingJobRequest) -> Dict[str, Any]:
    payload = request.model_dump()
    access_info = payload.get("table_access_info")
    if isinstance(access_info, dict):
        payload["table_access_info"] = {key: "***" for key in access_info.keys()}
    return payload
 async def _execute_result_desc(
    profiling_json: Dict[str, Any],
    _request: TableProfilingJobRequest,
    llm_model: str,
    client: httpx.AsyncClient,
    timeout_seconds: Optional[float],
 ) -> Dict[str, Any]:
    system_prompt, user_prompt = _render_prompt(
        "ge_result_desc",
        {"{{GE_RESULT_JSON}}": _json_dumps(profiling_json)},
    )
    llm_output = await _call_chat_completions(
        model_spec=llm_model,
        system_prompt=system_prompt,
        user_prompt=user_prompt,
        client=client,
        timeout_seconds=timeout_seconds,
    )
    if not isinstance(llm_output.data, dict):
        raise ProviderAPICallError("GE result description payload must be a JSON object.")
    return llm_output
 async def _execute_snippet_generation(
    table_desc_json: Dict[str, Any],
    _request: TableProfilingJobRequest,
    llm_model: str,
    client: httpx.AsyncClient,
    timeout_seconds: Optional[float],
 ) -> List[Dict[str, Any]]:
    system_prompt, user_prompt = _render_prompt(
        "snippet_generator",
        {"{{TABLE_PROFILE_JSON}}": _json_dumps(table_desc_json)},
    )
    llm_output = await _call_chat_completions(
        model_spec=llm_model,
        system_prompt=system_prompt,
        user_prompt=user_prompt,
        client=client,
        timeout_seconds=timeout_seconds,
    )
    if not isinstance(llm_output.data, list):
        raise ProviderAPICallError("Snippet generator must return a JSON array.")
    return llm_output
 async def _execute_snippet_alias(
    snippets_json: List[Dict[str, Any]],
    _request: TableProfilingJobRequest,
    llm_model: str,
    client: httpx.AsyncClient,
    timeout_seconds: Optional[float],
 ) -> List[Dict[str, Any]]:
    system_prompt, user_prompt = _render_prompt(
        "snippet_alias",
        {"{{SNIPPET_ARRAY}}": _json_dumps(snippets_json)},
    )
    llm_output = await _call_chat_completions(
        model_spec=llm_model,
        system_prompt=system_prompt,
        user_prompt=user_prompt,
        client=client,
        timeout_seconds=timeout_seconds,
    )
    if not isinstance(llm_output.data, list):
        raise ProviderAPICallError("Snippet alias generator must return a JSON array.")
    return llm_output
 async def _run_action_with_callback(
    *,
    action_type: str,
    runner,
    callback_base: Dict[str, Any],
    client: httpx.AsyncClient,
    callback_url: str,
    input_payload: Any = None,
    model_spec: Optional[str] = None,
 ) -> Any:
    if input_payload is not None:
        logger.info(
            "Pipeline action %s input: %s",
            action_type,
            _preview_for_log(input_payload),
        )
    try:
        result = await runner()
    except Exception as exc:
        failure_payload = dict(callback_base)
        failure_payload.update(
            {
                "status": "failed",
                "action_type": action_type,
                "error": str(exc),
            }
        )
        if model_spec is not None:
            failure_payload["model"] = model_spec
        await _post_callback(callback_url, failure_payload, client)
        raise
    usage_info: Optional[Dict[str, Any]] = None
    result_payload = result
    if isinstance(result, LLMCallResult):
        usage_info = result.usage
        result_payload = result.data
    success_payload = dict(callback_base)
    success_payload.update(
        {
            "status": "success",
            "action_type": action_type,
        }
    )
    if model_spec is not None:
        success_payload["model"] = model_spec
    logger.info(
        "Pipeline action %s output: %s",
        action_type,
        _preview_for_log(result_payload),
    )
    if action_type == PipelineActionType.GE_PROFILING:
        artifacts: GEProfilingArtifacts = result_payload
        success_payload["ge_profiling_json"] = artifacts.profiling_result
        success_payload["ge_profiling_summary"] = artifacts.profiling_summary
        success_payload["ge_report_path"] = artifacts.docs_path
    elif action_type == PipelineActionType.GE_RESULT_DESC:
        success_payload["ge_result_desc_json"] = result_payload
    elif action_type == PipelineActionType.SNIPPET:
        success_payload["snippet_json"] = result_payload
    elif action_type == PipelineActionType.SNIPPET_ALIAS:
        success_payload["snippet_alias_json"] = result_payload
    if usage_info:
        success_payload["llm_usage"] = usage_info
    await _post_callback(callback_url, success_payload, client)
    return result_payload
 async def process_table_profiling_job(
    request: TableProfilingJobRequest,
    _gateway: LLMGateway,
    client: httpx.AsyncClient,
 ) -> None:
    """Sequentially execute the four-step profiling pipeline and emit callbacks per action."""
    timeout_seconds = _extract_timeout_seconds(request.extra_options)
    if timeout_seconds is None:
        timeout_seconds = DEFAULT_CHAT_TIMEOUT_SECONDS
    base_payload = {
        "table_id": request.table_id,
        "version_ts": request.version_ts,
        "callback_url": str(request.callback_url),
        "table_schema": request.table_schema,
        "table_schema_version_id": request.table_schema_version_id,
        "llm_model": request.llm_model,
        "llm_timeout_seconds": timeout_seconds,
        "workspace_id": request.workspace_id,
        "rag_item_type": request.rag_item_type,
    }
    logging_request_payload = _profiling_request_for_log(request)
    try:
        artifacts: GEProfilingArtifacts = await _run_action_with_callback(
            action_type=PipelineActionType.GE_PROFILING,
            runner=lambda: _run_ge_profiling(request),
            callback_base=base_payload,
            client=client,
            callback_url=str(request.callback_url),
            input_payload=logging_request_payload,
            model_spec=request.llm_model,
        )
        table_desc_json: Dict[str, Any] = await _run_action_with_callback(
            action_type=PipelineActionType.GE_RESULT_DESC,
            runner=lambda: _execute_result_desc(
                artifacts.profiling_result,
                request,
                request.llm_model,
                client,
                timeout_seconds,
            ),
            callback_base=base_payload,
            client=client,
            callback_url=str(request.callback_url),
            input_payload=artifacts.profiling_result,
            model_spec=request.llm_model,
        )
        snippet_json: List[Dict[str, Any]] = await _run_action_with_callback(
            action_type=PipelineActionType.SNIPPET,
            runner=lambda: _execute_snippet_generation(
                table_desc_json,
                request,
                request.llm_model,
                client,
                timeout_seconds,
            ),
            callback_base=base_payload,
            client=client,
            callback_url=str(request.callback_url),
            input_payload=table_desc_json,
            model_spec=request.llm_model,
        )
        await _run_action_with_callback(
            action_type=PipelineActionType.SNIPPET_ALIAS,
            runner=lambda: _execute_snippet_alias(
                snippet_json,
                request,
                request.llm_model,
                client,
                timeout_seconds,
            ),
            callback_base=base_payload,
            client=client,
            callback_url=str(request.callback_url),
            input_payload=snippet_json,
            model_spec=request.llm_model,
        )
    except Exception:  # pragma: no cover - defensive catch
        logger.exception(
            "Table profiling pipeline failed for table_id=%s version_ts=%s",
            request.table_id,
            request.version_ts,
        )
--- a/app/services/table_snippet.py
+++ b/app/services/table_snippet.py
@ -1,640 +0,0 @@
 from __future__ import annotations
 import hashlib
 import json
 import logging
 from datetime import datetime
 from typing import Any, Dict, List, Optional, Sequence, Tuple
 from sqlalchemy import text
 from sqlalchemy.engine import Engine
 from sqlalchemy.exc import SQLAlchemyError
 from app.db import get_engine
 from app.models import ActionType, TableSnippetUpsertRequest, TableSnippetUpsertResponse
 from app.schemas.rag import RagItemPayload
 from app.services.rag_client import RagAPIClient
 logger = logging.getLogger(__name__)
 def _serialize_json(value: Any) -> Tuple[str | None, int | None]:
    logger.debug("Serializing JSON payload: %s", value)
    if value is None:
        return None, None
    if isinstance(value, str):
        encoded = value.encode("utf-8")
        return value, len(encoded)
    serialized = json.dumps(value, ensure_ascii=False)
    encoded = serialized.encode("utf-8")
    return serialized, len(encoded)
 def _prepare_table_schema(value: Any) -> str:
    logger.debug("Preparing table_schema payload.")
    if isinstance(value, str):
        return value
    return json.dumps(value, ensure_ascii=False)
 def _prepare_model_params(params: Dict[str, Any] | None) -> str | None:
    if not params:
        return None
    serialized, _ = _serialize_json(params)
    return serialized
 def _collect_common_columns(request: TableSnippetUpsertRequest) -> Dict[str, Any]:
    # Build the base column set shared by all action types; action-specific fields are populated later.
    logger.debug(
        "Collecting common columns for table_id=%s version_ts=%s action_type=%s",
        request.table_id,
        request.version_ts,
        request.action_type,
    )
    payload: Dict[str, Any] = {
        "table_id": request.table_id,
        "version_ts": request.version_ts,
        "action_type": request.action_type.value,
        "status": request.status.value,
        "callback_url": str(request.callback_url),
        "table_schema_version_id": request.table_schema_version_id,
        "table_schema": _prepare_table_schema(request.table_schema),
        "model": request.model,
        "model_provider": request.model_provider,
    }
    payload.update(
        {
            "ge_profiling_json": None,
            "ge_profiling_json_size_bytes": None,
            "ge_profiling_summary": None,
            "ge_profiling_summary_size_bytes": None,
            "ge_profiling_total_size_bytes": None,
            "ge_profiling_html_report_url": None,
            "ge_result_desc_json": None,
            "ge_result_desc_json_size_bytes": None,
            "snippet_json": None,
            "snippet_json_size_bytes": None,
            "snippet_alias_json": None,
            "snippet_alias_json_size_bytes": None,
        }
    )
    payload["model_params"] = _prepare_model_params(request.model_params)
    if request.llm_usage is not None:
        llm_usage_json, _ = _serialize_json(request.llm_usage)
        if llm_usage_json is not None:
            payload["llm_usage"] = llm_usage_json
    if request.error_code is not None:
        logger.debug("Adding error_code: %s", request.error_code)
        payload["error_code"] = request.error_code
    if request.error_message is not None:
        logger.debug("Adding error_message: %s", request.error_message)
        payload["error_message"] = request.error_message
    if request.started_at is not None:
        payload["started_at"] = request.started_at
    if request.finished_at is not None:
        payload["finished_at"] = request.finished_at
    if request.duration_ms is not None:
        payload["duration_ms"] = request.duration_ms
    if request.result_checksum is not None:
        payload["result_checksum"] = request.result_checksum
    logger.debug("Collected common payload: %s", payload)
    return payload
 def _apply_action_payload(
    request: TableSnippetUpsertRequest,
    payload: Dict[str, Any],
 ) -> None:
    logger.debug("Applying action-specific payload for action_type=%s", request.action_type)
    if request.action_type == ActionType.GE_PROFILING:
        full_json, full_size = _serialize_json(request.ge_profiling_json)
        summary_json, summary_size = _serialize_json(request.ge_profiling_summary)
        if full_json is not None:
            payload["ge_profiling_json"] = full_json
            payload["ge_profiling_json_size_bytes"] = full_size
        if summary_json is not None:
            payload["ge_profiling_summary"] = summary_json
            payload["ge_profiling_summary_size_bytes"] = summary_size
        if request.ge_profiling_total_size_bytes is not None:
            payload["ge_profiling_total_size_bytes"] = request.ge_profiling_total_size_bytes
        elif full_size is not None or summary_size is not None:
            payload["ge_profiling_total_size_bytes"] = (full_size or 0) + (summary_size or 0)
        if request.ge_profiling_html_report_url:
            payload["ge_profiling_html_report_url"] = request.ge_profiling_html_report_url
    elif request.action_type == ActionType.GE_RESULT_DESC:
        full_json, full_size = _serialize_json(request.ge_result_desc_json)
        if full_json is not None:
            payload["ge_result_desc_json"] = full_json
            payload["ge_result_desc_json_size_bytes"] = full_size
    elif request.action_type == ActionType.SNIPPET:
        full_json, full_size = _serialize_json(request.snippet_json)
        if full_json is not None:
            payload["snippet_json"] = full_json
            payload["snippet_json_size_bytes"] = full_size
    elif request.action_type == ActionType.SNIPPET_ALIAS:
        full_json, full_size = _serialize_json(request.snippet_alias_json)
        if full_json is not None:
            payload["snippet_alias_json"] = full_json
            payload["snippet_alias_json_size_bytes"] = full_size
    else:
        logger.error("Unsupported action type encountered: %s", request.action_type)
        raise ValueError(f"Unsupported action type '{request.action_type}'.")
    logger.debug("Payload after applying action-specific data: %s", payload)
 def _build_insert_statement(columns: Dict[str, Any]) -> Tuple[str, Dict[str, Any]]:
    logger.debug("Building insert statement for columns: %s", list(columns.keys()))
    column_names = list(columns.keys())
    placeholders = [f":{name}" for name in column_names]
    update_assignments = [
        f"{name}=VALUES({name})"
        for name in column_names
        if name not in {"table_id", "version_ts", "action_type"}
    ]
    update_assignments.append("updated_at=CURRENT_TIMESTAMP")
    sql = (
        "INSERT INTO action_results ({cols}) VALUES ({vals}) "
        "ON DUPLICATE KEY UPDATE {updates}"
    ).format(
        cols=", ".join(column_names),
        vals=", ".join(placeholders),
        updates=", ".join(update_assignments),
    )
    logger.debug("Generated SQL: %s", sql)
    return sql, columns
 def _execute_upsert(engine: Engine, sql: str, params: Dict[str, Any]) -> int:
    logger.info("Executing upsert for table_id=%s version_ts=%s action_type=%s", params.get("table_id"), params.get("version_ts"), params.get("action_type"))
    with engine.begin() as conn:
        result = conn.execute(text(sql), params)
    logger.info("Rows affected: %s", result.rowcount)
    return result.rowcount
 def upsert_action_result(request: TableSnippetUpsertRequest) -> TableSnippetUpsertResponse:
    logger.info(
        "Received upsert request: table_id=%s version_ts=%s action_type=%s status=%s",
        request.table_id,
        request.version_ts,
        request.action_type,
        request.status,
    )
    logger.debug("Request payload: %s", request.model_dump())
    columns = _collect_common_columns(request)
    _apply_action_payload(request, columns)
    sql, params = _build_insert_statement(columns)
    logger.debug("Final SQL params: %s", params)
    engine = get_engine()
    try:
        rowcount = _execute_upsert(engine, sql, params)
    except SQLAlchemyError as exc:
        logger.exception(
            "Failed to upsert action result: table_id=%s version_ts=%s action_type=%s",
            request.table_id,
            request.version_ts,
            request.action_type,
        )
        raise RuntimeError(f"Database operation failed: {exc}") from exc
    updated = rowcount > 1
    return TableSnippetUpsertResponse(
        table_id=request.table_id,
        version_ts=request.version_ts,
        action_type=request.action_type,
        status=request.status,
        updated=updated,
    )
 def _decode_json_field(value: Any) -> Any:
    """Decode JSON columns that may be returned as str/bytes/dicts/lists."""
    if value is None:
        return None
    if isinstance(value, (dict, list)):
        return value
    if isinstance(value, (bytes, bytearray)):
        try:
            value = value.decode("utf-8")
        except Exception:  # pragma: no cover - defensive
            return None
    if isinstance(value, str):
        try:
            return json.loads(value)
        except json.JSONDecodeError:
            logger.warning("Failed to decode JSON field: %s", value)
            return None
    return None
 def _coerce_json_array(value: Any) -> List[Any]:
    decoded = _decode_json_field(value)
    return decoded if isinstance(decoded, list) else []
 def _fetch_action_payload(
    engine: Engine, table_id: int, version_ts: int, action_type: ActionType
 ) -> Optional[Dict[str, Any]]:
    sql = text(
        """
        SELECT id AS action_result_id, snippet_json, snippet_alias_json, updated_at, status
        FROM action_results
        WHERE table_id = :table_id
          AND version_ts = :version_ts
          AND action_type = :action_type
          AND status IN ('success', 'partial')
        ORDER BY CASE status WHEN 'success' THEN 0 ELSE 1 END, updated_at DESC
        LIMIT 1
        """
    )
    with engine.connect() as conn:
        row = conn.execute(
            sql,
            {
                "table_id": table_id,
                "version_ts": version_ts,
                "action_type": action_type.value,
            },
        ).mappings().first()
    return dict(row) if row else None
 def _load_snippet_sources(
    engine: Engine, table_id: int, version_ts: int
 ) -> Tuple[List[Any], List[Any], Optional[datetime], Optional[int], Optional[int]]:
    alias_row = _fetch_action_payload(engine, table_id, version_ts, ActionType.SNIPPET_ALIAS)
    snippet_row = _fetch_action_payload(engine, table_id, version_ts, ActionType.SNIPPET)
    snippet_json = _coerce_json_array(alias_row.get("snippet_json") if alias_row else None)
    alias_json = _coerce_json_array(alias_row.get("snippet_alias_json") if alias_row else None)
    updated_at: Optional[datetime] = alias_row.get("updated_at") if alias_row else None
    alias_action_id: Optional[int] = alias_row.get("action_result_id") if alias_row else None
    snippet_action_id: Optional[int] = snippet_row.get("action_result_id") if snippet_row else None
    if not snippet_json and snippet_row:
        snippet_json = _coerce_json_array(snippet_row.get("snippet_json"))
        if updated_at is None:
            updated_at = snippet_row.get("updated_at")
        if alias_action_id is None:
            alias_action_id = snippet_action_id
    if not updated_at and alias_row:
        updated_at = alias_row.get("updated_at")
    return snippet_json, alias_json, updated_at, alias_action_id, snippet_action_id
 def _normalize_aliases(raw_aliases: Any) -> List[Dict[str, Any]]:
    aliases: List[Dict[str, Any]] = []
    seen: set[str] = set()
    if not raw_aliases:
        return aliases
    if not isinstance(raw_aliases, list):
        return aliases
    for item in raw_aliases:
        if isinstance(item, dict):
            text_val = item.get("text")
            if not text_val or text_val in seen:
                continue
            seen.add(text_val)
            aliases.append({"text": text_val, "tone": item.get("tone")})
        elif isinstance(item, str):
            if item in seen:
                continue
            seen.add(item)
            aliases.append({"text": item})
    return aliases
 def _normalize_str_list(values: Any) -> List[str]:
    if not values:
        return []
    if not isinstance(values, list):
        return []
    seen: set[str] = set()
    normalised: List[str] = []
    for val in values:
        if not isinstance(val, str):
            continue
        if val in seen:
            continue
        seen.add(val)
        normalised.append(val)
    return normalised
 def _merge_alias_lists(primary: List[Dict[str, Any]], secondary: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    merged: List[Dict[str, Any]] = []
    seen: set[str] = set()
    for source in (primary, secondary):
        for item in source:
            if not isinstance(item, dict):
                continue
            text_val = item.get("text")
            if not text_val or text_val in seen:
                continue
            seen.add(text_val)
            merged.append({"text": text_val, "tone": item.get("tone")})
    return merged
 def _merge_str_lists(primary: List[str], secondary: List[str]) -> List[str]:
    merged: List[str] = []
    seen: set[str] = set()
    for source in (primary, secondary):
        for item in source:
            if item in seen:
                continue
            seen.add(item)
            merged.append(item)
    return merged
 def _build_alias_map(alias_payload: List[Any]) -> Dict[str, Dict[str, Any]]:
    alias_map: Dict[str, Dict[str, Any]] = {}
    for item in alias_payload:
        if not isinstance(item, dict):
            continue
        alias_id = item.get("id")
        if not alias_id:
            continue
        existing = alias_map.setdefault(
            alias_id,
            {"aliases": [], "keywords": [], "intent_tags": []},
        )
        existing["aliases"] = _merge_alias_lists(
            existing["aliases"], _normalize_aliases(item.get("aliases"))
        )
        existing["keywords"] = _merge_str_lists(
            existing["keywords"], _normalize_str_list(item.get("keywords"))
        )
        existing["intent_tags"] = _merge_str_lists(
            existing["intent_tags"], _normalize_str_list(item.get("intent_tags"))
        )
    return alias_map
 def merge_snippet_records_from_db(
    table_id: int,
    version_ts: int,
    *,
    engine: Optional[Engine] = None,
 ) -> List[Dict[str, Any]]:
    """
    Load snippet + snippet_alias JSON from action_results after snippet_alias is stored,
    then merge into a unified snippet object list ready for downstream RAG.
    """
    engine = engine or get_engine()
    snippets, aliases, updated_at, alias_action_id, snippet_action_id = _load_snippet_sources(
        engine, table_id, version_ts
    )
    alias_map = _build_alias_map(aliases)
    merged: List[Dict[str, Any]] = []
    seen_ids: set[str] = set()
    for snippet in snippets:
        if not isinstance(snippet, dict):
            continue
        snippet_id = snippet.get("id")
        if not snippet_id:
            continue
        alias_info = alias_map.get(snippet_id)
        record = dict(snippet)
        record_aliases = _normalize_aliases(record.get("aliases"))
        record_keywords = _normalize_str_list(record.get("keywords"))
        record_intents = _normalize_str_list(record.get("intent_tags"))
        if alias_info:
            record_aliases = _merge_alias_lists(record_aliases, alias_info["aliases"])
            record_keywords = _merge_str_lists(record_keywords, alias_info["keywords"])
            record_intents = _merge_str_lists(record_intents, alias_info["intent_tags"])
        record["aliases"] = record_aliases
        record["keywords"] = record_keywords
        record["intent_tags"] = record_intents
        record["table_id"] = table_id
        record["version_ts"] = version_ts
        record["updated_at_from_action"] = updated_at
        record["source"] = "snippet"
        record["action_result_id"] = alias_action_id or snippet_action_id
        merged.append(record)
        seen_ids.add(snippet_id)
    for alias_id, alias_info in alias_map.items():
        if alias_id in seen_ids:
            continue
        if alias_action_id is None and snippet_action_id is None:
            continue
        merged.append(
            {
                "id": alias_id,
                "aliases": alias_info["aliases"],
                "keywords": alias_info["keywords"],
                "intent_tags": alias_info["intent_tags"],
                "table_id": table_id,
                "version_ts": version_ts,
                "updated_at_from_action": updated_at,
                "source": "alias_only",
                "action_result_id": alias_action_id or snippet_action_id,
            }
        )
    return merged
 def _stable_rag_item_id(table_id: int, version_ts: int, snippet_id: str) -> int:
    digest = hashlib.md5(f"{table_id}:{version_ts}:{snippet_id}".encode("utf-8")).hexdigest()
    return int(digest[:16], 16) % 9_000_000_000_000_000_000
 def _to_serializable(value: Any) -> Any:
    if value is None or isinstance(value, (str, int, float, bool)):
        return value
    if isinstance(value, datetime):
        return value.isoformat()
    if isinstance(value, dict):
        return {k: _to_serializable(v) for k, v in value.items()}
    if isinstance(value, list):
        return [_to_serializable(v) for v in value]
    return str(value)
 def _build_rag_text(snippet: Dict[str, Any]) -> str:
    # Deterministic text concatenation for embedding input.
    parts: List[str] = []
    def _add(label: str, value: Any) -> None:
        if value is None:
            return
        if isinstance(value, list):
            value = ", ".join([str(v) for v in value if v])
        elif isinstance(value, dict):
            value = json.dumps(value, ensure_ascii=False)
        if value:
            parts.append(f"{label}: {value}")
    _add("Title", snippet.get("title") or snippet.get("id"))
    _add("Description", snippet.get("desc"))
    _add("Business", snippet.get("business_caliber"))
    _add("Type", snippet.get("type"))
    _add("Examples", snippet.get("examples") or [])
    _add("Aliases", [a.get("text") for a in snippet.get("aliases") or [] if isinstance(a, dict)])
    _add("Keywords", snippet.get("keywords") or [])
    _add("IntentTags", snippet.get("intent_tags") or [])
    _add("Applicability", snippet.get("applicability"))
    _add("DialectSQL", snippet.get("dialect_sql"))
    return "\n".join(parts)
 def _prepare_rag_payloads(
    snippets: List[Dict[str, Any]],
    table_id: int,
    version_ts: int,
    workspace_id: int,
    rag_item_type: str = "SNIPPET",
 ) -> Tuple[List[Dict[str, Any]], List[RagItemPayload]]:
    rows: List[Dict[str, Any]] = []
    payloads: List[RagItemPayload] = []
    now = datetime.utcnow()
    for snippet in snippets:
        snippet_id = snippet.get("id")
        if not snippet_id:
            continue
        action_result_id = snippet.get("action_result_id")
        if action_result_id is None:
            logger.warning(
                "Skipping snippet without action_result_id for RAG ingestion (table_id=%s version_ts=%s snippet_id=%s)",
                table_id,
                version_ts,
                snippet_id,
            )
            continue
        rag_item_id = _stable_rag_item_id(table_id, version_ts, snippet_id)
        rag_text = _build_rag_text(snippet)
        serializable_snippet = _to_serializable(snippet)
        merged_json = json.dumps(serializable_snippet, ensure_ascii=False)
        updated_at_raw = snippet.get("updated_at_from_action") or now
        if isinstance(updated_at_raw, str):
            try:
                updated_at = datetime.fromisoformat(updated_at_raw)
            except ValueError:
                updated_at = now
        else:
            updated_at = updated_at_raw if isinstance(updated_at_raw, datetime) else now
        created_at = updated_at
        row = {
            "rag_item_id": rag_item_id,
            "workspace_id": workspace_id,
            "table_id": table_id,
            "version_ts": version_ts,
            "created_at": created_at,
            "action_result_id": action_result_id,
            "snippet_id": snippet_id,
            "rag_text": rag_text,
            "merged_json": merged_json,
            "updated_at": updated_at,
        }
        rows.append(row)
        payloads.append(
            RagItemPayload(
                id=rag_item_id,
                workspaceId=workspace_id,
                name=snippet.get("title") or snippet_id,
                embeddingData=rag_text,
                type=rag_item_type or "SNIPPET",
            )
        )
    return rows, payloads
 def _upsert_rag_snippet_rows(engine: Engine, rows: Sequence[Dict[str, Any]]) -> None:
    if not rows:
        return
    delete_sql = text("DELETE FROM rag_snippet WHERE rag_item_id=:rag_item_id")
    insert_sql = text(
        """
        INSERT INTO rag_snippet (
            rag_item_id,
            workspace_id,
            table_id,
            version_ts,
            created_at,
            action_result_id,
            snippet_id,
            rag_text,
            merged_json,
            updated_at
        ) VALUES (
            :rag_item_id,
            :workspace_id,
            :table_id,
            :version_ts,
            :created_at,
            :action_result_id,
            :snippet_id,
            :rag_text,
            :merged_json,
            :updated_at
        )
        """
    )
    with engine.begin() as conn:
        for row in rows:
            conn.execute(delete_sql, row)
            conn.execute(insert_sql, row)
 async def ingest_snippet_rag_from_db(
    table_id: int,
    version_ts: int,
    *,
    workspace_id: int,
    rag_item_type: str = "SNIPPET",
    client,
    engine: Optional[Engine] = None,
    rag_client: Optional[RagAPIClient] = None,
 ) -> List[int]:
    """
    Merge snippet + alias JSON from action_results, persist to rag_snippet, then push to RAG via addBatch.
    Returns list of rag_item_id ingested.
    """
    engine = engine or get_engine()
    snippets = merge_snippet_records_from_db(table_id, version_ts, engine=engine)
    if not snippets:
        logger.info(
            "No snippets available for RAG ingestion (table_id=%s version_ts=%s)",
            table_id,
            version_ts,
        )
        return []
    rows, payloads = _prepare_rag_payloads(
        snippets,
        table_id=table_id,
        version_ts=version_ts,
        workspace_id=workspace_id,
        rag_item_type=rag_item_type,
    )
    _upsert_rag_snippet_rows(engine, rows)
    rag_client = rag_client or RagAPIClient()
    await rag_client.add_batch(client, payloads)
    return [row["rag_item_id"] for row in rows]
--- a/app/settings.py
+++ b/app/settings.py
@ -20,11 +20,7 @@ PROVIDER_KEY_ENV_MAP: Dict[str, str] = {
 }
-DEFAULT_IMPORT_MODEL = os.getenv("DEFAULT_IMPORT_MODEL", "deepseek:deepseek-chat")
+DEFAULT_IMPORT_MODEL = os.getenv("DEFAULT_IMPORT_MODEL", "openai:gpt-4.1-mini")
 NEW_API_BASE_URL = os.getenv("NEW_API_BASE_URL")
 NEW_API_AUTH_TOKEN = os.getenv("NEW_API_AUTH_TOKEN")
 RAG_API_BASE_URL = os.getenv("RAG_API_BASE_URL", "https://tchatbi.agentcarrier.cn/chatbi/api")
 RAG_API_AUTH_TOKEN = os.getenv("RAG_API_AUTH_TOKEN")
@lru_cache(maxsize=1)
--- a/app/utils/llm_usage.py
+++ b/app/utils/llm_usage.py
@ -1,116 +0,0 @@
 from __future__ import annotations
 from typing import Any, Dict, Iterable, Optional
 PROMPT_TOKEN_KEYS: tuple[str, ...] = ("prompt_tokens", "input_tokens", "promptTokenCount")
 COMPLETION_TOKEN_KEYS: tuple[str, ...] = (
    "completion_tokens",
    "output_tokens",
    "candidatesTokenCount",
 )
 TOTAL_TOKEN_KEYS: tuple[str, ...] = ("total_tokens", "totalTokenCount")
 USAGE_CONTAINER_KEYS: tuple[str, ...] = ("usage", "usageMetadata", "usage_metadata")
 def _normalize_usage_value(value: Any) -> Any:
    if isinstance(value, (int, float)):
        return int(value)
    if isinstance(value, str):
        stripped = value.strip()
        if not stripped:
            return None
        try:
            numeric = float(stripped)
        except ValueError:
            return None
        return int(numeric)
    if isinstance(value, dict):
        normalized: Dict[str, Any] = {}
        for key, nested_value in value.items():
            normalized_value = _normalize_usage_value(nested_value)
            if normalized_value is not None:
                normalized[key] = normalized_value
        return normalized or None
    if isinstance(value, (list, tuple, set)):
        normalized_list = [
            item for item in (_normalize_usage_value(element) for element in value) if item is not None
        ]
        return normalized_list or None
    return None
 def _first_numeric(payload: Dict[str, Any], keys: Iterable[str]) -> Optional[int]:
    for key in keys:
        value = payload.get(key)
        if isinstance(value, (int, float)):
            return int(value)
    return None
 def _canonicalize_counts(payload: Dict[str, Any]) -> None:
    prompt = _first_numeric(payload, PROMPT_TOKEN_KEYS)
    completion = _first_numeric(payload, COMPLETION_TOKEN_KEYS)
    total = _first_numeric(payload, TOTAL_TOKEN_KEYS)
    if prompt is not None:
        payload["prompt_tokens"] = prompt
    else:
        payload.pop("prompt_tokens", None)
    if completion is not None:
        payload["completion_tokens"] = completion
    else:
        payload.pop("completion_tokens", None)
    if total is not None:
        payload["total_tokens"] = total
    elif prompt is not None and completion is not None:
        payload["total_tokens"] = prompt + completion
    else:
        payload.pop("total_tokens", None)
    for alias in PROMPT_TOKEN_KEYS[1:]:
        payload.pop(alias, None)
    for alias in COMPLETION_TOKEN_KEYS[1:]:
        payload.pop(alias, None)
    for alias in TOTAL_TOKEN_KEYS[1:]:
        payload.pop(alias, None)
 def _extract_usage_container(candidate: Any) -> Optional[Dict[str, Any]]:
    if not isinstance(candidate, dict):
        return None
    for key in USAGE_CONTAINER_KEYS:
        value = candidate.get(key)
        if isinstance(value, dict):
            return value
    return None
 def extract_usage(payload: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """Unified helper to parse token usage metadata from diverse provider responses."""
    if not isinstance(payload, dict):
        return None
    usage_candidate = _extract_usage_container(payload)
    if usage_candidate is None:
        raw_section = payload.get("raw")
        usage_candidate = _extract_usage_container(raw_section)
    if usage_candidate is None:
        return None
    normalized = _normalize_usage_value(usage_candidate)
    if not isinstance(normalized, dict):
        return None
    _canonicalize_counts(normalized)
    return normalized or None
 __all__ = ["extract_usage"]
--- a/deepseek-result.json
+++ b/deepseek-result.json
@ -0,0 +1,41 @@
 {
    "provider": "deepseek",
    "model": "deepseek-chat",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "content": "```json\n{\n  \"table_name\": \"national_brand_sales\",\n  \"description\": \"全国品牌系统外销售数据\",\n  \"columns\": [\n    {\n      \"original_name\": \"品牌\",\n      \"standard_name\": \"brand\",\n      \"data_type\": \"string\",\n      \"db_type\": \"varchar(50)\",\n      \"java_type\": \"string\",\n      \"nullable\": true,\n      \"distinct_count_sample\": 5,\n      \"null_ratio_sample\": 0.4,\n      \"is_enum_candidate\": false,\n      \"description\": \"品牌名称\",\n      \"date_format\": null\n    },\n    {\n      \"original_name\": \"产品价类\",\n      \"standard_name\": \"price_category\",\n      \"data_type\": \"string\",\n      \"db_type\": \"varchar(10)\",\n      \"java_type\": \"string\",\n      \"nullable\": false,\n      \"distinct_count_sample\": 3,\n      \"null_ratio_sample\": 0.0,\n      \"is_enum_candidate\": true,\n      \"description\": \"产品价格分类（一类/二类/三类）\",\n      \"date_format\": null\n    },\n    {\n      \"original_name\": \"是否重点品牌"
            }
        }
    ],
    "raw": {
        "id": "67f3cc80-38bc-4bb7-b336-48d4886722c4",
        "object": "chat.completion",
        "created": 1761752207,
        "model": "deepseek-chat",
        "choices": [
            {
                "index": 0,
                "message": {
                    "role": "assistant",
                    "content": "```json\n{\n  \"table_name\": \"national_brand_sales\",\n  \"description\": \"全国品牌系统外销售数据\",\n  \"columns\": [\n    {\n      \"original_name\": \"品牌\",\n      \"standard_name\": \"brand\",\n      \"data_type\": \"string\",\n      \"db_type\": \"varchar(50)\",\n      \"java_type\": \"string\",\n      \"nullable\": true,\n      \"distinct_count_sample\": 5,\n      \"null_ratio_sample\": 0.4,\n      \"is_enum_candidate\": false,\n      \"description\": \"品牌名称\",\n      \"date_format\": null\n    },\n    {\n      \"original_name\": \"产品价类\",\n      \"standard_name\": \"price_category\",\n      \"data_type\": \"string\",\n      \"db_type\": \"varchar(10)\",\n      \"java_type\": \"string\",\n      \"nullable\": false,\n      \"distinct_count_sample\": 3,\n      \"null_ratio_sample\": 0.0,\n      \"is_enum_candidate\": true,\n      \"description\": \"产品价格分类（一类/二类/三类）\",\n      \"date_format\": null\n    },\n    {\n      \"original_name\": \"是否重点品牌"
                },
                "logprobs": null,
                "finish_reason": "length"
            }
        ],
        "usage": {
            "prompt_tokens": 1078,
            "completion_tokens": 256,
            "total_tokens": 1334,
            "prompt_tokens_details": {
                "cached_tokens": 1024
            },
            "prompt_cache_hit_tokens": 1024,
            "prompt_cache_miss_tokens": 54
        },
        "system_fingerprint": "fp_ffc7281d48_prod0820_fp8_kvcache"
    }
 }
--- a/demo/水务/水务-gemini2.5-ge-result.json
+++ b/demo/水务/水务-gemini2.5-ge-result.json
@ -1 +0,0 @@
 {"role": "dimension", "time": {"range": null, "column": null, "has_gaps": null, "granularity": "unknown"}, "grain": ["service_point_id"], "table": "data-ge.water_meter_info", "columns": [{"name": "meter_subtype", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 9, "distinct_ratio": 0.03, "pk_candidate_score": 0.03, "metric_candidate_score": 0.0}, {"name": "installation_position", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 4, "distinct_ratio": 0.013333333333333334, "pk_candidate_score": 0.013333333333333334, "metric_candidate_score": 0.0}, {"name": "supply_office", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 11, "distinct_ratio": 0.03666666666666667, "pk_candidate_score": 0.03666666666666667, "metric_candidate_score": 0.0}, {"name": "meter_diameter", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 8, "distinct_ratio": 0.02666666666666667, "pk_candidate_score": 0.02666666666666667, "metric_candidate_score": 0.0}, {"name": "account_id", "dtype": "unknown", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列的统计指标（如空值率、唯一性）缺失，但根据命名规则推断为ID。", "enumish": null, "null_rate": null, "top_values": [], "semantic_type": "id", "distinct_count": null, "distinct_ratio": null, "pk_candidate_score": 0.9, "metric_candidate_score": 0.0}, {"name": "service_point_id", "dtype": "unknown", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列的统计指标（如空值率、唯一性）缺失，但根据命名规则推断为ID。", "enumish": null, "null_rate": null, "top_values": [], "semantic_type": "id", "distinct_count": null, "distinct_ratio": null, "pk_candidate_score": 0.95, "metric_candidate_score": 0.0}, {"name": "station", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 36, "distinct_ratio": 0.12, "pk_candidate_score": 0.12, "metric_candidate_score": 0.0}, {"name": "meter_type", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 5, "distinct_ratio": 0.016666666666666666, "pk_candidate_score": 0.016666666666666666, "metric_candidate_score": 0.0}, {"name": "district", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 13, "distinct_ratio": 0.043333333333333335, "pk_candidate_score": 0.043333333333333335, "metric_candidate_score": 0.0}, {"name": "meter_status", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列只有一个唯一值 '有效'。", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 1, "distinct_ratio": 0.0033333333333333335, "pk_candidate_score": 0.0033333333333333335, "metric_candidate_score": 0.0}], "quality": {"warning_hints": ["列 'meter_status' 只有一个唯一值 '有效'，可能为常量列。"], "failed_expectations": []}, "row_count": 300, "fk_candidates": [], "confidence_notes": ["表角色(role)被推断为 'dimension'，因为其列几乎完全由ID和类别属性构成，且缺少数值指标或时间序列列。", "主键候选(primary_key_candidates) 'service_point_id' 和 'account_id' 是基于命名约定（包含'_id'）推断的。其唯一性和非空性未在GE结果中直接度量，因此这是一个高置信度的猜测。", "表粒度(grain)可能为 'service_point'，与推断的主键 'service_point_id' 相对应。", "未根据列名或数据格式识别出时间列。"], "primary_key_candidates": [["service_point_id"], ["account_id"]]}
--- a/demo/水务/水务-gemini2.5-snippet-alias.json
+++ b/demo/水务/水务-gemini2.5-snippet-alias.json
@ -1,180 +0,0 @@
 [
    {
        "id": "snpt_count-service-points-by-dimension",
        "aliases": [
            {
                "text": "各个区有多少水表",
                "tone": "口语"
            },
            {
                "text": "按维度统计用水点数",
                "tone": "中性"
            },
            {
                "text": "各维度用水点数量分布",
                "tone": "专业"
            }
        ],
        "keywords": [
            "用水点数",
            "service_point_count",
            "数量",
            "统计",
            "汇总",
            "aggregate",
            "维度",
            "dimension",
            "区域",
            "district",
            "供水所",
            "分组统计",
            "水表"
        ],
        "intent_tags": [
            "aggregate",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_topn-service-points-by-dimension",
        "aliases": [
            {
                "text": "哪个地方水表最多",
                "tone": "口语"
            },
            {
                "text": "用水点数Top-N排名",
                "tone": "中性"
            },
            {
                "text": "Top-N用水点数维度排行",
                "tone": "专业"
            }
        ],
        "keywords": [
            "Top-N",
            "top",
            "排名",
            "排行",
            "ranking",
            "最多",
            "用水点数",
            "service_point_count",
            "维度",
            "dimension",
            "站点",
            "station",
            "水表"
        ],
        "intent_tags": [
            "topn",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_ratio-service-points-by-dimension",
        "aliases": [
            {
                "text": "各种水表各占多少",
                "tone": "口语"
            },
            {
                "text": "各维度用水点数占比",
                "tone": "中性"
            },
            {
                "text": "用水点维度构成分析",
                "tone": "专业"
            }
        ],
        "keywords": [
            "占比",
            "percentage",
            "百分比",
            "ratio",
            "构成",
            "分布",
            "用水点数",
            "水表类型",
            "meter_type",
            "维度",
            "dimension",
            "水表"
        ],
        "intent_tags": [
            "ratio",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_quality-check-duplicate-spid",
        "aliases": [
            {
                "text": "有没有重复的水表号",
                "tone": "口语"
            },
            {
                "text": "检查重复的用水点ID",
                "tone": "中性"
            },
            {
                "text": "用水点ID唯一性校验",
                "tone": "专业"
            }
        ],
        "keywords": [
            "数据质量",
            "quality",
            "检查",
            "校验",
            "重复",
            "duplicate",
            "唯一性",
            "uniqueness",
            "用水点ID",
            "service_point_id",
            "异常检测",
            "主键"
        ],
        "intent_tags": [
            "quality",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_sample-filter-service-points-by-dims",
        "aliases": [
            {
                "text": "给我看城区的机械表",
                "tone": "口语"
            },
            {
                "text": "按多维度筛选用水点",
                "tone": "中性"
            },
            {
                "text": "多维组合条件过滤用水点",
                "tone": "专业"
            }
        ],
        "keywords": [
            "筛选",
            "过滤",
            "filter",
            "查询",
            "明细",
            "列表",
            "sample",
            "用水点",
            "区域",
            "district",
            "水表类型",
            "meter_type",
            "条件查询"
        ],
        "intent_tags": [
            "sample",
            "filter"
        ]
    }
 ]
--- a/demo/水务/水务-gemini2.5-snippet.json
+++ b/demo/水务/水务-gemini2.5-snippet.json
@ -1,186 +0,0 @@
 [
    {
        "id": "snpt_count-service-points-by-dimension",
        "desc": "按指定维度（如区域、供水所）分组，统计各分类下的用水点数量。",
        "type": "aggregate",
        "title": "按维度统计用水点数",
        "examples": [
            "按区域统计用水点数量",
            "各个供水所分别有多少个用水点"
        ],
        "variables": [
            {
                "name": "dimension_column",
                "type": "column",
                "default": "district"
            }
        ],
        "dialect_sql": {
            "mysql": "SELECT\n  `${dimension_column}`,\n  COUNT(DISTINCT service_point_id) AS service_point_count\nFROM\n  `data-ge.water_meter_info`\nGROUP BY\n  `${dimension_column}`\nORDER BY\n  service_point_count DESC;"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "适用于对水表档案信息进行分类汇总统计。",
                    "可将变量 ${dimension_column} 替换为任一维度列，如 district, supply_office, station, meter_type 等。"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": null
            },
            "time_column": null,
            "required_columns": [
                "service_point_id"
            ]
        },
        "business_caliber": "用水点数：对 `service_point_id` 进行去重计数，代表一个独立的服务点（通常对应一个水表）。统计粒度为“指定维度”。"
    },
    {
        "id": "snpt_topn-service-points-by-dimension",
        "desc": "按指定维度（如区域、站点）统计用水点数，并展示数量最多的前N个分类。",
        "type": "topn",
        "title": "Top-N 用水点数维度排名",
        "examples": [
            "哪个区域的用水点最多",
            "用水点数排名前5的站点是哪些"
        ],
        "variables": [
            {
                "name": "dimension_column",
                "type": "column",
                "default": "station"
            },
            {
                "name": "top_n",
                "type": "int",
                "default": 10
            }
        ],
        "dialect_sql": {
            "mysql": "SELECT\n  `${dimension_column}`,\n  COUNT(DISTINCT service_point_id) AS service_point_count\nFROM\n  `data-ge.water_meter_info`\nGROUP BY\n  `${dimension_column}`\nORDER BY\n  service_point_count DESC\nLIMIT ${top_n};"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "维度 `station` 基数较高 (36)，建议 Top-N 查询时结合业务场景合理设置 N 值。"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": 36
            },
            "time_column": null,
            "required_columns": [
                "service_point_id"
            ]
        },
        "business_caliber": "用水点数：对 `service_point_id` 进行去重计数。排名依据为各维度分类下的用水点总数。统计粒度为“指定维度”。"
    },
    {
        "id": "snpt_ratio-service-points-by-dimension",
        "desc": "计算在指定维度下，各分类的用水点数占总用水点数的百分比，以分析其分布构成。",
        "type": "ratio",
        "title": "各维度用水点数占比",
        "examples": [
            "不同水表类型（meter_type）的分布情况",
            "各个区域的用水点占比是多少"
        ],
        "variables": [
            {
                "name": "dimension_column",
                "type": "column",
                "default": "meter_type"
            }
        ],
        "dialect_sql": {
            "mysql": "SELECT\n  `${dimension_column}`,\n  COUNT(DISTINCT service_point_id) AS service_point_count,\n  COUNT(DISTINCT service_point_id) * 100.0 / SUM(COUNT(DISTINCT service_point_id)) OVER () AS percentage\nFROM\n  `data-ge.water_meter_info`\nGROUP BY\n  `${dimension_column}`\nORDER BY\n  service_point_count DESC;"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "SQL模板使用了窗口函数 SUM() OVER()，请确保MySQL版本支持（8.0+）。"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": null
            },
            "time_column": null,
            "required_columns": [
                "service_point_id"
            ]
        },
        "business_caliber": "用水点数占比：某分类下的用水点数 / 总用水点数。用水点数以 `service_point_id` 去重计数。统计粒度为“指定维度”。"
    },
    {
        "id": "snpt_quality-check-duplicate-spid",
        "desc": "查找在用水点信息表中存在重复的 `service_point_id`，用于数据质量校验。",
        "type": "quality",
        "title": "检查重复的用水点ID",
        "examples": [
            "检查是否存在重复的水表档案",
            "校验用水点ID的唯一性"
        ],
        "variables": [],
        "dialect_sql": {
            "mysql": "SELECT\n  service_point_id,\n  COUNT(*) AS occurrences\nFROM\n  `data-ge.water_meter_info`\nGROUP BY\n  service_point_id\nHAVING\n  COUNT(*) > 1;"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "预期返回结果为空。若有返回，则表示数据存在一致性问题，`service_point_id` 未能作为唯一主键。"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": null
            },
            "time_column": null,
            "required_columns": [
                "service_point_id"
            ]
        },
        "business_caliber": "重复项：指 `service_point_id` 出现次数大于1的记录。此ID应为表的主键，理论上不应重复。"
    },
    {
        "id": "snpt_sample-filter-service-points-by-dims",
        "desc": "根据区域、水表类型、供水所等多个维度组合条件，筛选出符合条件的用水点明细。",
        "type": "sample",
        "title": "多维度筛选用水点列表",
        "examples": [
            "查询城区的机械表有哪些",
            "拉取某个供水所下特定口径水表的列表"
        ],
        "variables": [
            {
                "name": "district_name",
                "type": "string",
                "default": "城区"
            },
            {
                "name": "meter_type_name",
                "type": "string",
                "default": "机械表"
            },
            {
                "name": "limit_num",
                "type": "int",
                "default": 100
            }
        ],
        "dialect_sql": {
            "mysql": "SELECT\n  service_point_id,\n  account_id,\n  district,\n  supply_office,\n  meter_type,\n  meter_subtype,\n  meter_diameter\nFROM\n  `data-ge.water_meter_info`\nWHERE\n  district = '${district_name}'\n  AND meter_type = '${meter_type_name}'\n  -- AND meter_status = '有效' -- 可选：根据画像，该列为常量'有效'，可不加\nLIMIT ${limit_num};"
        },
        "applicability": {
            "constraints": {
                "notes": [],
                "fk_join_available": false,
                "dim_cardinality_hint": null
            },
            "time_column": null,
            "required_columns": [
                "service_point_id",
                "account_id",
                "district",
                "supply_office",
                "meter_type",
                "meter_subtype",
                "meter_diameter"
            ]
        },
        "business_caliber": "返回满足所有筛选条件的用水点明细信息。`meter_status` 列只有一个值 '有效'，通常无需作为筛选条件。"
    }
 ]
--- a/demo/水务/水务-gpt5-ge-desc.json
+++ b/demo/水务/水务-gpt5-ge-desc.json
@ -1,230 +0,0 @@
 {
    "role": "dimension",
    "time": {
        "range": null,
        "column": null,
        "has_gaps": null,
        "granularity": "unknown"
    },
    "grain": [
        "service_point_id"
    ],
    "table": "data-ge.water_meter_info",
    "columns": [
        {
            "name": "supply_office",
            "dtype": "string",
            "stats": {
                "max": null,
                "min": null,
                "std": null,
                "mean": null,
                "skewness": null
            },
            "comment": "非空；11 个枚举值（GE 约束）",
            "enumish": true,
            "null_rate": 0.0,
            "top_values": [],
            "semantic_type": "dimension",
            "distinct_count": 11,
            "distinct_ratio": 0.03666666666666667,
            "pk_candidate_score": 0.05,
            "metric_candidate_score": 0.0
        },
        {
            "name": "station",
            "dtype": "string",
            "stats": {
                "max": null,
                "min": null,
                "std": null,
                "mean": null,
                "skewness": null
            },
            "comment": "非空；36 个枚举值（GE 约束）",
            "enumish": true,
            "null_rate": 0.0,
            "top_values": [],
            "semantic_type": "dimension",
            "distinct_count": 36,
            "distinct_ratio": 0.12,
            "pk_candidate_score": 0.1,
            "metric_candidate_score": 0.0
        },
        {
            "name": "district",
            "dtype": "string",
            "stats": {
                "max": null,
                "min": null,
                "std": null,
                "mean": null,
                "skewness": null
            },
            "comment": "非空；13 个枚举值（GE 约束）",
            "enumish": true,
            "null_rate": 0.0,
            "top_values": [],
            "semantic_type": "dimension",
            "distinct_count": 13,
            "distinct_ratio": 0.043333333333333335,
            "pk_candidate_score": 0.05,
            "metric_candidate_score": 0.0
        },
        {
            "name": "meter_diameter",
            "dtype": "string",
            "stats": {
                "max": null,
                "min": null,
                "std": null,
                "mean": null,
                "skewness": null
            },
            "comment": "非空；8 个枚举值（GE 约束）",
            "enumish": true,
            "null_rate": 0.0,
            "top_values": [],
            "semantic_type": "dimension",
            "distinct_count": 8,
            "distinct_ratio": 0.02666666666666667,
            "pk_candidate_score": 0.03,
            "metric_candidate_score": 0.0
        },
        {
            "name": "meter_status",
            "dtype": "string",
            "stats": {
                "max": null,
                "min": null,
                "std": null,
                "mean": null,
                "skewness": null
            },
            "comment": "非空；单一取值（\"有效\"）",
            "enumish": true,
            "null_rate": 0.0,
            "top_values": [],
            "semantic_type": "dimension",
            "distinct_count": 1,
            "distinct_ratio": 0.0033333333333333335,
            "pk_candidate_score": 0.0,
            "metric_candidate_score": 0.0
        },
        {
            "name": "meter_subtype",
            "dtype": "string",
            "stats": {
                "max": null,
                "min": null,
                "std": null,
                "mean": null,
                "skewness": null
            },
            "comment": "非空；9 个枚举值（GE 约束）",
            "enumish": true,
            "null_rate": 0.0,
            "top_values": [],
            "semantic_type": "dimension",
            "distinct_count": 9,
            "distinct_ratio": 0.03,
            "pk_candidate_score": 0.03,
            "metric_candidate_score": 0.0
        },
        {
            "name": "meter_type",
            "dtype": "string",
            "stats": {
                "max": null,
                "min": null,
                "std": null,
                "mean": null,
                "skewness": null
            },
            "comment": "非空；5 个枚举值（GE 约束）",
            "enumish": true,
            "null_rate": 0.0,
            "top_values": [],
            "semantic_type": "dimension",
            "distinct_count": 5,
            "distinct_ratio": 0.016666666666666666,
            "pk_candidate_score": 0.02,
            "metric_candidate_score": 0.0
        },
        {
            "name": "installation_position",
            "dtype": "string",
            "stats": {
                "max": null,
                "min": null,
                "std": null,
                "mean": null,
                "skewness": null
            },
            "comment": "非空；4 个枚举值（GE 约束）",
            "enumish": true,
            "null_rate": 0.0,
            "top_values": [],
            "semantic_type": "dimension",
            "distinct_count": 4,
            "distinct_ratio": 0.013333333333333334,
            "pk_candidate_score": 0.02,
            "metric_candidate_score": 0.0
        },
        {
            "name": "service_point_id",
            "dtype": "unknown",
            "stats": {
                "max": null,
                "min": null,
                "std": null,
                "mean": null,
                "skewness": null
            },
            "comment": "命名指示标识列；未提供唯一性或非空验证",
            "enumish": null,
            "null_rate": null,
            "top_values": [],
            "semantic_type": "id",
            "distinct_count": null,
            "distinct_ratio": null,
            "pk_candidate_score": 0.6,
            "metric_candidate_score": 0.05
        },
        {
            "name": "account_id",
            "dtype": "unknown",
            "stats": {
                "max": null,
                "min": null,
                "std": null,
                "mean": null,
                "skewness": null
            },
            "comment": "命名指示账户标识；未提供唯一性或非空验证",
            "enumish": null,
            "null_rate": null,
            "top_values": [],
            "semantic_type": "id",
            "distinct_count": null,
            "distinct_ratio": null,
            "pk_candidate_score": 0.5,
            "metric_candidate_score": 0.05
        }
    ],
    "quality": {
        "warning_hints": [
            "以下列未设置非空校验：service_point_id, account_id（空值情况未知）",
            "未识别到时间列"
        ],
        "failed_expectations": []
    },
    "row_count": 300,
    "fk_candidates": [],
    "confidence_notes": [
        "role 判定为 dimension：表内列均为枚举/分类或ID，未发现数值型度量或时间列；34/34 期望均为分类枚举/非空与去重比例。",
        "grain 猜测为 service_point_id：仅依据命名启发式，缺少唯一性与非空度量佐证（置信度较低）。",
        "未识别时间列：列名与期望均未涉及日期/时间，也无最小/最大时间范围可推断。"
    ],
    "primary_key_candidates": []
 }
--- a/demo/水务/水务-gpt5-snippet-alias.json
+++ b/demo/水务/水务-gpt5-snippet-alias.json
@ -1,372 +0,0 @@
 [
    {
        "id": "snpt_topn_station",
        "aliases": [
            {
                "text": "站点水表排行前N",
                "tone": "中性"
            },
            {
                "text": "哪个站点表最多",
                "tone": "口语"
            },
            {
                "text": "按站点水表TopN",
                "tone": "专业"
            }
        ],
        "keywords": [
            "TopN",
            "排名",
            "排行",
            "station",
            "站点",
            "水表数",
            "meter count",
            "distinct",
            "去重",
            "聚合",
            "排序",
            "榜单"
        ],
        "intent_tags": [
            "topn",
            "aggregate",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_share_district",
        "aliases": [
            {
                "text": "各辖区水表占比",
                "tone": "中性"
            },
            {
                "text": "哪个辖区占比高",
                "tone": "口语"
            },
            {
                "text": "按辖区水表比例",
                "tone": "专业"
            }
        ],
        "keywords": [
            "占比",
            "ratio",
            "district",
            "辖区",
            "水表数",
            "meter count",
            "distinct",
            "去重",
            "百分比",
            "份额",
            "聚合",
            "排序",
            "分布"
        ],
        "intent_tags": [
            "ratio",
            "aggregate",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_dist_diameter",
        "aliases": [
            {
                "text": "表径水表数分布",
                "tone": "中性"
            },
            {
                "text": "不同口径有多少",
                "tone": "口语"
            },
            {
                "text": "按表径去重计数",
                "tone": "专业"
            }
        ],
        "keywords": [
            "分布",
            "distribution",
            "meter_diameter",
            "表径",
            "水表数",
            "meter count",
            "distinct",
            "去重",
            "聚合",
            "类别",
            "category",
            "条形图",
            "饼图",
            "排行"
        ],
        "intent_tags": [
            "aggregate",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_type_subtype_matrix",
        "aliases": [
            {
                "text": "类型×子类水表数",
                "tone": "中性"
            },
            {
                "text": "看各类型各子类",
                "tone": "口语"
            },
            {
                "text": "类型子类组合统计",
                "tone": "专业"
            }
        ],
        "keywords": [
            "类型",
            "type",
            "子类",
            "subtype",
            "组合",
            "matrix",
            "交叉分析",
            "cross-tab",
            "水表数",
            "meter count",
            "distinct",
            "去重",
            "分布",
            "聚合",
            "维度"
        ],
        "intent_tags": [
            "aggregate",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_quality_spid_uniq",
        "aliases": [
            {
                "text": "服务点ID唯一性检",
                "tone": "专业"
            },
            {
                "text": "服务点ID有重复吗",
                "tone": "口语"
            },
            {
                "text": "服务点ID完整性评估",
                "tone": "中性"
            }
        ],
        "keywords": [
            "质量检查",
            "quality",
            "唯一性",
            "uniqueness",
            "重复",
            "duplicate",
            "空值",
            "NULL",
            "完整性",
            "integrity",
            "service_point_id",
            "数据质量",
            "统计",
            "去重",
            "异常检测"
        ],
        "intent_tags": [
            "quality"
        ]
    },
    {
        "id": "snpt_quality_account_nulls",
        "aliases": [
            {
                "text": "账户ID缺失明细",
                "tone": "中性"
            },
            {
                "text": "看看哪些账户为空",
                "tone": "口语"
            },
            {
                "text": "account_id空值样本",
                "tone": "专业"
            }
        ],
        "keywords": [
            "质量检查",
            "缺失",
            "missing",
            "空值",
            "NULL",
            "account_id",
            "样本",
            "sample",
            "抽样",
            "sampling",
            "明细",
            "排查",
            "过滤",
            "WHERE",
            "LIMIT"
        ],
        "intent_tags": [
            "quality",
            "sample"
        ]
    },
    {
        "id": "snpt_sample_random_rows",
        "aliases": [
            {
                "text": "随机抽样水表明细",
                "tone": "中性"
            },
            {
                "text": "随机取几条看看",
                "tone": "口语"
            },
            {
                "text": "RAND()样本抽取",
                "tone": "专业"
            }
        ],
        "keywords": [
            "随机",
            "random",
            "样本",
            "sample",
            "抽样",
            "sampling",
            "明细",
            "details",
            "质检",
            "QA",
            "RAND()",
            "LIMIT",
            "抽取",
            "数据验证"
        ],
        "intent_tags": [
            "sample"
        ]
    },
    {
        "id": "snpt_filter_office_type_where",
        "aliases": [
            {
                "text": "按所与类型过滤有效",
                "tone": "专业"
            },
            {
                "text": "筛选某所的指定类型",
                "tone": "中性"
            },
            {
                "text": "只看这所的这种表",
                "tone": "口语"
            }
        ],
        "keywords": [
            "过滤",
            "filter",
            "WHERE",
            "supply_office",
            "营业所",
            "meter_type",
            "类型",
            "meter_status",
            "有效",
            "条件片段",
            "筛选",
            "查询拼接",
            "字段",
            "约束"
        ],
        "intent_tags": [
            "filter"
        ]
    },
    {
        "id": "snpt_office_station_dist",
        "aliases": [
            {
                "text": "所站组合水表数",
                "tone": "中性"
            },
            {
                "text": "各站在各所有多少",
                "tone": "口语"
            },
            {
                "text": "营业所×站点分布",
                "tone": "专业"
            }
        ],
        "keywords": [
            "supply_office",
            "营业所",
            "station",
            "站点",
            "层级",
            "hierarchy",
            "分布",
            "distribution",
            "水表数",
            "meter count",
            "distinct",
            "去重",
            "聚合",
            "交叉分析",
            "排行"
        ],
        "intent_tags": [
            "aggregate",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_total_meter_baseline",
        "aliases": [
            {
                "text": "水表总量基线",
                "tone": "中性"
            },
            {
                "text": "现在有多少水表",
                "tone": "口语"
            },
            {
                "text": "全表去重总数",
                "tone": "专业"
            }
        ],
        "keywords": [
            "总量",
            "total",
            "baseline",
            "基线",
            "水表总数",
            "meter total",
            "service_point_id",
            "distinct",
            "去重",
            "分母",
            "denominator",
            "占比",
            "聚合",
            "汇总",
            "snapshot"
        ],
        "intent_tags": [
            "aggregate"
        ]
    }
 ]
--- a/demo/水务/水务-gpt5-snippet.json
+++ b/demo/水务/水务-gpt5-snippet.json
@ -1,330 +0,0 @@
 [
    {
        "id": "snpt_topn_station",
        "desc": "按站点统计水表数量并取前N",
        "type": "topn",
        "title": "站点TopN水表数",
        "examples": [
            "各站点水表数量排名前10",
            "站点水表覆盖情况排行"
        ],
        "variables": [
            {
                "name": "top_n",
                "type": "int",
                "default": 10
            }
        ],
        "dialect_sql": {
            "mysql": "SELECT station,\n       COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY station\nORDER BY meter_cnt DESC\nLIMIT {{top_n}};"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "TopN建议N<=36",
                    "以service_point_id去重计数",
                    "无时间列，无法做趋势"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": 36
            },
            "time_column": null,
            "required_columns": [
                "station",
                "service_point_id"
            ]
        },
        "business_caliber": "水表数=按service_point_id去重计数；粒度=站点。仅统计当前表中的有效记录（不含时间口径）。安全限制：用于分析排名，避免扩大LIMIT造成全量导出。"
    },
    {
        "id": "snpt_share_district",
        "desc": "统计各辖区水表数及其占比",
        "type": "ratio",
        "title": "辖区水表占比",
        "examples": [
            "各辖区水表占比",
            "哪个辖区水表最多"
        ],
        "variables": [],
        "dialect_sql": {
            "mysql": "WITH by_district AS (\n  SELECT district, COUNT(DISTINCT service_point_id) AS meter_cnt\n  FROM `data-ge`.`water_meter_info`\n  GROUP BY district\n), tot AS (\n  SELECT COUNT(DISTINCT service_point_id) AS total_cnt\n  FROM `data-ge`.`water_meter_info`\n)\nSELECT b.district,\n       b.meter_cnt,\n       ROUND(b.meter_cnt / NULLIF(t.total_cnt, 0) * 100, 2) AS pct\nFROM by_district b\nCROSS JOIN tot t\nORDER BY pct DESC, b.district;"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "占比分母为全表service_point_id去重总数",
                    "service_point_id为空将被忽略"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": 13
            },
            "time_column": null,
            "required_columns": [
                "district",
                "service_point_id"
            ]
        },
        "business_caliber": "水表数=按service_point_id去重计数；粒度=辖区。占比=辖区水表数/全表水表总数。安全限制：仅基于本表，不代表全市/全网口径；无时间维度。"
    },
    {
        "id": "snpt_dist_diameter",
        "desc": "按表径统计水表数量分布",
        "type": "aggregate",
        "title": "表径分布统计",
        "examples": [
            "不同口径水表有多少",
            "查看表径分布情况"
        ],
        "variables": [],
        "dialect_sql": {
            "mysql": "SELECT meter_diameter,\n       COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY meter_diameter\nORDER BY meter_cnt DESC, meter_diameter;"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "以service_point_id去重计数",
                    "适合绘制条形图/饼图"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": 8
            },
            "time_column": null,
            "required_columns": [
                "meter_diameter",
                "service_point_id"
            ]
        },
        "business_caliber": "水表数=按service_point_id去重计数；粒度=表径。安全限制：仅用于分布分析，不含时间过滤；避免用于明细导出。"
    },
    {
        "id": "snpt_type_subtype_matrix",
        "desc": "统计水表类型与子类组合的数量",
        "type": "aggregate",
        "title": "类型子类分布",
        "examples": [
            "不同类型与子类的水表数量",
            "查看类型与子类的组合分布"
        ],
        "variables": [],
        "dialect_sql": {
            "mysql": "SELECT meter_type,\n       meter_subtype,\n       COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY meter_type, meter_subtype\nORDER BY meter_cnt DESC, meter_type, meter_subtype;"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "组合基数<=5×9=45",
                    "以service_point_id去重计数"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": 45
            },
            "time_column": null,
            "required_columns": [
                "meter_type",
                "meter_subtype",
                "service_point_id"
            ]
        },
        "business_caliber": "水表数=按service_point_id去重计数；粒度=类型×子类组合。安全限制：仅用于汇总分析，不包含时间或业务状态变化。"
    },
    {
        "id": "snpt_quality_spid_uniq",
        "desc": "评估service_point_id的空值与重复情况",
        "type": "quality",
        "title": "服务点唯一性检",
        "examples": [
            "检查服务点ID是否唯一",
            "统计service_point_id空值与重复情况"
        ],
        "variables": [],
        "dialect_sql": {
            "mysql": "SELECT\n  COUNT(*) AS total_rows,\n  SUM(service_point_id IS NULL) AS null_cnt,\n  COUNT(DISTINCT service_point_id) AS distinct_cnt,\n  (COUNT(*) - COUNT(DISTINCT service_point_id)) AS duplicate_rows_est,\n  (\n    SELECT COUNT(*) FROM (\n      SELECT service_point_id\n      FROM `data-ge`.`water_meter_info`\n      GROUP BY service_point_id\n      HAVING COUNT(*) > 1\n    ) AS dup\n  ) AS dup_key_groups\nFROM `data-ge`.`water_meter_info`;"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "用于键完整性检查",
                    "重复行估算=总行数-去重数"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": null
            },
            "time_column": null,
            "required_columns": [
                "service_point_id"
            ]
        },
        "business_caliber": "质量检查口径：在本表内评估service_point_id的非空与唯一性，不代表跨表全局唯一。安全限制：仅输出汇总指标，不暴露明细重复值。"
    },
    {
        "id": "snpt_quality_account_nulls",
        "desc": "抽取account_id为空的记录用于排查",
        "type": "quality",
        "title": "账户ID缺失明细",
        "examples": [
            "列出account_id为空的水表",
            "抽样查看账户缺失的数据行"
        ],
        "variables": [
            {
                "name": "limit_n",
                "type": "int",
                "default": 50
            }
        ],
        "dialect_sql": {
            "mysql": "SELECT *\nFROM `data-ge`.`water_meter_info`\nWHERE account_id IS NULL\nLIMIT {{limit_n}};"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "明细仅限小样本抽取",
                    "建议LIMIT<=100，避免全量导出"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": null
            },
            "time_column": null,
            "required_columns": [
                "account_id"
            ]
        },
        "business_caliber": "质量抽样：筛出账户ID缺失的水表记录，便于核对。安全限制：仅用于样本排查，不建议在生产中全量导出；如需口径统计请改为COUNT聚合。"
    },
    {
        "id": "snpt_sample_random_rows",
        "desc": "随机抽取水表信息用于人工核验",
        "type": "sample",
        "title": "随机抽样明细",
        "examples": [
            "抽样查看水表信息",
            "随机抽取20条做质检"
        ],
        "variables": [
            {
                "name": "sample_size",
                "type": "int",
                "default": 20
            }
        ],
        "dialect_sql": {
            "mysql": "SELECT *\nFROM `data-ge`.`water_meter_info`\nORDER BY RAND()\nLIMIT {{sample_size}};"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "使用RAND()随机，样本不可复现",
                    "建议限制样本量"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": 300
            },
            "time_column": null,
            "required_columns": [
                "service_point_id"
            ]
        },
        "business_caliber": "样本抽取：从本表随机返回若干行明细。安全限制：避免扩大LIMIT进行全量下载；如需可复现样本，请改用带种子的随机方法（MySQL不原生支持）。"
    },
    {
        "id": "snpt_filter_office_type_where",
        "desc": "常用WHERE筛选条件片段：按营业所与类型且为有效",
        "type": "sample",
        "title": "机构类型筛选片",
        "examples": [
            "筛选A营业所的机械表",
            "仅查看某营业所的指定类型水表"
        ],
        "variables": [
            {
                "name": "supply_office",
                "type": "string"
            },
            {
                "name": "meter_type",
                "type": "string"
            }
        ],
        "dialect_sql": {
            "mysql": "WHERE supply_office = '{{supply_office}}'\n  AND meter_type = '{{meter_type}}'\n  AND meter_status = '有效'"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "这是条件片段，可拼接到其他查询",
                    "meter_status当前为单一值“有效”"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": 11
            },
            "time_column": null,
            "required_columns": [
                "supply_office",
                "meter_type",
                "meter_status"
            ]
        },
        "business_caliber": "过滤口径：仅保留指定营业所与指定水表类型、且状态为“有效”的记录。安全限制：为片段用途，需拼接在SELECT…FROM之后使用。"
    },
    {
        "id": "snpt_office_station_dist",
        "desc": "按营业所与站点组合统计水表数",
        "type": "aggregate",
        "title": "所站层级分布",
        "examples": [
            "按营业所查看各站点水表数",
            "所站两级的水表分布情况"
        ],
        "variables": [],
        "dialect_sql": {
            "mysql": "SELECT supply_office,\n       station,\n       COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY supply_office, station\nORDER BY supply_office, meter_cnt DESC, station;"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "组合基数<=11×36=396",
                    "以service_point_id去重计数",
                    "如结果过长可再按TopN筛选"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": 396
            },
            "time_column": null,
            "required_columns": [
                "supply_office",
                "station",
                "service_point_id"
            ]
        },
        "business_caliber": "水表数=按service_point_id去重计数；粒度=营业所×站点。安全限制：结果行数可能较多，建议在可视化端增加筛选或分页。"
    },
    {
        "id": "snpt_total_meter_baseline",
        "desc": "获取全表水表去重总量基线",
        "type": "aggregate",
        "title": "水表总量基线",
        "examples": [
            "当前有多少只水表",
            "作为占比分析的分母基线"
        ],
        "variables": [],
        "dialect_sql": {
            "mysql": "SELECT COUNT(DISTINCT service_point_id) AS meter_total\nFROM `data-ge`.`water_meter_info`;"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "作为其他占比/分摊分母基线",
                    "忽略service_point_id为空的记录"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": 300
            },
            "time_column": null,
            "required_columns": [
                "service_point_id"
            ]
        },
        "business_caliber": "水表总量=按service_point_id去重计数；基于当前表的全量记录。安全限制：无时间维度，无法反映存量随时间变化。"
    }
 ]
--- a/demo/水务/水务-qwen3-coder-480b-ge-desc.json
+++ b/demo/水务/水务-qwen3-coder-480b-ge-desc.json
@ -1,415 +0,0 @@
 {
    "role": "dimension",
    "time": {
        "range": null,
        "column": null,
        "has_gaps": null,
        "granularity": "unknown"
    },
    "grain": [
        "account_id",
        "service_point_id"
    ],
    "table": "data-ge.water_meter_info",
    "columns": [
        {
            "name": "supply_office",
            "dtype": "string",
            "stats": {},
            "comment": "供水管理所名称，枚举值",
            "enumish": true,
            "null_rate": 0.0,
            "top_values": [
                {
                    "pct": null,
                    "value": "宝山供水管理所"
                },
                {
                    "pct": null,
                    "value": "黄浦供水管理所"
                },
                {
                    "pct": null,
                    "value": "青东供水管理所"
                },
                {
                    "pct": null,
                    "value": "虹口供水管理所"
                },
                {
                    "pct": null,
                    "value": "闸北供水管理所"
                },
                {
                    "pct": null,
                    "value": "松北供水管理所"
                },
                {
                    "pct": null,
                    "value": "杨浦供水管理所"
                },
                {
                    "pct": null,
                    "value": "长宁供水管理所"
                },
                {
                    "pct": null,
                    "value": "闵行供水管理所"
                },
                {
                    "pct": null,
                    "value": "徐汇供水管理所"
                },
                {
                    "pct": null,
                    "value": "普陀供水管理所"
                }
            ],
            "semantic_type": "dimension",
            "distinct_count": 11,
            "distinct_ratio": 0.03666666666666667,
            "pk_candidate_score": 0.11,
            "metric_candidate_score": 0.0
        },
        {
            "name": "station",
            "dtype": "string",
            "stats": {},
            "comment": "站点名称，枚举值",
            "enumish": true,
            "null_rate": 0.0,
            "top_values": [
                {
                    "pct": null,
                    "value": "新闸站"
                },
                {
                    "pct": null,
                    "value": "宝杨站"
                },
                {
                    "pct": null,
                    "value": "江川站"
                },
                {
                    "pct": null,
                    "value": "长江站"
                },
                {
                    "pct": null,
                    "value": "市光站"
                },
                {
                    "pct": null,
                    "value": "徐泾站"
                },
                {
                    "pct": null,
                    "value": "真北站"
                },
                {
                    "pct": null,
                    "value": "半淞园站"
                },
                {
                    "pct": null,
                    "value": "芙蓉江站"
                },
                {
                    "pct": null,
                    "value": "密云站"
                }
            ],
            "semantic_type": "dimension",
            "distinct_count": 36,
            "distinct_ratio": 0.12,
            "pk_candidate_score": 0.36,
            "metric_candidate_score": 0.0
        },
        {
            "name": "district",
            "dtype": "string",
            "stats": {},
            "comment": "行政区划名称，枚举值",
            "enumish": true,
            "null_rate": 0.0,
            "top_values": [
                {
                    "pct": null,
                    "value": "普陀区"
                },
                {
                    "pct": null,
                    "value": "闵行区"
                },
                {
                    "pct": null,
                    "value": "嘉定区"
                },
                {
                    "pct": null,
                    "value": "杨浦区"
                },
                {
                    "pct": null,
                    "value": "徐汇区"
                },
                {
                    "pct": null,
                    "value": "黄浦区"
                },
                {
                    "pct": null,
                    "value": "松江区"
                },
                {
                    "pct": null,
                    "value": "长宁区"
                },
                {
                    "pct": null,
                    "value": "青浦区"
                },
                {
                    "pct": null,
                    "value": "虹口区"
                }
            ],
            "semantic_type": "dimension",
            "distinct_count": 13,
            "distinct_ratio": 0.043333333333333335,
            "pk_candidate_score": 0.13,
            "metric_candidate_score": 0.0
        },
        {
            "name": "meter_diameter",
            "dtype": "string",
            "stats": {},
            "comment": "水表直径规格，枚举值",
            "enumish": true,
            "null_rate": 0.0,
            "top_values": [
                {
                    "pct": null,
                    "value": "20mm"
                },
                {
                    "pct": null,
                    "value": "15mm"
                },
                {
                    "pct": null,
                    "value": "25mm"
                },
                {
                    "pct": null,
                    "value": "40mm"
                },
                {
                    "pct": null,
                    "value": "150mm"
                },
                {
                    "pct": null,
                    "value": "100mm"
                },
                {
                    "pct": null,
                    "value": "80mm"
                },
                {
                    "pct": null,
                    "value": "50mm"
                }
            ],
            "semantic_type": "dimension",
            "distinct_count": 8,
            "distinct_ratio": 0.02666666666666667,
            "pk_candidate_score": 0.08,
            "metric_candidate_score": 0.0
        },
        {
            "name": "meter_status",
            "dtype": "string",
            "stats": {},
            "comment": "水表状态，枚举值",
            "enumish": true,
            "null_rate": 0.0,
            "top_values": [
                {
                    "pct": null,
                    "value": "有效"
                }
            ],
            "semantic_type": "dimension",
            "distinct_count": 1,
            "distinct_ratio": 0.0033333333333333335,
            "pk_candidate_score": 0.01,
            "metric_candidate_score": 0.0
        },
        {
            "name": "meter_subtype",
            "dtype": "string",
            "stats": {},
            "comment": "水表子类型，枚举值",
            "enumish": true,
            "null_rate": 0.0,
            "top_values": [
                {
                    "pct": null,
                    "value": "旋翼半液封式"
                },
                {
                    "pct": null,
                    "value": "超声波式"
                },
                {
                    "pct": null,
                    "value": "旋翼湿式（指针式）"
                },
                {
                    "pct": null,
                    "value": "旋翼湿式（数字指针式）"
                },
                {
                    "pct": null,
                    "value": "电磁式"
                },
                {
                    "pct": null,
                    "value": "无直管段要求超声波式"
                },
                {
                    "pct": null,
                    "value": "无直管段要求电磁式"
                },
                {
                    "pct": null,
                    "value": "垂直螺翼干式"
                },
                {
                    "pct": null,
                    "value": "机械容积式"
                }
            ],
            "semantic_type": "dimension",
            "distinct_count": 9,
            "distinct_ratio": 0.03,
            "pk_candidate_score": 0.09,
            "metric_candidate_score": 0.0
        },
        {
            "name": "meter_type",
            "dtype": "string",
            "stats": {},
            "comment": "水表类型，枚举值",
            "enumish": true,
            "null_rate": 0.0,
            "top_values": [
                {
                    "pct": null,
                    "value": "容积式机械水表"
                },
                {
                    "pct": null,
                    "value": "速度式机械水表"
                },
                {
                    "pct": null,
                    "value": "电磁式远传水表"
                },
                {
                    "pct": null,
                    "value": "速度式机电远传水表"
                },
                {
                    "pct": null,
                    "value": "超声波式远传水表"
                }
            ],
            "semantic_type": "dimension",
            "distinct_count": 5,
            "distinct_ratio": 0.016666666666666666,
            "pk_candidate_score": 0.05,
            "metric_candidate_score": 0.0
        },
        {
            "name": "installation_position",
            "dtype": "string",
            "stats": {},
            "comment": "安装位置，枚举值",
            "enumish": true,
            "null_rate": 0.0,
            "top_values": [
                {
                    "pct": null,
                    "value": "嵌墙表"
                },
                {
                    "pct": null,
                    "value": "管道井表"
                },
                {
                    "pct": null,
                    "value": "地下表"
                },
                {
                    "pct": null,
                    "value": "龙头表"
                }
            ],
            "semantic_type": "dimension",
            "distinct_count": 4,
            "distinct_ratio": 0.013333333333333334,
            "pk_candidate_score": 0.04,
            "metric_candidate_score": 0.0
        },
        {
            "name": "account_id",
            "dtype": "string",
            "stats": {},
            "comment": "账户ID",
            "enumish": false,
            "null_rate": null,
            "top_values": [],
            "semantic_type": "id",
            "distinct_count": null,
            "distinct_ratio": null,
            "pk_candidate_score": 0.95,
            "metric_candidate_score": 0.0
        },
        {
            "name": "service_point_id",
            "dtype": "string",
            "stats": {},
            "comment": "服务点ID",
            "enumish": false,
            "null_rate": null,
            "top_values": [],
            "semantic_type": "id",
            "distinct_count": null,
            "distinct_ratio": null,
            "pk_candidate_score": 0.95,
            "metric_candidate_score": 0.0
        }
    ],
    "quality": {
        "warning_hints": [],
        "failed_expectations": []
    },
    "row_count": 300,
    "fk_candidates": [],
    "confidence_notes": [
        "role判定为dimension，因所有列均为枚举或ID类型，无metric列",
        "grain依据account_id和service_point_id为唯一标识推测",
        "未发现时间列，因此time字段为null"
    ],
    "primary_key_candidates": [
        [
            "account_id"
        ],
        [
            "service_point_id"
        ]
    ]
 }
--- a/demo/水务/水务-qwen3-coder-480b-snippet-alias.json
+++ b/demo/水务/水务-qwen3-coder-480b-snippet-alias.json
@ -1,286 +0,0 @@
 [
    {
        "id": "snpt_water_meter_top_supply_office",
        "aliases": [
            {
                "text": "供水所水表排行",
                "tone": "中性"
            },
            {
                "text": "哪个供水所水表最多",
                "tone": "口语"
            },
            {
                "text": "供水管理所水表TopN统计",
                "tone": "专业"
            }
        ],
        "keywords": [
            "水表",
            "供水管理所",
            "排行",
            "TopN",
            "数量",
            "统计",
            "count",
            "排名",
            "前N",
            "供水所",
            "水表数",
            "维度聚合",
            "by_dimension",
            "topn"
        ],
        "intent_tags": [
            "topn",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_water_meter_top_station",
        "aliases": [
            {
                "text": "站点水表数量排行",
                "tone": "中性"
            },
            {
                "text": "哪个站点水表最多",
                "tone": "口语"
            },
            {
                "text": "站点维度水表TopN分析",
                "tone": "专业"
            }
        ],
        "keywords": [
            "水表",
            "站点",
            "排行",
            "TopN",
            "数量",
            "统计",
            "count",
            "排名",
            "前N",
            "站点数",
            "维度聚合",
            "by_dimension",
            "topn"
        ],
        "intent_tags": [
            "topn",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_water_meter_top_district",
        "aliases": [
            {
                "text": "区域水表数量排名",
                "tone": "中性"
            },
            {
                "text": "哪个区水表最多",
                "tone": "口语"
            },
            {
                "text": "行政区水表TopN统计",
                "tone": "专业"
            }
        ],
        "keywords": [
            "水表",
            "区域",
            "行政区",
            "排行",
            "TopN",
            "数量",
            "统计",
            "count",
            "排名",
            "前N",
            "区",
            "水表数",
            "维度聚合",
            "by_dimension",
            "topn"
        ],
        "intent_tags": [
            "topn",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_water_meter_share_by_type",
        "aliases": [
            {
                "text": "水表类型占比",
                "tone": "中性"
            },
            {
                "text": "哪种水表用得最多",
                "tone": "口语"
            },
            {
                "text": "水表类型分布比例",
                "tone": "专业"
            }
        ],
        "keywords": [
            "水表",
            "类型",
            "占比",
            "比例",
            "ratio",
            "分布",
            "meter_type",
            "百分比",
            "分类统计",
            "水表类型",
            "ratio",
            "aggregate",
            "by_dimension"
        ],
        "intent_tags": [
            "ratio",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_water_meter_subtype_distribution",
        "aliases": [
            {
                "text": "水表子类型分布",
                "tone": "中性"
            },
            {
                "text": "各种子类型水表情况",
                "tone": "口语"
            },
            {
                "text": "水表子类型计数与占比",
                "tone": "专业"
            }
        ],
        "keywords": [
            "水表",
            "子类型",
            "分布",
            "数量",
            "占比",
            "meter_subtype",
            "统计",
            "count",
            "百分比",
            "分类统计",
            "aggregate",
            "by_dimension"
        ],
        "intent_tags": [
            "aggregate",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_water_meter_installation_position_stats",
        "aliases": [
            {
                "text": "安装位置统计",
                "tone": "中性"
            },
            {
                "text": "哪种位置装表最多",
                "tone": "口语"
            },
            {
                "text": "水表安装位置分布",
                "tone": "专业"
            }
        ],
        "keywords": [
            "水表",
            "安装位置",
            "统计",
            "分布",
            "installation_position",
            "数量",
            "count",
            "位置",
            "安装点",
            "aggregate",
            "by_dimension"
        ],
        "intent_tags": [
            "aggregate",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_water_meter_grain_check",
        "aliases": [
            {
                "text": "主键粒度校验",
                "tone": "中性"
            },
            {
                "text": "数据有没有重复",
                "tone": "口语"
            },
            {
                "text": "数据粒度一致性检查",
                "tone": "专业"
            }
        ],
        "keywords": [
            "主键",
            "粒度",
            "校验",
            "质量",
            "quality",
            "重复",
            "唯一性",
            "account_id",
            "service_point_id",
            "数据校验",
            "质量检查",
            "异常检测"
        ],
        "intent_tags": [
            "quality"
        ]
    },
    {
        "id": "snpt_water_meter_sample_records",
        "aliases": [
            {
                "text": "水表数据抽样",
                "tone": "中性"
            },
            {
                "text": "给我看点水表数据",
                "tone": "口语"
            },
            {
                "text": "水表记录样本抽取",
                "tone": "专业"
            }
        ],
        "keywords": [
            "水表",
            "样本",
            "抽样",
            "sample",
            "随机",
            "记录",
            "抽查",
            "limit",
            "数据结构",
            "数据示例",
            "sample",
            "limit_rows"
        ],
        "intent_tags": [
            "sample"
        ]
    }
 ]
--- a/demo/水务/水务-qwen3-coder-480b-snippet.json
+++ b/demo/水务/水务-qwen3-coder-480b-snippet.json
@ -1,235 +0,0 @@
 [
    {
        "id": "snpt_water_meter_top_supply_office",
        "desc": "统计各供水管理所下辖水表数量并排序",
        "type": "topn",
        "title": "供水管理所水表数量排行",
        "examples": [
            "列出水表最多的前10个供水管理所",
            "各供水所水表数量排名"
        ],
        "variables": [
            {
                "name": "top_n",
                "type": "int",
                "default": 10
            }
        ],
        "dialect_sql": {
            "mysql": "SELECT supply_office AS dim_value, COUNT(*) AS metric_value FROM `data-ge.water_meter_info` GROUP BY supply_office ORDER BY metric_value DESC LIMIT {{top_n}}"
        },
        "applicability": {
            "constraints": {
                "notes": [],
                "fk_join_available": false,
                "dim_cardinality_hint": 11
            },
            "time_column": "nullable",
            "required_columns": [
                "supply_office"
            ]
        },
        "business_caliber": "按供水管理所维度聚合水表总数，粒度=供水管理所"
    },
    {
        "id": "snpt_water_meter_top_station",
        "desc": "统计各个站点下辖水表数量并排序",
        "type": "topn",
        "title": "站点水表数量排行",
        "examples": [
            "列出水表最多的前10个站点",
            "各站点水表数量排名"
        ],
        "variables": [
            {
                "name": "top_n",
                "type": "int",
                "default": 10
            }
        ],
        "dialect_sql": {
            "mysql": "SELECT station AS dim_value, COUNT(*) AS metric_value FROM `data-ge.water_meter_info` GROUP BY station ORDER BY metric_value DESC LIMIT {{top_n}}"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "高基数维度建议LIMIT<=50"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": 36
            },
            "time_column": "nullable",
            "required_columns": [
                "station"
            ]
        },
        "business_caliber": "按站点维度聚合水表总数，粒度=站点"
    },
    {
        "id": "snpt_water_meter_top_district",
        "desc": "统计各区水表数量并排序",
        "type": "topn",
        "title": "区域水表数量排行",
        "examples": [
            "列出各区水表数量排名",
            "哪个区的水表最多？"
        ],
        "variables": [
            {
                "name": "top_n",
                "type": "int",
                "default": 10
            }
        ],
        "dialect_sql": {
            "mysql": "SELECT district AS dim_value, COUNT(*) AS metric_value FROM `data-ge.water_meter_info` GROUP BY district ORDER BY metric_value DESC LIMIT {{top_n}}"
        },
        "applicability": {
            "constraints": {
                "notes": [],
                "fk_join_available": false,
                "dim_cardinality_hint": 13
            },
            "time_column": "nullable",
            "required_columns": [
                "district"
            ]
        },
        "business_caliber": "按行政区划维度聚合水表总数，粒度=区"
    },
    {
        "id": "snpt_water_meter_share_by_type",
        "desc": "计算各类水表占总水表的比例",
        "type": "ratio",
        "title": "水表类型占比分布",
        "examples": [
            "各类水表占比是多少？",
            "哪种类型的水表使用最广泛？"
        ],
        "variables": [],
        "dialect_sql": {
            "mysql": "SELECT meter_type AS dim_value, COUNT(*) * 100.0 / (SELECT COUNT(*) FROM `data-ge.water_meter_info`) AS ratio_percent FROM `data-ge.water_meter_info` GROUP BY meter_type ORDER BY ratio_percent DESC"
        },
        "applicability": {
            "constraints": {
                "notes": [],
                "fk_join_available": false,
                "dim_cardinality_hint": 5
            },
            "time_column": "nullable",
            "required_columns": [
                "meter_type"
            ]
        },
        "business_caliber": "按水表类型分类计算其占比，粒度=水表类型"
    },
    {
        "id": "snpt_water_meter_subtype_distribution",
        "desc": "展示不同水表子类型的数量及比例",
        "type": "aggregate",
        "title": "水表子类型分布情况",
        "examples": [
            "各种子类型水表的数量和占比",
            "哪种子类型水表最多？"
        ],
        "variables": [],
        "dialect_sql": {
            "mysql": "SELECT meter_subtype AS dim_value, COUNT(*) AS count_value, ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM `data-ge.water_meter_info`), 2) AS percentage FROM `data-ge.water_meter_info` GROUP BY meter_subtype ORDER BY count_value DESC"
        },
        "applicability": {
            "constraints": {
                "notes": [],
                "fk_join_available": false,
                "dim_cardinality_hint": 9
            },
            "time_column": "nullable",
            "required_columns": [
                "meter_subtype"
            ]
        },
        "business_caliber": "按水表子类型进行计数和百分比统计，粒度=水表子类型"
    },
    {
        "id": "snpt_water_meter_installation_position_stats",
        "desc": "统计不同安装位置下的水表数量",
        "type": "aggregate",
        "title": "安装位置分布统计",
        "examples": [
            "各种安装位置的水表数量",
            "哪种安装位置最为常见？"
        ],
        "variables": [],
        "dialect_sql": {
            "mysql": "SELECT installation_position AS dim_value, COUNT(*) AS count_value FROM `data-ge.water_meter_info` GROUP BY installation_position ORDER BY count_value DESC"
        },
        "applicability": {
            "constraints": {
                "notes": [],
                "fk_join_available": false,
                "dim_cardinality_hint": 4
            },
            "time_column": "nullable",
            "required_columns": [
                "installation_position"
            ]
        },
        "business_caliber": "按安装位置对水表进行分组计数，粒度=安装位置"
    },
    {
        "id": "snpt_water_meter_grain_check",
        "desc": "验证 account_id 和 service_point_id 是否构成唯一组合",
        "type": "quality",
        "title": "主键粒度校验",
        "examples": [
            "这张表的数据粒度是否正确？",
            "是否存在重复的服务点记录？"
        ],
        "variables": [],
        "dialect_sql": {
            "mysql": "SELECT IF(COUNT(*) = COUNT(DISTINCT account_id, service_point_id), 'PASS', 'FAIL') AS grain_check_result FROM `data-ge.water_meter_info`"
        },
        "applicability": {
            "constraints": {
                "notes": [],
                "fk_join_available": false,
                "dim_cardinality_hint": null
            },
            "time_column": "nullable",
            "required_columns": [
                "account_id",
                "service_point_id"
            ]
        },
        "business_caliber": "检验数据是否符合预期的主键粒度（account_id + service_point_id）"
    },
    {
        "id": "snpt_water_meter_sample_records",
        "desc": "随机抽取部分水表信息用于查看结构",
        "type": "sample",
        "title": "样本抽取",
        "examples": [
            "给我看几条水表数据的例子",
            "抽查一些原始数据看看格式"
        ],
        "variables": [
            {
                "name": "limit_rows",
                "type": "int",
                "default": 5
            }
        ],
        "dialect_sql": {
            "mysql": "SELECT * FROM `data-ge.water_meter_info` ORDER BY RAND() LIMIT {{limit_rows}}"
        },
        "applicability": {
            "constraints": {
                "notes": [],
                "fk_join_available": false,
                "dim_cardinality_hint": null
            },
            "time_column": "nullable",
            "required_columns": []
        },
        "business_caliber": "从全量数据中随机采样若干条记录供参考"
    }
 ]
--- a/demo/水务/水务-qwen3-next-80b-ge-desc.json
+++ b/demo/水务/水务-qwen3-next-80b-ge-desc.json
--- a/demo/水务/水务-qwen3-next-80b-snippet-alias.json
+++ b/demo/水务/水务-qwen3-next-80b-snippet-alias.json
@ -1,249 +0,0 @@
 [
    {
        "id": "snpt_topn_supply_office_by_account",
        "aliases": [
            {
                "text": "哪个供水所用户最多？",
                "tone": "口语"
            },
            {
                "text": "按供应办公室统计账户数量",
                "tone": "中性"
            },
            {
                "text": "供应办公室账户数TopN排名",
                "tone": "专业"
            }
        ],
        "keywords": [
            "供应办公室",
            "账户数",
            "TopN",
            "排行",
            "统计",
            "account_id",
            "supply_office",
            "去重",
            "高占比",
            "维度分析",
            "by_dimension",
            "aggregate",
            "topn"
        ],
        "intent_tags": [
            "topn",
            "aggregate",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_topn_station_by_account",
        "aliases": [
            {
                "text": "哪些站点用户最多？",
                "tone": "口语"
            },
            {
                "text": "按站点统计账户分布",
                "tone": "中性"
            },
            {
                "text": "站点账户数Top20排名",
                "tone": "专业"
            }
        ],
        "keywords": [
            "站点",
            "账户数",
            "TopN",
            "排行",
            "统计",
            "station",
            "account_id",
            "去重",
            "高负载",
            "维度分析",
            "by_dimension",
            "aggregate",
            "topn"
        ],
        "intent_tags": [
            "topn",
            "aggregate",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_topn_district_by_account",
        "aliases": [
            {
                "text": "哪个区用户最多？",
                "tone": "口语"
            },
            {
                "text": "按行政区统计账户数量",
                "tone": "中性"
            },
            {
                "text": "行政区账户数全量排名",
                "tone": "专业"
            }
        ],
        "keywords": [
            "行政区",
            "账户数",
            "TopN",
            "排行",
            "统计",
            "district",
            "account_id",
            "去重",
            "区域对比",
            "维度分析",
            "by_dimension",
            "aggregate",
            "topn"
        ],
        "intent_tags": [
            "topn",
            "aggregate",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_share_of_meter_type",
        "aliases": [
            {
                "text": "各类水表占多少比例？",
                "tone": "口语"
            },
            {
                "text": "水表类型占比分析",
                "tone": "中性"
            },
            {
                "text": "水表类型占比分布",
                "tone": "专业"
            }
        ],
        "keywords": [
            "水表类型",
            "占比",
            "比例",
            "meter_type",
            "account_id",
            "去重",
            "分布",
            "主流类型",
            "技术选型",
            "ratio",
            "aggregate",
            "by_dimension"
        ],
        "intent_tags": [
            "ratio",
            "aggregate",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_sample_account_service_point",
        "aliases": [
            {
                "text": "随机看10条账户信息",
                "tone": "口语"
            },
            {
                "text": "抽样账户与服务点明细",
                "tone": "中性"
            },
            {
                "text": "账户-服务点随机抽样验证",
                "tone": "专业"
            }
        ],
        "keywords": [
            "抽样",
            "随机",
            "样本",
            "account_id",
            "service_point_id",
            "数据质量",
            "验证",
            "唯一性",
            "格式检查",
            "sample",
            "quality"
        ],
        "intent_tags": [
            "sample",
            "quality"
        ]
    },
    {
        "id": "snpt_filter_meter_status_valid",
        "aliases": [
            {
                "text": "只取有效的水表记录",
                "tone": "口语"
            },
            {
                "text": "筛选有效水表记录",
                "tone": "中性"
            },
            {
                "text": "水表状态有效性过滤",
                "tone": "专业"
            }
        ],
        "keywords": [
            "有效",
            "过滤",
            "筛选",
            "meter_status",
            "质量检查",
            "断言",
            "清洗",
            "filter",
            "quality"
        ],
        "intent_tags": [
            "filter",
            "quality"
        ]
    },
    {
        "id": "snpt_filter_meter_diameter_20mm",
        "aliases": [
            {
                "text": "找出所有20mm水表用户",
                "tone": "口语"
            },
            {
                "text": "筛选20mm水表记录",
                "tone": "中性"
            },
            {
                "text": "20mm口径水表子集提取",
                "tone": "专业"
            }
        ],
        "keywords": [
            "20mm",
            "水表直径",
            "过滤",
            "筛选",
            "meter_diameter",
            "子集",
            "分析",
            "住宅用水",
            "规格",
            "filter",
            "by_dimension"
        ],
        "intent_tags": [
            "filter",
            "by_dimension"
        ]
    }
 ]
--- a/demo/水务/水务-qwen3-next-80b-snippet.json
+++ b/demo/水务/水务-qwen3-next-80b-snippet.json
@ -1,227 +0,0 @@
 [
    {
        "id": "snpt_topn_supply_office_by_account",
        "desc": "统计各供应办公室对应的账户数量，识别高占比管理所",
        "type": "topn",
        "title": "按供应办公室统计账户数",
        "examples": [
            "哪个供水管理所服务的用户最多？",
            "列出前5个账户数最多的供应办公室"
        ],
        "variables": [
            {
                "name": "top_n",
                "type": "int",
                "default": 11
            }
        ],
        "dialect_sql": {
            "mysql": "SELECT supply_office, COUNT(DISTINCT account_id) AS account_count\nFROM water_meter_info\nGROUP BY supply_office\nORDER BY account_count DESC\nLIMIT {{top_n}};"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "供应办公室仅11个唯一值，可安全展示全部；建议LIMIT 11避免冗余排序"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": 11
            },
            "time_column": "nullable",
            "required_columns": [
                "supply_office",
                "account_id"
            ]
        },
        "business_caliber": "粒度=供应办公室，指标=去重账户数（account_id），仅统计水表信息表中有效账户，不关联外部表"
    },
    {
        "id": "snpt_topn_station_by_account",
        "desc": "统计各站点服务的账户数量，识别高负载站点",
        "type": "topn",
        "title": "按站点统计账户分布",
        "examples": [
            "哪些站点服务的用户最多？",
            "TOP10用户最多的站点是哪些？"
        ],
        "variables": [
            {
                "name": "top_n",
                "type": "int",
                "default": 20
            }
        ],
        "dialect_sql": {
            "mysql": "SELECT station, COUNT(DISTINCT account_id) AS account_count\nFROM water_meter_info\nGROUP BY station\nORDER BY account_count DESC\nLIMIT {{top_n}};"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "站点有36个唯一值，建议LIMIT<=20以避免结果过长；高基数维度可能影响查询性能"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": 36
            },
            "time_column": "nullable",
            "required_columns": [
                "station",
                "account_id"
            ]
        },
        "business_caliber": "粒度=站点（station），指标=去重账户数（account_id），基于水表信息表直接聚合，不涉及时间维度"
    },
    {
        "id": "snpt_topn_district_by_account",
        "desc": "统计各行政区的账户数量，辅助区域资源分配分析",
        "type": "topn",
        "title": "按行政区统计账户分布",
        "examples": [
            "哪个区的用水账户最多？",
            "列出所有行政区的账户数量排名"
        ],
        "variables": [
            {
                "name": "top_n",
                "type": "int",
                "default": 13
            }
        ],
        "dialect_sql": {
            "mysql": "SELECT district, COUNT(DISTINCT account_id) AS account_count\nFROM water_meter_info\nGROUP BY district\nORDER BY account_count DESC\nLIMIT {{top_n}};"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "行政区共13个，可完整展示；适合用于区域对比分析"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": 13
            },
            "time_column": "nullable",
            "required_columns": [
                "district",
                "account_id"
            ]
        },
        "business_caliber": "粒度=行政区（district），指标=去重账户数（account_id），基于水表信息表聚合，反映各区域用户规模"
    },
    {
        "id": "snpt_share_of_meter_type",
        "desc": "计算各类水表类型在总账户中的占比，识别主流类型",
        "type": "ratio",
        "title": "水表类型占比分析",
        "examples": [
            "各类水表在用户中的占比是多少？",
            "电磁式远传水表占总用户比例多少？"
        ],
        "variables": [],
        "dialect_sql": {
            "mysql": "SELECT meter_type, \n       COUNT(DISTINCT account_id) AS account_count,\n       ROUND(COUNT(DISTINCT account_id) * 100.0 / SUM(COUNT(DISTINCT account_id)) OVER (), 2) AS percentage\nFROM water_meter_info\nGROUP BY meter_type\nORDER BY account_count DESC;"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "水表类型仅5种，适合计算占比；可直接展示全量分布"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": 5
            },
            "time_column": "nullable",
            "required_columns": [
                "meter_type",
                "account_id"
            ]
        },
        "business_caliber": "粒度=水表类型（meter_type），指标=去重账户数占比，分母为全表去重账户总数，反映技术选型分布"
    },
    {
        "id": "snpt_sample_account_service_point",
        "desc": "随机抽取部分账户与服务点ID的原始记录，用于数据质量核查",
        "type": "sample",
        "title": "抽样账户与服务点明细",
        "examples": [
            "随机查看10条账户与服务点的详细信息",
            "抽样检查水表信息是否符合预期格式"
        ],
        "variables": [
            {
                "name": "sample_size",
                "type": "int",
                "default": 10
            }
        ],
        "dialect_sql": {
            "mysql": "SELECT account_id, service_point_id, supply_office, station, district, meter_diameter, meter_type, meter_subtype, installation_position\nFROM water_meter_info\nORDER BY RAND()\nLIMIT {{sample_size}};"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "主键组合为account_id+service_point_id，适合抽样验证唯一性；建议样本量≤100"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": null
            },
            "time_column": "nullable",
            "required_columns": [
                "account_id",
                "service_point_id"
            ]
        },
        "business_caliber": "粒度=单条水表记录，抽取样本用于验证account_id与service_point_id的组合唯一性及维度字段完整性"
    },
    {
        "id": "snpt_filter_meter_status_valid",
        "desc": "过滤出水表状态为'有效'的记录，用于后续分析",
        "type": "quality",
        "title": "筛选有效水表记录",
        "examples": [
            "只取状态为有效的水表记录",
            "确认所有水表是否均为有效状态"
        ],
        "variables": [],
        "dialect_sql": {
            "mysql": "SELECT *\nFROM water_meter_info\nWHERE meter_status = '有效';"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "meter_status仅存在'有效'值，此条件恒成立；可用于数据清洗流程的显式过滤"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": 1
            },
            "time_column": "nullable",
            "required_columns": [
                "meter_status"
            ]
        },
        "business_caliber": "仅保留水表状态为'有效'的记录，因全表均为有效值，此过滤为冗余但可作为数据质量校验的显式断言"
    },
    {
        "id": "snpt_filter_meter_diameter_20mm",
        "desc": "筛选水表直径为20mm的记录，用于特定口径设备分析",
        "type": "quality",
        "title": "筛选20mm水表记录",
        "examples": [
            "找出所有使用20mm水表的用户",
            "20mm水表分布在哪些站点？"
        ],
        "variables": [],
        "dialect_sql": {
            "mysql": "SELECT *\nFROM water_meter_info\nWHERE meter_diameter = '20mm';"
        },
        "applicability": {
            "constraints": {
                "notes": [
                    "水表直径共8种枚举值，20mm为常见规格；可作为子集分析的起点"
                ],
                "fk_join_available": false,
                "dim_cardinality_hint": 8
            },
            "time_column": "nullable",
            "required_columns": [
                "meter_diameter"
            ]
        },
        "business_caliber": "粒度=单条水表记录，筛选条件为meter_diameter='20mm'，用于分析标准住宅用水表的分布特征"
    }
 ]
--- a/doc/rag-api.md
+++ b/doc/rag-api.md
@ -1,57 +0,0 @@
 #添加RAG
 curl --location --request POST 'http://127.0.0.1:8000/rag/add' \
 --header 'Content-Type: application/json' \
 --header 'Authorization: Bearer  ' \
 --data-raw '{
    "id": 0,
    "workspaceId": 0,
    "name": "string",
    "embeddingData": "string",
    "type": "METRIC"
 }'
 #批量添加RAG
 curl --location --request POST 'http://127.0.0.1:8000/rag/addBatch' \
 --header 'Content-Type: application/json' \
 --header 'Authorization: Bearer  ' \
 --data-raw '[
    {
        "id": 0,
        "workspaceId": 0,
        "name": "string",
        "embeddingData": "string",
        "type": "METRIC"
    }
 ]'
 #更新RAG
 curl --location --request POST 'http://127.0.0.1:8000/rag/update' \
 --header 'Content-Type: application/json' \
 --header 'Authorization: Bearer  ' \
 --data-raw '{
    "id": 0,
    "workspaceId": 0,
    "name": "string",
    "embeddingData": "string",
    "type": "METRIC"
 }'
 #删除RAG
 curl --location --request POST 'http://127.0.0.1:8000/rag/delete' \
 --header 'Content-Type: application/json' \
 --header 'Authorization: Bearer  ' \
 --data-raw '{
    "id": 0,
    "type": "METRIC"
 }'
 #检索RAG
 curl --location --request POST 'http://127.0.0.1:8000/rag/retrieve' \
 --header 'Content-Type: application/json' \
 --header 'Authorization: Bearer  ' \
 --data-raw '{
    "query": "string",
    "num": 0,
    "workspaceId": 0,
    "type": "METRIC"
 }'
--- a/doc/会话api.md
+++ b/doc/会话api.md
@ -1,49 +0,0 @@
 # 创建会话
 curl -X POST "/api/v1/chat/sessions" \
  -H "Content-Type: application/json" \
  -d "{\"user_id\": $CHAT_USER_ID}"
 # 获取会话
 curl "/api/v1/chat/sessions/{session_id}"
 # 按用户列出会话
 curl "/api/v1/chat/sessions?user_id=$CHAT_USER_ID"
 # 更新会话状态
 curl -X POST "/api/v1/chat/sessions/{session_id}/update" \
  -H "Content-Type: application/json" \
  -d '{"status":"PAUSED"}'
 # 关闭会话
 curl -X POST "/api/v1/chat/sessions/{session_id}/close"
 # 创建对话轮次
 curl -X POST "/api/v1/chat/sessions/{session_id}/turns" \
  -H "Content-Type: application/json" \
  -d '{
    "user_id": '"$CHAT_USER_ID"',
    "user_query": "展示昨天订单GMV",
    "intent": "METRIC_QUERY",
    "ast_json": {"select":["gmv"],"where":{"dt":"yesterday"}},
    "main_metric_ids": [1234],
    "created_metric_ids": []
  }'
 # 获取单条对话轮次
 curl "/api/v1/chat/turns/{turn_id}"
 # 列出会话下的轮次
 curl "/api/v1/chat/sessions/{session_id}/turns"
 # 写入检索结果
 curl -X POST "/api/v1/chat/turns/{turn_id}/retrievals" \
  -H "Content-Type: application/json" \
  -d '{
    "retrievals": [
      {"item_type":"METRIC","item_id":"metric_foo","used_in_sql":true,"rank_no":1},
      {"item_type":"SNIPPET","item_id":"snpt_bar","similarity_score":0.77,"rank_no":2}
    ]
  }'
 # 列出轮次的检索结果
 curl "/api/v1/chat/turns/{turn_id}/retrievals"
--- a/doc/指标api.md
+++ b/doc/指标api.md
@ -1,69 +0,0 @@
 # 新建指标
 curl -X POST "/api/v1/metrics" \
  -H "Content-Type: application/json" \
  -d '{
    "metric_code": "metric_1234",
    "metric_name": "订单数",
    "biz_domain": "order",
    "biz_desc": "订单总数",
    "base_sql": "select count(*) as order_cnt from orders",
    "time_grain": "DAY",
    "dim_binding": ["dt"],
    "update_strategy": "FULL",
    "metric_aliases": ["订单量"],
    "created_by": '"$METRIC_USER_ID"'
  }'
 # 更新指标
 curl -X POST "/api/v1/metrics/{metric_id}" \
  -H "Content-Type: application/json" \
  -d '{"metric_name":"订单数-更新","is_active":false}'
 # 获取指标
 curl "/api/v1/metrics/{metric_id}"
 # 新建调度
 curl -X POST "/api/v1/metric-schedules" \
  -H "Content-Type: application/json" \
  -d '{"metric_id":{metric_id},"cron_expr":"0 2 * * *","priority":5,"enabled":true}'
 # 更新调度
 curl -X POST "/api/v1/metric-schedules/{schedule_id}" \
  -H "Content-Type: application/json" \
  -d '{"enabled":false,"retry_times":1}'
 # 列出某指标的调度
 curl "/api/v1/metrics/{metric_id}/schedules"
 # 触发运行
 curl -X POST "/api/v1/metric-runs/trigger" \
  -H "Content-Type: application/json" \
  -d '{
    "metric_id": {metric_id},
    "triggered_by": "API",
    "data_time_from": "2024-05-01T00:00:00Z",
    "data_time_to": "2024-05-02T00:00:00Z"
  }'
 # 列出运行
 curl "/api/v1/metric-runs?metric_id={metric_id}"
 # 获取单次运行
 curl "/api/v1/metric-runs/{run_id}"
 # 写入指标结果
 curl -X POST "/api/v1/metric-results/{metric_id}" \
  -H "Content-Type: application/json" \
  -d '{
    "metric_id": {metric_id},
    "results": [
      {"stat_time":"2024-05-01T00:00:00Z","metric_value":123.45,"data_version":"{run_id}"},
      {"stat_time":"2024-05-02T00:00:00Z","metric_value":234.56,"data_version":"{run_id}"}
    ]
  }'
 # 查询指标结果
 curl "/api/v1/metric-results?metric_id={metric_id}"
 # 查询最新结果
 curl "/api/v1/metric-results/latest?metric_id={metric_id}"
--- a/doc/指标生成.md
+++ b/doc/指标生成.md
@ -1,83 +0,0 @@
 某个用户的一句问话 → 解析成某轮 chat_turn → 这轮用了哪些指标/知识/会话（chat_turn_retrieval） →
 是否产生了新的指标（metric_def） →
 是否触发了指标调度运行（metric_job_run.turn_id） →
 最终产生了哪些指标结果（metric_result.metric_id + stat_time）。
 会话域
 schema
 会话表 chat_session
 会话轮次表 chat_turn
 会话轮次检索关联表 chat_turn_retrieval
 API
 1. 创建会话
 POST /api/v1/chat/sessions
 2. 更新会话轮次
 POST /api/v1/chat/sessions/{session_id}/update
 3. 结束会话
 POST /api/v1/chat/sessions/{session_id}/close
 4. 查询会话
 GET /api/v1/chat/sessions/{session_id}
 5. 会话列表查询（按用户、时间）
 GET /api/v1/chat/sessions
 6. 创建问答轮次（用户发起 query）
 POST /api/v1/chat/sessions/{session_id}/turns
 7. 查询某会话的所有轮次
 GET /api/v1/chat/sessions/{session_id}/turns
 8. 查看单轮问答详情
 GET /api/v1/chat/turns/{turn_id}
 9. 批量写入某轮的检索结果
 POST /api/v1/chat/turns/{turn_id}/retrievals
 10. 查询某轮的检索记录
 GET /api/v1/chat/turns/{turn_id}/retrievals
 11. 更新某轮的检索记录（in future）
 POST /api/v1/chat/turns/{turn_id}/retrievals/update
 元数据域
 schema
 指标定义表 metric_def
 API
 12. 创建指标（来自问答或传统定义）
 POST /api/v1/metrics
 13. 更新指标
 POST /api/v1/metrics/{id}
 14. 获取指标详情
 GET /api/v1/metrics
 执行调度域（暂定airflow）
 schema
 指标调度配置表 metric_schedule
 调度运行记录表 metric_job_run
 API
 1. 创建调度配置
 POST /api/v1/metric-schedules
 2. 更新调度配置
 POST /api/v1/metric-schedules/{id}
 3. 查询指标调度配置详情
 GET /api/v1/metrics/{metric_id}/schedules
 4. 手动触发一次指标运行（例如来自问数）
 POST /api/v1/metric-runs/trigger
 5. 查询运行记录列表
 GET /api/v1/metric-runs
 6. 查询单次运行详情
 GET /api/metric-runs/{run_id}
 数据域
 schema
 指标结果表（纵表）metric_result
 API
 1. 查询指标结果（按时间段 & 维度）
 GET /api/metric-results
 2. 单点查询（最新值）
 GET /api/metric-results/latest
 3. 批量写入指标结果
 POST /api/v1/metric-results/{metrics_id}
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,13 +0,0 @@
 services:
  app:
    build: .
    ports:
      - "8060:8000"
    volumes:
      - .:/app
    environment:
      - PYTHONUNBUFFERED=1
    # 开发模式：启用 --reload
    command: uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
    # 生产模式：注释上面 command，取消注释下面这行
    # command: uvicorn app.main:app --host 0.0.0.0 --port 8000 --workers 4
--- a/file/dataset/ecommerce_orders_clean.csv
+++ b/file/dataset/ecommerce_orders_clean.csv
--- a/file/ecommerce_orders.sql
+++ b/file/ecommerce_orders.sql
@ -1,21 +0,0 @@
 CREATE TABLE `ecommerce_orders` (
  `order_id` char(36) COLLATE utf8mb4_unicode_ci NOT NULL COMMENT 'UUID from CSV',
  `customer_id` int NOT NULL,
  `product_id` int NOT NULL,
  `category` varchar(64) COLLATE utf8mb4_unicode_ci NOT NULL,
  `price` decimal(10,2) NOT NULL,
  `quantity` int NOT NULL,
  `order_date` datetime(6) NOT NULL,
  `shipping_date` datetime(6) NOT NULL,
  `delivery_status` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
  `payment_method` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
  `device_type` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
  `channel` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
  `shipping_address` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL,
  `billing_address` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL,
  `customer_segment` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
  PRIMARY KEY (`order_id`),
  KEY `idx_customer` (`customer_id`),
  KEY `idx_product` (`product_id`),
  KEY `idx_order_date` (`order_date`)
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
--- a/file/tableschema/action_results.sql
+++ b/file/tableschema/action_results.sql
@ -1,40 +0,0 @@
 CREATE TABLE `action_results` (
  `id` bigint NOT NULL AUTO_INCREMENT COMMENT '主键',
  `table_id` bigint NOT NULL COMMENT '表ID',
  `version_ts` bigint NOT NULL COMMENT '版本时间戳（版本号）',
  `action_type` enum('ge_profiling','ge_result_desc','snippet','snippet_alias') COLLATE utf8mb4_bin NOT NULL COMMENT '动作类型',
  `status` enum('pending','running','success','failed','partial') COLLATE utf8mb4_bin NOT NULL DEFAULT 'pending' COMMENT '执行状态',
  `llm_usage` json DEFAULT NULL COMMENT 'LLM token usage统计',
  `error_code` varchar(128) COLLATE utf8mb4_bin DEFAULT NULL,
  `error_message` text COLLATE utf8mb4_bin,
  `started_at` datetime DEFAULT NULL,
  `finished_at` datetime DEFAULT NULL,
  `duration_ms` int DEFAULT NULL,
  `table_schema_version_id` varchar(19) COLLATE utf8mb4_bin NOT NULL,
  `table_schema` json NOT NULL,
  `ge_profiling_json` json DEFAULT NULL COMMENT 'Profiling完整结果JSON',
  `ge_profiling_json_size_bytes` bigint DEFAULT NULL,
  `ge_profiling_summary` json DEFAULT NULL COMMENT 'Profiling摘要（剔除大value_set等）',
  `ge_profiling_summary_size_bytes` bigint DEFAULT NULL,
  `ge_profiling_total_size_bytes` bigint DEFAULT NULL COMMENT '上两者合计',
  `ge_profiling_html_report_url` varchar(1024) COLLATE utf8mb4_bin DEFAULT NULL COMMENT 'GE报告HTML路径/URL',
  `ge_result_desc_json` json DEFAULT NULL COMMENT '表描述结果JSON',
  `ge_result_desc_json_size_bytes` bigint DEFAULT NULL,
  `snippet_json` json DEFAULT NULL COMMENT 'SQL知识片段结果JSON',
  `snippet_json_size_bytes` bigint DEFAULT NULL,
  `snippet_alias_json` json DEFAULT NULL COMMENT 'SQL片段改写/丰富结果JSON',
  `snippet_alias_json_size_bytes` bigint DEFAULT NULL,
  `callback_url` varchar(1024) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
  `result_checksum` varbinary(32) DEFAULT NULL COMMENT '对当前action有效载荷计算的MD5/xxhash',
  `created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
  `updated_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
  `model` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型名称',
  `model_provider` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型渠道',
  `model_params` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型参数，如温度',
  PRIMARY KEY (`id`),
  UNIQUE KEY `uq_table_ver_action` (`table_id`,`version_ts`,`action_type`),
  KEY `idx_status` (`status`),
  KEY `idx_table` (`table_id`,`updated_at`),
  KEY `idx_action_time` (`action_type`,`version_ts`),
  KEY `idx_schema_version` (`table_schema_version_id`)
 ) ENGINE=InnoDB AUTO_INCREMENT=113 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC COMMENT='数据分析知识片段表';
--- a/file/tableschema/chat.sql
+++ b/file/tableschema/chat.sql
@ -1,103 +0,0 @@
 CREATE TABLE IF NOT EXISTS chat_session (
    id               BIGINT AUTO_INCREMENT PRIMARY KEY,
    user_id          BIGINT NOT NULL,
    session_uuid     CHAR(36) NOT NULL,              -- 可用于对外展示的ID（UUID）
    end_time         DATETIME NULL,
    status           VARCHAR(16) NOT NULL DEFAULT 'OPEN', -- OPEN/CLOSED/ABANDONED
    last_turn_id     BIGINT NULL,                    -- 指向 chat_turn.id
    ext_context      JSON NULL,                      -- 业务上下文
    created_at       DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
    updated_at       DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
    UNIQUE KEY uk_session_uuid (session_uuid),
    KEY idx_user_time (user_id, created_at),
    KEY idx_status_time (status, created_at),
    KEY idx_last_turn (last_turn_id)
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
 CREATE TABLE IF NOT EXISTS chat_turn (
    id                BIGINT AUTO_INCREMENT,
    session_id        BIGINT NOT NULL,                   -- 关联 chat_session.id
    turn_no           INT NOT NULL,                      -- 会话内轮次序号（1,2,3...）
    user_id           BIGINT NOT NULL,
    user_query        TEXT NOT NULL,                     -- 原始用户问句
    intent            VARCHAR(64) NULL,                  -- METRIC_QUERY/METRIC_EXPLAIN 等
    ast_json          JSON NULL,                         -- 解析出来的 AST
    generated_sql     MEDIUMTEXT NULL,                   -- 生成的最终SQL
    sql_status        VARCHAR(32) NULL,                  -- SUCCESS/FAILED/SKIPPED
    error_msg         TEXT NULL,                         -- SQL生成/执行错误信息
    main_metric_ids       JSON NULL,                     -- 本轮涉及的指标ID列表
    created_metric_ids    JSON NULL,                     -- 本轮新建指标ID列表
    end_time          DATETIME NULL,
    created_at        DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
    updated_at        DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
    -- 主键改为联合主键，必须包含 created_at
    PRIMARY KEY (id, created_at),
    KEY idx_session_turn (session_id, turn_no),
    KEY idx_session_time (session_id, created_at),
    KEY idx_intent_time (intent, created_at),
    KEY idx_user_time (user_id, created_at)
 )
 ENGINE=InnoDB
 DEFAULT CHARSET=utf8mb4
 PARTITION BY RANGE COLUMNS(created_at) (
    -- 历史数据分区（根据实际需求调整）
    PARTITION p202511 VALUES LESS THAN ('2025-12-01'),
    PARTITION p202512 VALUES LESS THAN ('2026-01-01'),
    -- 2026年按月分区
    PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
    PARTITION p202602 VALUES LESS THAN ('2026-03-01'),
    PARTITION p202603 VALUES LESS THAN ('2026-04-01'),
    PARTITION p202604 VALUES LESS THAN ('2026-05-01'),
    PARTITION p202605 VALUES LESS THAN ('2026-06-01'),
    PARTITION p202606 VALUES LESS THAN ('2026-07-01'),
    -- ... 可以预建几个月 ...
    -- 兜底分区，存放未来的数据，防止插入报错
    PARTITION p_future VALUES LESS THAN (MAXVALUE)
 );
 CREATE TABLE IF NOT EXISTS chat_turn_retrieval (
    id                BIGINT AUTO_INCREMENT,
    turn_id           BIGINT NOT NULL,                 -- 关联 qa_turn.id
    item_type         VARCHAR(32) NOT NULL,            -- METRIC/SNIPPET/CHAT
    item_id           VARCHAR(128) NOT NULL,           -- metric_id/snippet_id/table_name 等
    item_extra        JSON NULL,                       -- 附加信息，如字段名等
    similarity_score  DECIMAL(10,6) NULL,              -- 相似度
    rank_no           INT NULL,                        -- 检索排名
    used_in_reasoning TINYINT(1) NOT NULL DEFAULT 0,   -- 是否参与推理
    used_in_sql       TINYINT(1) NOT NULL DEFAULT 0,   -- 是否影响最终SQL
    created_at        DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
    updated_at        DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
    -- 主键改为联合主键，必须包含 created_at
    PRIMARY KEY (id, created_at),
    KEY idx_turn (turn_id),
    KEY idx_turn_type (turn_id, item_type),
    KEY idx_item (item_type, item_id)
 )
 ENGINE=InnoDB
 DEFAULT CHARSET=utf8mb4
 PARTITION BY RANGE COLUMNS(created_at) (
    -- 历史数据分区（根据实际需求调整）
    PARTITION p202511 VALUES LESS THAN ('2025-12-01'),
    PARTITION p202512 VALUES LESS THAN ('2026-01-01'),
    -- 2026年按月分区
    PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
    PARTITION p202602 VALUES LESS THAN ('2026-03-01'),
    PARTITION p202603 VALUES LESS THAN ('2026-04-01'),
    PARTITION p202604 VALUES LESS THAN ('2026-05-01'),
    PARTITION p202605 VALUES LESS THAN ('2026-06-01'),
    PARTITION p202606 VALUES LESS THAN ('2026-07-01'),
    -- ... 可以预建几个月 ...
    -- 兜底分区，存放未来的数据，防止插入报错
    PARTITION p_future VALUES LESS THAN (MAXVALUE)
 );
--- a/file/tableschema/metrics.sql
+++ b/file/tableschema/metrics.sql
@ -1,155 +0,0 @@
 CREATE TABLE metric_def (
    id                BIGINT AUTO_INCREMENT PRIMARY KEY,
    metric_code       VARCHAR(64) NOT NULL,          -- 内部编码：order_cnt_delivery
    metric_name       VARCHAR(128) NOT NULL,         -- 中文名：外送订单数
    metric_aliases    JSON NULL,                     -- 别名列表
    biz_domain        VARCHAR(64) NOT NULL,          -- 通过table tag获取，支持人工配置
    biz_desc          TEXT NULL,                     -- 业务口径描述
    chat_turn_id      BIGINT NULL,                   -- 来自哪轮会话
    tech_desc         TEXT NULL,                     -- 技术口径描述
    formula_expr      TEXT NULL,                     -- 公式描述："sum(pay_amount)"
    base_sql          MEDIUMTEXT NOT NULL,           -- 标准计算SQL（逻辑SQL/snippet）
    time_grain        VARCHAR(32) NOT NULL,          -- DAY/HOUR/WEEK/MONTH
    dim_binding       JSON NOT NULL,                 -- 维度绑定，如 ["dt","store_id","channel"]
    update_strategy   VARCHAR(32) NOT NULL,          -- FULL/INCR/REALTIME
    schedule_id       BIGINT NULL,                   -- 调度ID
    schedule_type     INT NULL,                      -- 调度类型，默认调度cron
    version           INT NOT NULL DEFAULT 1,
    is_active         TINYINT(1) NOT NULL DEFAULT 1,
    sql_hash          VARCHAR(64) NULL,              -- base_sql hash 用于版本比较
    created_by        BIGINT NULL,
    updated_by        BIGINT NULL,
    created_at        DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
    updated_at        DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
    UNIQUE KEY uk_metric_code (metric_code),
    KEY idx_domain_active (biz_domain, is_active),
    KEY idx_update_strategy (update_strategy),
    KEY idx_name (metric_name)
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
 CREATE TABLE metric_schedule (
    id                BIGINT AUTO_INCREMENT PRIMARY KEY,
    metric_id         BIGINT NOT NULL,                -- 关联 metric_def.id
    cron_expr         VARCHAR(64) NOT NULL,           -- 调度表达式
    enabled           TINYINT(1) NOT NULL DEFAULT 1,  -- 是否启用
    priority          INT NOT NULL DEFAULT 10,        -- 优先级
    backfill_allowed  TINYINT(1) NOT NULL DEFAULT 1,  -- 是否允许补数
    max_runtime_sec   INT NULL,                       -- 最大运行时长（秒）
    retry_times       INT NOT NULL DEFAULT 0,         -- 失败重试次数
    owner_team        VARCHAR(64) NULL,
    owner_user_id     BIGINT NULL,
    created_at        DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
    updated_at        DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
    KEY idx_metric_enabled (metric_id, enabled),
    KEY idx_owner (owner_team, owner_user_id)
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
 CREATE TABLE metric_job_run (
    id                BIGINT AUTO_INCREMENT,
    metric_id         BIGINT NOT NULL,                -- metric_def.id
    schedule_id       BIGINT NULL,                    -- metric_schedule.id，手动触发则可为空
    source_turn_id    BIGINT NULL,                    -- 若本次运行由某次问答触发，关联 qa_turn.id
    data_time_from    DATETIME NULL,                  -- 指标统计时间窗口起
    data_time_to      DATETIME NULL,                  -- 指标统计时间窗口止
    metric_version    INT NOT NULL,                   -- 执行时使用的指标版本
    base_sql_snapshot MEDIUMTEXT NOT NULL,            -- 本次执行使用的SQL快照
    status            VARCHAR(32) NOT NULL,           -- RUNNING/SUCCESS/FAILED/SKIPPED
    error_msg         TEXT NULL,
    affected_rows     BIGINT NULL,                    -- 写入行数
    runtime_ms        BIGINT NULL,                    -- 执行耗时
    triggered_by      VARCHAR(32) NOT NULL,           -- SCHEDULER/MANUAL/API/QA_TURN
    triggered_at      DATETIME NOT NULL,
    started_at        DATETIME NULL,
    finished_at       DATETIME NULL,
    created_at        DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
    updated_at        DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
    -- 主键改为联合主键，必须包含 created_at
    PRIMARY KEY (id, created_at),
    KEY idx_metric_time (metric_id, data_time_from, data_time_to),
    KEY idx_status_time (status, triggered_at),
    KEY idx_schedule (schedule_id),
    KEY idx_source_turn (source_turn_id)
 )
 ENGINE=InnoDB
 DEFAULT CHARSET=utf8mb4
 PARTITION BY RANGE COLUMNS(created_at) (
    -- 历史数据分区（根据实际需求调整）
    PARTITION p202511 VALUES LESS THAN ('2025-12-01'),
    PARTITION p202512 VALUES LESS THAN ('2026-01-01'),
    -- 2026年按月分区
    PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
    PARTITION p202602 VALUES LESS THAN ('2026-03-01'),
    PARTITION p202603 VALUES LESS THAN ('2026-04-01'),
    PARTITION p202604 VALUES LESS THAN ('2026-05-01'),
    PARTITION p202605 VALUES LESS THAN ('2026-06-01'),
    PARTITION p202606 VALUES LESS THAN ('2026-07-01'),
    -- ... 可以预建几个月 ...
    -- 兜底分区，存放未来的数据，防止插入报错
    PARTITION p_future VALUES LESS THAN (MAXVALUE)
 );
 CREATE TABLE metric_result (
    id                BIGINT AUTO_INCREMENT,
    metric_id         BIGINT NOT NULL,                -- metric_def.id
    metric_version    INT NOT NULL,                   -- metric_def.version
    stat_time         DATETIME NOT NULL,              -- 按 time_grain 对齐后的时间
    extra_dims        JSON NULL,                      -- 其他维度，JSON 存
    metric_value      DECIMAL(32,8) NOT NULL,         -- 指标结果值
    load_time         DATETIME NOT NULL,              -- 入库时间
    data_version      BIGINT NULL,                    -- 版本或 job_run id
    created_at        DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
    updated_at        DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
    -- 主键改为联合主键，必须包含 created_at
    PRIMARY KEY (id, created_at),
    KEY idx_metric_time (metric_id, stat_time),
    KEY idx_load_time (load_time)
 )
 ENGINE=InnoDB
 DEFAULT CHARSET=utf8mb4
 PARTITION BY RANGE COLUMNS(created_at) (
    -- 历史数据分区（根据实际需求调整）
    PARTITION p202511 VALUES LESS THAN ('2025-12-01'),
    PARTITION p202512 VALUES LESS THAN ('2026-01-01'),
    -- 2026年按月分区
    PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
    PARTITION p202602 VALUES LESS THAN ('2026-03-01'),
    PARTITION p202603 VALUES LESS THAN ('2026-04-01'),
    PARTITION p202604 VALUES LESS THAN ('2026-05-01'),
    PARTITION p202605 VALUES LESS THAN ('2026-06-01'),
    PARTITION p202606 VALUES LESS THAN ('2026-07-01'),
    -- ... 可以预建几个月 ...
    -- 兜底分区，存放未来的数据，防止插入报错
    PARTITION p_future VALUES LESS THAN (MAXVALUE)
 );
--- a/file/tableschema/rag_snippet.sql
+++ b/file/tableschema/rag_snippet.sql
@ -1,24 +0,0 @@
 CREATE TABLE `rag_snippet` (
  `rag_item_id` bigint NOT NULL COMMENT 'RAG item id (stable hash of table/version/snippet_id)',
  `workspace_id` bigint NOT NULL COMMENT 'RAG workspace scope',
  `table_id` bigint NOT NULL COMMENT '来源表ID',
  `version_ts` bigint NOT NULL COMMENT '表版本号',
  `action_result_id` bigint NOT NULL COMMENT '来源 action_results 主键ID（snippet_alias 或 snippet 行）',
  `snippet_id` varchar(255) COLLATE utf8mb4_bin NOT NULL COMMENT '原始 snippet id',
  `rag_text` text COLLATE utf8mb4_bin NOT NULL COMMENT '用于向量化的拼接文本',
  `merged_json` json NOT NULL COMMENT '合并后的 snippet 对象',
  `created_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '写入时间，用于分区',
  `updated_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
  PRIMARY KEY (`rag_item_id`,`created_at`),
  KEY `idx_action_result` (`action_result_id`),
  KEY `idx_workspace` (`workspace_id`),
  KEY `idx_table_version` (`table_id`,`version_ts`)
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin
 PARTITION BY RANGE COLUMNS (`created_at`) (
  PARTITION p202401 VALUES LESS THAN ('2024-02-01'),
  PARTITION p202402 VALUES LESS THAN ('2024-03-01'),
  PARTITION p202403 VALUES LESS THAN ('2024-04-01'),
  PARTITION p202404 VALUES LESS THAN ('2024-05-01'),
  PARTITION p202405 VALUES LESS THAN ('2024-06-01'),
  PARTITION p_future VALUES LESS THAN (MAXVALUE)
 ) COMMENT='RAG snippet 索引缓存';
--- a/file/tableschema/table_snippet.sql
+++ b/file/tableschema/table_snippet.sql
@ -1,40 +0,0 @@
 CREATE TABLE `action_results` (
  `id` bigint NOT NULL AUTO_INCREMENT COMMENT '主键',
  `table_id` bigint NOT NULL COMMENT '表ID',
  `version_ts` bigint NOT NULL COMMENT '版本时间戳（版本号）',
  `action_type` enum('ge_profiling','ge_result_desc','snippet','snippet_alias') COLLATE utf8mb4_bin NOT NULL COMMENT '动作类型',
  `status` enum('pending','running','success','failed','partial') COLLATE utf8mb4_bin NOT NULL DEFAULT 'pending' COMMENT '执行状态',
  `model` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型名称',
  `model_provider` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型渠道',
  `model_params` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型参数，如温度',
  `llm_usage` json DEFAULT NULL COMMENT 'LLM token usage统计',
  `error_code` varchar(128) COLLATE utf8mb4_bin DEFAULT NULL,
  `error_message` text COLLATE utf8mb4_bin,
  `started_at` datetime DEFAULT NULL,
  `finished_at` datetime DEFAULT NULL,
  `duration_ms` int DEFAULT NULL,
  `table_schema_version_id` varchar(19) COLLATE utf8mb4_bin NOT NULL,
  `table_schema` json NOT NULL,
  `ge_profiling_json` json DEFAULT NULL COMMENT 'Profiling完整结果JSON',
  `ge_profiling_json_size_bytes` bigint DEFAULT NULL,
  `ge_profiling_summary` json DEFAULT NULL COMMENT 'Profiling摘要（剔除大value_set等）',
  `ge_profiling_summary_size_bytes` bigint DEFAULT NULL,
  `ge_profiling_total_size_bytes` bigint DEFAULT NULL COMMENT '上两者合计',
  `ge_profiling_html_report_url` varchar(1024) COLLATE utf8mb4_bin DEFAULT NULL COMMENT 'GE报告HTML路径/URL',
  `ge_result_desc_json` json DEFAULT NULL COMMENT '表描述结果JSON',
  `ge_result_desc_json_size_bytes` bigint DEFAULT NULL,
  `snippet_json` json DEFAULT NULL COMMENT 'SQL知识片段结果JSON',
  `snippet_json_size_bytes` bigint DEFAULT NULL,
  `snippet_alias_json` json DEFAULT NULL COMMENT 'SQL片段改写/丰富结果JSON',
  `snippet_alias_json_size_bytes` bigint DEFAULT NULL,
  `callback_url` varchar(1024) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
  `result_checksum` varbinary(32) DEFAULT NULL COMMENT '对当前action有效载荷计算的MD5/xxhash',
  `created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
  `updated_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
  PRIMARY KEY (`id`),
  UNIQUE KEY `uq_table_ver_action` (`table_id`,`version_ts`,`action_type`),
  KEY `idx_status` (`status`),
  KEY `idx_table` (`table_id`,`updated_at`),
  KEY `idx_action_time` (`action_type`,`version_ts`),
  KEY `idx_schema_version` (`table_schema_version_id`)
 ) ENGINE=InnoDB AUTO_INCREMENT=53 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC COMMENT='数据分析知识片段表';
--- a/ge_v1.py
+++ b/ge_v1.py
@ -121,7 +121,7 @@ def clean_value(value: Any) -> Any:
    if isinstance(value, (np.generic,)):
        return value.item()
    if isinstance(value, pd.Timestamp):
-        return str(value)
+        return value.isoformat()
    if pd.isna(value):
        return None
    return value
--- a/logging.yaml
+++ b/logging.yaml
@ -1,30 +0,0 @@
 version: 1
 formatters:
  standard:
    format: "%(asctime)s %(levelname)s %(name)s:%(lineno)d %(message)s"
 handlers:
  console:
    class: logging.StreamHandler
    level: INFO
    formatter: standard
    stream: ext://sys.stdout
  file:
    class: logging.handlers.RotatingFileHandler
    level: INFO
    formatter: standard
    filename: logs/app.log
    maxBytes: 10485760  # 10 MB
    backupCount: 5
    encoding: utf-8
 loggers:
  app:
    level: INFO
    handlers:
      - console
      - file
    propagate: no
 root:
  level: INFO
  handlers:
    - console
    - file
--- a/main.py
+++ b/main.py
@ -1,6 +0,0 @@
 def main():
    print("Hello from data-ge-new!")
 if __name__ == "__main__":
    main()
--- a/project.md
+++ b/project.md
@ -1,23 +0,0 @@
 项目结构与逻辑
 app/main.py：创建 FastAPI 应用与生命周期，初始化共享 httpx.AsyncClient 和 LLMGateway，统一异常处理后暴露四个接口：聊天代理、导入分析、表画像流水线、表片段入库。
 app/models.py：定义所有请求/响应模型与枚举（LLM 请求、导入分析作业、表画像作业、片段入库等），并给出字段校验与默认值。
 app/services：核心业务逻辑
 gateway.py 将 /v1/chat/completions 请求转发到 NEW_API_BASE_URL（带可选 Bearer Token），并归一化返回。
 import_analysis.py 组装导入提示词（prompt/data_import_analysis.md）、解析/截断样本、调用统一聊天接口、抽取 JSON 结果与 token 用量，最后回调业务方。
 table_profiling.py 串行执行 4 步流水线：Great Expectations profiling → LLM 结果描述（prompt/ge_result_desc_prompt.md）→ 片段生成（prompt/snippet_generator.md）→ 片段别名（prompt/snippet_alias_generator.md），每步都回调状态与结果。
 table_snippet.py 将各步骤结果 upsert 到数据库表，自动序列化 JSON/大小信息并构造 INSERT ... ON DUPLICATE KEY UPDATE。
 app/providers/*：各云厂商直连客户端（OpenAI/Anthropic/OpenRouter/Gemini/Qwen/DeepSeek），实现统一 chat 接口；当前主流程通过 new-api 转发，但保留直连能力。
 prompt/ 存放提示词模板；scripts/ 与 test/ 目录提供接口调用示例和回归样本；table_snippet.sql 给出 action_results 表结构（用于片段与 profiling 结果持久化）。
 功能/需求说明
 LLM 网关：POST /v1/chat/completions 接收 LLMRequest（provider+model+messages 等），将 payload 透传到 NEW_API_BASE_URL/v1/chat/completions，带可选 NEW_API_AUTH_TOKEN 认证；异常时返回 4xx/5xx 并记录原始响应。
 导入分析（异步）：POST /v1/import/analyze 接收导入样本（rows/headers/raw_csv/table_schema）、目标模型 llm_model（默认 DEFAULT_IMPORT_MODEL，可被 IMPORT_SUPPORTED_MODELS 白名单限制）、温度与回调地址。服务将样本转 CSV、附加 schema，拼接系统+用户消息后调用统一聊天接口，解析首个 choice 中的 JSON 作为分析结果，连同 LLM usage 一并以回调形式返回；失败时回调 status=failed 与错误信息。
 表画像流水线（异步）：POST /v1/table/profiling 接收表标识、版本号、回调地址及 GE/LLM 配置（datasource/batch_request、连接串模板、LLM 模型与超时）。流水线按顺序执行：
 Great Expectations profiling（可指定 profiler 类型、datasource、runtime SQL 查询/表），生成完整与摘要 JSON 及 Data Docs 路径；
 调用聊天接口生成 GE 结果描述 JSON；
 基于描述生成 SQL 片段数组；
 生成片段别名/关键词。
 每步成功/失败都会回调，payload 包含 action_type、结果 JSON、模型、llm_usage、报错信息等。
 片段结果入库：POST /v1/table/snippet 接收 TableSnippetUpsertRequest（表/版本、action 类型、状态、schema、模型信息、各阶段 JSON 及大小、错误码、时间戳等），组装到 action_results 表进行 UPSERT，返回是否更新已有记录。
 配置与运行要求：核心环境变量在 app/settings.py（API Keys、DEFAULT_IMPORT_MODEL、IMPORT_GATEWAY_BASE_URL/NEW_API_BASE_URL、模型白名单、数据库 URL 等）；日志使用 logging.yaml 自动创建 logs/；HTTP 客户端超时/代理可通过 HTTP_CLIENT_TIMEOUT、HTTP_CLIENT_TRUST_ENV、HTTP_CLIENT_PROXY 控制。 调试可用 uvicorn app.main:app --reload，Docker 由 Dockerfile/docker-compose.yml 提供。
--- a/prompt/ge_result_desc_prompt.md
+++ b/prompt/ge_result_desc_prompt.md
@ -1,47 +0,0 @@
 系统角色（System）
 你是“数据画像抽取器”。输入是一段 Great Expectations 的 profiling/validation 结果 JSON，
 可能包含：列级期望（expect_*）、统计、样例值、类型推断等；也可能带表级/批次元数据。
 请将其归一化为一个可被程序消费的“表画像”JSON，对不确定项给出置信度与理由。
 禁止臆造不存在的列、时间范围或数值。
 用户消息（User）
 【输入：GE结果JSON】
 {{GE_RESULT_JSON}}
 【输出要求（只输出JSON，不要解释文字）】
 {
  "table": "<库.表 或 表名>",
  "row_count": <int|null>,                             // 若未知可为 null
  "role": "fact|dimension|unknown",                    // 依据指标/维度占比与唯一性启发式
  "grain": ["<列1>", "<列2>", ...],                    // 事实粒度猜测（如含 dt/店/类目）
  "time": { "column": "<name>|null", "granularity": "day|week|month|unknown", "range": ["YYYY-MM-DD","YYYY-MM-DD"]|null, "has_gaps": true|false|null },
  "columns": [
    {
      "name": "<col>",
      "dtype": "<ge推断/物理类型>",
      "semantic_type": "dimension|metric|time|text|id|unknown",
      "null_rate": <0~1|null>,
      "distinct_count": <int|null>,
      "distinct_ratio": <0~1|null>,
      "stats": { "min": <number|string|null>,"max": <number|string|null>,"mean": <number|null>,"std": <number|null>,"skewness": <number|null> },
      "enumish": true|false|null,                      // 低熵/可枚举
      "top_values": [{"value":"<v>","pct":<0~1>}, ...],// 取前K个（≤10）
      "pk_candidate_score": <0~1>,                     // 唯一性+非空综合评分
      "metric_candidate_score": <0~1>,                 // 数值/偏态/业务词命中
      "comment": "<列注释或GE描述|可为空>"
    }
  ],
  "primary_key_candidates": [["colA","colB"], ...],    // 依据 unique/compound unique 期望
  "fk_candidates": [{"from":"<col>","to":"<dim_table(col)>","confidence":<0~1>}],
  "quality": {
    "failed_expectations": [{"name":"<expect_*>","column":"<col|table>","summary":"<一句话>"}],
    "warning_hints": ["空值率>0.2的列: ...", "时间列存在缺口: ..."]
  },
  "confidence_notes": ["<为什么判定role/grain/time列>"]
 }
 【判定规则（简要）】
 - time列：类型为日期/时间 OR 命中 dt/date/day 等命名；若有 min/max 可给出 range；若间隔缺口≥1天记 has_gaps=true。
 - semantic_type：数值+右偏/方差大→更偏 metric；高唯一/ID命名→id；高基数+文本→text；低熵+有限取值→dimension。
 - role：metric列占比高且存在time列→倾向 fact；几乎全是枚举/ID且少数值→dimension。
 - 置信不高时给出 null 或 unknown，并写入 confidence_notes。
--- a/prompt/snippet_alias_generator.md
+++ b/prompt/snippet_alias_generator.md
@ -1,52 +0,0 @@
 系统角色（System）
 你是“SQL片段别名生成器”。
 输入为一个或多个 SQL 片段对象（来自 snippet.json），输出为针对每个片段生成的多样化别名（口语 / 中性 / 专业）、关键词与意图标签。
 要求逐个处理所有片段对象，输出同样数量的 JSON 元素。
 用户消息（User）
 【上下文】
 SQL片段对象数组：{{SNIPPET_ARRAY}} // snippet.json中的一个或多个片段
 【任务要求】
 请针对输入数组中的 每个 SQL 片段，输出一个 JSON 对象，结构如下：
 {
  "id": "<与输入片段id一致>",
  "aliases": [
    {"text": "…", "tone": "口语|中性|专业"},
    {"text": "…", "tone": "专业"}
  ],
  "keywords": [
    "GMV","销售额","TopN","category","类目","趋势","同比","客户","订单","质量","异常检测","join","过滤","sample"
  ],
  "intent_tags": ["aggregate","trend","topn","ratio","quality","join","sample","filter","by_dimension"]
 }
 生成逻辑规范
 1.逐条输出：输入数组中每个片段对应一个输出对象（id 保持一致）。
 2.aliases生成
 至少 3 个别名，分别覆盖语气类型：口语 / 中性 / 专业。
 ≤20字，语义需等价，不得添加不存在的字段或业务口径。
 示例：
  GMV趋势分析（中性）
  每天卖多少钱（口语）
  按日GMV曲线（专业）
 3.keywords生成
 8~15个关键词，需涵盖片段核心维度、指标、分析类型和语义近义词。
 中英文混合（如 "GMV"/"销售额"、"同比"/"YoY"、"类目"/"category" 等）。
 包含用于匹配的分析意图关键词（如 “趋势”、“排行”、“占比”、“质量检查”、“过滤” 等）。
 4.intent_tags生成
 从以下集合中选取，与片段type及用途一致：
 ["aggregate","trend","topn","ratio","quality","join","sample","filter","by_dimension"]
 若为条件片段（WHERE句型），补充 "filter"；若含维度分组逻辑，补充 "by_dimension"。
 5.语言与内容要求
 保持正式书面风格，不添加解释说明。
 只输出JSON数组，不包含文字描述或额外文本。
--- a/prompt/snippet_generator.md
+++ b/prompt/snippet_generator.md
@ -1,46 +0,0 @@
 系统角色（System）
 你是“SQL片段生成器”。只能基于给定“表画像”生成可复用的分析片段。
 为每个片段产出：标题、用途描述、片段类型、变量、适用条件、SQL模板（mysql方言），并注明业务口径与安全限制。
 不要发明画像里没有的列。时间/维度/指标须与画像匹配。
 用户消息（User）
 【表画像JSON】
 {{TABLE_PROFILE_JSON}}
 【输出要求（只输出JSON数组）】
 [
  {
    "id": "snpt_<slug>",
    "title": "中文标题（≤16字）",
    "desc": "一句话用途",
    "type": "aggregate|trend|topn|ratio|quality|join|sample",
    "applicability": {
      "required_columns": ["<col>", ...],
      "time_column": "<dt|nullable>",
      "constraints": {
        "dim_cardinality_hint": <int|null>,            // 用于TopN限制与性能提示
        "fk_join_available": true|false,
        "notes": ["高基数维度建议LIMIT<=50", "..."]
      }
    },
    "variables": [
      {"name":"start_date","type":"date"},
      {"name":"end_date","type":"date"},
      {"name":"top_n","type":"int","default":10}
    ],
    "dialect_sql": {
      "mysql": ""
    },
    "business_caliber": "清晰口径说明，如 UV以device_id去重；粒度=日-类目",
    "examples": ["示例问法1","示例问法2"]
  }
 ]
 【片段选择建议】
 - 若存在 time 列：生成 trend_by_day / yoy_qoq / moving_avg。
 - 若存在 enumish 维度（distinct 5~200）：生成 topn_by_dimension / share_of_total。
 - 若 metric 列：生成 sum/avg/max、分位数/异常检测（3σ/箱线）。
 - 有主键/唯一：生成 去重/明细抽样/质量检查。
 - 有 fk_candidates：同时生成“join维表命名版”和“纯ID版”。
 - 高枚举维度：在 constraints.notes 中强调 LIMIT 建议与可能的性能风险。
 - 除了完整的sql片段，还有sql里部分内容的sql片段，比如 where payment_method = 'Credit Card' and delivery_status = 'Deliverd' 的含义是支付方式为信用卡且配送状态是已送达
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,21 +0,0 @@
 [project]
 name = "data-ge-new"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
    "fastapi>=0.111.0",
    "uvicorn[standard]>=0.29.0",
    "pydantic>=2.6.0",
    "sqlalchemy>=2.0.28",
    "pymysql>=1.1.0",
    "great-expectations[profilers]==0.18.19",
    "pandas>=2.0",
    "numpy>=1.24",
    "openpyxl>=3.1",
    "httpx==0.27.2",
    "python-dotenv==1.0.1",
    "requests>=2.31.0",
    "PyYAML>=6.0.1",
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,11 @@
 fastapi>=0.111.0
 uvicorn[standard]>=0.29.0
 pydantic>=2.6.0
 sqlalchemy>=2.0.28
 pymysql>=1.1.0
 great_expectations>=0.18.0,<0.19.0
 pandas>=2.0
 numpy>=1.24
 openpyxl>=3.1
 httpx==0.27.2
 python-dotenv==1.0.1
--- a/scripts/huggingface_download.py
+++ b/scripts/huggingface_download.py
@ -1,226 +0,0 @@
 import argparse
 import logging
 import os
 from typing import Dict, Iterable, List, Optional
 import datasets
 from datasets import DownloadConfig
 from huggingface_hub import snapshot_download
 # 批量下载 Hugging Face 上的数据集和模型
 # 支持通过命令行参数配置代理和下载参数，如超时和重试次数，支持批量循环下载，存储到file目录下dataset和model子目录
 def _parse_id_list(values: Iterable[str]) -> List[str]:
    """将多次传入以及逗号分隔的标识整理为列表."""
    ids: List[str] = []
    for value in values:
        value = value.strip()
        if not value:
            continue
        if "," in value:
            ids.extend(v.strip() for v in value.split(",") if v.strip())
        else:
            ids.append(value)
    return ids
 def _parse_proxy_args(proxy_args: Iterable[str]) -> Dict[str, str]:
    """解析命令行传入的代理设置，格式 scheme=url."""
    proxies: Dict[str, str] = {}
    for item in proxy_args:
        raw = item.strip()
        if not raw:
            continue
        if "=" not in raw:
            logging.warning("代理参数 %s 缺少 '=' 分隔符，将忽略该项", raw)
            continue
        key, value = raw.split("=", 1)
        key = key.strip()
        value = value.strip()
        if not key or not value:
            logging.warning("代理参数 %s 解析失败，将忽略该项", raw)
            continue
        proxies[key] = value
    return proxies
 def _sanitize_dir_name(name: str) -> str:
    return name.replace("/", "__")
 def _ensure_dirs(root_dir: str) -> Dict[str, str]:
    paths = {
        "dataset": os.path.join(root_dir, "dataset"),
        "model": os.path.join(root_dir, "model"),
    }
    for path in paths.values():
        os.makedirs(path, exist_ok=True)
    return paths
 def _build_download_config(cache_dir: str, retries: Optional[int], proxies: Dict[str, str]) -> DownloadConfig:
    config_kwargs = {"cache_dir": cache_dir}
    if retries is not None:
        config_kwargs["max_retries"] = retries
    if proxies:
        config_kwargs["proxies"] = proxies
    return DownloadConfig(**config_kwargs)
 def _apply_timeout(timeout: Optional[float]) -> None:
    if timeout is None:
        return
    str_timeout = str(timeout)
    os.environ.setdefault("HF_DATASETS_HTTP_TIMEOUT", str_timeout)
    os.environ.setdefault("HF_HUB_HTTP_TIMEOUT", str_timeout)
 def _resolve_log_level(level_name: str) -> int:
    if isinstance(level_name, int):
        return level_name
    upper_name = str(level_name).upper()
    return getattr(logging, upper_name, logging.INFO)
 def _build_argument_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        description="批量下载 Hugging Face 数据集和模型并存储到指定目录。"
    )
    parser.add_argument(
        "-d",
        "--dataset",
        action="append",
        default=[],
        help="要下载的数据集 ID，可重复使用或传入逗号分隔列表。",
    )
    parser.add_argument(
        "-m",
        "--model",
        action="append",
        default=[],
        help="要下载的模型 ID，可重复使用或传入逗号分隔列表。",
    )
    parser.add_argument(
        "-r",
        "--root",
        default="file",
        help="存储根目录，默认 file。",
    )
    parser.add_argument(
        "--retries",
        type=int,
        default=None,
        help="失败后的重试次数，默认不重试。",
    )
    parser.add_argument(
        "--timeout",
        type=float,
        default=None,
        help="HTTP 超时时间（秒），默认跟随库设置。",
    )
    parser.add_argument(
        "-p",
        "--proxy",
        action="append",
        default=[],
        help="代理设置，格式 scheme=url，可多次传入，例如 --proxy http=http://127.0.0.1:7890",
    )
    parser.add_argument(
        "--log-level",
        default="INFO",
        help="日志级别，默认 INFO。",
    )
    return parser
 def download_datasets(dataset_ids: Iterable[str], root_dir: str, retries: Optional[int], proxies: Dict[str, str]) -> None:
    if not dataset_ids:
        return
    cache_dir = root_dir
    download_config = _build_download_config(cache_dir, retries, proxies)
    for dataset_id in dataset_ids:
        try:
            logging.info("开始下载数据集 %s", dataset_id)
            # 使用 load_dataset 触发缓存下载
            dataset = datasets.load_dataset(
                dataset_id,
                cache_dir=cache_dir,
                download_config=download_config,
                download_mode="reuse_cache_if_exists",
            )
            target_path = os.path.join(root_dir, _sanitize_dir_name(dataset_id))
            dataset.save_to_disk(target_path)
            logging.info("数据集 %s 下载完成，存储于 %s", dataset_id, target_path)
        except Exception as exc:  # pylint: disable=broad-except
            logging.error("下载数据集 %s 失败: %s", dataset_id, exc)
 def download_models(
    model_ids: Iterable[str],
    target_dir: str,
    retries: Optional[int],
    proxies: Dict[str, str],
    timeout: Optional[float],
 ) -> None:
    if not model_ids:
        return
    max_attempts = (retries or 0) + 1
    hub_kwargs = {
        "local_dir": target_dir,
        "local_dir_use_symlinks": False,
        "max_workers": os.cpu_count() or 4,
    }
    if proxies:
        hub_kwargs["proxies"] = proxies
    if timeout is not None:
        hub_kwargs["timeout"] = timeout
    for model_id in model_ids:
        attempt = 0
        while attempt < max_attempts:
            attempt += 1
            try:
                logging.info("开始下载模型 %s (尝试 %s/%s)", model_id, attempt, max_attempts)
                snapshot_download(
                    repo_id=model_id,
                    **hub_kwargs,
                )
                logging.info("模型 %s 下载完成，存储于 %s", model_id, target_dir)
                break
            except Exception as exc:  # pylint: disable=broad-except
                logging.error("下载模型 %s 失败: %s", model_id, exc)
                if attempt >= max_attempts:
                    logging.error("模型 %s 在重试后仍未成功下载", model_id)
 def main() -> None:
    parser = _build_argument_parser()
    args = parser.parse_args()
    logging.basicConfig(
        level=_resolve_log_level(args.log_level),
        format="%(asctime)s - %(levelname)s - %(message)s",
    )
    dataset_ids = _parse_id_list(args.dataset)
    model_ids = _parse_id_list(args.model)
    retries = args.retries
    timeout = args.timeout
    proxies = _parse_proxy_args(args.proxy)
    _apply_timeout(timeout)
    if not dataset_ids and not model_ids:
        logging.warning(
            "未配置任何数据集或模型，"
            "请通过参数 --dataset / --model 指定 Hugging Face ID"
        )
        return
    dirs = _ensure_dirs(args.root)
    download_datasets(dataset_ids, dirs["dataset"], retries, proxies)
    download_models(model_ids, dirs["model"], retries, proxies, timeout)
 if __name__ == "__main__":
    main()
--- a/scripts/table_snippet_demo.py
+++ b/scripts/table_snippet_demo.py
@ -1,80 +0,0 @@
 from __future__ import annotations
 import json
 import os
 import sys
 from datetime import datetime
 from typing import Any, Dict
 import requests
 def build_demo_payload() -> Dict[str, Any]:
    now = datetime.utcnow()
    started_at = now.replace(microsecond=0).isoformat() + "Z"
    finished_at = now.replace(microsecond=0).isoformat() + "Z"
    return {
        "table_id": 42,
        "version_ts": 20251101200000,
        "action_type": "snippet",
        "status": "success",
        "callback_url": "http://localhost:9999/dummy-callback",
        "table_schema_version_id": 7,
        "table_schema": {
            "columns": [
                {"name": "order_id", "type": "bigint"},
                {"name": "order_dt", "type": "date"},
                {"name": "gmv", "type": "decimal(18,2)"},
            ]
        },
        "result_json": [
            {
                "id": "snpt_daily_gmv",
                "title": "按日GMV",
                "desc": "统计每日GMV总额",
                "type": "trend",
                "dialect_sql": {
                    "mysql": "SELECT order_dt, SUM(gmv) AS total_gmv FROM orders GROUP BY order_dt ORDER BY order_dt"
                },
            }
        ],
        "result_summary_json": {"total_snippets": 1},
        "html_report_url": None,
        "error_code": None,
        "error_message": None,
        "started_at": started_at,
        "finished_at": finished_at,
        "duration_ms": 1234,
        "result_checksum": "demo-checksum",
    }
 def main() -> int:
    base_url = os.getenv("TABLE_SNIPPET_DEMO_BASE_URL", "http://localhost:8000")
    endpoint = f"{base_url.rstrip('/')}/v1/table/snippet"
    payload = build_demo_payload()
    print(f"POST {endpoint}")
    print(json.dumps(payload, ensure_ascii=False, indent=2))
    try:
        response = requests.post(endpoint, json=payload, timeout=30)
    except requests.RequestException as exc:
        print(f"Request failed: {exc}", file=sys.stderr)
        return 1
    print(f"\nStatus: {response.status_code}")
    try:
        data = response.json()
        print("Response JSON:")
        print(json.dumps(data, ensure_ascii=False, indent=2))
    except ValueError:
        print("Response Text:")
        print(response.text)
    return 0 if response.ok else 1
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/test/test_chat_api_mysql.py
+++ b/test/test_chat_api_mysql.py
@ -1,142 +0,0 @@
 from __future__ import annotations
 import os
 import random
 from pathlib import Path
 from typing import Generator, List
 import sys
 import pytest
 from fastapi.testclient import TestClient
 from sqlalchemy import text
 from sqlalchemy.exc import SQLAlchemyError
 # Ensure the project root is importable when running directly via python.
 ROOT = Path(__file__).resolve().parents[1]
 if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
 from app import db
 from app.main import create_app
 TEST_USER_ID = 872341
 SCHEMA_PATH = Path("file/tableschema/chat.sql")
 DEFAULT_MYSQL_URL = "mysql+pymysql://root:12345678@127.0.0.1:3306/data-ge?charset=utf8mb4"
@pytest.fixture(scope="module")
 def client() -> Generator[TestClient, None, None]:
    mysql_url = os.getenv("TEST_DATABASE_URL", DEFAULT_MYSQL_URL)
    os.environ["DATABASE_URL"] = mysql_url
    db.get_engine.cache_clear()
    engine = db.get_engine()
    try:
        # Quick connectivity check
        with engine.connect() as conn:
            conn.execute(text("SELECT 1"))
    except SQLAlchemyError:
        pytest.skip(f"Cannot connect to MySQL at {mysql_url}")
    #_ensure_chat_schema(engine)
    app = create_app()
    with TestClient(app) as test_client:
        yield test_client
    # cleanup test artifacts
    with engine.begin() as conn:
        # remove retrievals and turns tied to test sessions
        conn.execute(
            text(
                """
                DELETE FROM chat_turn_retrieval
                WHERE turn_id IN (
                    SELECT id FROM chat_turn WHERE session_id IN (SELECT id FROM chat_session WHERE user_id=:uid)
                )
                """
            ),
            {"uid": TEST_USER_ID},
        )
        conn.execute(
            text("DELETE FROM chat_turn WHERE session_id IN (SELECT id FROM chat_session WHERE user_id=:uid)"),
            {"uid": TEST_USER_ID},
        )
        conn.execute(text("DELETE FROM chat_session WHERE user_id=:uid"), {"uid": TEST_USER_ID})
    db.get_engine.cache_clear()
 def test_session_lifecycle_mysql(client: TestClient) -> None:
    # Create a session
    resp = client.post("/api/v1/chat/sessions", json={"user_id": TEST_USER_ID})
    assert resp.status_code == 200, resp.text
    session = resp.json()
    session_id = session["id"]
    assert session["status"] == "OPEN"
    # Get session
    assert client.get(f"/api/v1/chat/sessions/{session_id}").status_code == 200
    # List sessions (filter by user)
    resp = client.get(f"/api/v1/chat/sessions", params={"user_id": TEST_USER_ID})
    assert resp.status_code == 200
    assert any(item["id"] == session_id for item in resp.json())
    # Update status
    resp = client.post(f"/api/v1/chat/sessions/{session_id}/update", json={"status": "PAUSED"})
    assert resp.status_code == 200
    assert resp.json()["status"] == "PAUSED"
    # Close session
    resp = client.post(f"/api/v1/chat/sessions/{session_id}/close")
    assert resp.status_code == 200
    assert resp.json()["status"] == "CLOSED"
 def test_turns_and_retrievals_mysql(client: TestClient) -> None:
    session_id = client.post("/api/v1/chat/sessions", json={"user_id": TEST_USER_ID}).json()["id"]
    turn_payload = {
        "user_id": TEST_USER_ID,
        "user_query": "展示昨天订单GMV",
        "intent": "METRIC_QUERY",
        "ast_json": {"select": ["gmv"], "where": {"dt": "yesterday"}},
        "main_metric_ids": [random.randint(1000, 9999)],
        "created_metric_ids": [],
    }
    resp = client.post(f"/api/v1/chat/sessions/{session_id}/turns", json=turn_payload)
    assert resp.status_code == 200, resp.text
    turn = resp.json()
    turn_id = turn["id"]
    assert turn["turn_no"] == 1
    # Fetch turn
    assert client.get(f"/api/v1/chat/turns/{turn_id}").status_code == 200
    # List turns under session
    resp = client.get(f"/api/v1/chat/sessions/{session_id}/turns")
    assert resp.status_code == 200
    assert any(t["id"] == turn_id for t in resp.json())
    # Insert retrievals
    retrievals_payload = {
        "retrievals": [
            {"item_type": "METRIC", "item_id": "metric_foo", "used_in_sql": True, "rank_no": 1},
            {"item_type": "SNIPPET", "item_id": "snpt_bar", "similarity_score": 0.77, "rank_no": 2},
        ]
    }
    resp = client.post(f"/api/v1/chat/turns/{turn_id}/retrievals", json=retrievals_payload)
    assert resp.status_code == 200
    assert resp.json()["inserted"] == 2
    # List retrievals
    resp = client.get(f"/api/v1/chat/turns/{turn_id}/retrievals")
    assert resp.status_code == 200
    items = resp.json()
    assert len(items) == 2
    assert {item["item_type"] for item in items} == {"METRIC", "SNIPPET"}
 if __name__ == "__main__":
    import pytest as _pytest
    raise SystemExit(_pytest.main([__file__]))
--- a/test/test_metrics_api_mysql.py
+++ b/test/test_metrics_api_mysql.py
@ -1,207 +0,0 @@
 from __future__ import annotations
 import os
 import random
 from datetime import datetime, timedelta
 from pathlib import Path
 from typing import Generator, List
 import pytest
 from fastapi.testclient import TestClient
 from sqlalchemy import text
 from sqlalchemy.exc import SQLAlchemyError
 # Ensure project root on path for direct execution
 ROOT = Path(__file__).resolve().parents[1]
 import sys
 if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
 from app import db
 from app.main import create_app
 TEST_USER_ID = 98765
 #SCHEMA_PATH = Path("file/tableschema/metrics.sql")
 DEFAULT_MYSQL_URL = "mysql+pymysql://root:12345678@127.0.0.1:3306/data-ge?charset=utf8mb4"
 # def _run_sql_script(engine, sql_text: str) -> None:
 #     """Execute semicolon-terminated SQL statements sequentially."""
 #     statements: List[str] = []
 #     buffer: List[str] = []
 #     for line in sql_text.splitlines():
 #         stripped = line.strip()
 #         if not stripped or stripped.startswith("--"):
 #             continue
 #         buffer.append(line)
 #         if stripped.endswith(";"):
 #             statements.append("\n".join(buffer).rstrip(";"))
 #             buffer = []
 #     if buffer:
 #         statements.append("\n".join(buffer))
 #     with engine.begin() as conn:
 #         for stmt in statements:
 #             conn.execute(text(stmt))
 # def _ensure_metric_schema(engine) -> None:
 #     if not SCHEMA_PATH.exists():
 #         pytest.skip("metrics.sql schema file not found.")
 #     raw_sql = SCHEMA_PATH.read_text(encoding="utf-8")
 #     raw_sql = raw_sql.replace("CREATE TABLE metric_def", "CREATE TABLE IF NOT EXISTS metric_def")
 #     raw_sql = raw_sql.replace("CREATE TABLE metric_schedule", "CREATE TABLE IF NOT EXISTS metric_schedule")
 #     raw_sql = raw_sql.replace("CREATE TABLE metric_job_run", "CREATE TABLE IF NOT EXISTS metric_job_run")
 #     raw_sql = raw_sql.replace("CREATE TABLE metric_result", "CREATE TABLE IF NOT EXISTS metric_result")
 #     _run_sql_script(engine, raw_sql)
@pytest.fixture(scope="module")
 def client() -> Generator[TestClient, None, None]:
    mysql_url = os.getenv("TEST_DATABASE_URL", DEFAULT_MYSQL_URL)
    os.environ["DATABASE_URL"] = mysql_url
    db.get_engine.cache_clear()
    engine = db.get_engine()
    try:
        with engine.connect() as conn:
            conn.execute(text("SELECT 1"))
    except SQLAlchemyError:
        pytest.skip(f"Cannot connect to MySQL at {mysql_url}")
    #_ensure_metric_schema(engine)
    app = create_app()
    with TestClient(app) as test_client:
        yield test_client
    # cleanup test artifacts
    with engine.begin() as conn:
        conn.execute(text("DELETE FROM metric_result WHERE metric_id IN (SELECT id FROM metric_def WHERE created_by=:uid)"), {"uid": TEST_USER_ID})
        conn.execute(text("DELETE FROM metric_job_run WHERE metric_id IN (SELECT id FROM metric_def WHERE created_by=:uid)"), {"uid": TEST_USER_ID})
        conn.execute(text("DELETE FROM metric_schedule WHERE metric_id IN (SELECT id FROM metric_def WHERE created_by=:uid)"), {"uid": TEST_USER_ID})
        conn.execute(text("DELETE FROM metric_def WHERE created_by=:uid"), {"uid": TEST_USER_ID})
    db.get_engine.cache_clear()
 def test_metric_crud_and_schedule_mysql(client: TestClient) -> None:
    code = f"metric_{random.randint(1000,9999)}"
    create_payload = {
        "metric_code": code,
        "metric_name": "订单数",
        "biz_domain": "order",
        "biz_desc": "订单总数",
        "base_sql": "select count(*) as order_cnt from orders",
        "time_grain": "DAY",
        "dim_binding": ["dt"],
        "update_strategy": "FULL",
        "metric_aliases": ["订单量"],
        "created_by": TEST_USER_ID,
    }
    resp = client.post("/api/v1/metrics", json=create_payload)
    assert resp.status_code == 200, resp.text
    metric = resp.json()
    metric_id = metric["id"]
    assert metric["metric_code"] == code
    # Update metric
    resp = client.post(f"/api/v1/metrics/{metric_id}", json={"metric_name": "订单数-更新", "is_active": False})
    assert resp.status_code == 200
    assert resp.json()["is_active"] is False
    # Get metric
    resp = client.get(f"/api/v1/metrics/{metric_id}")
    assert resp.status_code == 200
    assert resp.json()["metric_name"] == "订单数-更新"
    # Create schedule
    resp = client.post(
        "/api/v1/metric-schedules",
        json={"metric_id": metric_id, "cron_expr": "0 2 * * *", "priority": 5, "enabled": True},
    )
    assert resp.status_code == 200, resp.text
    schedule = resp.json()
    schedule_id = schedule["id"]
    # Update schedule
    resp = client.post(f"/api/v1/metric-schedules/{schedule_id}", json={"enabled": False, "retry_times": 1})
    assert resp.status_code == 200
    assert resp.json()["enabled"] is False
    # List schedules for metric
    resp = client.get(f"/api/v1/metrics/{metric_id}/schedules")
    assert resp.status_code == 200
    assert any(s["id"] == schedule_id for s in resp.json())
 def test_metric_runs_and_results_mysql(client: TestClient) -> None:
    code = f"gmv_{random.randint(1000,9999)}"
    metric_id = client.post(
        "/api/v1/metrics",
        json={
            "metric_code": code,
            "metric_name": "GMV",
            "biz_domain": "order",
            "base_sql": "select sum(pay_amount) as gmv from orders",
            "time_grain": "DAY",
            "dim_binding": ["dt"],
            "update_strategy": "FULL",
            "created_by": TEST_USER_ID,
        },
    ).json()["id"]
    # Trigger run
    resp = client.post(
        "/api/v1/metric-runs/trigger",
        json={
            "metric_id": metric_id,
            "triggered_by": "API",
            "data_time_from": (datetime.utcnow() - timedelta(days=1)).isoformat(),
            "data_time_to": datetime.utcnow().isoformat(),
        },
    )
    assert resp.status_code == 200, resp.text
    run = resp.json()
    run_id = run["id"]
    assert run["status"] == "RUNNING"
    # List runs
    resp = client.get("/api/v1/metric-runs", params={"metric_id": metric_id})
    assert resp.status_code == 200
    assert any(r["id"] == run_id for r in resp.json())
    # Get run
    resp = client.get(f"/api/v1/metric-runs/{run_id}")
    assert resp.status_code == 200
    # Write results
    now = datetime.utcnow()
    resp = client.post(
        f"/api/v1/metric-results/{metric_id}",
        json={
            "metric_id": metric_id,
            "results": [
                {"stat_time": (now - timedelta(days=1)).isoformat(), "metric_value": 123.45, "data_version": run_id},
                {"stat_time": now.isoformat(), "metric_value": 234.56, "data_version": run_id},
            ],
        },
    )
    assert resp.status_code == 200, resp.text
    assert resp.json()["inserted"] == 2
    # Query results
    resp = client.get("/api/v1/metric-results", params={"metric_id": metric_id})
    assert resp.status_code == 200
    results = resp.json()
    assert len(results) >= 2
    # Latest result
    resp = client.get("/api/v1/metric-results/latest", params={"metric_id": metric_id})
    assert resp.status_code == 200
    latest = resp.json()
    assert float(latest["metric_value"]) in {123.45, 234.56}
 if __name__ == "__main__":
    import pytest as _pytest
    raise SystemExit(_pytest.main([__file__]))
--- a/test/test_rag_client.py
+++ b/test/test_rag_client.py
@ -1,91 +0,0 @@
 from __future__ import annotations
 import json
 import httpx
 import pytest
 from app.exceptions import ProviderAPICallError
 from app.schemas.rag import RagDeleteRequest, RagItemPayload, RagRetrieveRequest
 from app.services.rag_client import RagAPIClient
@pytest.mark.asyncio
 async def test_add_sends_payload_and_headers() -> None:
    rag_client = RagAPIClient(base_url="http://rag.test", auth_token="secret-token")
    def handler(request: httpx.Request) -> httpx.Response:
        assert request.method == "POST"
        assert str(request.url) == "http://rag.test/rag/add"
        assert request.headers["Authorization"] == "Bearer secret-token"
        payload = json.loads(request.content.decode())
        assert payload == {
            "id": 1,
            "workspaceId": 2,
            "name": "demo",
            "embeddingData": "vector",
            "type": "METRIC",
        }
        return httpx.Response(200, json={"ok": True, "echo": payload})
    transport = httpx.MockTransport(handler)
    async with httpx.AsyncClient(transport=transport) as client:
        result = await rag_client.add(
            client,
            RagItemPayload(id=1, workspaceId=2, name="demo", embeddingData="vector", type="METRIC"),
        )
    assert result["ok"] is True
    assert result["echo"]["name"] == "demo"
@pytest.mark.asyncio
 async def test_add_batch_serializes_list() -> None:
    rag_client = RagAPIClient(base_url="http://rag.test", auth_token=None)
    def handler(request: httpx.Request) -> httpx.Response:
        payload = json.loads(request.content.decode())
        assert request.url.path == "/rag/addBatch"
        assert isinstance(payload, list) and len(payload) == 2
        return httpx.Response(200, json={"received": len(payload)})
    items = [
        RagItemPayload(id=1, workspaceId=2, name="a", embeddingData="vec-a", type="METRIC"),
        RagItemPayload(id=2, workspaceId=2, name="b", embeddingData="vec-b", type="METRIC"),
    ]
    transport = httpx.MockTransport(handler)
    async with httpx.AsyncClient(transport=transport) as client:
        result = await rag_client.add_batch(client, items)
    assert result == {"received": 2}
@pytest.mark.asyncio
 async def test_http_error_raises_provider_error() -> None:
    rag_client = RagAPIClient(base_url="http://rag.test")
    def handler(request: httpx.Request) -> httpx.Response:
        return httpx.Response(500, text="boom")
    transport = httpx.MockTransport(handler)
    async with httpx.AsyncClient(transport=transport) as client:
        with pytest.raises(ProviderAPICallError) as excinfo:
            await rag_client.delete(client, RagDeleteRequest(id=1, type="METRIC"))
    err = excinfo.value
    assert err.status_code == 500
    assert "boom" in (err.response_text or "")
@pytest.mark.asyncio
 async def test_non_json_response_returns_raw_text() -> None:
    rag_client = RagAPIClient(base_url="http://rag.test")
    def handler(request: httpx.Request) -> httpx.Response:
        return httpx.Response(200, text="plain-text-body")
    transport = httpx.MockTransport(handler)
    async with httpx.AsyncClient(transport=transport) as client:
        result = await rag_client.retrieve(
            client, RagRetrieveRequest(query="foo", num=1, workspaceId=1, type="METRIC")
        )
    assert result == {"raw": "plain-text-body"}
--- a/test/test_snippet_rag_ingest.py
+++ b/test/test_snippet_rag_ingest.py
@ -1,157 +0,0 @@
 from __future__ import annotations
 import json
 from datetime import datetime
 import httpx
 import pytest
 from sqlalchemy import create_engine, text
 from app.services.table_snippet import ingest_snippet_rag_from_db
 def _setup_sqlite_engine():
    engine = create_engine("sqlite://")
    with engine.begin() as conn:
        conn.execute(
            text(
                """
                CREATE TABLE action_results (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    table_id INTEGER,
                    version_ts INTEGER,
                    action_type TEXT,
                    status TEXT,
                    snippet_json TEXT,
                    snippet_alias_json TEXT,
                    updated_at TEXT
                )
                """
            )
        )
        conn.execute(
            text(
                """
                CREATE TABLE rag_snippet (
                    rag_item_id INTEGER PRIMARY KEY,
                    action_result_id INTEGER NOT NULL,
                    workspace_id INTEGER,
                    table_id INTEGER,
                    version_ts INTEGER,
                    created_at TEXT,
                    snippet_id TEXT,
                    rag_text TEXT,
                    merged_json TEXT,
                    updated_at TEXT
                )
                """
            )
        )
    return engine
 def _insert_action_row(engine, payload: dict) -> None:
    with engine.begin() as conn:
        conn.execute(
            text(
                """
                INSERT INTO action_results (table_id, version_ts, action_type, status, snippet_json, snippet_alias_json, updated_at)
                VALUES (:table_id, :version_ts, :action_type, :status, :snippet_json, :snippet_alias_json, :updated_at)
                """
            ),
            {
                "table_id": payload["table_id"],
                "version_ts": payload["version_ts"],
                "action_type": payload["action_type"],
                "status": payload.get("status", "success"),
                "snippet_json": json.dumps(payload.get("snippet_json"), ensure_ascii=False)
                if payload.get("snippet_json") is not None
                else None,
                "snippet_alias_json": json.dumps(payload.get("snippet_alias_json"), ensure_ascii=False)
                if payload.get("snippet_alias_json") is not None
                else None,
                "updated_at": payload.get("updated_at") or datetime.utcnow().isoformat(),
            },
        )
 class _StubRagClient:
    def __init__(self) -> None:
        self.received = None
    async def add_batch(self, _client, items):
        self.received = items
        return {"count": len(items)}
@pytest.mark.asyncio
 async def test_ingest_snippet_rag_from_db_persists_and_calls_rag_client() -> None:
    engine = _setup_sqlite_engine()
    table_id = 321
    version_ts = 20240102000000
    snippet_payload = [
        {
            "id": "snpt_topn",
            "title": "TopN",
            "aliases": [{"text": "站点水表排行前N", "tone": "中性"}],
            "keywords": ["TopN", "站点"],
        }
    ]
    alias_payload = [
        {
            "id": "snpt_topn",
            "aliases": [
                {"text": "站点水表排行前N", "tone": "中性"},
                {"text": "按站点水表TopN", "tone": "专业"},
            ],
            "keywords": ["TopN", "排行"],
            "intent_tags": ["topn", "aggregate"],
        },
        {
            "id": "snpt_extra",
            "aliases": [{"text": "额外别名"}],
            "keywords": ["extra"],
        },
    ]
    _insert_action_row(
        engine,
        {
            "table_id": table_id,
            "version_ts": version_ts,
            "action_type": "snippet_alias",
            "snippet_json": snippet_payload,
            "snippet_alias_json": alias_payload,
            "updated_at": "2024-01-02T00:00:00",
        },
    )
    rag_stub = _StubRagClient()
    async with httpx.AsyncClient() as client:
        rag_ids = await ingest_snippet_rag_from_db(
            table_id=table_id,
            version_ts=version_ts,
            workspace_id=99,
            rag_item_type="SNIPPET",
            client=client,
            engine=engine,
            rag_client=rag_stub,
        )
    assert rag_stub.received is not None
    assert len(rag_stub.received) == 2  # includes alias-only row
    assert len(rag_ids) == 2
    with engine.connect() as conn:
        rows = list(
            conn.execute(
                text("SELECT snippet_id, action_result_id, rag_text, merged_json FROM rag_snippet ORDER BY snippet_id")
            )
        )
    assert {row[0] for row in rows} == {"snpt_extra", "snpt_topn"}
    assert all(row[1] is not None for row in rows)
    topn_row = next(row for row in rows if row[0] == "snpt_topn")
    assert "TopN" in topn_row[2]
    assert "按站点水表TopN" in topn_row[2]
    assert "排行" in topn_row[2]
--- a/test/test_table_profiling_parsing.py
+++ b/test/test_table_profiling_parsing.py
@ -1,74 +0,0 @@
 from __future__ import annotations
 from app.services.table_profiling import _parse_completion_payload
 from app.utils.llm_usage import extract_usage
 def test_parse_completion_payload_handles_array_with_trailing_text() -> None:
    response_payload = {
        "choices": [
            {
                "message": {
                    "content": """
 结果如下：
 [
  {"id": "snpt_a"},
  {"id": "snpt_b"}
 ]
 附加说明：模型可能会输出额外文本。
 """.strip()
                }
            }
        ]
    }
    parsed = _parse_completion_payload(response_payload)
    assert isinstance(parsed, list)
    assert [item["id"] for item in parsed] == ["snpt_a", "snpt_b"]
 def test_extract_usage_info_normalizes_numeric_fields() -> None:
    response_payload = {
        "raw": {
            "usage": {
                "prompt_tokens": 12.7,
                "completion_tokens": 3,
                "total_tokens": 15.7,
                "prompt_tokens_details": {"cached_tokens": 8.9, "other": None},
                "non_numeric": "ignored",
            }
        }
    }
    usage = extract_usage(response_payload)
    assert usage == {
        "prompt_tokens": 12,
        "completion_tokens": 3,
        "total_tokens": 15,
        "prompt_tokens_details": {"cached_tokens": 8},
    }
 def test_extract_usage_handles_alias_keys() -> None:
    response_payload = {
        "raw": {
            "usageMetadata": {
                "input_tokens": 20,
                "output_tokens": 4,
            }
        }
    }
    usage = extract_usage(response_payload)
    assert usage == {
        "prompt_tokens": 20,
        "completion_tokens": 4,
        "total_tokens": 24,
    }
 def test_extract_usage_returns_none_when_missing() -> None:
    assert extract_usage({"raw": {}}) is None
--- a/test/test_table_snippet_merge.py
+++ b/test/test_table_snippet_merge.py
@ -1,213 +0,0 @@
 from __future__ import annotations
 import json
 import os
 import random
 from datetime import datetime, timedelta
 from typing import List
 from pathlib import Path
 import sys
 import pytest
 from sqlalchemy import text
 from sqlalchemy.engine import Engine
 from sqlalchemy.exc import SQLAlchemyError
 # Ensure the project root is importable when running directly via python.
 ROOT = Path(__file__).resolve().parents[1]
 if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
 from app import db
 from app.main import create_app
 from app.services.table_snippet import merge_snippet_records_from_db
 DEFAULT_MYSQL_URL = "mysql+pymysql://root:12345678@127.0.0.1:3306/data-ge?charset=utf8mb4"
@pytest.fixture()
 def mysql_engine() -> Engine:
    mysql_url = os.getenv("TEST_DATABASE_URL", DEFAULT_MYSQL_URL)
    os.environ["DATABASE_URL"] = mysql_url
    db.get_engine.cache_clear()
    engine = db.get_engine()
    try:
        with engine.connect() as conn:
            conn.execute(text("SELECT 1"))
            exists = conn.execute(text("SHOW TABLES LIKE 'action_results'")).scalar()
            if not exists:
                pytest.skip("action_results table not found in test database.")
    except SQLAlchemyError:
        pytest.skip(f"Cannot connect to MySQL at {mysql_url}")
    return engine
 def _insert_action_row(
    engine: Engine,
    *,
    table_id: int,
    version_ts: int,
    action_type: str,
    status: str = "success",
    snippet_json: List[dict] | None = None,
    snippet_alias_json: List[dict] | None = None,
    updated_at: datetime | None = None,
 ) -> None:
    snippet_json_str = json.dumps(snippet_json, ensure_ascii=False) if snippet_json is not None else None
    snippet_alias_json_str = (
        json.dumps(snippet_alias_json, ensure_ascii=False) if snippet_alias_json is not None else None
    )
    with engine.begin() as conn:
        conn.execute(
            text(
                """
                INSERT INTO action_results (
                    table_id, version_ts, action_type, status,
                    callback_url, table_schema_version_id, table_schema,
                    snippet_json, snippet_alias_json, updated_at
                ) VALUES (
                    :table_id, :version_ts, :action_type, :status,
                    :callback_url, :table_schema_version_id, :table_schema,
                    :snippet_json, :snippet_alias_json, :updated_at
                )
                ON DUPLICATE KEY UPDATE
                    status=VALUES(status),
                    snippet_json=VALUES(snippet_json),
                    snippet_alias_json=VALUES(snippet_alias_json),
                    updated_at=VALUES(updated_at)
                """
            ),
            {
                "table_id": table_id,
                "version_ts": version_ts,
                "action_type": action_type,
                "status": status,
                "callback_url": "http://localhost/test-callback",
                "table_schema_version_id": "1",
                "table_schema": json.dumps({}, ensure_ascii=False),
                "snippet_json": snippet_json_str,
                "snippet_alias_json": snippet_alias_json_str,
                "updated_at": updated_at or datetime.utcnow(),
            },
        )
 def _cleanup(engine: Engine, table_id: int, version_ts: int) -> None:
    with engine.begin() as conn:
        conn.execute(
            text("DELETE FROM action_results WHERE table_id=:table_id AND version_ts=:version_ts"),
            {"table_id": table_id, "version_ts": version_ts},
        )
 def test_merge_prefers_alias_row_and_appends_alias_only_entries(mysql_engine: Engine) -> None:
    table_id = 990000000 + random.randint(1, 9999)
    version_ts = int(datetime.utcnow().strftime("%Y%m%d%H%M%S"))
    alias_updated = datetime(2024, 1, 2, 0, 0, 0)
    snippet_payload = [
        {
            "id": "snpt_topn",
            "aliases": [{"text": "站点水表排行前N", "tone": "中性"}],
            "keywords": ["TopN", "站点"],
        }
    ]
    alias_payload = [
        {
            "id": "snpt_topn",
            "aliases": [
                {"text": "站点水表排行前N", "tone": "中性"},
                {"text": "按站点水表TopN", "tone": "专业"},
            ],
            "keywords": ["TopN", "排行"],
            "intent_tags": ["topn", "aggregate"],
        },
        {
            "id": "snpt_extra",
            "aliases": [{"text": "额外别名"}],
            "keywords": ["extra"],
        },
    ]
    _insert_action_row(
        mysql_engine,
        table_id=table_id,
        version_ts=version_ts,
        action_type="snippet_alias",
        snippet_json=snippet_payload,
        snippet_alias_json=alias_payload,
        updated_at=alias_updated,
    )
    try:
        merged = merge_snippet_records_from_db(table_id, version_ts, engine=mysql_engine)
        assert len(merged) == 2
        topn = next(item for item in merged if item["id"] == "snpt_topn")
        assert topn["source"] == "snippet"
        assert topn["updated_at_from_action"] == alias_updated
        assert {a["text"] for a in topn["aliases"]} == {"站点水表排行前N", "按站点水表TopN"}
        assert set(topn["keywords"]) == {"TopN", "站点", "排行"}
        assert set(topn["intent_tags"]) == {"topn", "aggregate"}
        alias_only = next(item for item in merged if item["source"] == "alias_only")
        assert alias_only["id"] == "snpt_extra"
        assert alias_only["aliases"][0]["text"] == "额外别名"
    finally:
        _cleanup(mysql_engine, table_id, version_ts)
 def test_merge_falls_back_to_snippet_row_when_alias_row_missing_snippet_json(mysql_engine: Engine) -> None:
    table_id = 991000000 + random.randint(1, 9999)
    version_ts = int((datetime.utcnow() + timedelta(seconds=1)).strftime("%Y%m%d%H%M%S"))
    alias_updated = datetime(2024, 1, 3, 0, 0, 0)
    alias_payload = [
        {
            "id": "snpt_quality",
            "aliases": [{"text": "质量检查"}],
            "keywords": ["quality"],
        }
    ]
    snippet_payload = [
        {
            "id": "snpt_quality",
            "title": "质量检查",
            "keywords": ["data-quality"],
            "aliases": [{"text": "质量检查"}],
        }
    ]
    _insert_action_row(
        mysql_engine,
        table_id=table_id,
        version_ts=version_ts,
        action_type="snippet_alias",
        snippet_json=None,
        snippet_alias_json=alias_payload,
        updated_at=alias_updated,
    )
    _insert_action_row(
        mysql_engine,
        table_id=table_id,
        version_ts=version_ts,
        action_type="snippet",
        snippet_json=snippet_payload,
        snippet_alias_json=None,
        updated_at=datetime(2024, 1, 2, 0, 0, 0),
    )
    try:
        merged = merge_snippet_records_from_db(table_id, version_ts, engine=mysql_engine)
        assert len(merged) == 1
        record = merged[0]
        assert record["id"] == "snpt_quality"
        assert record["source"] == "snippet"
        assert record["updated_at_from_action"] == alias_updated
        assert set(record["keywords"]) == {"data-quality", "quality"}
        assert {a["text"] for a in record["aliases"]} == {"质量检查"}
    finally:
        _cleanup(mysql_engine, table_id, version_ts)
--- a/doc/todo.md
+++ b/doc/todo.md
--- a/uv.lock
+++ b/uv.lock
@ -1,13 +0,0 @@
 version = 1
 revision = 1
 requires-python = ">=3.11"
 resolution-markers = [
    "python_full_version >= '3.14'",
    "python_full_version >= '3.12' and python_full_version < '3.14'",
    "python_full_version < '3.12'",
 ]
 [[package]]
 name = "data-ge-new"
 version = "0.1.0"
 source = { virtual = "." }
		`@ -1 +0,0 @@`
			{"role": "dimension", "time": {"range": null, "column": null, "has_gaps": null, "granularity": "unknown"}, "grain": ["service_point_id"], "table": "data-ge.water_meter_info", "columns": [{"name": "meter_subtype", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 9, "distinct_ratio": 0.03, "pk_candidate_score": 0.03, "metric_candidate_score": 0.0}, {"name": "installation_position", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 4, "distinct_ratio": 0.013333333333333334, "pk_candidate_score": 0.013333333333333334, "metric_candidate_score": 0.0}, {"name": "supply_office", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 11, "distinct_ratio": 0.03666666666666667, "pk_candidate_score": 0.03666666666666667, "metric_candidate_score": 0.0}, {"name": "meter_diameter", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 8, "distinct_ratio": 0.02666666666666667, "pk_candidate_score": 0.02666666666666667, "metric_candidate_score": 0.0}, {"name": "account_id", "dtype": "unknown", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列的统计指标（如空值率、唯一性）缺失，但根据命名规则推断为ID。", "enumish": null, "null_rate": null, "top_values": [], "semantic_type": "id", "distinct_count": null, "distinct_ratio": null, "pk_candidate_score": 0.9, "metric_candidate_score": 0.0}, {"name": "service_point_id", "dtype": "unknown", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列的统计指标（如空值率、唯一性）缺失，但根据命名规则推断为ID。", "enumish": null, "null_rate": null, "top_values": [], "semantic_type": "id", "distinct_count": null, "distinct_ratio": null, "pk_candidate_score": 0.95, "metric_candidate_score": 0.0}, {"name": "station", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 36, "distinct_ratio": 0.12, "pk_candidate_score": 0.12, "metric_candidate_score": 0.0}, {"name": "meter_type", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 5, "distinct_ratio": 0.016666666666666666, "pk_candidate_score": 0.016666666666666666, "metric_candidate_score": 0.0}, {"name": "district", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 13, "distinct_ratio": 0.043333333333333335, "pk_candidate_score": 0.043333333333333335, "metric_candidate_score": 0.0}, {"name": "meter_status", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列只有一个唯一值 '有效'。", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 1, "distinct_ratio": 0.0033333333333333335, "pk_candidate_score": 0.0033333333333333335, "metric_candidate_score": 0.0}], "quality": {"warning_hints": ["列 'meter_status' 只有一个唯一值 '有效'，可能为常量列。"], "failed_expectations": []}, "row_count": 300, "fk_candidates": [], "confidence_notes": ["表角色(role)被推断为 'dimension'，因为其列几乎完全由ID和类别属性构成，且缺少数值指标或时间序列列。", "主键候选(primary_key_candidates) 'service_point_id' 和 'account_id' 是基于命名约定（包含'_id'）推断的。其唯一性和非空性未在GE结果中直接度量，因此这是一个高置信度的猜测。", "表粒度(grain)可能为 'service_point'，与推断的主键 'service_point_id' 相对应。", "未根据列名或数据格式识别出时间列。"], "primary_key_candidates": [["service_point_id"], ["account_id"]]}