Compare commits

1 Commits
main ... dev

Author SHA1 Message Date
82fe7b93b5 安装启动数据分析治理服务指引 2025-10-30 23:01:19 +08:00
66 changed files with 140 additions and 18372 deletions

7
.env
View File

@ -16,11 +16,8 @@ DEFAULT_IMPORT_MODEL=deepseek:deepseek-chat
# Service configuration # Service configuration
IMPORT_GATEWAY_BASE_URL=http://localhost:8000 IMPORT_GATEWAY_BASE_URL=http://localhost:8000
# prod nbackend base url
NBACKEND_BASE_URL=https://chatbi.agentcarrier.cn/chatbi/api
# HTTP client configuration # HTTP client configuration
HTTP_CLIENT_TIMEOUT=120 HTTP_CLIENT_TIMEOUT=30
HTTP_CLIENT_TRUST_ENV=false HTTP_CLIENT_TRUST_ENV=false
# HTTP_CLIENT_PROXY= # HTTP_CLIENT_PROXY=
@ -30,5 +27,3 @@ IMPORT_CHAT_TIMEOUT_SECONDS=120
# Logging # Logging
LOG_LEVEL=INFO LOG_LEVEL=INFO
# LOG_FORMAT=%(asctime)s %(levelname)s %(name)s:%(lineno)d %(message)s # LOG_FORMAT=%(asctime)s %(levelname)s %(name)s:%(lineno)d %(message)s
NEW_API_BASE_URL=http://localhost:3000
NEW_API_AUTH_TOKEN="sk-Q79KGFJRs5Vk9HsfFqoiJk948uLMDhAVe037AeCb31URyWGL"

2
.gitignore vendored
View File

@ -4,5 +4,3 @@ gx/uncommitted/
**/__pycache__/ **/__pycache__/
*.pyc *.pyc
.DS_Store .DS_Store
gx/
logs/

View File

@ -1,17 +0,0 @@
FROM python:3.11-slim
# 设置 pip 全局使用国内源
ENV PIP_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple/
ENV PIP_TRUSTED_HOST=pypi.tuna.tsinghua.edu.cn
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r requirements.txt
COPY . .
EXPOSE 8000
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@ -2,7 +2,7 @@
This project exposes a FastAPI-based microservice that provides: This project exposes a FastAPI-based microservice that provides:
- A unified chat completions gateway that now forwards requests to the internal `new-api` service (default `http://localhost:3000`) while preserving the same client-facing schema. - A unified chat completions gateway supporting multiple LLM providers (OpenAI, Anthropic, OpenRouter, Gemini, Qwen, DeepSeek, etc.)
- An asynchronous data import analysis pipeline that orchestrates LLM calls to produce structured metadata and processing recommendations - An asynchronous data import analysis pipeline that orchestrates LLM calls to produce structured metadata and processing recommendations
The following instructions cover environment setup, dependency installation, and running the backend service. The following instructions cover environment setup, dependency installation, and running the backend service.
@ -56,7 +56,6 @@ Copy `.env.example` to `.env` (if provided) or edit `.env` to supply API keys an
- `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `OPENROUTER_API_KEY`, etc. - `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `OPENROUTER_API_KEY`, etc.
- `HTTP_CLIENT_TIMEOUT`, `IMPORT_CHAT_TIMEOUT_SECONDS` - `HTTP_CLIENT_TIMEOUT`, `IMPORT_CHAT_TIMEOUT_SECONDS`
- `LOG_LEVEL`, `LOG_FORMAT` for logging - `LOG_LEVEL`, `LOG_FORMAT` for logging
- `NEW_API_BASE_URL` (defaults to `http://localhost:3000`) and optional `NEW_API_AUTH_TOKEN` if the new-api component enforces authentication.
## Run the Backend Service ## Run the Backend Service
@ -78,9 +77,6 @@ nohup uvicorn app.main:app --host 0.0.0.0 --port 8000 > server.log 2>&1 &
Or use a process manager such as `pm2`, `supervisor`, or systemd for production deployments. Or use a process manager such as `pm2`, `supervisor`, or systemd for production deployments.
## API List
1. 导入分析schema接口 http://localhost:8000/v1/import/analyze
## Additional Commands ## Additional Commands
- Run the data import analysis example: `python test/data_import_analysis_example.py` - Run the data import analysis example: `python test/data_import_analysis_example.py`

View File

@ -1,26 +0,0 @@
from __future__ import annotations
import os
from functools import lru_cache
from sqlalchemy import create_engine
from sqlalchemy.engine import Engine
@lru_cache(maxsize=1)
def get_engine() -> Engine:
"""Return a cached SQLAlchemy engine configured from DATABASE_URL."""
database_url = os.getenv(
"DATABASE_URL",
"mysql+pymysql://root:12345678@localhost:3306/data-ge?charset=utf8mb4",
)
connect_args = {}
if database_url.startswith("sqlite"):
connect_args["check_same_thread"] = False
return create_engine(
database_url,
pool_pre_ping=True,
future=True,
connect_args=connect_args,
)

View File

@ -2,65 +2,43 @@ from __future__ import annotations
import asyncio import asyncio
import logging import logging
import logging.config
import os import os
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from typing import Any from typing import Any
import yaml
import httpx import httpx
from fastapi import Depends, FastAPI, HTTPException, Request from fastapi import Depends, FastAPI, HTTPException, Request
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse
from app.exceptions import ProviderAPICallError, ProviderConfigurationError from app.exceptions import ProviderAPICallError, ProviderConfigurationError
from app.models import ( from app.models import (
ActionStatus,
ActionType,
DataImportAnalysisJobAck, DataImportAnalysisJobAck,
DataImportAnalysisJobRequest, DataImportAnalysisJobRequest,
LLMRequest, LLMRequest,
LLMResponse, LLMResponse,
TableProfilingJobAck,
TableProfilingJobRequest,
TableSnippetRagIngestRequest,
TableSnippetRagIngestResponse,
TableSnippetUpsertRequest,
TableSnippetUpsertResponse,
) )
from app.routers import chat_router, metrics_router
from app.services import LLMGateway from app.services import LLMGateway
from app.services.import_analysis import process_import_analysis_job from app.services.import_analysis import process_import_analysis_job
from app.services.table_profiling import process_table_profiling_job
from app.services.table_snippet import ingest_snippet_rag_from_db, upsert_action_result
def _ensure_log_directories(config: dict[str, Any]) -> None:
handlers = config.get("handlers", {})
for handler_config in handlers.values():
filename = handler_config.get("filename")
if not filename:
continue
directory = os.path.dirname(filename)
if directory and not os.path.exists(directory):
os.makedirs(directory, exist_ok=True)
def _configure_logging() -> None: def _configure_logging() -> None:
config_path = os.getenv("LOGGING_CONFIG", "logging.yaml") level_name = os.getenv("LOG_LEVEL", "INFO").upper()
if os.path.exists(config_path): level = getattr(logging, level_name, logging.INFO)
with open(config_path, "r", encoding="utf-8") as fh: log_format = os.getenv(
config = yaml.safe_load(fh) "LOG_FORMAT",
if isinstance(config, dict): "%(asctime)s %(levelname)s %(name)s:%(lineno)d %(message)s",
_ensure_log_directories(config)
logging.config.dictConfig(config)
return
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s:%(lineno)d %(message)s",
) )
root = logging.getLogger()
if not root.handlers:
logging.basicConfig(level=level, format=log_format)
else:
root.setLevel(level)
formatter = logging.Formatter(log_format)
for handler in root.handlers:
handler.setLevel(level)
handler.setFormatter(formatter)
_configure_logging() _configure_logging()
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -140,27 +118,6 @@ def create_app() -> FastAPI:
version="0.1.0", version="0.1.0",
lifespan=lifespan, lifespan=lifespan,
) )
# Chat/metric management APIs
application.include_router(chat_router)
application.include_router(metrics_router)
@application.exception_handler(RequestValidationError)
async def request_validation_exception_handler(
request: Request, exc: RequestValidationError
) -> JSONResponse:
try:
raw_body = await request.body()
except Exception: # pragma: no cover - defensive
raw_body = b"<unavailable>"
truncated_body = raw_body[:4096]
logger.warning(
"Validation error on %s %s: %s | body preview=%s",
request.method,
request.url.path,
exc.errors(),
truncated_body.decode("utf-8", errors="ignore"),
)
return JSONResponse(status_code=422, content={"detail": exc.errors()})
@application.post( @application.post(
"/v1/chat/completions", "/v1/chat/completions",
@ -207,109 +164,6 @@ def create_app() -> FastAPI:
return DataImportAnalysisJobAck(import_record_id=payload.import_record_id, status="accepted") return DataImportAnalysisJobAck(import_record_id=payload.import_record_id, status="accepted")
@application.post(
"/v1/table/profiling",
response_model=TableProfilingJobAck,
summary="Run end-to-end GE profiling pipeline and notify via callback per action",
status_code=202,
)
async def run_table_profiling(
payload: TableProfilingJobRequest,
gateway: LLMGateway = Depends(get_gateway),
client: httpx.AsyncClient = Depends(get_http_client),
) -> TableProfilingJobAck:
request_copy = payload.model_copy(deep=True)
async def _runner() -> None:
await process_table_profiling_job(request_copy, gateway, client)
asyncio.create_task(_runner())
return TableProfilingJobAck(
table_id=payload.table_id,
version_ts=payload.version_ts,
status="accepted",
)
@application.post(
"/v1/table/snippet",
response_model=TableSnippetUpsertResponse,
summary="Persist or update action results, such as table snippets.",
)
async def upsert_table_snippet(
payload: TableSnippetUpsertRequest,
client: httpx.AsyncClient = Depends(get_http_client),
) -> TableSnippetUpsertResponse:
request_copy = payload.model_copy(deep=True)
try:
response = await asyncio.to_thread(upsert_action_result, request_copy)
except Exception as exc:
logger.error(
"Failed to upsert table snippet: table_id=%s version_ts=%s action_type=%s",
payload.table_id,
payload.version_ts,
payload.action_type,
exc_info=True,
)
raise HTTPException(status_code=500, detail=str(exc)) from exc
else:
# After snippet_alias is stored, automatically trigger RAG ingest when configured.
if (
payload.action_type == ActionType.SNIPPET_ALIAS
and payload.status == ActionStatus.SUCCESS
and payload.rag_workspace_id is not None
):
try:
await ingest_snippet_rag_from_db(
table_id=payload.table_id,
version_ts=payload.version_ts,
workspace_id=payload.rag_workspace_id,
rag_item_type=payload.rag_item_type or "SNIPPET",
client=client,
)
except Exception:
logger.exception(
"Failed to ingest snippet RAG artifacts after snippet_alias upsert",
extra={
"table_id": payload.table_id,
"version_ts": payload.version_ts,
"workspace_id": payload.rag_workspace_id,
},
)
return response
@application.post(
"/v1/table/snippet/rag_ingest",
response_model=TableSnippetRagIngestResponse,
summary="Merge snippet+alias results from action_results and ingest into RAG.",
)
async def ingest_snippet_rag(
payload: TableSnippetRagIngestRequest,
client: httpx.AsyncClient = Depends(get_http_client),
) -> TableSnippetRagIngestResponse:
try:
rag_item_ids = await ingest_snippet_rag_from_db(
table_id=payload.table_id,
version_ts=payload.version_ts,
workspace_id=payload.workspace_id,
rag_item_type=payload.rag_item_type or "SNIPPET",
client=client,
)
except Exception as exc:
logger.exception(
"Failed to ingest snippet RAG artifacts",
extra={
"table_id": payload.table_id,
"version_ts": payload.version_ts,
"workspace_id": payload.workspace_id,
},
)
raise HTTPException(status_code=500, detail=str(exc)) from exc
return TableSnippetRagIngestResponse(rag_item_ids=rag_item_ids)
@application.post("/__mock__/import-callback") @application.post("/__mock__/import-callback")
async def mock_import_callback(payload: dict[str, Any]) -> dict[str, str]: async def mock_import_callback(payload: dict[str, Any]) -> dict[str, str]:
logger.info("Received import analysis callback: %s", payload) logger.info("Received import analysis callback: %s", payload)

View File

@ -1,6 +1,5 @@
from __future__ import annotations from __future__ import annotations
from datetime import datetime
from enum import Enum from enum import Enum
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
@ -77,8 +76,8 @@ class DataImportAnalysisRequest(BaseModel):
description="Ordered list of table headers associated with the data.", description="Ordered list of table headers associated with the data.",
) )
llm_model: str = Field( llm_model: str = Field(
None, ...,
description="Model identifier. Accepts 'provider:model_name' format or custom model alias.", description="Model identifier. Accepts 'provider:model' format or plain model name.",
) )
temperature: Optional[float] = Field( temperature: Optional[float] = Field(
None, None,
@ -136,235 +135,3 @@ class DataImportAnalysisJobRequest(BaseModel):
class DataImportAnalysisJobAck(BaseModel): class DataImportAnalysisJobAck(BaseModel):
import_record_id: str = Field(..., description="Echo of the import record identifier") import_record_id: str = Field(..., description="Echo of the import record identifier")
status: str = Field("accepted", description="Processing status acknowledgement.") status: str = Field("accepted", description="Processing status acknowledgement.")
class ActionType(str, Enum):
GE_PROFILING = "ge_profiling"
GE_RESULT_DESC = "ge_result_desc"
SNIPPET = "snippet"
SNIPPET_ALIAS = "snippet_alias"
class ActionStatus(str, Enum):
PENDING = "pending"
RUNNING = "running"
SUCCESS = "success"
FAILED = "failed"
PARTIAL = "partial"
class TableProfilingJobRequest(BaseModel):
table_id: str = Field(..., description="Unique identifier for the table to profile.")
version_ts: str = Field(
...,
pattern=r"^\d{14}$",
description="Version timestamp expressed as fourteen digit string (yyyyMMddHHmmss).",
)
callback_url: HttpUrl = Field(
...,
description="Callback endpoint invoked after each pipeline action completes.",
)
llm_model: Optional[str] = Field(
None,
description="Default LLM model spec applied to prompt-based actions when overrides are omitted.",
)
table_schema: Optional[Any] = Field(
None,
description="Schema structure snapshot for the current table version.",
)
table_schema_version_id: Optional[str] = Field(
None,
description="Identifier for the schema snapshot provided in table_schema.",
)
table_link_info: Optional[Dict[str, Any]] = Field(
None,
description=(
"Information describing how to locate the source table for profiling. "
"For example: {'type': 'sql', 'connection_string': 'mysql+pymysql://user:pass@host/db', "
"'table': 'schema.table_name'}."
),
)
table_access_info: Optional[Dict[str, Any]] = Field(
None,
description=(
"Credentials or supplemental parameters required to access the table described in table_link_info. "
"These values can be merged into the connection string using Python format placeholders."
),
)
ge_batch_request: Optional[Dict[str, Any]] = Field(
None,
description="Optional Great Expectations batch request payload used for profiling.",
)
ge_expectation_suite_name: Optional[str] = Field(
None,
description="Expectation suite name used during profiling. Created automatically when absent.",
)
ge_data_context_root: Optional[str] = Field(
None,
description="Custom root directory for the Great Expectations data context. Defaults to project ./gx.",
)
ge_datasource_name: Optional[str] = Field(
None,
description="Datasource name registered inside the GE context when batch_request is not supplied.",
)
ge_data_asset_name: Optional[str] = Field(
None,
description="Data asset reference used when inferring batch request from datasource configuration.",
)
ge_profiler_type: str = Field(
"user_configurable",
description="Profiler implementation identifier. Currently supports 'user_configurable' or 'data_assistant'.",
)
result_desc_model: Optional[str] = Field(
None,
description="LLM model override used for GE result description (action 2).",
)
snippet_model: Optional[str] = Field(
None,
description="LLM model override used for snippet generation (action 3).",
)
snippet_alias_model: Optional[str] = Field(
None,
description="LLM model override used for snippet alias enrichment (action 4).",
)
extra_options: Optional[Dict[str, Any]] = Field(
None,
description="Miscellaneous execution flags applied across pipeline steps.",
)
workspace_id: Optional[int] = Field(
None,
ge=0,
description="Optional workspace identifier forwarded to snippet_alias callback for RAG ingestion.",
)
rag_item_type: Optional[str] = Field(
"SNIPPET",
description="Optional RAG item type forwarded to snippet_alias callback.",
)
class TableProfilingJobAck(BaseModel):
table_id: str = Field(..., description="Echo of the table identifier.")
version_ts: str = Field(..., description="Echo of the profiling version timestamp (yyyyMMddHHmmss).")
status: str = Field("accepted", description="Processing acknowledgement status.")
class TableSnippetUpsertRequest(BaseModel):
table_id: int = Field(..., ge=1, description="Unique identifier for the table.")
version_ts: int = Field(
...,
ge=0,
description="Version timestamp aligned with the pipeline (yyyyMMddHHmmss as integer).",
)
workspace_id: Optional[int] = Field(
None,
ge=0,
description="Optional workspace identifier for RAG ingestion; when provided and action_type=snippet_alias "
"with status=success, merged snippets will be written to rag_snippet and pushed to RAG.",
)
rag_item_type: Optional[str] = Field(
"SNIPPET",
description="Optional RAG item type used when pushing snippets to RAG. Defaults to 'SNIPPET'.",
)
action_type: ActionType = Field(..., description="Pipeline action type for this record.")
status: ActionStatus = Field(
ActionStatus.SUCCESS, description="Execution status for the action."
)
callback_url: HttpUrl = Field(..., description="Callback URL associated with the action run.")
table_schema_version_id: int = Field(..., ge=0, description="Identifier for the schema snapshot.")
table_schema: Any = Field(..., description="Schema snapshot payload for the table.")
model: Optional[str] = Field(
None,
description="LLM model identifier (can be provider alias) used for this action, when applicable.",
)
model_provider: Optional[str] = Field(
None,
description="LLM provider responsible for executing the action's model.",
)
model_params: Optional[Dict[str, Any]] = Field(
None,
description="Optional model parameter overrides (e.g., temperature) associated with the action.",
)
llm_usage: Optional[Any] = Field(
None,
description="Optional token usage metrics reported by the LLM provider.",
)
ge_profiling_json: Optional[Any] = Field(
None, description="Full GE profiling result payload for the profiling action."
)
ge_profiling_json_size_bytes: Optional[int] = Field(
None, ge=0, description="Size in bytes of the GE profiling result JSON."
)
ge_profiling_summary: Optional[Any] = Field(
None, description="Sanitised GE profiling summary payload."
)
ge_profiling_summary_size_bytes: Optional[int] = Field(
None, ge=0, description="Size in bytes of the GE profiling summary JSON."
)
ge_profiling_total_size_bytes: Optional[int] = Field(
None, ge=0, description="Combined size (bytes) of profiling result + summary."
)
ge_profiling_html_report_url: Optional[str] = Field(
None, description="Optional URL to the generated GE profiling HTML report."
)
ge_result_desc_json: Optional[Any] = Field(
None, description="Result JSON for the GE result description action."
)
ge_result_desc_json_size_bytes: Optional[int] = Field(
None, ge=0, description="Size in bytes of the GE result description JSON."
)
snippet_json: Optional[Any] = Field(
None, description="Snippet generation action result JSON."
)
snippet_json_size_bytes: Optional[int] = Field(
None, ge=0, description="Size in bytes of the snippet result JSON."
)
snippet_alias_json: Optional[Any] = Field(
None, description="Snippet alias expansion result JSON."
)
snippet_alias_json_size_bytes: Optional[int] = Field(
None, ge=0, description="Size in bytes of the snippet alias result JSON."
)
error_code: Optional[str] = Field(None, description="Optional error code when status indicates a failure.")
error_message: Optional[str] = Field(None, description="Optional error message when status indicates a failure.")
started_at: Optional[datetime] = Field(
None, description="Timestamp when the action started executing."
)
finished_at: Optional[datetime] = Field(
None, description="Timestamp when the action finished executing."
)
duration_ms: Optional[int] = Field(
None,
ge=0,
description="Optional execution duration in milliseconds.",
)
class TableSnippetRagIngestRequest(BaseModel):
table_id: int = Field(..., ge=1, description="Unique identifier for the table.")
version_ts: int = Field(
...,
ge=0,
description="Version timestamp aligned with the pipeline (yyyyMMddHHmmss as integer).",
)
workspace_id: int = Field(..., ge=0, description="Workspace id used when pushing snippets to RAG.")
rag_item_type: Optional[str] = Field(
"SNIPPET",
description="Optional RAG item type used when pushing snippets to RAG. Defaults to 'SNIPPET'.",
)
class TableSnippetRagIngestResponse(BaseModel):
rag_item_ids: List[int] = Field(..., description="List of ingested rag_item_ids.")
result_checksum: Optional[str] = Field(
None,
description="Optional checksum for the result payload (e.g., MD5).",
)
class TableSnippetUpsertResponse(BaseModel):
table_id: int
version_ts: int
action_type: ActionType
status: ActionStatus
updated: bool

View File

@ -1,4 +0,0 @@
from .chat import router as chat_router
from .metrics import router as metrics_router
__all__ = ["chat_router", "metrics_router"]

View File

@ -1,102 +0,0 @@
from __future__ import annotations
from datetime import datetime
from typing import Any, List, Optional
from fastapi import APIRouter, HTTPException, Query
from app.schemas.chat import (
ChatSessionCreate,
ChatSessionUpdate,
ChatTurnCreate,
ChatTurnRetrievalBatch,
)
from app.services import metric_store
router = APIRouter(prefix="/api/v1/chat", tags=["chat"])
@router.post("/sessions")
def create_session(payload: ChatSessionCreate) -> Any:
"""Create a chat session."""
return metric_store.create_chat_session(payload)
@router.post("/sessions/{session_id}/update")
def update_session(session_id: int, payload: ChatSessionUpdate) -> Any:
try:
return metric_store.update_chat_session(session_id, payload)
except KeyError:
raise HTTPException(status_code=404, detail="Session not found")
@router.post("/sessions/{session_id}/close")
def close_session(session_id: int) -> Any:
"""Close a chat session and stamp end_time."""
try:
return metric_store.close_chat_session(session_id)
except KeyError:
raise HTTPException(status_code=404, detail="Session not found")
@router.get("/sessions/{session_id}")
def get_session(session_id: int) -> Any:
"""Fetch one session."""
session = metric_store.get_chat_session(session_id)
if not session:
raise HTTPException(status_code=404, detail="Session not found")
return session
@router.get("/sessions")
def list_sessions(
user_id: Optional[int] = None,
status: Optional[str] = None,
start_from: Optional[datetime] = Query(None, description="Filter by start time lower bound."),
start_to: Optional[datetime] = Query(None, description="Filter by start time upper bound."),
limit: int = Query(50, ge=1, le=500),
offset: int = Query(0, ge=0),
) -> List[Any]:
return metric_store.list_chat_sessions(
user_id=user_id,
status=status,
start_from=start_from,
start_to=start_to,
limit=limit,
offset=offset,
)
@router.post("/sessions/{session_id}/turns")
def create_turn(session_id: int, payload: ChatTurnCreate) -> Any:
"""Create a turn under a session."""
try:
return metric_store.create_chat_turn(session_id, payload)
except Exception as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
@router.get("/sessions/{session_id}/turns")
def list_turns(session_id: int) -> List[Any]:
return metric_store.list_chat_turns(session_id)
@router.get("/turns/{turn_id}")
def get_turn(turn_id: int) -> Any:
turn = metric_store.get_chat_turn(turn_id)
if not turn:
raise HTTPException(status_code=404, detail="Turn not found")
return turn
@router.post("/turns/{turn_id}/retrievals")
def write_retrievals(turn_id: int, payload: ChatTurnRetrievalBatch) -> Any:
"""Batch write retrieval records for a turn."""
count = metric_store.create_retrievals(turn_id, payload.retrievals)
return {"turn_id": turn_id, "inserted": count}
@router.get("/turns/{turn_id}/retrievals")
def list_retrievals(turn_id: int) -> List[Any]:
return metric_store.list_retrievals(turn_id)

View File

@ -1,166 +0,0 @@
from __future__ import annotations
from datetime import datetime
from typing import Any, List, Optional
from fastapi import APIRouter, HTTPException, Query
from app.schemas.metrics import (
MetricCreate,
MetricResultsWriteRequest,
MetricRunTrigger,
MetricScheduleCreate,
MetricScheduleUpdate,
MetricUpdate,
)
from app.services import metric_store
router = APIRouter(prefix="/api/v1", tags=["metrics"])
@router.post("/metrics")
def create_metric(payload: MetricCreate) -> Any:
"""Create a metric definition."""
try:
return metric_store.create_metric(payload)
except Exception as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
@router.post("/metrics/{metric_id}")
def update_metric(metric_id: int, payload: MetricUpdate) -> Any:
"""Update fields of a metric definition."""
try:
return metric_store.update_metric(metric_id, payload)
except KeyError:
raise HTTPException(status_code=404, detail="Metric not found")
except Exception as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
@router.get("/metrics/{metric_id}")
def get_metric(metric_id: int) -> Any:
"""Fetch a metric definition by id."""
metric = metric_store.get_metric(metric_id)
if not metric:
raise HTTPException(status_code=404, detail="Metric not found")
return metric
@router.get("/metrics")
def list_metrics(
biz_domain: Optional[str] = None,
is_active: Optional[bool] = None,
keyword: Optional[str] = Query(None, description="Search by code/name"),
limit: int = Query(100, ge=1, le=500),
offset: int = Query(0, ge=0),
) -> List[Any]:
"""List metrics with optional filters."""
return metric_store.list_metrics(
biz_domain=biz_domain,
is_active=is_active,
keyword=keyword,
limit=limit,
offset=offset,
)
@router.post("/metric-schedules")
def create_schedule(payload: MetricScheduleCreate) -> Any:
"""Create a metric schedule."""
try:
return metric_store.create_metric_schedule(payload)
except Exception as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
@router.post("/metric-schedules/{schedule_id}")
def update_schedule(schedule_id: int, payload: MetricScheduleUpdate) -> Any:
"""Update a metric schedule."""
try:
return metric_store.update_metric_schedule(schedule_id, payload)
except KeyError:
raise HTTPException(status_code=404, detail="Schedule not found")
except Exception as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
@router.get("/metrics/{metric_id}/schedules")
def list_schedules(metric_id: int) -> List[Any]:
"""List schedules for one metric."""
return metric_store.list_schedules_for_metric(metric_id=metric_id)
@router.post("/metric-runs/trigger")
def trigger_run(payload: MetricRunTrigger) -> Any:
"""Insert a run record (execution handled externally)."""
try:
return metric_store.trigger_metric_run(payload)
except KeyError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc
except Exception as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
@router.get("/metric-runs")
def list_runs(
metric_id: Optional[int] = None,
status: Optional[str] = None,
limit: int = Query(100, ge=1, le=500),
offset: int = Query(0, ge=0),
) -> List[Any]:
"""List run records."""
return metric_store.list_metric_runs(
metric_id=metric_id, status=status, limit=limit, offset=offset
)
@router.get("/metric-runs/{run_id}")
def get_run(run_id: int) -> Any:
"""Fetch run details."""
run = metric_store.get_metric_run(run_id)
if not run:
raise HTTPException(status_code=404, detail="Run not found")
return run
@router.post("/metric-results/{metric_id}")
def write_results(metric_id: int, payload: MetricResultsWriteRequest) -> Any:
# Align path metric_id with payload to avoid mismatch.
if payload.metric_id != metric_id:
raise HTTPException(status_code=400, detail="metric_id in path/body mismatch")
try:
inserted = metric_store.write_metric_results(payload)
except KeyError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc
except Exception as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
return {"metric_id": metric_id, "inserted": inserted}
@router.get("/metric-results")
def query_results(
metric_id: int,
stat_from: Optional[datetime] = None,
stat_to: Optional[datetime] = None,
limit: int = Query(200, ge=1, le=1000),
offset: int = Query(0, ge=0),
) -> List[Any]:
"""Query metric results by time range."""
return metric_store.query_metric_results(
metric_id=metric_id,
stat_from=stat_from,
stat_to=stat_to,
limit=limit,
offset=offset,
)
@router.get("/metric-results/latest")
def latest_result(metric_id: int) -> Any:
"""Fetch the latest metric result."""
result = metric_store.latest_metric_result(metric_id)
if not result:
raise HTTPException(status_code=404, detail="Metric result not found")
return result

View File

@ -1,53 +0,0 @@
from __future__ import annotations
from datetime import datetime
from typing import Any, List, Optional
from pydantic import BaseModel, Field
class ChatSessionCreate(BaseModel):
"""Create a chat session to group multiple turns for a user."""
user_id: int = Field(..., description="User ID owning the session.")
session_uuid: Optional[str] = Field(None, description="Optional externally provided UUID.")
status: Optional[str] = Field("OPEN", description="Session status, default OPEN.")
end_time: Optional[datetime] = Field(None, description="Optional end time.")
ext_context: Optional[dict[str, Any]] = Field(None, description="Arbitrary business context.")
class ChatSessionUpdate(BaseModel):
"""Partial update for a chat session."""
status: Optional[str] = Field(None, description="New session status.")
end_time: Optional[datetime] = Field(None, description="Close time override.")
last_turn_id: Optional[int] = Field(None, description="Pointer to last chat turn.")
ext_context: Optional[dict[str, Any]] = Field(None, description="Context patch.")
class ChatTurnCreate(BaseModel):
"""Create a single chat turn with intent/SQL context."""
user_id: int = Field(..., description="User ID for this turn.")
user_query: str = Field(..., description="Raw user query content.")
intent: Optional[str] = Field(None, description="Intent tag such as METRIC_QUERY.")
ast_json: Optional[dict[str, Any]] = Field(None, description="Parsed AST payload.")
generated_sql: Optional[str] = Field(None, description="Final SQL text, if generated.")
sql_status: Optional[str] = Field(None, description="SQL generation/execution status.")
error_msg: Optional[str] = Field(None, description="Error message when SQL failed.")
main_metric_ids: Optional[List[int]] = Field(None, description="Metric IDs referenced in this turn.")
created_metric_ids: Optional[List[int]] = Field(None, description="Metric IDs created in this turn.")
end_time: Optional[datetime] = Field(None, description="Turn end time.")
class ChatTurnRetrievalItem(BaseModel):
"""Record of one retrieved item contributing to a turn."""
item_type: str = Field(..., description="METRIC/SNIPPET/CHAT etc.")
item_id: str = Field(..., description="Identifier such as metric_id or snippet_id.")
item_extra: Optional[dict[str, Any]] = Field(None, description="Additional context like column name.")
similarity_score: Optional[float] = Field(None, description="Similarity score.")
rank_no: Optional[int] = Field(None, description="Ranking position.")
used_in_reasoning: Optional[bool] = Field(False, description="Flag if used in reasoning.")
used_in_sql: Optional[bool] = Field(False, description="Flag if used in final SQL.")
class ChatTurnRetrievalBatch(BaseModel):
"""Batch insert wrapper for retrieval records."""
retrievals: List[ChatTurnRetrievalItem]

View File

@ -1,99 +0,0 @@
from __future__ import annotations
from datetime import datetime
from typing import Any, List, Optional
from pydantic import BaseModel, Field
class MetricCreate(BaseModel):
"""Create a metric definition with business and technical metadata."""
metric_code: str = Field(..., description="Internal metric code, unique.")
metric_name: str = Field(..., description="Display name.")
metric_aliases: Optional[List[str]] = Field(None, description="Optional alias list.")
biz_domain: str = Field(..., description="Business domain identifier.")
biz_desc: Optional[str] = Field(None, description="Business definition.")
chat_turn_id: Optional[int] = Field(None, description="Source chat turn ID.")
tech_desc: Optional[str] = Field(None, description="Technical definition.")
formula_expr: Optional[str] = Field(None, description="Formula expression text.")
base_sql: str = Field(..., description="Canonical SQL used to compute the metric.")
time_grain: str = Field(..., description="DAY/HOUR/WEEK/MONTH etc.")
dim_binding: List[str] = Field(..., description="Dimension columns bound to the metric.")
update_strategy: str = Field(..., description="FULL/INCR/REALTIME.")
schedule_id: Optional[int] = Field(None, description="Linked schedule id if any.")
schedule_type: Optional[int] = Field(None, description="Scheduler type identifier.")
is_active: bool = Field(True, description="Whether the metric is enabled.")
created_by: Optional[int] = Field(None, description="Creator user id.")
updated_by: Optional[int] = Field(None, description="Updater user id.")
class MetricUpdate(BaseModel):
"""Partial update for an existing metric definition."""
metric_name: Optional[str] = None
metric_aliases: Optional[List[str]] = None
biz_domain: Optional[str] = None
biz_desc: Optional[str] = None
tech_desc: Optional[str] = None
formula_expr: Optional[str] = None
base_sql: Optional[str] = None
time_grain: Optional[str] = None
dim_binding: Optional[List[str]] = None
update_strategy: Optional[str] = None
schedule_id: Optional[int] = None
schedule_type: Optional[int] = None
is_active: Optional[bool] = None
updated_by: Optional[int] = None
class MetricScheduleCreate(BaseModel):
"""Create a cron-based schedule for a metric."""
metric_id: int
cron_expr: str
enabled: bool = True
priority: int = 10
backfill_allowed: bool = True
max_runtime_sec: Optional[int] = None
retry_times: int = 0
owner_team: Optional[str] = None
owner_user_id: Optional[int] = None
class MetricScheduleUpdate(BaseModel):
"""Update fields of an existing metric schedule."""
cron_expr: Optional[str] = None
enabled: Optional[bool] = None
priority: Optional[int] = None
backfill_allowed: Optional[bool] = None
max_runtime_sec: Optional[int] = None
retry_times: Optional[int] = None
owner_team: Optional[str] = None
owner_user_id: Optional[int] = None
class MetricRunTrigger(BaseModel):
"""Trigger a metric run, optionally linking to a chat turn or schedule."""
metric_id: int
schedule_id: Optional[int] = None
source_turn_id: Optional[int] = None
data_time_from: Optional[datetime] = None
data_time_to: Optional[datetime] = None
metric_version: Optional[int] = None
base_sql_snapshot: Optional[str] = None
triggered_by: str = Field("API", description="SCHEDULER/MANUAL/API/QA_TURN")
triggered_at: Optional[datetime] = None
class MetricResultItem(BaseModel):
"""Single metric result row to be persisted."""
stat_time: datetime
metric_value: float
metric_version: Optional[int] = None
extra_dims: Optional[dict[str, Any]] = None
load_time: Optional[datetime] = None
data_version: Optional[int] = None
class MetricResultsWriteRequest(BaseModel):
"""Batch write request for metric results."""
metric_id: int
results: List[MetricResultItem]

View File

@ -1,46 +0,0 @@
from __future__ import annotations
from typing import Any, List
from pydantic import BaseModel, ConfigDict, Field
class RagItemPayload(BaseModel):
"""Payload for creating or updating a single RAG item."""
model_config = ConfigDict(populate_by_name=True, extra="ignore")
id: int = Field(..., description="Unique identifier for the RAG item.")
workspace_id: int = Field(..., alias="workspaceId", description="Workspace identifier.")
name: str = Field(..., description="Readable name of the item.")
embedding_data: str = Field(..., alias="embeddingData", description="Serialized embedding payload.")
type: str = Field(..., description='Item type, e.g. "METRIC".')
class RagDeleteRequest(BaseModel):
"""Payload for deleting a single RAG item."""
model_config = ConfigDict(populate_by_name=True, extra="ignore")
id: int = Field(..., description="Identifier of the item to delete.")
type: str = Field(..., description="Item type matching the stored record.")
class RagRetrieveRequest(BaseModel):
"""Payload for retrieving RAG items by semantic query."""
model_config = ConfigDict(populate_by_name=True, extra="ignore")
query: str = Field(..., description="Search query text.")
num: int = Field(..., description="Number of items to return.")
workspace_id: int = Field(..., alias="workspaceId", description="Workspace scope for the search.")
type: str = Field(..., description="Item type to search, e.g. METRIC.")
class RagRetrieveResponse(BaseModel):
"""Generic RAG retrieval response wrapper."""
model_config = ConfigDict(extra="allow")
data: List[Any] = Field(default_factory=list, description="Retrieved items.")

View File

@ -1,4 +1,3 @@
from .gateway import LLMGateway from .gateway import LLMGateway
from .rag_client import RagAPIClient
__all__ = ["LLMGateway", "RagAPIClient"] __all__ = ["LLMGateway"]

View File

@ -1,93 +1,53 @@
from __future__ import annotations from __future__ import annotations
import logging import os
from typing import Dict, Type
import httpx import httpx
from pydantic import ValidationError
from app.exceptions import ProviderAPICallError from app.exceptions import ProviderConfigurationError
from app.models import LLMChoice, LLMMessage, LLMRequest, LLMResponse from app.models import LLMProvider, LLMRequest, LLMResponse
from app.settings import NEW_API_AUTH_TOKEN, NEW_API_BASE_URL from app.providers import (
AnthropicProvider,
DeepSeekProvider,
logger = logging.getLogger(__name__) GeminiProvider,
LLMProviderClient,
OpenAIProvider,
OpenRouterProvider,
QwenProvider,
)
class LLMGateway: class LLMGateway:
"""Forward chat requests to the configured new-api component.""" """Simple registry that dispatches chat requests to provider clients."""
def __init__( def __init__(self) -> None:
self, self._providers: Dict[LLMProvider, LLMProviderClient] = {}
*, self._factory: Dict[LLMProvider, Type[LLMProviderClient]] = {
base_url: str | None = None, LLMProvider.OPENAI: OpenAIProvider,
auth_token: str | None = None, LLMProvider.ANTHROPIC: AnthropicProvider,
) -> None: LLMProvider.OPENROUTER: OpenRouterProvider,
resolved_base = base_url or NEW_API_BASE_URL LLMProvider.GEMINI: GeminiProvider,
self._base_url = resolved_base.rstrip("/") LLMProvider.QWEN: QwenProvider,
self._auth_token = auth_token or NEW_API_AUTH_TOKEN LLMProvider.DEEPSEEK: DeepSeekProvider,
}
def get_provider(self, provider: LLMProvider) -> LLMProviderClient:
if provider not in self._factory:
raise ProviderConfigurationError(f"Unsupported provider '{provider.value}'.")
if provider not in self._providers:
self._providers[provider] = self._build_provider(provider)
return self._providers[provider]
def _build_provider(self, provider: LLMProvider) -> LLMProviderClient:
provider_cls = self._factory[provider]
api_key_env = getattr(provider_cls, "api_key_env", None)
api_key = os.getenv(api_key_env) if api_key_env else None
return provider_cls(api_key)
async def chat( async def chat(
self, request: LLMRequest, client: httpx.AsyncClient self, request: LLMRequest, client: httpx.AsyncClient
) -> LLMResponse: ) -> LLMResponse:
url = f"{self._base_url}/v1/chat/completions" provider_client = self.get_provider(request.provider)
payload = request.model_dump(mode="json", exclude_none=True) return await provider_client.chat(request, client)
headers = {"Content-Type": "application/json"}
if self._auth_token:
headers["Authorization"] = f"Bearer {self._auth_token}"
logger.info("Forwarding chat request to new-api at %s", url)
try:
response = await client.post(url, json=payload, headers=headers)
response.raise_for_status()
except httpx.HTTPStatusError as exc:
status_code = exc.response.status_code if exc.response else None
response_text = exc.response.text if exc.response else ""
logger.error(
"new-api upstream returned %s: %s",
status_code,
response_text,
exc_info=True,
)
raise ProviderAPICallError(
"Chat completion request failed.",
status_code=status_code,
response_text=response_text,
) from exc
except httpx.HTTPError as exc:
logger.error("new-api transport error: %s", exc, exc_info=True)
raise ProviderAPICallError(f"Chat completion request failed: {exc}") from exc
try:
data = response.json()
except ValueError as exc:
logger.error("new-api responded with invalid JSON.", exc_info=True)
raise ProviderAPICallError(
"Chat completion response was not valid JSON."
) from exc
logger.info("new-api payload: %s", data)
normalized_choices: list[LLMChoice] = []
for idx, choice in enumerate(data.get("choices", []) or []):
message_payload = choice.get("message") or {}
message = LLMMessage(
role=message_payload.get("role", "assistant"),
content=message_payload.get("content", ""),
)
normalized_choices.append(
LLMChoice(index=choice.get("index", idx), message=message)
)
try:
normalized_response = LLMResponse(
provider=request.provider,
model=data.get("model", request.model),
choices=normalized_choices,
raw=data,
)
return normalized_response
except ValidationError as exc:
logger.error(
"new-api response did not match expected schema: %s", data, exc_info=True
)
raise ProviderAPICallError(
"Chat completion response was not in the expected format."
) from exc

View File

@ -22,24 +22,13 @@ from app.models import (
LLMResponse, LLMResponse,
LLMRole, LLMRole,
) )
from app.settings import ( from app.settings import DEFAULT_IMPORT_MODEL, get_supported_import_models
DEFAULT_IMPORT_MODEL,
NEW_API_AUTH_TOKEN,
NEW_API_BASE_URL,
get_supported_import_models,
)
from app.utils.llm_usage import extract_usage
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
IMPORT_GATEWAY_BASE_URL = os.getenv("IMPORT_GATEWAY_BASE_URL", NEW_API_BASE_URL) IMPORT_GATEWAY_BASE_URL = os.getenv(
"IMPORT_GATEWAY_BASE_URL", "http://localhost:8000"
)
def build_import_gateway_headers() -> dict[str, str]:
headers = {"Content-Type": "application/json"}
if NEW_API_AUTH_TOKEN:
headers["Authorization"] = f"Bearer {NEW_API_AUTH_TOKEN}"
return headers
def _env_float(name: str, default: float) -> float: def _env_float(name: str, default: float) -> float:
@ -53,7 +42,7 @@ def _env_float(name: str, default: float) -> float:
return default return default
IMPORT_CHAT_TIMEOUT_SECONDS = _env_float("IMPORT_CHAT_TIMEOUT_SECONDS", 120.0) IMPORT_CHAT_TIMEOUT_SECONDS = _env_float("IMPORT_CHAT_TIMEOUT_SECONDS", 90.0)
SUPPORTED_IMPORT_MODELS = get_supported_import_models() SUPPORTED_IMPORT_MODELS = get_supported_import_models()
@ -309,7 +298,7 @@ def parse_llm_analysis_json(llm_response: LLMResponse) -> Dict[str, Any]:
try: try:
return json.loads(json_payload) return json.loads(json_payload)
except json.JSONDecodeError as exc: except json.JSONDecodeError as exc:
preview = json_payload[:10000] preview = json_payload[:2000]
logger.error("Failed to parse JSON from LLM response content: %s", preview, exc_info=True) logger.error("Failed to parse JSON from LLM response content: %s", preview, exc_info=True)
raise ProviderAPICallError("LLM response JSON could not be parsed.") from exc raise ProviderAPICallError("LLM response JSON could not be parsed.") from exc
@ -324,18 +313,16 @@ async def dispatch_import_analysis_job(
url = f"{IMPORT_GATEWAY_BASE_URL.rstrip('/')}/v1/chat/completions" url = f"{IMPORT_GATEWAY_BASE_URL.rstrip('/')}/v1/chat/completions"
logger.info( logger.info(
"Dispatching import %s to %s using provider=%s model=%s", "Dispatching import %s to %s: %s",
request.import_record_id, request.import_record_id,
url, url,
payload.get("provider"), json.dumps(payload, ensure_ascii=False),
payload.get("model"),
) )
timeout = httpx.Timeout(IMPORT_CHAT_TIMEOUT_SECONDS) timeout = httpx.Timeout(IMPORT_CHAT_TIMEOUT_SECONDS)
headers = build_import_gateway_headers()
try: try:
response = await client.post(url, json=payload, timeout=timeout, headers=headers) response = await client.post(url, json=payload, timeout=timeout)
response.raise_for_status() response.raise_for_status()
except httpx.HTTPStatusError as exc: except httpx.HTTPStatusError as exc:
body_preview = "" body_preview = ""
@ -360,10 +347,9 @@ async def dispatch_import_analysis_job(
response.status_code, response.status_code,
) )
logger.info( logger.info(
"LLM response received for %s (status %s, choices=%s)", "LLM response for %s: %s",
request.import_record_id, request.import_record_id,
response.status_code, json.dumps(response_data, ensure_ascii=False),
len(response_data.get("choices") or []),
) )
try: try:
@ -389,6 +375,18 @@ async def dispatch_import_analysis_job(
return result return result
# 兼容处理多模型的使用量字段提取
def extract_usage(resp_json: dict) -> dict:
usage = resp_json.get("usage") or resp_json.get("usageMetadata") or {}
return {
"prompt_tokens": usage.get("prompt_tokens") or usage.get("input_tokens") or usage.get("promptTokenCount"),
"completion_tokens": usage.get("completion_tokens") or usage.get("output_tokens") or usage.get("candidatesTokenCount"),
"total_tokens": usage.get("total_tokens") or usage.get("totalTokenCount") or (
(usage.get("prompt_tokens") or usage.get("input_tokens") or 0)
+ (usage.get("completion_tokens") or usage.get("output_tokens") or 0)
)
}
async def notify_import_analysis_callback( async def notify_import_analysis_callback(
callback_url: str, callback_url: str,
payload: Dict[str, Any], payload: Dict[str, Any],
@ -417,7 +415,6 @@ async def process_import_analysis_job(
request: DataImportAnalysisJobRequest, request: DataImportAnalysisJobRequest,
client: httpx.AsyncClient, client: httpx.AsyncClient,
) -> None: ) -> None:
# Run the import analysis and ensure the callback fires regardless of success/failure.
try: try:
payload = await dispatch_import_analysis_job(request, client) payload = await dispatch_import_analysis_job(request, client)
except ProviderAPICallError as exc: except ProviderAPICallError as exc:

View File

@ -1,842 +0,0 @@
from __future__ import annotations
import hashlib
import json
import logging
from datetime import datetime
from typing import Any, Dict, Iterable, List, Optional
from uuid import uuid4
from sqlalchemy import text
from sqlalchemy.engine import Row
from app.db import get_engine
from app.schemas.chat import (
ChatSessionCreate,
ChatSessionUpdate,
ChatTurnCreate,
ChatTurnRetrievalItem,
)
from app.schemas.metrics import (
MetricCreate,
MetricResultItem,
MetricResultsWriteRequest,
MetricRunTrigger,
MetricScheduleCreate,
MetricScheduleUpdate,
MetricUpdate,
)
logger = logging.getLogger(__name__)
# Common helpers
def _json_dump(value: Any) -> Optional[str]:
"""Safe JSON dumper; returns None on failure to keep DB writes simple."""
if value is None:
return None
if isinstance(value, str):
return value
try:
return json.dumps(value, ensure_ascii=False)
except (TypeError, ValueError):
return None
def _parse_json_fields(payload: Dict[str, Any], fields: Iterable[str]) -> Dict[str, Any]:
"""Parse select fields from JSON strings into dict/list for responses."""
for field in fields:
raw = payload.get(field)
if raw is None or isinstance(raw, (dict, list)):
continue
if isinstance(raw, (bytes, bytearray)):
raw = raw.decode("utf-8", errors="ignore")
if isinstance(raw, str):
try:
payload[field] = json.loads(raw)
except ValueError:
pass
return payload
def _row_to_dict(row: Row[Any]) -> Dict[str, Any]:
return dict(row._mapping)
# Chat sessions & turns
def create_chat_session(payload: ChatSessionCreate) -> Dict[str, Any]:
"""Create a chat session row with optional external UUID."""
engine = get_engine()
session_uuid = payload.session_uuid or str(uuid4())
now = datetime.utcnow()
params = {
"user_id": payload.user_id,
"session_uuid": session_uuid,
"end_time": payload.end_time,
"status": payload.status or "OPEN",
"ext_context": _json_dump(payload.ext_context),
}
with engine.begin() as conn:
result = conn.execute(
text(
"""
INSERT INTO chat_session (user_id, session_uuid, end_time, status, ext_context)
VALUES (:user_id, :session_uuid, :end_time, :status, :ext_context)
"""
),
params,
)
session_id = result.lastrowid
row = conn.execute(
text("SELECT * FROM chat_session WHERE id=:id"), {"id": session_id}
).first()
if not row:
raise RuntimeError("Failed to create chat session.")
data = _row_to_dict(row)
_parse_json_fields(data, ["ext_context"])
return data
def update_chat_session(session_id: int, payload: ChatSessionUpdate) -> Dict[str, Any]:
"""Patch selected chat session fields."""
updates = {}
if payload.status is not None:
updates["status"] = payload.status
if payload.end_time is not None:
updates["end_time"] = payload.end_time
if payload.last_turn_id is not None:
updates["last_turn_id"] = payload.last_turn_id
if payload.ext_context is not None:
updates["ext_context"] = _json_dump(payload.ext_context)
if not updates:
current = get_chat_session(session_id)
if not current:
raise KeyError(f"Session {session_id} not found.")
return current
set_clause = ", ".join(f"{key}=:{key}" for key in updates.keys())
params = dict(updates)
params["id"] = session_id
engine = get_engine()
with engine.begin() as conn:
conn.execute(
text(f"UPDATE chat_session SET {set_clause} WHERE id=:id"),
params,
)
row = conn.execute(
text("SELECT * FROM chat_session WHERE id=:id"), {"id": session_id}
).first()
if not row:
raise KeyError(f"Session {session_id} not found.")
data = _row_to_dict(row)
_parse_json_fields(data, ["ext_context"])
return data
def close_chat_session(session_id: int) -> Dict[str, Any]:
"""Mark a chat session as CLOSED with end_time."""
now = datetime.utcnow()
return update_chat_session(
session_id,
ChatSessionUpdate(status="CLOSED", end_time=now),
)
def get_chat_session(session_id: int) -> Optional[Dict[str, Any]]:
engine = get_engine()
with engine.begin() as conn:
row = conn.execute(
text("SELECT * FROM chat_session WHERE id=:id"), {"id": session_id}
).first()
if not row:
return None
data = _row_to_dict(row)
_parse_json_fields(data, ["ext_context"])
return data
def list_chat_sessions(
*,
user_id: Optional[int] = None,
status: Optional[str] = None,
start_from: Optional[datetime] = None,
start_to: Optional[datetime] = None,
limit: int = 50,
offset: int = 0,
) -> List[Dict[str, Any]]:
"""List chat sessions with optional filters and pagination."""
conditions = []
params: Dict[str, Any] = {"limit": limit, "offset": offset}
if user_id is not None:
conditions.append("user_id=:user_id")
params["user_id"] = user_id
if status is not None:
conditions.append("status=:status")
params["status"] = status
if start_from is not None:
conditions.append("created_at>=:start_from")
params["start_from"] = start_from
if start_to is not None:
conditions.append("created_at<=:start_to")
params["start_to"] = start_to
where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
engine = get_engine()
with engine.begin() as conn:
rows = conn.execute(
text(
f"SELECT * FROM chat_session {where_clause} "
"ORDER BY created_at DESC LIMIT :limit OFFSET :offset"
),
params,
).fetchall()
results: List[Dict[str, Any]] = []
for row in rows:
data = _row_to_dict(row)
_parse_json_fields(data, ["ext_context"])
results.append(data)
return results
def _next_turn_no(conn, session_id: int) -> int:
row = conn.execute(
text("SELECT COALESCE(MAX(turn_no), 0) + 1 AS next_no FROM chat_turn WHERE session_id=:sid"),
{"sid": session_id},
).first()
if not row:
return 1
return int(row._mapping["next_no"])
def create_chat_turn(session_id: int, payload: ChatTurnCreate) -> Dict[str, Any]:
"""Insert a chat turn and auto-increment turn number within the session."""
engine = get_engine()
now = datetime.utcnow()
params = {
"session_id": session_id,
"user_id": payload.user_id,
"user_query": payload.user_query,
"intent": payload.intent,
"ast_json": _json_dump(payload.ast_json),
"generated_sql": payload.generated_sql,
"sql_status": payload.sql_status,
"error_msg": payload.error_msg,
"main_metric_ids": _json_dump(payload.main_metric_ids),
"created_metric_ids": _json_dump(payload.created_metric_ids),
"end_time": payload.end_time,
}
with engine.begin() as conn:
turn_no = _next_turn_no(conn, session_id)
params["turn_no"] = turn_no
result = conn.execute(
text(
"""
INSERT INTO chat_turn (
session_id, turn_no, user_id,
user_query, intent, ast_json,
generated_sql, sql_status, error_msg,
main_metric_ids, created_metric_ids,
end_time
)
VALUES (
:session_id, :turn_no, :user_id,
:user_query, :intent, :ast_json,
:generated_sql, :sql_status, :error_msg,
:main_metric_ids, :created_metric_ids,
:end_time
)
"""
),
params,
)
turn_id = result.lastrowid
row = conn.execute(
text("SELECT * FROM chat_turn WHERE id=:id"), {"id": turn_id}
).first()
if not row:
raise RuntimeError("Failed to create chat turn.")
data = _row_to_dict(row)
_parse_json_fields(data, ["ast_json", "main_metric_ids", "created_metric_ids"])
return data
def get_chat_turn(turn_id: int) -> Optional[Dict[str, Any]]:
engine = get_engine()
with engine.begin() as conn:
row = conn.execute(
text("SELECT * FROM chat_turn WHERE id=:id"), {"id": turn_id}
).first()
if not row:
return None
data = _row_to_dict(row)
_parse_json_fields(data, ["ast_json", "main_metric_ids", "created_metric_ids"])
return data
def list_chat_turns(session_id: int) -> List[Dict[str, Any]]:
engine = get_engine()
with engine.begin() as conn:
rows = conn.execute(
text(
"SELECT * FROM chat_turn WHERE session_id=:session_id ORDER BY turn_no ASC"
),
{"session_id": session_id},
).fetchall()
results: List[Dict[str, Any]] = []
for row in rows:
data = _row_to_dict(row)
_parse_json_fields(data, ["ast_json", "main_metric_ids", "created_metric_ids"])
results.append(data)
return results
def create_retrievals(turn_id: int, retrievals: List[ChatTurnRetrievalItem]) -> int:
"""Batch insert retrieval records for a turn."""
if not retrievals:
return 0
engine = get_engine()
params_list = []
for item in retrievals:
params_list.append(
{
"turn_id": turn_id,
"item_type": item.item_type,
"item_id": item.item_id,
"item_extra": _json_dump(item.item_extra),
"similarity_score": item.similarity_score,
"rank_no": item.rank_no,
"used_in_reasoning": 1 if item.used_in_reasoning else 0,
"used_in_sql": 1 if item.used_in_sql else 0,
}
)
with engine.begin() as conn:
conn.execute(
text(
"""
INSERT INTO chat_turn_retrieval (
turn_id, item_type, item_id, item_extra,
similarity_score, rank_no, used_in_reasoning, used_in_sql
)
VALUES (
:turn_id, :item_type, :item_id, :item_extra,
:similarity_score, :rank_no, :used_in_reasoning, :used_in_sql
)
"""
),
params_list,
)
return len(retrievals)
def list_retrievals(turn_id: int) -> List[Dict[str, Any]]:
engine = get_engine()
with engine.begin() as conn:
rows = conn.execute(
text(
"SELECT * FROM chat_turn_retrieval WHERE turn_id=:turn_id ORDER BY created_at ASC, rank_no ASC"
),
{"turn_id": turn_id},
).fetchall()
results: List[Dict[str, Any]] = []
for row in rows:
data = _row_to_dict(row)
_parse_json_fields(data, ["item_extra"])
data["used_in_reasoning"] = bool(data.get("used_in_reasoning"))
data["used_in_sql"] = bool(data.get("used_in_sql"))
results.append(data)
return results
# Metric registry
def _metric_sql_hash(sql_text: str) -> str:
"""Compute a stable hash to detect SQL definition changes."""
return hashlib.md5(sql_text.encode("utf-8")).hexdigest()
def create_metric(payload: MetricCreate) -> Dict[str, Any]:
"""Insert a new metric definition; version starts at 1."""
engine = get_engine()
now = datetime.utcnow()
sql_hash = _metric_sql_hash(payload.base_sql)
params = {
"metric_code": payload.metric_code,
"metric_name": payload.metric_name,
"metric_aliases": _json_dump(payload.metric_aliases),
"biz_domain": payload.biz_domain,
"biz_desc": payload.biz_desc,
"chat_turn_id": payload.chat_turn_id,
"tech_desc": payload.tech_desc,
"formula_expr": payload.formula_expr,
"base_sql": payload.base_sql,
"time_grain": payload.time_grain,
"dim_binding": _json_dump(payload.dim_binding),
"update_strategy": payload.update_strategy,
"schedule_id": payload.schedule_id,
"schedule_type": payload.schedule_type,
"version": 1,
"is_active": 1 if payload.is_active else 0,
"sql_hash": sql_hash,
"created_by": payload.created_by,
"updated_by": payload.updated_by,
"created_at": now,
"updated_at": now,
}
with engine.begin() as conn:
result = conn.execute(
text(
"""
INSERT INTO metric_def (
metric_code, metric_name, metric_aliases, biz_domain, biz_desc,
chat_turn_id, tech_desc, formula_expr, base_sql,
time_grain, dim_binding, update_strategy,
schedule_id, schedule_type, version, is_active,
sql_hash, created_by, updated_by, created_at, updated_at
)
VALUES (
:metric_code, :metric_name, :metric_aliases, :biz_domain, :biz_desc,
:chat_turn_id, :tech_desc, :formula_expr, :base_sql,
:time_grain, :dim_binding, :update_strategy,
:schedule_id, :schedule_type, :version, :is_active,
:sql_hash, :created_by, :updated_by, :created_at, :updated_at
)
"""
),
params,
)
metric_id = result.lastrowid
row = conn.execute(
text("SELECT * FROM metric_def WHERE id=:id"), {"id": metric_id}
).first()
if not row:
raise RuntimeError("Failed to create metric definition.")
data = _row_to_dict(row)
_parse_json_fields(data, ["metric_aliases", "dim_binding"])
data["is_active"] = bool(data.get("is_active"))
return data
def update_metric(metric_id: int, payload: MetricUpdate) -> Dict[str, Any]:
"""Update mutable fields of a metric definition and refresh sql_hash when needed."""
updates: Dict[str, Any] = {}
for field in (
"metric_name",
"biz_domain",
"biz_desc",
"tech_desc",
"formula_expr",
"base_sql",
"time_grain",
"update_strategy",
"schedule_id",
"schedule_type",
"updated_by",
):
value = getattr(payload, field)
if value is not None:
updates[field] = value
if payload.metric_aliases is not None:
updates["metric_aliases"] = _json_dump(payload.metric_aliases)
if payload.dim_binding is not None:
updates["dim_binding"] = _json_dump(payload.dim_binding)
if payload.is_active is not None:
updates["is_active"] = 1 if payload.is_active else 0
if payload.base_sql is not None:
updates["sql_hash"] = _metric_sql_hash(payload.base_sql)
if not updates:
current = get_metric(metric_id)
if not current:
raise KeyError(f"Metric {metric_id} not found.")
return current
updates["updated_at"] = datetime.utcnow()
set_clause = ", ".join(f"{key}=:{key}" for key in updates.keys())
params = dict(updates)
params["id"] = metric_id
engine = get_engine()
with engine.begin() as conn:
conn.execute(
text(f"UPDATE metric_def SET {set_clause} WHERE id=:id"),
params,
)
row = conn.execute(
text("SELECT * FROM metric_def WHERE id=:id"), {"id": metric_id}
).first()
if not row:
raise KeyError(f"Metric {metric_id} not found.")
data = _row_to_dict(row)
_parse_json_fields(data, ["metric_aliases", "dim_binding"])
data["is_active"] = bool(data.get("is_active"))
return data
def get_metric(metric_id: int) -> Optional[Dict[str, Any]]:
engine = get_engine()
with engine.begin() as conn:
row = conn.execute(
text("SELECT * FROM metric_def WHERE id=:id"), {"id": metric_id}
).first()
if not row:
return None
data = _row_to_dict(row)
_parse_json_fields(data, ["metric_aliases", "dim_binding"])
data["is_active"] = bool(data.get("is_active"))
return data
def list_metrics(
*,
biz_domain: Optional[str] = None,
is_active: Optional[bool] = None,
keyword: Optional[str] = None,
limit: int = 100,
offset: int = 0,
) -> List[Dict[str, Any]]:
"""List metric definitions with simple filters and pagination."""
conditions = []
params: Dict[str, Any] = {"limit": limit, "offset": offset}
if biz_domain:
conditions.append("biz_domain=:biz_domain")
params["biz_domain"] = biz_domain
if is_active is not None:
conditions.append("is_active=:is_active")
params["is_active"] = 1 if is_active else 0
if keyword:
conditions.append("(metric_code LIKE :kw OR metric_name LIKE :kw)")
params["kw"] = f"%{keyword}%"
where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
engine = get_engine()
with engine.begin() as conn:
rows = conn.execute(
text(
f"SELECT * FROM metric_def {where_clause} "
"ORDER BY updated_at DESC LIMIT :limit OFFSET :offset"
),
params,
).fetchall()
results: List[Dict[str, Any]] = []
for row in rows:
data = _row_to_dict(row)
_parse_json_fields(data, ["metric_aliases", "dim_binding"])
data["is_active"] = bool(data.get("is_active"))
results.append(data)
return results
# Metric schedules
def create_metric_schedule(payload: MetricScheduleCreate) -> Dict[str, Any]:
"""Create a schedule record for a metric."""
engine = get_engine()
params = {
"metric_id": payload.metric_id,
"cron_expr": payload.cron_expr,
"enabled": 1 if payload.enabled else 0,
"priority": payload.priority,
"backfill_allowed": 1 if payload.backfill_allowed else 0,
"max_runtime_sec": payload.max_runtime_sec,
"retry_times": payload.retry_times,
"owner_team": payload.owner_team,
"owner_user_id": payload.owner_user_id,
}
with engine.begin() as conn:
result = conn.execute(
text(
"""
INSERT INTO metric_schedule (
metric_id, cron_expr, enabled, priority,
backfill_allowed, max_runtime_sec, retry_times,
owner_team, owner_user_id
) VALUES (
:metric_id, :cron_expr, :enabled, :priority,
:backfill_allowed, :max_runtime_sec, :retry_times,
:owner_team, :owner_user_id
)
"""
),
params,
)
schedule_id = result.lastrowid
row = conn.execute(
text("SELECT * FROM metric_schedule WHERE id=:id"), {"id": schedule_id}
).first()
if not row:
raise RuntimeError("Failed to create metric schedule.")
data = _row_to_dict(row)
data["enabled"] = bool(data.get("enabled"))
data["backfill_allowed"] = bool(data.get("backfill_allowed"))
return data
def update_metric_schedule(schedule_id: int, payload: MetricScheduleUpdate) -> Dict[str, Any]:
updates: Dict[str, Any] = {}
for field in (
"cron_expr",
"priority",
"max_runtime_sec",
"retry_times",
"owner_team",
"owner_user_id",
):
value = getattr(payload, field)
if value is not None:
updates[field] = value
if payload.enabled is not None:
updates["enabled"] = 1 if payload.enabled else 0
if payload.backfill_allowed is not None:
updates["backfill_allowed"] = 1 if payload.backfill_allowed else 0
if not updates:
current = list_schedules_for_metric(schedule_id=schedule_id)
if current:
return current[0]
raise KeyError(f"Schedule {schedule_id} not found.")
set_clause = ", ".join(f"{key}=:{key}" for key in updates.keys())
params = dict(updates)
params["id"] = schedule_id
engine = get_engine()
with engine.begin() as conn:
conn.execute(
text(f"UPDATE metric_schedule SET {set_clause} WHERE id=:id"),
params,
)
row = conn.execute(
text("SELECT * FROM metric_schedule WHERE id=:id"), {"id": schedule_id}
).first()
if not row:
raise KeyError(f"Schedule {schedule_id} not found.")
data = _row_to_dict(row)
data["enabled"] = bool(data.get("enabled"))
data["backfill_allowed"] = bool(data.get("backfill_allowed"))
return data
def list_schedules_for_metric(metric_id: Optional[int] = None, schedule_id: Optional[int] = None) -> List[Dict[str, Any]]:
conditions = []
params: Dict[str, Any] = {}
if metric_id is not None:
conditions.append("metric_id=:metric_id")
params["metric_id"] = metric_id
if schedule_id is not None:
conditions.append("id=:id")
params["id"] = schedule_id
where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
engine = get_engine()
with engine.begin() as conn:
rows = conn.execute(
text(f"SELECT * FROM metric_schedule {where_clause} ORDER BY id DESC"),
params,
).fetchall()
results: List[Dict[str, Any]] = []
for row in rows:
data = _row_to_dict(row)
data["enabled"] = bool(data.get("enabled"))
data["backfill_allowed"] = bool(data.get("backfill_allowed"))
results.append(data)
return results
# Metric runs
def trigger_metric_run(payload: MetricRunTrigger) -> Dict[str, Any]:
"""Create a metric_job_run entry; execution is orchestrated elsewhere."""
metric = get_metric(payload.metric_id)
if not metric:
raise KeyError(f"Metric {payload.metric_id} not found.")
metric_version = payload.metric_version or metric.get("version", 1)
base_sql_snapshot = payload.base_sql_snapshot or metric.get("base_sql")
triggered_at = payload.triggered_at or datetime.utcnow()
params = {
"metric_id": payload.metric_id,
"schedule_id": payload.schedule_id,
"source_turn_id": payload.source_turn_id,
"data_time_from": payload.data_time_from,
"data_time_to": payload.data_time_to,
"metric_version": metric_version,
"base_sql_snapshot": base_sql_snapshot,
"status": "RUNNING",
"error_msg": None,
"affected_rows": None,
"runtime_ms": None,
"triggered_by": payload.triggered_by,
"triggered_at": triggered_at,
"started_at": None,
"finished_at": None,
}
engine = get_engine()
with engine.begin() as conn:
result = conn.execute(
text(
"""
INSERT INTO metric_job_run (
metric_id, schedule_id, source_turn_id,
data_time_from, data_time_to, metric_version,
base_sql_snapshot, status, error_msg,
affected_rows, runtime_ms,
triggered_by, triggered_at, started_at, finished_at
) VALUES (
:metric_id, :schedule_id, :source_turn_id,
:data_time_from, :data_time_to, :metric_version,
:base_sql_snapshot, :status, :error_msg,
:affected_rows, :runtime_ms,
:triggered_by, :triggered_at, :started_at, :finished_at
)
"""
),
params,
)
run_id = result.lastrowid
row = conn.execute(
text("SELECT * FROM metric_job_run WHERE id=:id"), {"id": run_id}
).first()
if not row:
raise RuntimeError("Failed to create metric job run.")
return _row_to_dict(row)
def get_metric_run(run_id: int) -> Optional[Dict[str, Any]]:
engine = get_engine()
with engine.begin() as conn:
row = conn.execute(
text("SELECT * FROM metric_job_run WHERE id=:id"), {"id": run_id}
).first()
if not row:
return None
return _row_to_dict(row)
def list_metric_runs(
*,
metric_id: Optional[int] = None,
status: Optional[str] = None,
limit: int = 100,
offset: int = 0,
) -> List[Dict[str, Any]]:
conditions = []
params: Dict[str, Any] = {"limit": limit, "offset": offset}
if metric_id is not None:
conditions.append("metric_id=:metric_id")
params["metric_id"] = metric_id
if status is not None:
conditions.append("status=:status")
params["status"] = status
where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
engine = get_engine()
with engine.begin() as conn:
rows = conn.execute(
text(
f"SELECT * FROM metric_job_run {where_clause} "
"ORDER BY triggered_at DESC LIMIT :limit OFFSET :offset"
),
params,
).fetchall()
return [_row_to_dict(row) for row in rows]
# Metric results
def write_metric_results(payload: MetricResultsWriteRequest) -> int:
"""Bulk insert metric_result rows for a metric/version."""
metric = get_metric(payload.metric_id)
if not metric:
raise KeyError(f"Metric {payload.metric_id} not found.")
default_version = metric.get("version", 1)
now = datetime.utcnow()
rows: List[Dict[str, Any]] = []
for item in payload.results:
rows.append(
{
"metric_id": payload.metric_id,
"metric_version": item.metric_version or default_version,
"stat_time": item.stat_time,
"extra_dims": _json_dump(item.extra_dims),
"metric_value": item.metric_value,
"load_time": item.load_time or now,
"data_version": item.data_version,
}
)
if not rows:
return 0
engine = get_engine()
with engine.begin() as conn:
conn.execute(
text(
"""
INSERT INTO metric_result (
metric_id, metric_version, stat_time,
extra_dims, metric_value, load_time, data_version
) VALUES (
:metric_id, :metric_version, :stat_time,
:extra_dims, :metric_value, :load_time, :data_version
)
"""
),
rows,
)
return len(rows)
def query_metric_results(
*,
metric_id: int,
stat_from: Optional[datetime] = None,
stat_to: Optional[datetime] = None,
limit: int = 200,
offset: int = 0,
) -> List[Dict[str, Any]]:
conditions = ["metric_id=:metric_id"]
params: Dict[str, Any] = {
"metric_id": metric_id,
"limit": limit,
"offset": offset,
}
if stat_from is not None:
conditions.append("stat_time>=:stat_from")
params["stat_from"] = stat_from
if stat_to is not None:
conditions.append("stat_time<=:stat_to")
params["stat_to"] = stat_to
where_clause = "WHERE " + " AND ".join(conditions)
engine = get_engine()
with engine.begin() as conn:
rows = conn.execute(
text(
f"SELECT * FROM metric_result {where_clause} "
"ORDER BY stat_time DESC LIMIT :limit OFFSET :offset"
),
params,
).fetchall()
results: List[Dict[str, Any]] = []
for row in rows:
data = _row_to_dict(row)
_parse_json_fields(data, ["extra_dims"])
results.append(data)
return results
def latest_metric_result(metric_id: int) -> Optional[Dict[str, Any]]:
engine = get_engine()
with engine.begin() as conn:
row = conn.execute(
text(
"""
SELECT * FROM metric_result
WHERE metric_id=:metric_id
ORDER BY stat_time DESC
LIMIT 1
"""
),
{"metric_id": metric_id},
).first()
if not row:
return None
data = _row_to_dict(row)
_parse_json_fields(data, ["extra_dims"])
return data

View File

@ -1,83 +0,0 @@
from __future__ import annotations
import logging
from typing import Any, Sequence
import httpx
from app.exceptions import ProviderAPICallError
from app.schemas.rag import RagDeleteRequest, RagItemPayload, RagRetrieveRequest
from app.settings import RAG_API_AUTH_TOKEN, RAG_API_BASE_URL
logger = logging.getLogger(__name__)
class RagAPIClient:
"""Thin async client wrapper around the RAG endpoints described in doc/rag-api.md."""
def __init__(self, *, base_url: str | None = None, auth_token: str | None = None) -> None:
resolved_base = base_url or RAG_API_BASE_URL
self._base_url = resolved_base.rstrip("/")
self._auth_token = auth_token or RAG_API_AUTH_TOKEN
def _headers(self) -> dict[str, str]:
headers = {"Content-Type": "application/json"}
if self._auth_token:
headers["Authorization"] = f"Bearer {self._auth_token}"
return headers
async def _post(
self,
client: httpx.AsyncClient,
path: str,
payload: Any,
) -> Any:
url = f"{self._base_url}{path}"
try:
response = await client.post(url, json=payload, headers=self._headers())
response.raise_for_status()
except httpx.HTTPStatusError as exc:
status_code = exc.response.status_code if exc.response else None
response_text = exc.response.text if exc.response else ""
logger.error(
"RAG API responded with an error (%s) for %s: %s",
status_code,
url,
response_text,
exc_info=True,
)
raise ProviderAPICallError(
"RAG API call failed.",
status_code=status_code,
response_text=response_text,
) from exc
except httpx.HTTPError as exc:
logger.error("Transport error calling RAG API %s: %s", url, exc, exc_info=True)
raise ProviderAPICallError(f"RAG API call failed: {exc}") from exc
try:
return response.json()
except ValueError:
logger.warning("RAG API returned non-JSON response for %s; returning raw text.", url)
return {"raw": response.text}
async def add(self, client: httpx.AsyncClient, payload: RagItemPayload) -> Any:
body = payload.model_dump(by_alias=True, exclude_none=True)
return await self._post(client, "/rag/add", body)
async def add_batch(self, client: httpx.AsyncClient, items: Sequence[RagItemPayload]) -> Any:
body = [item.model_dump(by_alias=True, exclude_none=True) for item in items]
return await self._post(client, "/rag/addBatch", body)
async def update(self, client: httpx.AsyncClient, payload: RagItemPayload) -> Any:
body = payload.model_dump(by_alias=True, exclude_none=True)
return await self._post(client, "/rag/update", body)
async def delete(self, client: httpx.AsyncClient, payload: RagDeleteRequest) -> Any:
body = payload.model_dump(by_alias=True, exclude_none=True)
return await self._post(client, "/rag/delete", body)
async def retrieve(self, client: httpx.AsyncClient, payload: RagRetrieveRequest) -> Any:
body = payload.model_dump(by_alias=True, exclude_none=True)
return await self._post(client, "/rag/retrieve", body)

View File

@ -1,857 +0,0 @@
from __future__ import annotations
import asyncio
import json
import logging
import os
import re
from datetime import date, datetime
from dataclasses import asdict, dataclass, is_dataclass
from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import httpx
import great_expectations as gx
from great_expectations.core.batch import RuntimeBatchRequest
from great_expectations.core.expectation_suite import ExpectationSuite
from great_expectations.data_context import AbstractDataContext
from great_expectations.exceptions import DataContextError, MetricResolutionError
from app.exceptions import ProviderAPICallError
from app.models import TableProfilingJobRequest
from app.services import LLMGateway
from app.settings import DEFAULT_IMPORT_MODEL
from app.services.import_analysis import (
IMPORT_GATEWAY_BASE_URL,
resolve_provider_from_model,
)
from app.utils.llm_usage import extract_usage as extract_llm_usage
logger = logging.getLogger(__name__)
GE_REPORT_RELATIVE_PATH = Path("uncommitted") / "data_docs" / "local_site" / "index.html"
PROMPT_FILENAMES = {
"ge_result_desc": "ge_result_desc_prompt.md",
"snippet_generator": "snippet_generator.md",
"snippet_alias": "snippet_alias_generator.md",
}
DEFAULT_CHAT_TIMEOUT_SECONDS = 180.0
@dataclass
class GEProfilingArtifacts:
profiling_result: Dict[str, Any]
profiling_summary: Dict[str, Any]
docs_path: str
@dataclass
class LLMCallResult:
data: Any
usage: Optional[Dict[str, Any]] = None
class PipelineActionType:
GE_PROFILING = "ge_profiling"
GE_RESULT_DESC = "ge_result_desc"
SNIPPET = "snippet"
SNIPPET_ALIAS = "snippet_alias"
def _project_root() -> Path:
return Path(__file__).resolve().parents[2]
def _prompt_dir() -> Path:
return _project_root() / "prompt"
@lru_cache(maxsize=None)
def _load_prompt_parts(filename: str) -> Tuple[str, str]:
prompt_path = _prompt_dir() / filename
if not prompt_path.exists():
raise FileNotFoundError(f"Prompt template not found: {prompt_path}")
raw = prompt_path.read_text(encoding="utf-8")
splitter = "用户消息User"
if splitter not in raw:
raise ValueError(f"Prompt template '{filename}' missing separator '{splitter}'.")
system_raw, user_raw = raw.split(splitter, maxsplit=1)
system_text = system_raw.replace("系统角色System", "").strip()
user_text = user_raw.strip()
return system_text, user_text
def _render_prompt(template_key: str, replacements: Dict[str, str]) -> Tuple[str, str]:
filename = PROMPT_FILENAMES[template_key]
system_text, user_template = _load_prompt_parts(filename)
rendered_user = user_template
for key, value in replacements.items():
rendered_user = rendered_user.replace(key, value)
return system_text, rendered_user
def _extract_timeout_seconds(options: Optional[Dict[str, Any]]) -> Optional[float]:
if not options:
return None
value = options.get("llm_timeout_seconds")
if value is None:
return None
try:
timeout = float(value)
if timeout <= 0:
raise ValueError
return timeout
except (TypeError, ValueError):
logger.warning(
"Invalid llm_timeout_seconds value in extra_options: %r. Falling back to default.",
value,
)
return DEFAULT_CHAT_TIMEOUT_SECONDS
def _extract_json_payload(content: str) -> str:
fenced = re.search(
r"```(?:json)?\s*([\s\S]+?)```",
content,
flags=re.IGNORECASE,
)
if fenced:
snippet = fenced.group(1).strip()
if snippet:
return snippet
stripped = content.strip()
if not stripped:
raise ValueError("Empty LLM content.")
decoder = json.JSONDecoder()
for idx, char in enumerate(stripped):
if char not in {"{", "["}:
continue
try:
_, end = decoder.raw_decode(stripped[idx:])
except json.JSONDecodeError:
continue
candidate = stripped[idx : idx + end].strip()
if candidate:
return candidate
return stripped
def _parse_completion_payload(response_payload: Dict[str, Any]) -> Any:
choices = response_payload.get("choices") or []
if not choices:
raise ProviderAPICallError("LLM response did not contain choices to parse.")
message = choices[0].get("message") or {}
content = message.get("content") or ""
if not content.strip():
raise ProviderAPICallError("LLM response content is empty.")
json_payload = _extract_json_payload(content)
try:
return json.loads(json_payload)
except json.JSONDecodeError as exc:
preview = json_payload[:800]
logger.error("Failed to parse JSON from LLM response: %s", preview, exc_info=True)
raise ProviderAPICallError("LLM response JSON parsing failed.") from exc
async def _post_callback(callback_url: str, payload: Dict[str, Any], client: httpx.AsyncClient) -> None:
safe_payload = _normalize_for_json(payload)
try:
logger.info(
"Posting pipeline action callback to %s: %s",
callback_url,
json.dumps(safe_payload, ensure_ascii=False),
)
response = await client.post(callback_url, json=safe_payload)
response.raise_for_status()
except httpx.HTTPError as exc:
logger.error("Callback delivery to %s failed: %s", callback_url, exc, exc_info=True)
def _sanitize_value_set(value: Any, max_values: int) -> Tuple[Any, Optional[Dict[str, int]]]:
if not isinstance(value, list):
return value, None
original_len = len(value)
if original_len <= max_values:
return value, None
trimmed = value[:max_values]
return trimmed, {"original_length": original_len, "retained": max_values}
def _sanitize_expectation_suite(suite: ExpectationSuite, max_value_set_values: int = 100) -> Dict[str, Any]:
suite_dict = suite.to_json_dict()
remarks: List[Dict[str, Any]] = []
for expectation in suite_dict.get("expectations", []):
kwargs = expectation.get("kwargs", {})
if "value_set" in kwargs:
sanitized_value, note = _sanitize_value_set(kwargs["value_set"], max_value_set_values)
kwargs["value_set"] = sanitized_value
if note:
expectation.setdefault("meta", {})
expectation["meta"]["value_set_truncated"] = note
remarks.append(
{
"column": kwargs.get("column"),
"expectation": expectation.get("expectation_type"),
"note": note,
}
)
if remarks:
suite_dict.setdefault("meta", {})
suite_dict["meta"]["value_set_truncations"] = remarks
return suite_dict
def _summarize_expectation_suite(suite_dict: Dict[str, Any]) -> Dict[str, Any]:
column_map: Dict[str, Dict[str, Any]] = {}
table_expectations: List[Dict[str, Any]] = []
for expectation in suite_dict.get("expectations", []):
expectation_type = expectation.get("expectation_type")
kwargs = expectation.get("kwargs", {})
column = kwargs.get("column")
summary_entry: Dict[str, Any] = {"expectation": expectation_type}
if "value_set" in kwargs and isinstance(kwargs["value_set"], list):
summary_entry["value_set_size"] = len(kwargs["value_set"])
summary_entry["value_set_preview"] = kwargs["value_set"][:5]
if column:
column_entry = column_map.setdefault(
column,
{"name": column, "expectations": []},
)
column_entry["expectations"].append(summary_entry)
else:
table_expectations.append(summary_entry)
summary = {
"column_profiles": list(column_map.values()),
"table_level_expectations": table_expectations,
"total_expectations": len(suite_dict.get("expectations", [])),
}
return summary
def _sanitize_identifier(raw: Optional[str], fallback: str) -> str:
if not raw:
return fallback
candidate = re.sub(r"[^0-9A-Za-z_]+", "_", raw).strip("_")
return candidate or fallback
def _format_connection_string(template: str, access_info: Dict[str, Any]) -> str:
if not access_info:
return template
try:
return template.format_map({k: v for k, v in access_info.items()})
except KeyError as exc:
missing = exc.args[0]
raise ValueError(f"table_access_info missing key '{missing}' required by connection_string.") from exc
def _ensure_sql_runtime_datasource(
context: AbstractDataContext,
datasource_name: str,
connection_string: str,
) -> None:
try:
datasource = context.get_datasource(datasource_name)
except (DataContextError, ValueError) as exc:
message = str(exc)
if "Could not find a datasource" in message or "Unable to load datasource" in message:
datasource = None
else: # pragma: no cover - defensive
raise RuntimeError(f"Failed to inspect datasource '{datasource_name}'.") from exc
except Exception as exc: # pragma: no cover - defensive
raise RuntimeError(f"Failed to inspect datasource '{datasource_name}'.") from exc
if datasource is not None:
execution_engine = getattr(datasource, "execution_engine", None)
current_conn = getattr(execution_engine, "connection_string", None)
if current_conn and current_conn != connection_string:
logger.info(
"Existing datasource %s uses different connection string; creating dedicated runtime datasource.",
datasource_name,
)
try:
context.delete_datasource(datasource_name)
except Exception as exc: # pragma: no cover - defensive
logger.warning(
"Failed to delete datasource %s before recreation: %s",
datasource_name,
exc,
)
else:
datasource = None
if datasource is not None:
return
runtime_datasource_config = {
"name": datasource_name,
"class_name": "Datasource",
"execution_engine": {
"class_name": "SqlAlchemyExecutionEngine",
"connection_string": connection_string,
},
"data_connectors": {
"runtime_connector": {
"class_name": "RuntimeDataConnector",
"batch_identifiers": ["default_identifier_name"],
}
},
}
try:
context.add_datasource(**runtime_datasource_config)
except Exception as exc: # pragma: no cover - defensive
raise RuntimeError(f"Failed to create runtime datasource '{datasource_name}'.") from exc
def _build_sql_runtime_batch_request(
context: AbstractDataContext,
request: TableProfilingJobRequest,
) -> RuntimeBatchRequest:
link_info = request.table_link_info or {}
access_info = request.table_access_info or {}
connection_template = link_info.get("connection_string")
if not connection_template:
raise ValueError("table_link_info.connection_string is required when using table_link_info.")
connection_string = _format_connection_string(connection_template, access_info)
source_type = (link_info.get("type") or "sql").lower()
if source_type != "sql":
raise ValueError(f"Unsupported table_link_info.type='{source_type}'. Only 'sql' is supported.")
query = link_info.get("query")
table_name = link_info.get("table") or link_info.get("table_name")
schema_name = link_info.get("schema")
if not query and not table_name:
raise ValueError("Either table_link_info.query or table_link_info.table must be provided.")
if not query:
if not table_name:
raise ValueError("table_link_info.table must be provided when query is omitted.")
identifier = re.compile(r"^[A-Za-z_][A-Za-z0-9_$]*$")
def _quote(name: str) -> str:
if identifier.match(name):
return name
return f"`{name.replace('`', '``')}`"
if schema_name:
schema_part = schema_name if "." not in schema_name else schema_name.split(".")[-1]
table_part = table_name if "." not in table_name else table_name.split(".")[-1]
qualified_table = f"{_quote(schema_part)}.{_quote(table_part)}"
else:
qualified_table = _quote(table_name)
query = f"SELECT * FROM {qualified_table}"
limit = link_info.get("limit")
if isinstance(limit, int) and limit > 0:
query = f"{query} LIMIT {limit}"
datasource_name = request.ge_datasource_name or _sanitize_identifier(
f"{request.table_id}_runtime_ds", "runtime_ds"
)
data_asset_name = request.ge_data_asset_name or _sanitize_identifier(
table_name or "runtime_query", "runtime_query"
)
_ensure_sql_runtime_datasource(context, datasource_name, connection_string)
batch_identifiers = {
"default_identifier_name": f"{request.table_id}:{request.version_ts}",
}
return RuntimeBatchRequest(
datasource_name=datasource_name,
data_connector_name="runtime_connector",
data_asset_name=data_asset_name,
runtime_parameters={"query": query},
batch_identifiers=batch_identifiers,
)
def _run_onboarding_assistant(
context: AbstractDataContext,
batch_request: Any,
suite_name: str,
) -> Tuple[ExpectationSuite, Any]:
assistant = context.assistants.onboarding
assistant_result = assistant.run(batch_request=batch_request)
suite = assistant_result.get_expectation_suite(expectation_suite_name=suite_name)
context.save_expectation_suite(suite, expectation_suite_name=suite_name)
validation_getter = getattr(assistant_result, "get_validation_result", None)
if callable(validation_getter):
validation_result = validation_getter()
else:
validation_result = getattr(assistant_result, "validation_result", None)
if validation_result is None:
# Fallback: rerun validation using the freshly generated expectation suite.
validator = context.get_validator(
batch_request=batch_request,
expectation_suite_name=suite_name,
)
validation_result = validator.validate()
return suite, validation_result
def _resolve_context(request: TableProfilingJobRequest) -> AbstractDataContext:
context_kwargs: Dict[str, Any] = {}
if request.ge_data_context_root:
context_kwargs["project_root_dir"] = request.ge_data_context_root
elif os.environ.get("GE_DATA_CONTEXT_ROOT"):
context_kwargs["project_root_dir"] = os.environ["GE_DATA_CONTEXT_ROOT"]
else:
context_kwargs["project_root_dir"] = str(_project_root())
return gx.get_context(**context_kwargs)
def _build_batch_request(
context: AbstractDataContext,
request: TableProfilingJobRequest,
) -> Any:
if request.ge_batch_request:
from great_expectations.core.batch import BatchRequest
return BatchRequest(**request.ge_batch_request)
if request.table_link_info:
return _build_sql_runtime_batch_request(context, request)
if not request.ge_datasource_name or not request.ge_data_asset_name:
raise ValueError(
"ge_batch_request or (ge_datasource_name and ge_data_asset_name) must be provided."
)
datasource = context.get_datasource(request.ge_datasource_name)
data_asset = datasource.get_asset(request.ge_data_asset_name)
return data_asset.build_batch_request()
async def _run_ge_profiling(request: TableProfilingJobRequest) -> GEProfilingArtifacts:
def _execute() -> GEProfilingArtifacts:
context = _resolve_context(request)
suite_name = (
request.ge_expectation_suite_name
or f"{request.table_id}_profiling"
)
batch_request = _build_batch_request(context, request)
try:
context.get_expectation_suite(suite_name)
except DataContextError:
context.add_expectation_suite(suite_name)
validator = context.get_validator(
batch_request=batch_request,
expectation_suite_name=suite_name,
)
profiler_type = (request.ge_profiler_type or "user_configurable").lower()
if profiler_type == "data_assistant":
suite, validation_result = _run_onboarding_assistant(
context,
batch_request,
suite_name,
)
else:
try:
from great_expectations.profile.user_configurable_profiler import (
UserConfigurableProfiler,
)
except ImportError as err: # pragma: no cover - dependency guard
raise RuntimeError(
"UserConfigurableProfiler is unavailable; install great_expectations profiling extra or switch profiler."
) from err
profiler = UserConfigurableProfiler(profile_dataset=validator)
try:
suite = profiler.build_suite()
context.save_expectation_suite(suite, expectation_suite_name=suite_name)
validator.expectation_suite = suite
validation_result = validator.validate()
except MetricResolutionError as exc:
logger.warning(
"UserConfigurableProfiler failed (%s); falling back to data assistant profiling.",
exc,
)
suite, validation_result = _run_onboarding_assistant(
context,
batch_request,
suite_name,
)
sanitized_suite = _sanitize_expectation_suite(suite)
summary = _summarize_expectation_suite(sanitized_suite)
validation_dict = validation_result.to_json_dict()
context.build_data_docs()
docs_path = Path(context.root_directory) / GE_REPORT_RELATIVE_PATH
profiling_result = {
"expectation_suite": sanitized_suite,
"validation_result": validation_dict,
"batch_request": getattr(batch_request, "to_json_dict", lambda: None)() or getattr(batch_request, "dict", lambda: None)(),
}
return GEProfilingArtifacts(
profiling_result=profiling_result,
profiling_summary=summary,
docs_path=str(docs_path),
)
return await asyncio.to_thread(_execute)
async def _call_chat_completions(
*,
model_spec: str,
system_prompt: str,
user_prompt: str,
client: httpx.AsyncClient,
temperature: float = 0.2,
timeout_seconds: Optional[float] = None,
) -> Any:
provider, model_name = resolve_provider_from_model(model_spec)
payload = {
"provider": provider.value,
"model": model_name,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
"temperature": temperature,
}
payload_size_bytes = len(json.dumps(payload, ensure_ascii=False).encode("utf-8"))
url = f"{IMPORT_GATEWAY_BASE_URL.rstrip('/')}/v1/chat/completions"
try:
# log the request whole info
logger.info(
"Calling chat completions API %s with model %s and size %s and payload %s",
url,
model_name,
payload_size_bytes,
payload,
)
response = await client.post(url, json=payload, timeout=timeout_seconds)
response.raise_for_status()
except httpx.HTTPError as exc:
error_name = exc.__class__.__name__
detail = str(exc).strip()
if detail:
message = f"Chat completions request failed ({error_name}): {detail}"
else:
message = f"Chat completions request failed ({error_name})."
raise ProviderAPICallError(message) from exc
try:
response_payload = response.json()
except ValueError as exc:
raise ProviderAPICallError("Chat completions response was not valid JSON.") from exc
parsed_payload = _parse_completion_payload(response_payload)
usage_info = extract_llm_usage(response_payload)
return LLMCallResult(data=parsed_payload, usage=usage_info)
def _normalize_for_json(value: Any) -> Any:
if value is None or isinstance(value, (str, int, float, bool)):
return value
if isinstance(value, (datetime, date)):
return str(value)
if hasattr(value, "model_dump"):
try:
return value.model_dump()
except Exception: # pragma: no cover - defensive
pass
if is_dataclass(value):
return asdict(value)
if isinstance(value, dict):
return {k: _normalize_for_json(v) for k, v in value.items()}
if isinstance(value, (list, tuple, set)):
return [_normalize_for_json(v) for v in value]
if hasattr(value, "to_json_dict"):
try:
return value.to_json_dict()
except Exception: # pragma: no cover - defensive
pass
if hasattr(value, "__dict__"):
return _normalize_for_json(value.__dict__)
return repr(value)
def _json_dumps(data: Any) -> str:
normalised = _normalize_for_json(data)
return json.dumps(normalised, ensure_ascii=False, indent=2)
def _preview_for_log(data: Any) -> str:
try:
serialised = _json_dumps(data)
except Exception:
serialised = repr(data)
return serialised
def _profiling_request_for_log(request: TableProfilingJobRequest) -> Dict[str, Any]:
payload = request.model_dump()
access_info = payload.get("table_access_info")
if isinstance(access_info, dict):
payload["table_access_info"] = {key: "***" for key in access_info.keys()}
return payload
async def _execute_result_desc(
profiling_json: Dict[str, Any],
_request: TableProfilingJobRequest,
llm_model: str,
client: httpx.AsyncClient,
timeout_seconds: Optional[float],
) -> Dict[str, Any]:
system_prompt, user_prompt = _render_prompt(
"ge_result_desc",
{"{{GE_RESULT_JSON}}": _json_dumps(profiling_json)},
)
llm_output = await _call_chat_completions(
model_spec=llm_model,
system_prompt=system_prompt,
user_prompt=user_prompt,
client=client,
timeout_seconds=timeout_seconds,
)
if not isinstance(llm_output.data, dict):
raise ProviderAPICallError("GE result description payload must be a JSON object.")
return llm_output
async def _execute_snippet_generation(
table_desc_json: Dict[str, Any],
_request: TableProfilingJobRequest,
llm_model: str,
client: httpx.AsyncClient,
timeout_seconds: Optional[float],
) -> List[Dict[str, Any]]:
system_prompt, user_prompt = _render_prompt(
"snippet_generator",
{"{{TABLE_PROFILE_JSON}}": _json_dumps(table_desc_json)},
)
llm_output = await _call_chat_completions(
model_spec=llm_model,
system_prompt=system_prompt,
user_prompt=user_prompt,
client=client,
timeout_seconds=timeout_seconds,
)
if not isinstance(llm_output.data, list):
raise ProviderAPICallError("Snippet generator must return a JSON array.")
return llm_output
async def _execute_snippet_alias(
snippets_json: List[Dict[str, Any]],
_request: TableProfilingJobRequest,
llm_model: str,
client: httpx.AsyncClient,
timeout_seconds: Optional[float],
) -> List[Dict[str, Any]]:
system_prompt, user_prompt = _render_prompt(
"snippet_alias",
{"{{SNIPPET_ARRAY}}": _json_dumps(snippets_json)},
)
llm_output = await _call_chat_completions(
model_spec=llm_model,
system_prompt=system_prompt,
user_prompt=user_prompt,
client=client,
timeout_seconds=timeout_seconds,
)
if not isinstance(llm_output.data, list):
raise ProviderAPICallError("Snippet alias generator must return a JSON array.")
return llm_output
async def _run_action_with_callback(
*,
action_type: str,
runner,
callback_base: Dict[str, Any],
client: httpx.AsyncClient,
callback_url: str,
input_payload: Any = None,
model_spec: Optional[str] = None,
) -> Any:
if input_payload is not None:
logger.info(
"Pipeline action %s input: %s",
action_type,
_preview_for_log(input_payload),
)
try:
result = await runner()
except Exception as exc:
failure_payload = dict(callback_base)
failure_payload.update(
{
"status": "failed",
"action_type": action_type,
"error": str(exc),
}
)
if model_spec is not None:
failure_payload["model"] = model_spec
await _post_callback(callback_url, failure_payload, client)
raise
usage_info: Optional[Dict[str, Any]] = None
result_payload = result
if isinstance(result, LLMCallResult):
usage_info = result.usage
result_payload = result.data
success_payload = dict(callback_base)
success_payload.update(
{
"status": "success",
"action_type": action_type,
}
)
if model_spec is not None:
success_payload["model"] = model_spec
logger.info(
"Pipeline action %s output: %s",
action_type,
_preview_for_log(result_payload),
)
if action_type == PipelineActionType.GE_PROFILING:
artifacts: GEProfilingArtifacts = result_payload
success_payload["ge_profiling_json"] = artifacts.profiling_result
success_payload["ge_profiling_summary"] = artifacts.profiling_summary
success_payload["ge_report_path"] = artifacts.docs_path
elif action_type == PipelineActionType.GE_RESULT_DESC:
success_payload["ge_result_desc_json"] = result_payload
elif action_type == PipelineActionType.SNIPPET:
success_payload["snippet_json"] = result_payload
elif action_type == PipelineActionType.SNIPPET_ALIAS:
success_payload["snippet_alias_json"] = result_payload
if usage_info:
success_payload["llm_usage"] = usage_info
await _post_callback(callback_url, success_payload, client)
return result_payload
async def process_table_profiling_job(
request: TableProfilingJobRequest,
_gateway: LLMGateway,
client: httpx.AsyncClient,
) -> None:
"""Sequentially execute the four-step profiling pipeline and emit callbacks per action."""
timeout_seconds = _extract_timeout_seconds(request.extra_options)
if timeout_seconds is None:
timeout_seconds = DEFAULT_CHAT_TIMEOUT_SECONDS
base_payload = {
"table_id": request.table_id,
"version_ts": request.version_ts,
"callback_url": str(request.callback_url),
"table_schema": request.table_schema,
"table_schema_version_id": request.table_schema_version_id,
"llm_model": request.llm_model,
"llm_timeout_seconds": timeout_seconds,
"workspace_id": request.workspace_id,
"rag_item_type": request.rag_item_type,
}
logging_request_payload = _profiling_request_for_log(request)
try:
artifacts: GEProfilingArtifacts = await _run_action_with_callback(
action_type=PipelineActionType.GE_PROFILING,
runner=lambda: _run_ge_profiling(request),
callback_base=base_payload,
client=client,
callback_url=str(request.callback_url),
input_payload=logging_request_payload,
model_spec=request.llm_model,
)
table_desc_json: Dict[str, Any] = await _run_action_with_callback(
action_type=PipelineActionType.GE_RESULT_DESC,
runner=lambda: _execute_result_desc(
artifacts.profiling_result,
request,
request.llm_model,
client,
timeout_seconds,
),
callback_base=base_payload,
client=client,
callback_url=str(request.callback_url),
input_payload=artifacts.profiling_result,
model_spec=request.llm_model,
)
snippet_json: List[Dict[str, Any]] = await _run_action_with_callback(
action_type=PipelineActionType.SNIPPET,
runner=lambda: _execute_snippet_generation(
table_desc_json,
request,
request.llm_model,
client,
timeout_seconds,
),
callback_base=base_payload,
client=client,
callback_url=str(request.callback_url),
input_payload=table_desc_json,
model_spec=request.llm_model,
)
await _run_action_with_callback(
action_type=PipelineActionType.SNIPPET_ALIAS,
runner=lambda: _execute_snippet_alias(
snippet_json,
request,
request.llm_model,
client,
timeout_seconds,
),
callback_base=base_payload,
client=client,
callback_url=str(request.callback_url),
input_payload=snippet_json,
model_spec=request.llm_model,
)
except Exception: # pragma: no cover - defensive catch
logger.exception(
"Table profiling pipeline failed for table_id=%s version_ts=%s",
request.table_id,
request.version_ts,
)

View File

@ -1,640 +0,0 @@
from __future__ import annotations
import hashlib
import json
import logging
from datetime import datetime
from typing import Any, Dict, List, Optional, Sequence, Tuple
from sqlalchemy import text
from sqlalchemy.engine import Engine
from sqlalchemy.exc import SQLAlchemyError
from app.db import get_engine
from app.models import ActionType, TableSnippetUpsertRequest, TableSnippetUpsertResponse
from app.schemas.rag import RagItemPayload
from app.services.rag_client import RagAPIClient
logger = logging.getLogger(__name__)
def _serialize_json(value: Any) -> Tuple[str | None, int | None]:
logger.debug("Serializing JSON payload: %s", value)
if value is None:
return None, None
if isinstance(value, str):
encoded = value.encode("utf-8")
return value, len(encoded)
serialized = json.dumps(value, ensure_ascii=False)
encoded = serialized.encode("utf-8")
return serialized, len(encoded)
def _prepare_table_schema(value: Any) -> str:
logger.debug("Preparing table_schema payload.")
if isinstance(value, str):
return value
return json.dumps(value, ensure_ascii=False)
def _prepare_model_params(params: Dict[str, Any] | None) -> str | None:
if not params:
return None
serialized, _ = _serialize_json(params)
return serialized
def _collect_common_columns(request: TableSnippetUpsertRequest) -> Dict[str, Any]:
# Build the base column set shared by all action types; action-specific fields are populated later.
logger.debug(
"Collecting common columns for table_id=%s version_ts=%s action_type=%s",
request.table_id,
request.version_ts,
request.action_type,
)
payload: Dict[str, Any] = {
"table_id": request.table_id,
"version_ts": request.version_ts,
"action_type": request.action_type.value,
"status": request.status.value,
"callback_url": str(request.callback_url),
"table_schema_version_id": request.table_schema_version_id,
"table_schema": _prepare_table_schema(request.table_schema),
"model": request.model,
"model_provider": request.model_provider,
}
payload.update(
{
"ge_profiling_json": None,
"ge_profiling_json_size_bytes": None,
"ge_profiling_summary": None,
"ge_profiling_summary_size_bytes": None,
"ge_profiling_total_size_bytes": None,
"ge_profiling_html_report_url": None,
"ge_result_desc_json": None,
"ge_result_desc_json_size_bytes": None,
"snippet_json": None,
"snippet_json_size_bytes": None,
"snippet_alias_json": None,
"snippet_alias_json_size_bytes": None,
}
)
payload["model_params"] = _prepare_model_params(request.model_params)
if request.llm_usage is not None:
llm_usage_json, _ = _serialize_json(request.llm_usage)
if llm_usage_json is not None:
payload["llm_usage"] = llm_usage_json
if request.error_code is not None:
logger.debug("Adding error_code: %s", request.error_code)
payload["error_code"] = request.error_code
if request.error_message is not None:
logger.debug("Adding error_message: %s", request.error_message)
payload["error_message"] = request.error_message
if request.started_at is not None:
payload["started_at"] = request.started_at
if request.finished_at is not None:
payload["finished_at"] = request.finished_at
if request.duration_ms is not None:
payload["duration_ms"] = request.duration_ms
if request.result_checksum is not None:
payload["result_checksum"] = request.result_checksum
logger.debug("Collected common payload: %s", payload)
return payload
def _apply_action_payload(
request: TableSnippetUpsertRequest,
payload: Dict[str, Any],
) -> None:
logger.debug("Applying action-specific payload for action_type=%s", request.action_type)
if request.action_type == ActionType.GE_PROFILING:
full_json, full_size = _serialize_json(request.ge_profiling_json)
summary_json, summary_size = _serialize_json(request.ge_profiling_summary)
if full_json is not None:
payload["ge_profiling_json"] = full_json
payload["ge_profiling_json_size_bytes"] = full_size
if summary_json is not None:
payload["ge_profiling_summary"] = summary_json
payload["ge_profiling_summary_size_bytes"] = summary_size
if request.ge_profiling_total_size_bytes is not None:
payload["ge_profiling_total_size_bytes"] = request.ge_profiling_total_size_bytes
elif full_size is not None or summary_size is not None:
payload["ge_profiling_total_size_bytes"] = (full_size or 0) + (summary_size or 0)
if request.ge_profiling_html_report_url:
payload["ge_profiling_html_report_url"] = request.ge_profiling_html_report_url
elif request.action_type == ActionType.GE_RESULT_DESC:
full_json, full_size = _serialize_json(request.ge_result_desc_json)
if full_json is not None:
payload["ge_result_desc_json"] = full_json
payload["ge_result_desc_json_size_bytes"] = full_size
elif request.action_type == ActionType.SNIPPET:
full_json, full_size = _serialize_json(request.snippet_json)
if full_json is not None:
payload["snippet_json"] = full_json
payload["snippet_json_size_bytes"] = full_size
elif request.action_type == ActionType.SNIPPET_ALIAS:
full_json, full_size = _serialize_json(request.snippet_alias_json)
if full_json is not None:
payload["snippet_alias_json"] = full_json
payload["snippet_alias_json_size_bytes"] = full_size
else:
logger.error("Unsupported action type encountered: %s", request.action_type)
raise ValueError(f"Unsupported action type '{request.action_type}'.")
logger.debug("Payload after applying action-specific data: %s", payload)
def _build_insert_statement(columns: Dict[str, Any]) -> Tuple[str, Dict[str, Any]]:
logger.debug("Building insert statement for columns: %s", list(columns.keys()))
column_names = list(columns.keys())
placeholders = [f":{name}" for name in column_names]
update_assignments = [
f"{name}=VALUES({name})"
for name in column_names
if name not in {"table_id", "version_ts", "action_type"}
]
update_assignments.append("updated_at=CURRENT_TIMESTAMP")
sql = (
"INSERT INTO action_results ({cols}) VALUES ({vals}) "
"ON DUPLICATE KEY UPDATE {updates}"
).format(
cols=", ".join(column_names),
vals=", ".join(placeholders),
updates=", ".join(update_assignments),
)
logger.debug("Generated SQL: %s", sql)
return sql, columns
def _execute_upsert(engine: Engine, sql: str, params: Dict[str, Any]) -> int:
logger.info("Executing upsert for table_id=%s version_ts=%s action_type=%s", params.get("table_id"), params.get("version_ts"), params.get("action_type"))
with engine.begin() as conn:
result = conn.execute(text(sql), params)
logger.info("Rows affected: %s", result.rowcount)
return result.rowcount
def upsert_action_result(request: TableSnippetUpsertRequest) -> TableSnippetUpsertResponse:
logger.info(
"Received upsert request: table_id=%s version_ts=%s action_type=%s status=%s",
request.table_id,
request.version_ts,
request.action_type,
request.status,
)
logger.debug("Request payload: %s", request.model_dump())
columns = _collect_common_columns(request)
_apply_action_payload(request, columns)
sql, params = _build_insert_statement(columns)
logger.debug("Final SQL params: %s", params)
engine = get_engine()
try:
rowcount = _execute_upsert(engine, sql, params)
except SQLAlchemyError as exc:
logger.exception(
"Failed to upsert action result: table_id=%s version_ts=%s action_type=%s",
request.table_id,
request.version_ts,
request.action_type,
)
raise RuntimeError(f"Database operation failed: {exc}") from exc
updated = rowcount > 1
return TableSnippetUpsertResponse(
table_id=request.table_id,
version_ts=request.version_ts,
action_type=request.action_type,
status=request.status,
updated=updated,
)
def _decode_json_field(value: Any) -> Any:
"""Decode JSON columns that may be returned as str/bytes/dicts/lists."""
if value is None:
return None
if isinstance(value, (dict, list)):
return value
if isinstance(value, (bytes, bytearray)):
try:
value = value.decode("utf-8")
except Exception: # pragma: no cover - defensive
return None
if isinstance(value, str):
try:
return json.loads(value)
except json.JSONDecodeError:
logger.warning("Failed to decode JSON field: %s", value)
return None
return None
def _coerce_json_array(value: Any) -> List[Any]:
decoded = _decode_json_field(value)
return decoded if isinstance(decoded, list) else []
def _fetch_action_payload(
engine: Engine, table_id: int, version_ts: int, action_type: ActionType
) -> Optional[Dict[str, Any]]:
sql = text(
"""
SELECT id AS action_result_id, snippet_json, snippet_alias_json, updated_at, status
FROM action_results
WHERE table_id = :table_id
AND version_ts = :version_ts
AND action_type = :action_type
AND status IN ('success', 'partial')
ORDER BY CASE status WHEN 'success' THEN 0 ELSE 1 END, updated_at DESC
LIMIT 1
"""
)
with engine.connect() as conn:
row = conn.execute(
sql,
{
"table_id": table_id,
"version_ts": version_ts,
"action_type": action_type.value,
},
).mappings().first()
return dict(row) if row else None
def _load_snippet_sources(
engine: Engine, table_id: int, version_ts: int
) -> Tuple[List[Any], List[Any], Optional[datetime], Optional[int], Optional[int]]:
alias_row = _fetch_action_payload(engine, table_id, version_ts, ActionType.SNIPPET_ALIAS)
snippet_row = _fetch_action_payload(engine, table_id, version_ts, ActionType.SNIPPET)
snippet_json = _coerce_json_array(alias_row.get("snippet_json") if alias_row else None)
alias_json = _coerce_json_array(alias_row.get("snippet_alias_json") if alias_row else None)
updated_at: Optional[datetime] = alias_row.get("updated_at") if alias_row else None
alias_action_id: Optional[int] = alias_row.get("action_result_id") if alias_row else None
snippet_action_id: Optional[int] = snippet_row.get("action_result_id") if snippet_row else None
if not snippet_json and snippet_row:
snippet_json = _coerce_json_array(snippet_row.get("snippet_json"))
if updated_at is None:
updated_at = snippet_row.get("updated_at")
if alias_action_id is None:
alias_action_id = snippet_action_id
if not updated_at and alias_row:
updated_at = alias_row.get("updated_at")
return snippet_json, alias_json, updated_at, alias_action_id, snippet_action_id
def _normalize_aliases(raw_aliases: Any) -> List[Dict[str, Any]]:
aliases: List[Dict[str, Any]] = []
seen: set[str] = set()
if not raw_aliases:
return aliases
if not isinstance(raw_aliases, list):
return aliases
for item in raw_aliases:
if isinstance(item, dict):
text_val = item.get("text")
if not text_val or text_val in seen:
continue
seen.add(text_val)
aliases.append({"text": text_val, "tone": item.get("tone")})
elif isinstance(item, str):
if item in seen:
continue
seen.add(item)
aliases.append({"text": item})
return aliases
def _normalize_str_list(values: Any) -> List[str]:
if not values:
return []
if not isinstance(values, list):
return []
seen: set[str] = set()
normalised: List[str] = []
for val in values:
if not isinstance(val, str):
continue
if val in seen:
continue
seen.add(val)
normalised.append(val)
return normalised
def _merge_alias_lists(primary: List[Dict[str, Any]], secondary: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
merged: List[Dict[str, Any]] = []
seen: set[str] = set()
for source in (primary, secondary):
for item in source:
if not isinstance(item, dict):
continue
text_val = item.get("text")
if not text_val or text_val in seen:
continue
seen.add(text_val)
merged.append({"text": text_val, "tone": item.get("tone")})
return merged
def _merge_str_lists(primary: List[str], secondary: List[str]) -> List[str]:
merged: List[str] = []
seen: set[str] = set()
for source in (primary, secondary):
for item in source:
if item in seen:
continue
seen.add(item)
merged.append(item)
return merged
def _build_alias_map(alias_payload: List[Any]) -> Dict[str, Dict[str, Any]]:
alias_map: Dict[str, Dict[str, Any]] = {}
for item in alias_payload:
if not isinstance(item, dict):
continue
alias_id = item.get("id")
if not alias_id:
continue
existing = alias_map.setdefault(
alias_id,
{"aliases": [], "keywords": [], "intent_tags": []},
)
existing["aliases"] = _merge_alias_lists(
existing["aliases"], _normalize_aliases(item.get("aliases"))
)
existing["keywords"] = _merge_str_lists(
existing["keywords"], _normalize_str_list(item.get("keywords"))
)
existing["intent_tags"] = _merge_str_lists(
existing["intent_tags"], _normalize_str_list(item.get("intent_tags"))
)
return alias_map
def merge_snippet_records_from_db(
table_id: int,
version_ts: int,
*,
engine: Optional[Engine] = None,
) -> List[Dict[str, Any]]:
"""
Load snippet + snippet_alias JSON from action_results after snippet_alias is stored,
then merge into a unified snippet object list ready for downstream RAG.
"""
engine = engine or get_engine()
snippets, aliases, updated_at, alias_action_id, snippet_action_id = _load_snippet_sources(
engine, table_id, version_ts
)
alias_map = _build_alias_map(aliases)
merged: List[Dict[str, Any]] = []
seen_ids: set[str] = set()
for snippet in snippets:
if not isinstance(snippet, dict):
continue
snippet_id = snippet.get("id")
if not snippet_id:
continue
alias_info = alias_map.get(snippet_id)
record = dict(snippet)
record_aliases = _normalize_aliases(record.get("aliases"))
record_keywords = _normalize_str_list(record.get("keywords"))
record_intents = _normalize_str_list(record.get("intent_tags"))
if alias_info:
record_aliases = _merge_alias_lists(record_aliases, alias_info["aliases"])
record_keywords = _merge_str_lists(record_keywords, alias_info["keywords"])
record_intents = _merge_str_lists(record_intents, alias_info["intent_tags"])
record["aliases"] = record_aliases
record["keywords"] = record_keywords
record["intent_tags"] = record_intents
record["table_id"] = table_id
record["version_ts"] = version_ts
record["updated_at_from_action"] = updated_at
record["source"] = "snippet"
record["action_result_id"] = alias_action_id or snippet_action_id
merged.append(record)
seen_ids.add(snippet_id)
for alias_id, alias_info in alias_map.items():
if alias_id in seen_ids:
continue
if alias_action_id is None and snippet_action_id is None:
continue
merged.append(
{
"id": alias_id,
"aliases": alias_info["aliases"],
"keywords": alias_info["keywords"],
"intent_tags": alias_info["intent_tags"],
"table_id": table_id,
"version_ts": version_ts,
"updated_at_from_action": updated_at,
"source": "alias_only",
"action_result_id": alias_action_id or snippet_action_id,
}
)
return merged
def _stable_rag_item_id(table_id: int, version_ts: int, snippet_id: str) -> int:
digest = hashlib.md5(f"{table_id}:{version_ts}:{snippet_id}".encode("utf-8")).hexdigest()
return int(digest[:16], 16) % 9_000_000_000_000_000_000
def _to_serializable(value: Any) -> Any:
if value is None or isinstance(value, (str, int, float, bool)):
return value
if isinstance(value, datetime):
return value.isoformat()
if isinstance(value, dict):
return {k: _to_serializable(v) for k, v in value.items()}
if isinstance(value, list):
return [_to_serializable(v) for v in value]
return str(value)
def _build_rag_text(snippet: Dict[str, Any]) -> str:
# Deterministic text concatenation for embedding input.
parts: List[str] = []
def _add(label: str, value: Any) -> None:
if value is None:
return
if isinstance(value, list):
value = ", ".join([str(v) for v in value if v])
elif isinstance(value, dict):
value = json.dumps(value, ensure_ascii=False)
if value:
parts.append(f"{label}: {value}")
_add("Title", snippet.get("title") or snippet.get("id"))
_add("Description", snippet.get("desc"))
_add("Business", snippet.get("business_caliber"))
_add("Type", snippet.get("type"))
_add("Examples", snippet.get("examples") or [])
_add("Aliases", [a.get("text") for a in snippet.get("aliases") or [] if isinstance(a, dict)])
_add("Keywords", snippet.get("keywords") or [])
_add("IntentTags", snippet.get("intent_tags") or [])
_add("Applicability", snippet.get("applicability"))
_add("DialectSQL", snippet.get("dialect_sql"))
return "\n".join(parts)
def _prepare_rag_payloads(
snippets: List[Dict[str, Any]],
table_id: int,
version_ts: int,
workspace_id: int,
rag_item_type: str = "SNIPPET",
) -> Tuple[List[Dict[str, Any]], List[RagItemPayload]]:
rows: List[Dict[str, Any]] = []
payloads: List[RagItemPayload] = []
now = datetime.utcnow()
for snippet in snippets:
snippet_id = snippet.get("id")
if not snippet_id:
continue
action_result_id = snippet.get("action_result_id")
if action_result_id is None:
logger.warning(
"Skipping snippet without action_result_id for RAG ingestion (table_id=%s version_ts=%s snippet_id=%s)",
table_id,
version_ts,
snippet_id,
)
continue
rag_item_id = _stable_rag_item_id(table_id, version_ts, snippet_id)
rag_text = _build_rag_text(snippet)
serializable_snippet = _to_serializable(snippet)
merged_json = json.dumps(serializable_snippet, ensure_ascii=False)
updated_at_raw = snippet.get("updated_at_from_action") or now
if isinstance(updated_at_raw, str):
try:
updated_at = datetime.fromisoformat(updated_at_raw)
except ValueError:
updated_at = now
else:
updated_at = updated_at_raw if isinstance(updated_at_raw, datetime) else now
created_at = updated_at
row = {
"rag_item_id": rag_item_id,
"workspace_id": workspace_id,
"table_id": table_id,
"version_ts": version_ts,
"created_at": created_at,
"action_result_id": action_result_id,
"snippet_id": snippet_id,
"rag_text": rag_text,
"merged_json": merged_json,
"updated_at": updated_at,
}
rows.append(row)
payloads.append(
RagItemPayload(
id=rag_item_id,
workspaceId=workspace_id,
name=snippet.get("title") or snippet_id,
embeddingData=rag_text,
type=rag_item_type or "SNIPPET",
)
)
return rows, payloads
def _upsert_rag_snippet_rows(engine: Engine, rows: Sequence[Dict[str, Any]]) -> None:
if not rows:
return
delete_sql = text("DELETE FROM rag_snippet WHERE rag_item_id=:rag_item_id")
insert_sql = text(
"""
INSERT INTO rag_snippet (
rag_item_id,
workspace_id,
table_id,
version_ts,
created_at,
action_result_id,
snippet_id,
rag_text,
merged_json,
updated_at
) VALUES (
:rag_item_id,
:workspace_id,
:table_id,
:version_ts,
:created_at,
:action_result_id,
:snippet_id,
:rag_text,
:merged_json,
:updated_at
)
"""
)
with engine.begin() as conn:
for row in rows:
conn.execute(delete_sql, row)
conn.execute(insert_sql, row)
async def ingest_snippet_rag_from_db(
table_id: int,
version_ts: int,
*,
workspace_id: int,
rag_item_type: str = "SNIPPET",
client,
engine: Optional[Engine] = None,
rag_client: Optional[RagAPIClient] = None,
) -> List[int]:
"""
Merge snippet + alias JSON from action_results, persist to rag_snippet, then push to RAG via addBatch.
Returns list of rag_item_id ingested.
"""
engine = engine or get_engine()
snippets = merge_snippet_records_from_db(table_id, version_ts, engine=engine)
if not snippets:
logger.info(
"No snippets available for RAG ingestion (table_id=%s version_ts=%s)",
table_id,
version_ts,
)
return []
rows, payloads = _prepare_rag_payloads(
snippets,
table_id=table_id,
version_ts=version_ts,
workspace_id=workspace_id,
rag_item_type=rag_item_type,
)
_upsert_rag_snippet_rows(engine, rows)
rag_client = rag_client or RagAPIClient()
await rag_client.add_batch(client, payloads)
return [row["rag_item_id"] for row in rows]

View File

@ -20,11 +20,7 @@ PROVIDER_KEY_ENV_MAP: Dict[str, str] = {
} }
DEFAULT_IMPORT_MODEL = os.getenv("DEFAULT_IMPORT_MODEL", "deepseek:deepseek-chat") DEFAULT_IMPORT_MODEL = os.getenv("DEFAULT_IMPORT_MODEL", "openai:gpt-4.1-mini")
NEW_API_BASE_URL = os.getenv("NEW_API_BASE_URL")
NEW_API_AUTH_TOKEN = os.getenv("NEW_API_AUTH_TOKEN")
RAG_API_BASE_URL = os.getenv("RAG_API_BASE_URL", "https://tchatbi.agentcarrier.cn/chatbi/api")
RAG_API_AUTH_TOKEN = os.getenv("RAG_API_AUTH_TOKEN")
@lru_cache(maxsize=1) @lru_cache(maxsize=1)

View File

@ -1,116 +0,0 @@
from __future__ import annotations
from typing import Any, Dict, Iterable, Optional
PROMPT_TOKEN_KEYS: tuple[str, ...] = ("prompt_tokens", "input_tokens", "promptTokenCount")
COMPLETION_TOKEN_KEYS: tuple[str, ...] = (
"completion_tokens",
"output_tokens",
"candidatesTokenCount",
)
TOTAL_TOKEN_KEYS: tuple[str, ...] = ("total_tokens", "totalTokenCount")
USAGE_CONTAINER_KEYS: tuple[str, ...] = ("usage", "usageMetadata", "usage_metadata")
def _normalize_usage_value(value: Any) -> Any:
if isinstance(value, (int, float)):
return int(value)
if isinstance(value, str):
stripped = value.strip()
if not stripped:
return None
try:
numeric = float(stripped)
except ValueError:
return None
return int(numeric)
if isinstance(value, dict):
normalized: Dict[str, Any] = {}
for key, nested_value in value.items():
normalized_value = _normalize_usage_value(nested_value)
if normalized_value is not None:
normalized[key] = normalized_value
return normalized or None
if isinstance(value, (list, tuple, set)):
normalized_list = [
item for item in (_normalize_usage_value(element) for element in value) if item is not None
]
return normalized_list or None
return None
def _first_numeric(payload: Dict[str, Any], keys: Iterable[str]) -> Optional[int]:
for key in keys:
value = payload.get(key)
if isinstance(value, (int, float)):
return int(value)
return None
def _canonicalize_counts(payload: Dict[str, Any]) -> None:
prompt = _first_numeric(payload, PROMPT_TOKEN_KEYS)
completion = _first_numeric(payload, COMPLETION_TOKEN_KEYS)
total = _first_numeric(payload, TOTAL_TOKEN_KEYS)
if prompt is not None:
payload["prompt_tokens"] = prompt
else:
payload.pop("prompt_tokens", None)
if completion is not None:
payload["completion_tokens"] = completion
else:
payload.pop("completion_tokens", None)
if total is not None:
payload["total_tokens"] = total
elif prompt is not None and completion is not None:
payload["total_tokens"] = prompt + completion
else:
payload.pop("total_tokens", None)
for alias in PROMPT_TOKEN_KEYS[1:]:
payload.pop(alias, None)
for alias in COMPLETION_TOKEN_KEYS[1:]:
payload.pop(alias, None)
for alias in TOTAL_TOKEN_KEYS[1:]:
payload.pop(alias, None)
def _extract_usage_container(candidate: Any) -> Optional[Dict[str, Any]]:
if not isinstance(candidate, dict):
return None
for key in USAGE_CONTAINER_KEYS:
value = candidate.get(key)
if isinstance(value, dict):
return value
return None
def extract_usage(payload: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Unified helper to parse token usage metadata from diverse provider responses."""
if not isinstance(payload, dict):
return None
usage_candidate = _extract_usage_container(payload)
if usage_candidate is None:
raw_section = payload.get("raw")
usage_candidate = _extract_usage_container(raw_section)
if usage_candidate is None:
return None
normalized = _normalize_usage_value(usage_candidate)
if not isinstance(normalized, dict):
return None
_canonicalize_counts(normalized)
return normalized or None
__all__ = ["extract_usage"]

41
deepseek-result.json Normal file
View File

@ -0,0 +1,41 @@
{
"provider": "deepseek",
"model": "deepseek-chat",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": "```json\n{\n \"table_name\": \"national_brand_sales\",\n \"description\": \"全国品牌系统外销售数据\",\n \"columns\": [\n {\n \"original_name\": \"品牌\",\n \"standard_name\": \"brand\",\n \"data_type\": \"string\",\n \"db_type\": \"varchar(50)\",\n \"java_type\": \"string\",\n \"nullable\": true,\n \"distinct_count_sample\": 5,\n \"null_ratio_sample\": 0.4,\n \"is_enum_candidate\": false,\n \"description\": \"品牌名称\",\n \"date_format\": null\n },\n {\n \"original_name\": \"产品价类\",\n \"standard_name\": \"price_category\",\n \"data_type\": \"string\",\n \"db_type\": \"varchar(10)\",\n \"java_type\": \"string\",\n \"nullable\": false,\n \"distinct_count_sample\": 3,\n \"null_ratio_sample\": 0.0,\n \"is_enum_candidate\": true,\n \"description\": \"产品价格分类(一类/二类/三类)\",\n \"date_format\": null\n },\n {\n \"original_name\": \"是否重点品牌"
}
}
],
"raw": {
"id": "67f3cc80-38bc-4bb7-b336-48d4886722c4",
"object": "chat.completion",
"created": 1761752207,
"model": "deepseek-chat",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": "```json\n{\n \"table_name\": \"national_brand_sales\",\n \"description\": \"全国品牌系统外销售数据\",\n \"columns\": [\n {\n \"original_name\": \"品牌\",\n \"standard_name\": \"brand\",\n \"data_type\": \"string\",\n \"db_type\": \"varchar(50)\",\n \"java_type\": \"string\",\n \"nullable\": true,\n \"distinct_count_sample\": 5,\n \"null_ratio_sample\": 0.4,\n \"is_enum_candidate\": false,\n \"description\": \"品牌名称\",\n \"date_format\": null\n },\n {\n \"original_name\": \"产品价类\",\n \"standard_name\": \"price_category\",\n \"data_type\": \"string\",\n \"db_type\": \"varchar(10)\",\n \"java_type\": \"string\",\n \"nullable\": false,\n \"distinct_count_sample\": 3,\n \"null_ratio_sample\": 0.0,\n \"is_enum_candidate\": true,\n \"description\": \"产品价格分类(一类/二类/三类)\",\n \"date_format\": null\n },\n {\n \"original_name\": \"是否重点品牌"
},
"logprobs": null,
"finish_reason": "length"
}
],
"usage": {
"prompt_tokens": 1078,
"completion_tokens": 256,
"total_tokens": 1334,
"prompt_tokens_details": {
"cached_tokens": 1024
},
"prompt_cache_hit_tokens": 1024,
"prompt_cache_miss_tokens": 54
},
"system_fingerprint": "fp_ffc7281d48_prod0820_fp8_kvcache"
}
}

View File

@ -1 +0,0 @@
{"role": "dimension", "time": {"range": null, "column": null, "has_gaps": null, "granularity": "unknown"}, "grain": ["service_point_id"], "table": "data-ge.water_meter_info", "columns": [{"name": "meter_subtype", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 9, "distinct_ratio": 0.03, "pk_candidate_score": 0.03, "metric_candidate_score": 0.0}, {"name": "installation_position", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 4, "distinct_ratio": 0.013333333333333334, "pk_candidate_score": 0.013333333333333334, "metric_candidate_score": 0.0}, {"name": "supply_office", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 11, "distinct_ratio": 0.03666666666666667, "pk_candidate_score": 0.03666666666666667, "metric_candidate_score": 0.0}, {"name": "meter_diameter", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 8, "distinct_ratio": 0.02666666666666667, "pk_candidate_score": 0.02666666666666667, "metric_candidate_score": 0.0}, {"name": "account_id", "dtype": "unknown", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列的统计指标如空值率、唯一性缺失但根据命名规则推断为ID。", "enumish": null, "null_rate": null, "top_values": [], "semantic_type": "id", "distinct_count": null, "distinct_ratio": null, "pk_candidate_score": 0.9, "metric_candidate_score": 0.0}, {"name": "service_point_id", "dtype": "unknown", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列的统计指标如空值率、唯一性缺失但根据命名规则推断为ID。", "enumish": null, "null_rate": null, "top_values": [], "semantic_type": "id", "distinct_count": null, "distinct_ratio": null, "pk_candidate_score": 0.95, "metric_candidate_score": 0.0}, {"name": "station", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 36, "distinct_ratio": 0.12, "pk_candidate_score": 0.12, "metric_candidate_score": 0.0}, {"name": "meter_type", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 5, "distinct_ratio": 0.016666666666666666, "pk_candidate_score": 0.016666666666666666, "metric_candidate_score": 0.0}, {"name": "district", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 13, "distinct_ratio": 0.043333333333333335, "pk_candidate_score": 0.043333333333333335, "metric_candidate_score": 0.0}, {"name": "meter_status", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列只有一个唯一值 '有效'。", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 1, "distinct_ratio": 0.0033333333333333335, "pk_candidate_score": 0.0033333333333333335, "metric_candidate_score": 0.0}], "quality": {"warning_hints": ["列 'meter_status' 只有一个唯一值 '有效',可能为常量列。"], "failed_expectations": []}, "row_count": 300, "fk_candidates": [], "confidence_notes": ["表角色(role)被推断为 'dimension'因为其列几乎完全由ID和类别属性构成且缺少数值指标或时间序列列。", "主键候选(primary_key_candidates) 'service_point_id' 和 'account_id' 是基于命名约定(包含'_id'推断的。其唯一性和非空性未在GE结果中直接度量因此这是一个高置信度的猜测。", "表粒度(grain)可能为 'service_point',与推断的主键 'service_point_id' 相对应。", "未根据列名或数据格式识别出时间列。"], "primary_key_candidates": [["service_point_id"], ["account_id"]]}

View File

@ -1,180 +0,0 @@
[
{
"id": "snpt_count-service-points-by-dimension",
"aliases": [
{
"text": "各个区有多少水表",
"tone": "口语"
},
{
"text": "按维度统计用水点数",
"tone": "中性"
},
{
"text": "各维度用水点数量分布",
"tone": "专业"
}
],
"keywords": [
"用水点数",
"service_point_count",
"数量",
"统计",
"汇总",
"aggregate",
"维度",
"dimension",
"区域",
"district",
"供水所",
"分组统计",
"水表"
],
"intent_tags": [
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_topn-service-points-by-dimension",
"aliases": [
{
"text": "哪个地方水表最多",
"tone": "口语"
},
{
"text": "用水点数Top-N排名",
"tone": "中性"
},
{
"text": "Top-N用水点数维度排行",
"tone": "专业"
}
],
"keywords": [
"Top-N",
"top",
"排名",
"排行",
"ranking",
"最多",
"用水点数",
"service_point_count",
"维度",
"dimension",
"站点",
"station",
"水表"
],
"intent_tags": [
"topn",
"by_dimension"
]
},
{
"id": "snpt_ratio-service-points-by-dimension",
"aliases": [
{
"text": "各种水表各占多少",
"tone": "口语"
},
{
"text": "各维度用水点数占比",
"tone": "中性"
},
{
"text": "用水点维度构成分析",
"tone": "专业"
}
],
"keywords": [
"占比",
"percentage",
"百分比",
"ratio",
"构成",
"分布",
"用水点数",
"水表类型",
"meter_type",
"维度",
"dimension",
"水表"
],
"intent_tags": [
"ratio",
"by_dimension"
]
},
{
"id": "snpt_quality-check-duplicate-spid",
"aliases": [
{
"text": "有没有重复的水表号",
"tone": "口语"
},
{
"text": "检查重复的用水点ID",
"tone": "中性"
},
{
"text": "用水点ID唯一性校验",
"tone": "专业"
}
],
"keywords": [
"数据质量",
"quality",
"检查",
"校验",
"重复",
"duplicate",
"唯一性",
"uniqueness",
"用水点ID",
"service_point_id",
"异常检测",
"主键"
],
"intent_tags": [
"quality",
"by_dimension"
]
},
{
"id": "snpt_sample-filter-service-points-by-dims",
"aliases": [
{
"text": "给我看城区的机械表",
"tone": "口语"
},
{
"text": "按多维度筛选用水点",
"tone": "中性"
},
{
"text": "多维组合条件过滤用水点",
"tone": "专业"
}
],
"keywords": [
"筛选",
"过滤",
"filter",
"查询",
"明细",
"列表",
"sample",
"用水点",
"区域",
"district",
"水表类型",
"meter_type",
"条件查询"
],
"intent_tags": [
"sample",
"filter"
]
}
]

View File

@ -1,186 +0,0 @@
[
{
"id": "snpt_count-service-points-by-dimension",
"desc": "按指定维度(如区域、供水所)分组,统计各分类下的用水点数量。",
"type": "aggregate",
"title": "按维度统计用水点数",
"examples": [
"按区域统计用水点数量",
"各个供水所分别有多少个用水点"
],
"variables": [
{
"name": "dimension_column",
"type": "column",
"default": "district"
}
],
"dialect_sql": {
"mysql": "SELECT\n `${dimension_column}`,\n COUNT(DISTINCT service_point_id) AS service_point_count\nFROM\n `data-ge.water_meter_info`\nGROUP BY\n `${dimension_column}`\nORDER BY\n service_point_count DESC;"
},
"applicability": {
"constraints": {
"notes": [
"适用于对水表档案信息进行分类汇总统计。",
"可将变量 ${dimension_column} 替换为任一维度列,如 district, supply_office, station, meter_type 等。"
],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "用水点数:对 `service_point_id` 进行去重计数,代表一个独立的服务点(通常对应一个水表)。统计粒度为“指定维度”。"
},
{
"id": "snpt_topn-service-points-by-dimension",
"desc": "按指定维度如区域、站点统计用水点数并展示数量最多的前N个分类。",
"type": "topn",
"title": "Top-N 用水点数维度排名",
"examples": [
"哪个区域的用水点最多",
"用水点数排名前5的站点是哪些"
],
"variables": [
{
"name": "dimension_column",
"type": "column",
"default": "station"
},
{
"name": "top_n",
"type": "int",
"default": 10
}
],
"dialect_sql": {
"mysql": "SELECT\n `${dimension_column}`,\n COUNT(DISTINCT service_point_id) AS service_point_count\nFROM\n `data-ge.water_meter_info`\nGROUP BY\n `${dimension_column}`\nORDER BY\n service_point_count DESC\nLIMIT ${top_n};"
},
"applicability": {
"constraints": {
"notes": [
"维度 `station` 基数较高 (36),建议 Top-N 查询时结合业务场景合理设置 N 值。"
],
"fk_join_available": false,
"dim_cardinality_hint": 36
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "用水点数:对 `service_point_id` 进行去重计数。排名依据为各维度分类下的用水点总数。统计粒度为“指定维度”。"
},
{
"id": "snpt_ratio-service-points-by-dimension",
"desc": "计算在指定维度下,各分类的用水点数占总用水点数的百分比,以分析其分布构成。",
"type": "ratio",
"title": "各维度用水点数占比",
"examples": [
"不同水表类型meter_type的分布情况",
"各个区域的用水点占比是多少"
],
"variables": [
{
"name": "dimension_column",
"type": "column",
"default": "meter_type"
}
],
"dialect_sql": {
"mysql": "SELECT\n `${dimension_column}`,\n COUNT(DISTINCT service_point_id) AS service_point_count,\n COUNT(DISTINCT service_point_id) * 100.0 / SUM(COUNT(DISTINCT service_point_id)) OVER () AS percentage\nFROM\n `data-ge.water_meter_info`\nGROUP BY\n `${dimension_column}`\nORDER BY\n service_point_count DESC;"
},
"applicability": {
"constraints": {
"notes": [
"SQL模板使用了窗口函数 SUM() OVER()请确保MySQL版本支持8.0+)。"
],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "用水点数占比:某分类下的用水点数 / 总用水点数。用水点数以 `service_point_id` 去重计数。统计粒度为“指定维度”。"
},
{
"id": "snpt_quality-check-duplicate-spid",
"desc": "查找在用水点信息表中存在重复的 `service_point_id`,用于数据质量校验。",
"type": "quality",
"title": "检查重复的用水点ID",
"examples": [
"检查是否存在重复的水表档案",
"校验用水点ID的唯一性"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT\n service_point_id,\n COUNT(*) AS occurrences\nFROM\n `data-ge.water_meter_info`\nGROUP BY\n service_point_id\nHAVING\n COUNT(*) > 1;"
},
"applicability": {
"constraints": {
"notes": [
"预期返回结果为空。若有返回,则表示数据存在一致性问题,`service_point_id` 未能作为唯一主键。"
],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "重复项:指 `service_point_id` 出现次数大于1的记录。此ID应为表的主键理论上不应重复。"
},
{
"id": "snpt_sample-filter-service-points-by-dims",
"desc": "根据区域、水表类型、供水所等多个维度组合条件,筛选出符合条件的用水点明细。",
"type": "sample",
"title": "多维度筛选用水点列表",
"examples": [
"查询城区的机械表有哪些",
"拉取某个供水所下特定口径水表的列表"
],
"variables": [
{
"name": "district_name",
"type": "string",
"default": "城区"
},
{
"name": "meter_type_name",
"type": "string",
"default": "机械表"
},
{
"name": "limit_num",
"type": "int",
"default": 100
}
],
"dialect_sql": {
"mysql": "SELECT\n service_point_id,\n account_id,\n district,\n supply_office,\n meter_type,\n meter_subtype,\n meter_diameter\nFROM\n `data-ge.water_meter_info`\nWHERE\n district = '${district_name}'\n AND meter_type = '${meter_type_name}'\n -- AND meter_status = '有效' -- 可选:根据画像,该列为常量'有效',可不加\nLIMIT ${limit_num};"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": null,
"required_columns": [
"service_point_id",
"account_id",
"district",
"supply_office",
"meter_type",
"meter_subtype",
"meter_diameter"
]
},
"business_caliber": "返回满足所有筛选条件的用水点明细信息。`meter_status` 列只有一个值 '有效',通常无需作为筛选条件。"
}
]

View File

@ -1,230 +0,0 @@
{
"role": "dimension",
"time": {
"range": null,
"column": null,
"has_gaps": null,
"granularity": "unknown"
},
"grain": [
"service_point_id"
],
"table": "data-ge.water_meter_info",
"columns": [
{
"name": "supply_office",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空11 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 11,
"distinct_ratio": 0.03666666666666667,
"pk_candidate_score": 0.05,
"metric_candidate_score": 0.0
},
{
"name": "station",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空36 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 36,
"distinct_ratio": 0.12,
"pk_candidate_score": 0.1,
"metric_candidate_score": 0.0
},
{
"name": "district",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空13 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 13,
"distinct_ratio": 0.043333333333333335,
"pk_candidate_score": 0.05,
"metric_candidate_score": 0.0
},
{
"name": "meter_diameter",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空8 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 8,
"distinct_ratio": 0.02666666666666667,
"pk_candidate_score": 0.03,
"metric_candidate_score": 0.0
},
{
"name": "meter_status",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空;单一取值(\"有效\"",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 1,
"distinct_ratio": 0.0033333333333333335,
"pk_candidate_score": 0.0,
"metric_candidate_score": 0.0
},
{
"name": "meter_subtype",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空9 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 9,
"distinct_ratio": 0.03,
"pk_candidate_score": 0.03,
"metric_candidate_score": 0.0
},
{
"name": "meter_type",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空5 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 5,
"distinct_ratio": 0.016666666666666666,
"pk_candidate_score": 0.02,
"metric_candidate_score": 0.0
},
{
"name": "installation_position",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空4 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 4,
"distinct_ratio": 0.013333333333333334,
"pk_candidate_score": 0.02,
"metric_candidate_score": 0.0
},
{
"name": "service_point_id",
"dtype": "unknown",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "命名指示标识列;未提供唯一性或非空验证",
"enumish": null,
"null_rate": null,
"top_values": [],
"semantic_type": "id",
"distinct_count": null,
"distinct_ratio": null,
"pk_candidate_score": 0.6,
"metric_candidate_score": 0.05
},
{
"name": "account_id",
"dtype": "unknown",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "命名指示账户标识;未提供唯一性或非空验证",
"enumish": null,
"null_rate": null,
"top_values": [],
"semantic_type": "id",
"distinct_count": null,
"distinct_ratio": null,
"pk_candidate_score": 0.5,
"metric_candidate_score": 0.05
}
],
"quality": {
"warning_hints": [
"以下列未设置非空校验service_point_id, account_id空值情况未知",
"未识别到时间列"
],
"failed_expectations": []
},
"row_count": 300,
"fk_candidates": [],
"confidence_notes": [
"role 判定为 dimension表内列均为枚举/分类或ID未发现数值型度量或时间列34/34 期望均为分类枚举/非空与去重比例。",
"grain 猜测为 service_point_id仅依据命名启发式缺少唯一性与非空度量佐证置信度较低。",
"未识别时间列:列名与期望均未涉及日期/时间,也无最小/最大时间范围可推断。"
],
"primary_key_candidates": []
}

View File

@ -1,372 +0,0 @@
[
{
"id": "snpt_topn_station",
"aliases": [
{
"text": "站点水表排行前N",
"tone": "中性"
},
{
"text": "哪个站点表最多",
"tone": "口语"
},
{
"text": "按站点水表TopN",
"tone": "专业"
}
],
"keywords": [
"TopN",
"排名",
"排行",
"station",
"站点",
"水表数",
"meter count",
"distinct",
"去重",
"聚合",
"排序",
"榜单"
],
"intent_tags": [
"topn",
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_share_district",
"aliases": [
{
"text": "各辖区水表占比",
"tone": "中性"
},
{
"text": "哪个辖区占比高",
"tone": "口语"
},
{
"text": "按辖区水表比例",
"tone": "专业"
}
],
"keywords": [
"占比",
"ratio",
"district",
"辖区",
"水表数",
"meter count",
"distinct",
"去重",
"百分比",
"份额",
"聚合",
"排序",
"分布"
],
"intent_tags": [
"ratio",
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_dist_diameter",
"aliases": [
{
"text": "表径水表数分布",
"tone": "中性"
},
{
"text": "不同口径有多少",
"tone": "口语"
},
{
"text": "按表径去重计数",
"tone": "专业"
}
],
"keywords": [
"分布",
"distribution",
"meter_diameter",
"表径",
"水表数",
"meter count",
"distinct",
"去重",
"聚合",
"类别",
"category",
"条形图",
"饼图",
"排行"
],
"intent_tags": [
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_type_subtype_matrix",
"aliases": [
{
"text": "类型×子类水表数",
"tone": "中性"
},
{
"text": "看各类型各子类",
"tone": "口语"
},
{
"text": "类型子类组合统计",
"tone": "专业"
}
],
"keywords": [
"类型",
"type",
"子类",
"subtype",
"组合",
"matrix",
"交叉分析",
"cross-tab",
"水表数",
"meter count",
"distinct",
"去重",
"分布",
"聚合",
"维度"
],
"intent_tags": [
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_quality_spid_uniq",
"aliases": [
{
"text": "服务点ID唯一性检",
"tone": "专业"
},
{
"text": "服务点ID有重复吗",
"tone": "口语"
},
{
"text": "服务点ID完整性评估",
"tone": "中性"
}
],
"keywords": [
"质量检查",
"quality",
"唯一性",
"uniqueness",
"重复",
"duplicate",
"空值",
"NULL",
"完整性",
"integrity",
"service_point_id",
"数据质量",
"统计",
"去重",
"异常检测"
],
"intent_tags": [
"quality"
]
},
{
"id": "snpt_quality_account_nulls",
"aliases": [
{
"text": "账户ID缺失明细",
"tone": "中性"
},
{
"text": "看看哪些账户为空",
"tone": "口语"
},
{
"text": "account_id空值样本",
"tone": "专业"
}
],
"keywords": [
"质量检查",
"缺失",
"missing",
"空值",
"NULL",
"account_id",
"样本",
"sample",
"抽样",
"sampling",
"明细",
"排查",
"过滤",
"WHERE",
"LIMIT"
],
"intent_tags": [
"quality",
"sample"
]
},
{
"id": "snpt_sample_random_rows",
"aliases": [
{
"text": "随机抽样水表明细",
"tone": "中性"
},
{
"text": "随机取几条看看",
"tone": "口语"
},
{
"text": "RAND()样本抽取",
"tone": "专业"
}
],
"keywords": [
"随机",
"random",
"样本",
"sample",
"抽样",
"sampling",
"明细",
"details",
"质检",
"QA",
"RAND()",
"LIMIT",
"抽取",
"数据验证"
],
"intent_tags": [
"sample"
]
},
{
"id": "snpt_filter_office_type_where",
"aliases": [
{
"text": "按所与类型过滤有效",
"tone": "专业"
},
{
"text": "筛选某所的指定类型",
"tone": "中性"
},
{
"text": "只看这所的这种表",
"tone": "口语"
}
],
"keywords": [
"过滤",
"filter",
"WHERE",
"supply_office",
"营业所",
"meter_type",
"类型",
"meter_status",
"有效",
"条件片段",
"筛选",
"查询拼接",
"字段",
"约束"
],
"intent_tags": [
"filter"
]
},
{
"id": "snpt_office_station_dist",
"aliases": [
{
"text": "所站组合水表数",
"tone": "中性"
},
{
"text": "各站在各所有多少",
"tone": "口语"
},
{
"text": "营业所×站点分布",
"tone": "专业"
}
],
"keywords": [
"supply_office",
"营业所",
"station",
"站点",
"层级",
"hierarchy",
"分布",
"distribution",
"水表数",
"meter count",
"distinct",
"去重",
"聚合",
"交叉分析",
"排行"
],
"intent_tags": [
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_total_meter_baseline",
"aliases": [
{
"text": "水表总量基线",
"tone": "中性"
},
{
"text": "现在有多少水表",
"tone": "口语"
},
{
"text": "全表去重总数",
"tone": "专业"
}
],
"keywords": [
"总量",
"total",
"baseline",
"基线",
"水表总数",
"meter total",
"service_point_id",
"distinct",
"去重",
"分母",
"denominator",
"占比",
"聚合",
"汇总",
"snapshot"
],
"intent_tags": [
"aggregate"
]
}
]

View File

@ -1,330 +0,0 @@
[
{
"id": "snpt_topn_station",
"desc": "按站点统计水表数量并取前N",
"type": "topn",
"title": "站点TopN水表数",
"examples": [
"各站点水表数量排名前10",
"站点水表覆盖情况排行"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 10
}
],
"dialect_sql": {
"mysql": "SELECT station,\n COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY station\nORDER BY meter_cnt DESC\nLIMIT {{top_n}};"
},
"applicability": {
"constraints": {
"notes": [
"TopN建议N<=36",
"以service_point_id去重计数",
"无时间列,无法做趋势"
],
"fk_join_available": false,
"dim_cardinality_hint": 36
},
"time_column": null,
"required_columns": [
"station",
"service_point_id"
]
},
"business_caliber": "水表数=按service_point_id去重计数粒度=站点。仅统计当前表中的有效记录不含时间口径。安全限制用于分析排名避免扩大LIMIT造成全量导出。"
},
{
"id": "snpt_share_district",
"desc": "统计各辖区水表数及其占比",
"type": "ratio",
"title": "辖区水表占比",
"examples": [
"各辖区水表占比",
"哪个辖区水表最多"
],
"variables": [],
"dialect_sql": {
"mysql": "WITH by_district AS (\n SELECT district, COUNT(DISTINCT service_point_id) AS meter_cnt\n FROM `data-ge`.`water_meter_info`\n GROUP BY district\n), tot AS (\n SELECT COUNT(DISTINCT service_point_id) AS total_cnt\n FROM `data-ge`.`water_meter_info`\n)\nSELECT b.district,\n b.meter_cnt,\n ROUND(b.meter_cnt / NULLIF(t.total_cnt, 0) * 100, 2) AS pct\nFROM by_district b\nCROSS JOIN tot t\nORDER BY pct DESC, b.district;"
},
"applicability": {
"constraints": {
"notes": [
"占比分母为全表service_point_id去重总数",
"service_point_id为空将被忽略"
],
"fk_join_available": false,
"dim_cardinality_hint": 13
},
"time_column": null,
"required_columns": [
"district",
"service_point_id"
]
},
"business_caliber": "水表数=按service_point_id去重计数粒度=辖区。占比=辖区水表数/全表水表总数。安全限制:仅基于本表,不代表全市/全网口径;无时间维度。"
},
{
"id": "snpt_dist_diameter",
"desc": "按表径统计水表数量分布",
"type": "aggregate",
"title": "表径分布统计",
"examples": [
"不同口径水表有多少",
"查看表径分布情况"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT meter_diameter,\n COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY meter_diameter\nORDER BY meter_cnt DESC, meter_diameter;"
},
"applicability": {
"constraints": {
"notes": [
"以service_point_id去重计数",
"适合绘制条形图/饼图"
],
"fk_join_available": false,
"dim_cardinality_hint": 8
},
"time_column": null,
"required_columns": [
"meter_diameter",
"service_point_id"
]
},
"business_caliber": "水表数=按service_point_id去重计数粒度=表径。安全限制:仅用于分布分析,不含时间过滤;避免用于明细导出。"
},
{
"id": "snpt_type_subtype_matrix",
"desc": "统计水表类型与子类组合的数量",
"type": "aggregate",
"title": "类型子类分布",
"examples": [
"不同类型与子类的水表数量",
"查看类型与子类的组合分布"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT meter_type,\n meter_subtype,\n COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY meter_type, meter_subtype\nORDER BY meter_cnt DESC, meter_type, meter_subtype;"
},
"applicability": {
"constraints": {
"notes": [
"组合基数<=5×9=45",
"以service_point_id去重计数"
],
"fk_join_available": false,
"dim_cardinality_hint": 45
},
"time_column": null,
"required_columns": [
"meter_type",
"meter_subtype",
"service_point_id"
]
},
"business_caliber": "水表数=按service_point_id去重计数粒度=类型×子类组合。安全限制:仅用于汇总分析,不包含时间或业务状态变化。"
},
{
"id": "snpt_quality_spid_uniq",
"desc": "评估service_point_id的空值与重复情况",
"type": "quality",
"title": "服务点唯一性检",
"examples": [
"检查服务点ID是否唯一",
"统计service_point_id空值与重复情况"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT\n COUNT(*) AS total_rows,\n SUM(service_point_id IS NULL) AS null_cnt,\n COUNT(DISTINCT service_point_id) AS distinct_cnt,\n (COUNT(*) - COUNT(DISTINCT service_point_id)) AS duplicate_rows_est,\n (\n SELECT COUNT(*) FROM (\n SELECT service_point_id\n FROM `data-ge`.`water_meter_info`\n GROUP BY service_point_id\n HAVING COUNT(*) > 1\n ) AS dup\n ) AS dup_key_groups\nFROM `data-ge`.`water_meter_info`;"
},
"applicability": {
"constraints": {
"notes": [
"用于键完整性检查",
"重复行估算=总行数-去重数"
],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "质量检查口径在本表内评估service_point_id的非空与唯一性不代表跨表全局唯一。安全限制仅输出汇总指标不暴露明细重复值。"
},
{
"id": "snpt_quality_account_nulls",
"desc": "抽取account_id为空的记录用于排查",
"type": "quality",
"title": "账户ID缺失明细",
"examples": [
"列出account_id为空的水表",
"抽样查看账户缺失的数据行"
],
"variables": [
{
"name": "limit_n",
"type": "int",
"default": 50
}
],
"dialect_sql": {
"mysql": "SELECT *\nFROM `data-ge`.`water_meter_info`\nWHERE account_id IS NULL\nLIMIT {{limit_n}};"
},
"applicability": {
"constraints": {
"notes": [
"明细仅限小样本抽取",
"建议LIMIT<=100避免全量导出"
],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": null,
"required_columns": [
"account_id"
]
},
"business_caliber": "质量抽样筛出账户ID缺失的水表记录便于核对。安全限制仅用于样本排查不建议在生产中全量导出如需口径统计请改为COUNT聚合。"
},
{
"id": "snpt_sample_random_rows",
"desc": "随机抽取水表信息用于人工核验",
"type": "sample",
"title": "随机抽样明细",
"examples": [
"抽样查看水表信息",
"随机抽取20条做质检"
],
"variables": [
{
"name": "sample_size",
"type": "int",
"default": 20
}
],
"dialect_sql": {
"mysql": "SELECT *\nFROM `data-ge`.`water_meter_info`\nORDER BY RAND()\nLIMIT {{sample_size}};"
},
"applicability": {
"constraints": {
"notes": [
"使用RAND()随机,样本不可复现",
"建议限制样本量"
],
"fk_join_available": false,
"dim_cardinality_hint": 300
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "样本抽取从本表随机返回若干行明细。安全限制避免扩大LIMIT进行全量下载如需可复现样本请改用带种子的随机方法MySQL不原生支持。"
},
{
"id": "snpt_filter_office_type_where",
"desc": "常用WHERE筛选条件片段按营业所与类型且为有效",
"type": "sample",
"title": "机构类型筛选片",
"examples": [
"筛选A营业所的机械表",
"仅查看某营业所的指定类型水表"
],
"variables": [
{
"name": "supply_office",
"type": "string"
},
{
"name": "meter_type",
"type": "string"
}
],
"dialect_sql": {
"mysql": "WHERE supply_office = '{{supply_office}}'\n AND meter_type = '{{meter_type}}'\n AND meter_status = '有效'"
},
"applicability": {
"constraints": {
"notes": [
"这是条件片段,可拼接到其他查询",
"meter_status当前为单一值“有效”"
],
"fk_join_available": false,
"dim_cardinality_hint": 11
},
"time_column": null,
"required_columns": [
"supply_office",
"meter_type",
"meter_status"
]
},
"business_caliber": "过滤口径仅保留指定营业所与指定水表类型、且状态为“有效”的记录。安全限制为片段用途需拼接在SELECT…FROM之后使用。"
},
{
"id": "snpt_office_station_dist",
"desc": "按营业所与站点组合统计水表数",
"type": "aggregate",
"title": "所站层级分布",
"examples": [
"按营业所查看各站点水表数",
"所站两级的水表分布情况"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT supply_office,\n station,\n COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY supply_office, station\nORDER BY supply_office, meter_cnt DESC, station;"
},
"applicability": {
"constraints": {
"notes": [
"组合基数<=11×36=396",
"以service_point_id去重计数",
"如结果过长可再按TopN筛选"
],
"fk_join_available": false,
"dim_cardinality_hint": 396
},
"time_column": null,
"required_columns": [
"supply_office",
"station",
"service_point_id"
]
},
"business_caliber": "水表数=按service_point_id去重计数粒度=营业所×站点。安全限制:结果行数可能较多,建议在可视化端增加筛选或分页。"
},
{
"id": "snpt_total_meter_baseline",
"desc": "获取全表水表去重总量基线",
"type": "aggregate",
"title": "水表总量基线",
"examples": [
"当前有多少只水表",
"作为占比分析的分母基线"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT COUNT(DISTINCT service_point_id) AS meter_total\nFROM `data-ge`.`water_meter_info`;"
},
"applicability": {
"constraints": {
"notes": [
"作为其他占比/分摊分母基线",
"忽略service_point_id为空的记录"
],
"fk_join_available": false,
"dim_cardinality_hint": 300
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "水表总量=按service_point_id去重计数基于当前表的全量记录。安全限制无时间维度无法反映存量随时间变化。"
}
]

View File

@ -1,415 +0,0 @@
{
"role": "dimension",
"time": {
"range": null,
"column": null,
"has_gaps": null,
"granularity": "unknown"
},
"grain": [
"account_id",
"service_point_id"
],
"table": "data-ge.water_meter_info",
"columns": [
{
"name": "supply_office",
"dtype": "string",
"stats": {},
"comment": "供水管理所名称,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "宝山供水管理所"
},
{
"pct": null,
"value": "黄浦供水管理所"
},
{
"pct": null,
"value": "青东供水管理所"
},
{
"pct": null,
"value": "虹口供水管理所"
},
{
"pct": null,
"value": "闸北供水管理所"
},
{
"pct": null,
"value": "松北供水管理所"
},
{
"pct": null,
"value": "杨浦供水管理所"
},
{
"pct": null,
"value": "长宁供水管理所"
},
{
"pct": null,
"value": "闵行供水管理所"
},
{
"pct": null,
"value": "徐汇供水管理所"
},
{
"pct": null,
"value": "普陀供水管理所"
}
],
"semantic_type": "dimension",
"distinct_count": 11,
"distinct_ratio": 0.03666666666666667,
"pk_candidate_score": 0.11,
"metric_candidate_score": 0.0
},
{
"name": "station",
"dtype": "string",
"stats": {},
"comment": "站点名称,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "新闸站"
},
{
"pct": null,
"value": "宝杨站"
},
{
"pct": null,
"value": "江川站"
},
{
"pct": null,
"value": "长江站"
},
{
"pct": null,
"value": "市光站"
},
{
"pct": null,
"value": "徐泾站"
},
{
"pct": null,
"value": "真北站"
},
{
"pct": null,
"value": "半淞园站"
},
{
"pct": null,
"value": "芙蓉江站"
},
{
"pct": null,
"value": "密云站"
}
],
"semantic_type": "dimension",
"distinct_count": 36,
"distinct_ratio": 0.12,
"pk_candidate_score": 0.36,
"metric_candidate_score": 0.0
},
{
"name": "district",
"dtype": "string",
"stats": {},
"comment": "行政区划名称,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "普陀区"
},
{
"pct": null,
"value": "闵行区"
},
{
"pct": null,
"value": "嘉定区"
},
{
"pct": null,
"value": "杨浦区"
},
{
"pct": null,
"value": "徐汇区"
},
{
"pct": null,
"value": "黄浦区"
},
{
"pct": null,
"value": "松江区"
},
{
"pct": null,
"value": "长宁区"
},
{
"pct": null,
"value": "青浦区"
},
{
"pct": null,
"value": "虹口区"
}
],
"semantic_type": "dimension",
"distinct_count": 13,
"distinct_ratio": 0.043333333333333335,
"pk_candidate_score": 0.13,
"metric_candidate_score": 0.0
},
{
"name": "meter_diameter",
"dtype": "string",
"stats": {},
"comment": "水表直径规格,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "20mm"
},
{
"pct": null,
"value": "15mm"
},
{
"pct": null,
"value": "25mm"
},
{
"pct": null,
"value": "40mm"
},
{
"pct": null,
"value": "150mm"
},
{
"pct": null,
"value": "100mm"
},
{
"pct": null,
"value": "80mm"
},
{
"pct": null,
"value": "50mm"
}
],
"semantic_type": "dimension",
"distinct_count": 8,
"distinct_ratio": 0.02666666666666667,
"pk_candidate_score": 0.08,
"metric_candidate_score": 0.0
},
{
"name": "meter_status",
"dtype": "string",
"stats": {},
"comment": "水表状态,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "有效"
}
],
"semantic_type": "dimension",
"distinct_count": 1,
"distinct_ratio": 0.0033333333333333335,
"pk_candidate_score": 0.01,
"metric_candidate_score": 0.0
},
{
"name": "meter_subtype",
"dtype": "string",
"stats": {},
"comment": "水表子类型,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "旋翼半液封式"
},
{
"pct": null,
"value": "超声波式"
},
{
"pct": null,
"value": "旋翼湿式(指针式)"
},
{
"pct": null,
"value": "旋翼湿式(数字指针式)"
},
{
"pct": null,
"value": "电磁式"
},
{
"pct": null,
"value": "无直管段要求超声波式"
},
{
"pct": null,
"value": "无直管段要求电磁式"
},
{
"pct": null,
"value": "垂直螺翼干式"
},
{
"pct": null,
"value": "机械容积式"
}
],
"semantic_type": "dimension",
"distinct_count": 9,
"distinct_ratio": 0.03,
"pk_candidate_score": 0.09,
"metric_candidate_score": 0.0
},
{
"name": "meter_type",
"dtype": "string",
"stats": {},
"comment": "水表类型,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "容积式机械水表"
},
{
"pct": null,
"value": "速度式机械水表"
},
{
"pct": null,
"value": "电磁式远传水表"
},
{
"pct": null,
"value": "速度式机电远传水表"
},
{
"pct": null,
"value": "超声波式远传水表"
}
],
"semantic_type": "dimension",
"distinct_count": 5,
"distinct_ratio": 0.016666666666666666,
"pk_candidate_score": 0.05,
"metric_candidate_score": 0.0
},
{
"name": "installation_position",
"dtype": "string",
"stats": {},
"comment": "安装位置,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "嵌墙表"
},
{
"pct": null,
"value": "管道井表"
},
{
"pct": null,
"value": "地下表"
},
{
"pct": null,
"value": "龙头表"
}
],
"semantic_type": "dimension",
"distinct_count": 4,
"distinct_ratio": 0.013333333333333334,
"pk_candidate_score": 0.04,
"metric_candidate_score": 0.0
},
{
"name": "account_id",
"dtype": "string",
"stats": {},
"comment": "账户ID",
"enumish": false,
"null_rate": null,
"top_values": [],
"semantic_type": "id",
"distinct_count": null,
"distinct_ratio": null,
"pk_candidate_score": 0.95,
"metric_candidate_score": 0.0
},
{
"name": "service_point_id",
"dtype": "string",
"stats": {},
"comment": "服务点ID",
"enumish": false,
"null_rate": null,
"top_values": [],
"semantic_type": "id",
"distinct_count": null,
"distinct_ratio": null,
"pk_candidate_score": 0.95,
"metric_candidate_score": 0.0
}
],
"quality": {
"warning_hints": [],
"failed_expectations": []
},
"row_count": 300,
"fk_candidates": [],
"confidence_notes": [
"role判定为dimension因所有列均为枚举或ID类型无metric列",
"grain依据account_id和service_point_id为唯一标识推测",
"未发现时间列因此time字段为null"
],
"primary_key_candidates": [
[
"account_id"
],
[
"service_point_id"
]
]
}

View File

@ -1,286 +0,0 @@
[
{
"id": "snpt_water_meter_top_supply_office",
"aliases": [
{
"text": "供水所水表排行",
"tone": "中性"
},
{
"text": "哪个供水所水表最多",
"tone": "口语"
},
{
"text": "供水管理所水表TopN统计",
"tone": "专业"
}
],
"keywords": [
"水表",
"供水管理所",
"排行",
"TopN",
"数量",
"统计",
"count",
"排名",
"前N",
"供水所",
"水表数",
"维度聚合",
"by_dimension",
"topn"
],
"intent_tags": [
"topn",
"by_dimension"
]
},
{
"id": "snpt_water_meter_top_station",
"aliases": [
{
"text": "站点水表数量排行",
"tone": "中性"
},
{
"text": "哪个站点水表最多",
"tone": "口语"
},
{
"text": "站点维度水表TopN分析",
"tone": "专业"
}
],
"keywords": [
"水表",
"站点",
"排行",
"TopN",
"数量",
"统计",
"count",
"排名",
"前N",
"站点数",
"维度聚合",
"by_dimension",
"topn"
],
"intent_tags": [
"topn",
"by_dimension"
]
},
{
"id": "snpt_water_meter_top_district",
"aliases": [
{
"text": "区域水表数量排名",
"tone": "中性"
},
{
"text": "哪个区水表最多",
"tone": "口语"
},
{
"text": "行政区水表TopN统计",
"tone": "专业"
}
],
"keywords": [
"水表",
"区域",
"行政区",
"排行",
"TopN",
"数量",
"统计",
"count",
"排名",
"前N",
"区",
"水表数",
"维度聚合",
"by_dimension",
"topn"
],
"intent_tags": [
"topn",
"by_dimension"
]
},
{
"id": "snpt_water_meter_share_by_type",
"aliases": [
{
"text": "水表类型占比",
"tone": "中性"
},
{
"text": "哪种水表用得最多",
"tone": "口语"
},
{
"text": "水表类型分布比例",
"tone": "专业"
}
],
"keywords": [
"水表",
"类型",
"占比",
"比例",
"ratio",
"分布",
"meter_type",
"百分比",
"分类统计",
"水表类型",
"ratio",
"aggregate",
"by_dimension"
],
"intent_tags": [
"ratio",
"by_dimension"
]
},
{
"id": "snpt_water_meter_subtype_distribution",
"aliases": [
{
"text": "水表子类型分布",
"tone": "中性"
},
{
"text": "各种子类型水表情况",
"tone": "口语"
},
{
"text": "水表子类型计数与占比",
"tone": "专业"
}
],
"keywords": [
"水表",
"子类型",
"分布",
"数量",
"占比",
"meter_subtype",
"统计",
"count",
"百分比",
"分类统计",
"aggregate",
"by_dimension"
],
"intent_tags": [
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_water_meter_installation_position_stats",
"aliases": [
{
"text": "安装位置统计",
"tone": "中性"
},
{
"text": "哪种位置装表最多",
"tone": "口语"
},
{
"text": "水表安装位置分布",
"tone": "专业"
}
],
"keywords": [
"水表",
"安装位置",
"统计",
"分布",
"installation_position",
"数量",
"count",
"位置",
"安装点",
"aggregate",
"by_dimension"
],
"intent_tags": [
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_water_meter_grain_check",
"aliases": [
{
"text": "主键粒度校验",
"tone": "中性"
},
{
"text": "数据有没有重复",
"tone": "口语"
},
{
"text": "数据粒度一致性检查",
"tone": "专业"
}
],
"keywords": [
"主键",
"粒度",
"校验",
"质量",
"quality",
"重复",
"唯一性",
"account_id",
"service_point_id",
"数据校验",
"质量检查",
"异常检测"
],
"intent_tags": [
"quality"
]
},
{
"id": "snpt_water_meter_sample_records",
"aliases": [
{
"text": "水表数据抽样",
"tone": "中性"
},
{
"text": "给我看点水表数据",
"tone": "口语"
},
{
"text": "水表记录样本抽取",
"tone": "专业"
}
],
"keywords": [
"水表",
"样本",
"抽样",
"sample",
"随机",
"记录",
"抽查",
"limit",
"数据结构",
"数据示例",
"sample",
"limit_rows"
],
"intent_tags": [
"sample"
]
}
]

View File

@ -1,235 +0,0 @@
[
{
"id": "snpt_water_meter_top_supply_office",
"desc": "统计各供水管理所下辖水表数量并排序",
"type": "topn",
"title": "供水管理所水表数量排行",
"examples": [
"列出水表最多的前10个供水管理所",
"各供水所水表数量排名"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 10
}
],
"dialect_sql": {
"mysql": "SELECT supply_office AS dim_value, COUNT(*) AS metric_value FROM `data-ge.water_meter_info` GROUP BY supply_office ORDER BY metric_value DESC LIMIT {{top_n}}"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": 11
},
"time_column": "nullable",
"required_columns": [
"supply_office"
]
},
"business_caliber": "按供水管理所维度聚合水表总数,粒度=供水管理所"
},
{
"id": "snpt_water_meter_top_station",
"desc": "统计各个站点下辖水表数量并排序",
"type": "topn",
"title": "站点水表数量排行",
"examples": [
"列出水表最多的前10个站点",
"各站点水表数量排名"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 10
}
],
"dialect_sql": {
"mysql": "SELECT station AS dim_value, COUNT(*) AS metric_value FROM `data-ge.water_meter_info` GROUP BY station ORDER BY metric_value DESC LIMIT {{top_n}}"
},
"applicability": {
"constraints": {
"notes": [
"高基数维度建议LIMIT<=50"
],
"fk_join_available": false,
"dim_cardinality_hint": 36
},
"time_column": "nullable",
"required_columns": [
"station"
]
},
"business_caliber": "按站点维度聚合水表总数,粒度=站点"
},
{
"id": "snpt_water_meter_top_district",
"desc": "统计各区水表数量并排序",
"type": "topn",
"title": "区域水表数量排行",
"examples": [
"列出各区水表数量排名",
"哪个区的水表最多?"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 10
}
],
"dialect_sql": {
"mysql": "SELECT district AS dim_value, COUNT(*) AS metric_value FROM `data-ge.water_meter_info` GROUP BY district ORDER BY metric_value DESC LIMIT {{top_n}}"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": 13
},
"time_column": "nullable",
"required_columns": [
"district"
]
},
"business_caliber": "按行政区划维度聚合水表总数,粒度=区"
},
{
"id": "snpt_water_meter_share_by_type",
"desc": "计算各类水表占总水表的比例",
"type": "ratio",
"title": "水表类型占比分布",
"examples": [
"各类水表占比是多少?",
"哪种类型的水表使用最广泛?"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT meter_type AS dim_value, COUNT(*) * 100.0 / (SELECT COUNT(*) FROM `data-ge.water_meter_info`) AS ratio_percent FROM `data-ge.water_meter_info` GROUP BY meter_type ORDER BY ratio_percent DESC"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": 5
},
"time_column": "nullable",
"required_columns": [
"meter_type"
]
},
"business_caliber": "按水表类型分类计算其占比,粒度=水表类型"
},
{
"id": "snpt_water_meter_subtype_distribution",
"desc": "展示不同水表子类型的数量及比例",
"type": "aggregate",
"title": "水表子类型分布情况",
"examples": [
"各种子类型水表的数量和占比",
"哪种子类型水表最多?"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT meter_subtype AS dim_value, COUNT(*) AS count_value, ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM `data-ge.water_meter_info`), 2) AS percentage FROM `data-ge.water_meter_info` GROUP BY meter_subtype ORDER BY count_value DESC"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": 9
},
"time_column": "nullable",
"required_columns": [
"meter_subtype"
]
},
"business_caliber": "按水表子类型进行计数和百分比统计,粒度=水表子类型"
},
{
"id": "snpt_water_meter_installation_position_stats",
"desc": "统计不同安装位置下的水表数量",
"type": "aggregate",
"title": "安装位置分布统计",
"examples": [
"各种安装位置的水表数量",
"哪种安装位置最为常见?"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT installation_position AS dim_value, COUNT(*) AS count_value FROM `data-ge.water_meter_info` GROUP BY installation_position ORDER BY count_value DESC"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": 4
},
"time_column": "nullable",
"required_columns": [
"installation_position"
]
},
"business_caliber": "按安装位置对水表进行分组计数,粒度=安装位置"
},
{
"id": "snpt_water_meter_grain_check",
"desc": "验证 account_id 和 service_point_id 是否构成唯一组合",
"type": "quality",
"title": "主键粒度校验",
"examples": [
"这张表的数据粒度是否正确?",
"是否存在重复的服务点记录?"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT IF(COUNT(*) = COUNT(DISTINCT account_id, service_point_id), 'PASS', 'FAIL') AS grain_check_result FROM `data-ge.water_meter_info`"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": "nullable",
"required_columns": [
"account_id",
"service_point_id"
]
},
"business_caliber": "检验数据是否符合预期的主键粒度account_id + service_point_id"
},
{
"id": "snpt_water_meter_sample_records",
"desc": "随机抽取部分水表信息用于查看结构",
"type": "sample",
"title": "样本抽取",
"examples": [
"给我看几条水表数据的例子",
"抽查一些原始数据看看格式"
],
"variables": [
{
"name": "limit_rows",
"type": "int",
"default": 5
}
],
"dialect_sql": {
"mysql": "SELECT * FROM `data-ge.water_meter_info` ORDER BY RAND() LIMIT {{limit_rows}}"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": "nullable",
"required_columns": []
},
"business_caliber": "从全量数据中随机采样若干条记录供参考"
}
]

File diff suppressed because one or more lines are too long

View File

@ -1,249 +0,0 @@
[
{
"id": "snpt_topn_supply_office_by_account",
"aliases": [
{
"text": "哪个供水所用户最多?",
"tone": "口语"
},
{
"text": "按供应办公室统计账户数量",
"tone": "中性"
},
{
"text": "供应办公室账户数TopN排名",
"tone": "专业"
}
],
"keywords": [
"供应办公室",
"账户数",
"TopN",
"排行",
"统计",
"account_id",
"supply_office",
"去重",
"高占比",
"维度分析",
"by_dimension",
"aggregate",
"topn"
],
"intent_tags": [
"topn",
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_topn_station_by_account",
"aliases": [
{
"text": "哪些站点用户最多?",
"tone": "口语"
},
{
"text": "按站点统计账户分布",
"tone": "中性"
},
{
"text": "站点账户数Top20排名",
"tone": "专业"
}
],
"keywords": [
"站点",
"账户数",
"TopN",
"排行",
"统计",
"station",
"account_id",
"去重",
"高负载",
"维度分析",
"by_dimension",
"aggregate",
"topn"
],
"intent_tags": [
"topn",
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_topn_district_by_account",
"aliases": [
{
"text": "哪个区用户最多?",
"tone": "口语"
},
{
"text": "按行政区统计账户数量",
"tone": "中性"
},
{
"text": "行政区账户数全量排名",
"tone": "专业"
}
],
"keywords": [
"行政区",
"账户数",
"TopN",
"排行",
"统计",
"district",
"account_id",
"去重",
"区域对比",
"维度分析",
"by_dimension",
"aggregate",
"topn"
],
"intent_tags": [
"topn",
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_share_of_meter_type",
"aliases": [
{
"text": "各类水表占多少比例?",
"tone": "口语"
},
{
"text": "水表类型占比分析",
"tone": "中性"
},
{
"text": "水表类型占比分布",
"tone": "专业"
}
],
"keywords": [
"水表类型",
"占比",
"比例",
"meter_type",
"account_id",
"去重",
"分布",
"主流类型",
"技术选型",
"ratio",
"aggregate",
"by_dimension"
],
"intent_tags": [
"ratio",
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_sample_account_service_point",
"aliases": [
{
"text": "随机看10条账户信息",
"tone": "口语"
},
{
"text": "抽样账户与服务点明细",
"tone": "中性"
},
{
"text": "账户-服务点随机抽样验证",
"tone": "专业"
}
],
"keywords": [
"抽样",
"随机",
"样本",
"account_id",
"service_point_id",
"数据质量",
"验证",
"唯一性",
"格式检查",
"sample",
"quality"
],
"intent_tags": [
"sample",
"quality"
]
},
{
"id": "snpt_filter_meter_status_valid",
"aliases": [
{
"text": "只取有效的水表记录",
"tone": "口语"
},
{
"text": "筛选有效水表记录",
"tone": "中性"
},
{
"text": "水表状态有效性过滤",
"tone": "专业"
}
],
"keywords": [
"有效",
"过滤",
"筛选",
"meter_status",
"质量检查",
"断言",
"清洗",
"filter",
"quality"
],
"intent_tags": [
"filter",
"quality"
]
},
{
"id": "snpt_filter_meter_diameter_20mm",
"aliases": [
{
"text": "找出所有20mm水表用户",
"tone": "口语"
},
{
"text": "筛选20mm水表记录",
"tone": "中性"
},
{
"text": "20mm口径水表子集提取",
"tone": "专业"
}
],
"keywords": [
"20mm",
"水表直径",
"过滤",
"筛选",
"meter_diameter",
"子集",
"分析",
"住宅用水",
"规格",
"filter",
"by_dimension"
],
"intent_tags": [
"filter",
"by_dimension"
]
}
]

View File

@ -1,227 +0,0 @@
[
{
"id": "snpt_topn_supply_office_by_account",
"desc": "统计各供应办公室对应的账户数量,识别高占比管理所",
"type": "topn",
"title": "按供应办公室统计账户数",
"examples": [
"哪个供水管理所服务的用户最多?",
"列出前5个账户数最多的供应办公室"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 11
}
],
"dialect_sql": {
"mysql": "SELECT supply_office, COUNT(DISTINCT account_id) AS account_count\nFROM water_meter_info\nGROUP BY supply_office\nORDER BY account_count DESC\nLIMIT {{top_n}};"
},
"applicability": {
"constraints": {
"notes": [
"供应办公室仅11个唯一值可安全展示全部建议LIMIT 11避免冗余排序"
],
"fk_join_available": false,
"dim_cardinality_hint": 11
},
"time_column": "nullable",
"required_columns": [
"supply_office",
"account_id"
]
},
"business_caliber": "粒度=供应办公室,指标=去重账户数account_id仅统计水表信息表中有效账户不关联外部表"
},
{
"id": "snpt_topn_station_by_account",
"desc": "统计各站点服务的账户数量,识别高负载站点",
"type": "topn",
"title": "按站点统计账户分布",
"examples": [
"哪些站点服务的用户最多?",
"TOP10用户最多的站点是哪些"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 20
}
],
"dialect_sql": {
"mysql": "SELECT station, COUNT(DISTINCT account_id) AS account_count\nFROM water_meter_info\nGROUP BY station\nORDER BY account_count DESC\nLIMIT {{top_n}};"
},
"applicability": {
"constraints": {
"notes": [
"站点有36个唯一值建议LIMIT<=20以避免结果过长高基数维度可能影响查询性能"
],
"fk_join_available": false,
"dim_cardinality_hint": 36
},
"time_column": "nullable",
"required_columns": [
"station",
"account_id"
]
},
"business_caliber": "粒度=站点station指标=去重账户数account_id基于水表信息表直接聚合不涉及时间维度"
},
{
"id": "snpt_topn_district_by_account",
"desc": "统计各行政区的账户数量,辅助区域资源分配分析",
"type": "topn",
"title": "按行政区统计账户分布",
"examples": [
"哪个区的用水账户最多?",
"列出所有行政区的账户数量排名"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 13
}
],
"dialect_sql": {
"mysql": "SELECT district, COUNT(DISTINCT account_id) AS account_count\nFROM water_meter_info\nGROUP BY district\nORDER BY account_count DESC\nLIMIT {{top_n}};"
},
"applicability": {
"constraints": {
"notes": [
"行政区共13个可完整展示适合用于区域对比分析"
],
"fk_join_available": false,
"dim_cardinality_hint": 13
},
"time_column": "nullable",
"required_columns": [
"district",
"account_id"
]
},
"business_caliber": "粒度=行政区district指标=去重账户数account_id基于水表信息表聚合反映各区域用户规模"
},
{
"id": "snpt_share_of_meter_type",
"desc": "计算各类水表类型在总账户中的占比,识别主流类型",
"type": "ratio",
"title": "水表类型占比分析",
"examples": [
"各类水表在用户中的占比是多少?",
"电磁式远传水表占总用户比例多少?"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT meter_type, \n COUNT(DISTINCT account_id) AS account_count,\n ROUND(COUNT(DISTINCT account_id) * 100.0 / SUM(COUNT(DISTINCT account_id)) OVER (), 2) AS percentage\nFROM water_meter_info\nGROUP BY meter_type\nORDER BY account_count DESC;"
},
"applicability": {
"constraints": {
"notes": [
"水表类型仅5种适合计算占比可直接展示全量分布"
],
"fk_join_available": false,
"dim_cardinality_hint": 5
},
"time_column": "nullable",
"required_columns": [
"meter_type",
"account_id"
]
},
"business_caliber": "粒度=水表类型meter_type指标=去重账户数占比,分母为全表去重账户总数,反映技术选型分布"
},
{
"id": "snpt_sample_account_service_point",
"desc": "随机抽取部分账户与服务点ID的原始记录用于数据质量核查",
"type": "sample",
"title": "抽样账户与服务点明细",
"examples": [
"随机查看10条账户与服务点的详细信息",
"抽样检查水表信息是否符合预期格式"
],
"variables": [
{
"name": "sample_size",
"type": "int",
"default": 10
}
],
"dialect_sql": {
"mysql": "SELECT account_id, service_point_id, supply_office, station, district, meter_diameter, meter_type, meter_subtype, installation_position\nFROM water_meter_info\nORDER BY RAND()\nLIMIT {{sample_size}};"
},
"applicability": {
"constraints": {
"notes": [
"主键组合为account_id+service_point_id适合抽样验证唯一性建议样本量≤100"
],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": "nullable",
"required_columns": [
"account_id",
"service_point_id"
]
},
"business_caliber": "粒度=单条水表记录抽取样本用于验证account_id与service_point_id的组合唯一性及维度字段完整性"
},
{
"id": "snpt_filter_meter_status_valid",
"desc": "过滤出水表状态为'有效'的记录,用于后续分析",
"type": "quality",
"title": "筛选有效水表记录",
"examples": [
"只取状态为有效的水表记录",
"确认所有水表是否均为有效状态"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT *\nFROM water_meter_info\nWHERE meter_status = '有效';"
},
"applicability": {
"constraints": {
"notes": [
"meter_status仅存在'有效'值,此条件恒成立;可用于数据清洗流程的显式过滤"
],
"fk_join_available": false,
"dim_cardinality_hint": 1
},
"time_column": "nullable",
"required_columns": [
"meter_status"
]
},
"business_caliber": "仅保留水表状态为'有效'的记录,因全表均为有效值,此过滤为冗余但可作为数据质量校验的显式断言"
},
{
"id": "snpt_filter_meter_diameter_20mm",
"desc": "筛选水表直径为20mm的记录用于特定口径设备分析",
"type": "quality",
"title": "筛选20mm水表记录",
"examples": [
"找出所有使用20mm水表的用户",
"20mm水表分布在哪些站点"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT *\nFROM water_meter_info\nWHERE meter_diameter = '20mm';"
},
"applicability": {
"constraints": {
"notes": [
"水表直径共8种枚举值20mm为常见规格可作为子集分析的起点"
],
"fk_join_available": false,
"dim_cardinality_hint": 8
},
"time_column": "nullable",
"required_columns": [
"meter_diameter"
]
},
"business_caliber": "粒度=单条水表记录筛选条件为meter_diameter='20mm',用于分析标准住宅用水表的分布特征"
}
]

View File

@ -1,57 +0,0 @@
#添加RAG
curl --location --request POST 'http://127.0.0.1:8000/rag/add' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer ' \
--data-raw '{
"id": 0,
"workspaceId": 0,
"name": "string",
"embeddingData": "string",
"type": "METRIC"
}'
#批量添加RAG
curl --location --request POST 'http://127.0.0.1:8000/rag/addBatch' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer ' \
--data-raw '[
{
"id": 0,
"workspaceId": 0,
"name": "string",
"embeddingData": "string",
"type": "METRIC"
}
]'
#更新RAG
curl --location --request POST 'http://127.0.0.1:8000/rag/update' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer ' \
--data-raw '{
"id": 0,
"workspaceId": 0,
"name": "string",
"embeddingData": "string",
"type": "METRIC"
}'
#删除RAG
curl --location --request POST 'http://127.0.0.1:8000/rag/delete' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer ' \
--data-raw '{
"id": 0,
"type": "METRIC"
}'
#检索RAG
curl --location --request POST 'http://127.0.0.1:8000/rag/retrieve' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer ' \
--data-raw '{
"query": "string",
"num": 0,
"workspaceId": 0,
"type": "METRIC"
}'

View File

@ -1,49 +0,0 @@
# 创建会话
curl -X POST "/api/v1/chat/sessions" \
-H "Content-Type: application/json" \
-d "{\"user_id\": $CHAT_USER_ID}"
# 获取会话
curl "/api/v1/chat/sessions/{session_id}"
# 按用户列出会话
curl "/api/v1/chat/sessions?user_id=$CHAT_USER_ID"
# 更新会话状态
curl -X POST "/api/v1/chat/sessions/{session_id}/update" \
-H "Content-Type: application/json" \
-d '{"status":"PAUSED"}'
# 关闭会话
curl -X POST "/api/v1/chat/sessions/{session_id}/close"
# 创建对话轮次
curl -X POST "/api/v1/chat/sessions/{session_id}/turns" \
-H "Content-Type: application/json" \
-d '{
"user_id": '"$CHAT_USER_ID"',
"user_query": "展示昨天订单GMV",
"intent": "METRIC_QUERY",
"ast_json": {"select":["gmv"],"where":{"dt":"yesterday"}},
"main_metric_ids": [1234],
"created_metric_ids": []
}'
# 获取单条对话轮次
curl "/api/v1/chat/turns/{turn_id}"
# 列出会话下的轮次
curl "/api/v1/chat/sessions/{session_id}/turns"
# 写入检索结果
curl -X POST "/api/v1/chat/turns/{turn_id}/retrievals" \
-H "Content-Type: application/json" \
-d '{
"retrievals": [
{"item_type":"METRIC","item_id":"metric_foo","used_in_sql":true,"rank_no":1},
{"item_type":"SNIPPET","item_id":"snpt_bar","similarity_score":0.77,"rank_no":2}
]
}'
# 列出轮次的检索结果
curl "/api/v1/chat/turns/{turn_id}/retrievals"

View File

@ -1,69 +0,0 @@
# 新建指标
curl -X POST "/api/v1/metrics" \
-H "Content-Type: application/json" \
-d '{
"metric_code": "metric_1234",
"metric_name": "订单数",
"biz_domain": "order",
"biz_desc": "订单总数",
"base_sql": "select count(*) as order_cnt from orders",
"time_grain": "DAY",
"dim_binding": ["dt"],
"update_strategy": "FULL",
"metric_aliases": ["订单量"],
"created_by": '"$METRIC_USER_ID"'
}'
# 更新指标
curl -X POST "/api/v1/metrics/{metric_id}" \
-H "Content-Type: application/json" \
-d '{"metric_name":"订单数-更新","is_active":false}'
# 获取指标
curl "/api/v1/metrics/{metric_id}"
# 新建调度
curl -X POST "/api/v1/metric-schedules" \
-H "Content-Type: application/json" \
-d '{"metric_id":{metric_id},"cron_expr":"0 2 * * *","priority":5,"enabled":true}'
# 更新调度
curl -X POST "/api/v1/metric-schedules/{schedule_id}" \
-H "Content-Type: application/json" \
-d '{"enabled":false,"retry_times":1}'
# 列出某指标的调度
curl "/api/v1/metrics/{metric_id}/schedules"
# 触发运行
curl -X POST "/api/v1/metric-runs/trigger" \
-H "Content-Type: application/json" \
-d '{
"metric_id": {metric_id},
"triggered_by": "API",
"data_time_from": "2024-05-01T00:00:00Z",
"data_time_to": "2024-05-02T00:00:00Z"
}'
# 列出运行
curl "/api/v1/metric-runs?metric_id={metric_id}"
# 获取单次运行
curl "/api/v1/metric-runs/{run_id}"
# 写入指标结果
curl -X POST "/api/v1/metric-results/{metric_id}" \
-H "Content-Type: application/json" \
-d '{
"metric_id": {metric_id},
"results": [
{"stat_time":"2024-05-01T00:00:00Z","metric_value":123.45,"data_version":"{run_id}"},
{"stat_time":"2024-05-02T00:00:00Z","metric_value":234.56,"data_version":"{run_id}"}
]
}'
# 查询指标结果
curl "/api/v1/metric-results?metric_id={metric_id}"
# 查询最新结果
curl "/api/v1/metric-results/latest?metric_id={metric_id}"

View File

@ -1,83 +0,0 @@
某个用户的一句问话 → 解析成某轮 chat_turn → 这轮用了哪些指标/知识/会话chat_turn_retrieval
是否产生了新的指标metric_def
是否触发了指标调度运行metric_job_run.turn_id
最终产生了哪些指标结果metric_result.metric_id + stat_time
会话域
schema
会话表 chat_session
会话轮次表 chat_turn
会话轮次检索关联表 chat_turn_retrieval
API
1. 创建会话
POST /api/v1/chat/sessions
2. 更新会话轮次
POST /api/v1/chat/sessions/{session_id}/update
3. 结束会话
POST /api/v1/chat/sessions/{session_id}/close
4. 查询会话
GET /api/v1/chat/sessions/{session_id}
5. 会话列表查询(按用户、时间)
GET /api/v1/chat/sessions
6. 创建问答轮次(用户发起 query
POST /api/v1/chat/sessions/{session_id}/turns
7. 查询某会话的所有轮次
GET /api/v1/chat/sessions/{session_id}/turns
8. 查看单轮问答详情
GET /api/v1/chat/turns/{turn_id}
9. 批量写入某轮的检索结果
POST /api/v1/chat/turns/{turn_id}/retrievals
10. 查询某轮的检索记录
GET /api/v1/chat/turns/{turn_id}/retrievals
11. 更新某轮的检索记录in future
POST /api/v1/chat/turns/{turn_id}/retrievals/update
元数据域
schema
指标定义表 metric_def
API
12. 创建指标(来自问答或传统定义)
POST /api/v1/metrics
13. 更新指标
POST /api/v1/metrics/{id}
14. 获取指标详情
GET /api/v1/metrics
执行调度域暂定airflow
schema
指标调度配置表 metric_schedule
调度运行记录表 metric_job_run
API
1. 创建调度配置
POST /api/v1/metric-schedules
2. 更新调度配置
POST /api/v1/metric-schedules/{id}
3. 查询指标调度配置详情
GET /api/v1/metrics/{metric_id}/schedules
4. 手动触发一次指标运行(例如来自问数)
POST /api/v1/metric-runs/trigger
5. 查询运行记录列表
GET /api/v1/metric-runs
6. 查询单次运行详情
GET /api/metric-runs/{run_id}
数据域
schema
指标结果表纵表metric_result
API
1. 查询指标结果(按时间段 & 维度)
GET /api/metric-results
2. 单点查询(最新值)
GET /api/metric-results/latest
3. 批量写入指标结果
POST /api/v1/metric-results/{metrics_id}

View File

@ -1,13 +0,0 @@
services:
app:
build: .
ports:
- "8060:8000"
volumes:
- .:/app
environment:
- PYTHONUNBUFFERED=1
# 开发模式:启用 --reload
command: uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
# 生产模式:注释上面 command取消注释下面这行
# command: uvicorn app.main:app --host 0.0.0.0 --port 8000 --workers 4

File diff suppressed because it is too large Load Diff

View File

@ -1,21 +0,0 @@
CREATE TABLE `ecommerce_orders` (
`order_id` char(36) COLLATE utf8mb4_unicode_ci NOT NULL COMMENT 'UUID from CSV',
`customer_id` int NOT NULL,
`product_id` int NOT NULL,
`category` varchar(64) COLLATE utf8mb4_unicode_ci NOT NULL,
`price` decimal(10,2) NOT NULL,
`quantity` int NOT NULL,
`order_date` datetime(6) NOT NULL,
`shipping_date` datetime(6) NOT NULL,
`delivery_status` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
`payment_method` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
`device_type` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
`channel` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
`shipping_address` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL,
`billing_address` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL,
`customer_segment` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
PRIMARY KEY (`order_id`),
KEY `idx_customer` (`customer_id`),
KEY `idx_product` (`product_id`),
KEY `idx_order_date` (`order_date`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

View File

@ -1,40 +0,0 @@
CREATE TABLE `action_results` (
`id` bigint NOT NULL AUTO_INCREMENT COMMENT '主键',
`table_id` bigint NOT NULL COMMENT '表ID',
`version_ts` bigint NOT NULL COMMENT '版本时间戳(版本号)',
`action_type` enum('ge_profiling','ge_result_desc','snippet','snippet_alias') COLLATE utf8mb4_bin NOT NULL COMMENT '动作类型',
`status` enum('pending','running','success','failed','partial') COLLATE utf8mb4_bin NOT NULL DEFAULT 'pending' COMMENT '执行状态',
`llm_usage` json DEFAULT NULL COMMENT 'LLM token usage统计',
`error_code` varchar(128) COLLATE utf8mb4_bin DEFAULT NULL,
`error_message` text COLLATE utf8mb4_bin,
`started_at` datetime DEFAULT NULL,
`finished_at` datetime DEFAULT NULL,
`duration_ms` int DEFAULT NULL,
`table_schema_version_id` varchar(19) COLLATE utf8mb4_bin NOT NULL,
`table_schema` json NOT NULL,
`ge_profiling_json` json DEFAULT NULL COMMENT 'Profiling完整结果JSON',
`ge_profiling_json_size_bytes` bigint DEFAULT NULL,
`ge_profiling_summary` json DEFAULT NULL COMMENT 'Profiling摘要剔除大value_set等',
`ge_profiling_summary_size_bytes` bigint DEFAULT NULL,
`ge_profiling_total_size_bytes` bigint DEFAULT NULL COMMENT '上两者合计',
`ge_profiling_html_report_url` varchar(1024) COLLATE utf8mb4_bin DEFAULT NULL COMMENT 'GE报告HTML路径/URL',
`ge_result_desc_json` json DEFAULT NULL COMMENT '表描述结果JSON',
`ge_result_desc_json_size_bytes` bigint DEFAULT NULL,
`snippet_json` json DEFAULT NULL COMMENT 'SQL知识片段结果JSON',
`snippet_json_size_bytes` bigint DEFAULT NULL,
`snippet_alias_json` json DEFAULT NULL COMMENT 'SQL片段改写/丰富结果JSON',
`snippet_alias_json_size_bytes` bigint DEFAULT NULL,
`callback_url` varchar(1024) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
`result_checksum` varbinary(32) DEFAULT NULL COMMENT '对当前action有效载荷计算的MD5/xxhash',
`created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
`updated_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
`model` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型名称',
`model_provider` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型渠道',
`model_params` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型参数,如温度',
PRIMARY KEY (`id`),
UNIQUE KEY `uq_table_ver_action` (`table_id`,`version_ts`,`action_type`),
KEY `idx_status` (`status`),
KEY `idx_table` (`table_id`,`updated_at`),
KEY `idx_action_time` (`action_type`,`version_ts`),
KEY `idx_schema_version` (`table_schema_version_id`)
) ENGINE=InnoDB AUTO_INCREMENT=113 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC COMMENT='数据分析知识片段表';

View File

@ -1,103 +0,0 @@
CREATE TABLE IF NOT EXISTS chat_session (
id BIGINT AUTO_INCREMENT PRIMARY KEY,
user_id BIGINT NOT NULL,
session_uuid CHAR(36) NOT NULL, -- 可用于对外展示的IDUUID
end_time DATETIME NULL,
status VARCHAR(16) NOT NULL DEFAULT 'OPEN', -- OPEN/CLOSED/ABANDONED
last_turn_id BIGINT NULL, -- 指向 chat_turn.id
ext_context JSON NULL, -- 业务上下文
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
UNIQUE KEY uk_session_uuid (session_uuid),
KEY idx_user_time (user_id, created_at),
KEY idx_status_time (status, created_at),
KEY idx_last_turn (last_turn_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE IF NOT EXISTS chat_turn (
id BIGINT AUTO_INCREMENT,
session_id BIGINT NOT NULL, -- 关联 chat_session.id
turn_no INT NOT NULL, -- 会话内轮次序号1,2,3...
user_id BIGINT NOT NULL,
user_query TEXT NOT NULL, -- 原始用户问句
intent VARCHAR(64) NULL, -- METRIC_QUERY/METRIC_EXPLAIN 等
ast_json JSON NULL, -- 解析出来的 AST
generated_sql MEDIUMTEXT NULL, -- 生成的最终SQL
sql_status VARCHAR(32) NULL, -- SUCCESS/FAILED/SKIPPED
error_msg TEXT NULL, -- SQL生成/执行错误信息
main_metric_ids JSON NULL, -- 本轮涉及的指标ID列表
created_metric_ids JSON NULL, -- 本轮新建指标ID列表
end_time DATETIME NULL,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
-- 主键改为联合主键,必须包含 created_at
PRIMARY KEY (id, created_at),
KEY idx_session_turn (session_id, turn_no),
KEY idx_session_time (session_id, created_at),
KEY idx_intent_time (intent, created_at),
KEY idx_user_time (user_id, created_at)
)
ENGINE=InnoDB
DEFAULT CHARSET=utf8mb4
PARTITION BY RANGE COLUMNS(created_at) (
-- 历史数据分区(根据实际需求调整)
PARTITION p202511 VALUES LESS THAN ('2025-12-01'),
PARTITION p202512 VALUES LESS THAN ('2026-01-01'),
-- 2026年按月分区
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
PARTITION p202602 VALUES LESS THAN ('2026-03-01'),
PARTITION p202603 VALUES LESS THAN ('2026-04-01'),
PARTITION p202604 VALUES LESS THAN ('2026-05-01'),
PARTITION p202605 VALUES LESS THAN ('2026-06-01'),
PARTITION p202606 VALUES LESS THAN ('2026-07-01'),
-- ... 可以预建几个月 ...
-- 兜底分区,存放未来的数据,防止插入报错
PARTITION p_future VALUES LESS THAN (MAXVALUE)
);
CREATE TABLE IF NOT EXISTS chat_turn_retrieval (
id BIGINT AUTO_INCREMENT,
turn_id BIGINT NOT NULL, -- 关联 qa_turn.id
item_type VARCHAR(32) NOT NULL, -- METRIC/SNIPPET/CHAT
item_id VARCHAR(128) NOT NULL, -- metric_id/snippet_id/table_name 等
item_extra JSON NULL, -- 附加信息,如字段名等
similarity_score DECIMAL(10,6) NULL, -- 相似度
rank_no INT NULL, -- 检索排名
used_in_reasoning TINYINT(1) NOT NULL DEFAULT 0, -- 是否参与推理
used_in_sql TINYINT(1) NOT NULL DEFAULT 0, -- 是否影响最终SQL
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
-- 主键改为联合主键,必须包含 created_at
PRIMARY KEY (id, created_at),
KEY idx_turn (turn_id),
KEY idx_turn_type (turn_id, item_type),
KEY idx_item (item_type, item_id)
)
ENGINE=InnoDB
DEFAULT CHARSET=utf8mb4
PARTITION BY RANGE COLUMNS(created_at) (
-- 历史数据分区(根据实际需求调整)
PARTITION p202511 VALUES LESS THAN ('2025-12-01'),
PARTITION p202512 VALUES LESS THAN ('2026-01-01'),
-- 2026年按月分区
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
PARTITION p202602 VALUES LESS THAN ('2026-03-01'),
PARTITION p202603 VALUES LESS THAN ('2026-04-01'),
PARTITION p202604 VALUES LESS THAN ('2026-05-01'),
PARTITION p202605 VALUES LESS THAN ('2026-06-01'),
PARTITION p202606 VALUES LESS THAN ('2026-07-01'),
-- ... 可以预建几个月 ...
-- 兜底分区,存放未来的数据,防止插入报错
PARTITION p_future VALUES LESS THAN (MAXVALUE)
);

View File

@ -1,155 +0,0 @@
CREATE TABLE metric_def (
id BIGINT AUTO_INCREMENT PRIMARY KEY,
metric_code VARCHAR(64) NOT NULL, -- 内部编码order_cnt_delivery
metric_name VARCHAR(128) NOT NULL, -- 中文名:外送订单数
metric_aliases JSON NULL, -- 别名列表
biz_domain VARCHAR(64) NOT NULL, -- 通过table tag获取支持人工配置
biz_desc TEXT NULL, -- 业务口径描述
chat_turn_id BIGINT NULL, -- 来自哪轮会话
tech_desc TEXT NULL, -- 技术口径描述
formula_expr TEXT NULL, -- 公式描述:"sum(pay_amount)"
base_sql MEDIUMTEXT NOT NULL, -- 标准计算SQL逻辑SQL/snippet
time_grain VARCHAR(32) NOT NULL, -- DAY/HOUR/WEEK/MONTH
dim_binding JSON NOT NULL, -- 维度绑定,如 ["dt","store_id","channel"]
update_strategy VARCHAR(32) NOT NULL, -- FULL/INCR/REALTIME
schedule_id BIGINT NULL, -- 调度ID
schedule_type INT NULL, -- 调度类型默认调度cron
version INT NOT NULL DEFAULT 1,
is_active TINYINT(1) NOT NULL DEFAULT 1,
sql_hash VARCHAR(64) NULL, -- base_sql hash 用于版本比较
created_by BIGINT NULL,
updated_by BIGINT NULL,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
UNIQUE KEY uk_metric_code (metric_code),
KEY idx_domain_active (biz_domain, is_active),
KEY idx_update_strategy (update_strategy),
KEY idx_name (metric_name)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE metric_schedule (
id BIGINT AUTO_INCREMENT PRIMARY KEY,
metric_id BIGINT NOT NULL, -- 关联 metric_def.id
cron_expr VARCHAR(64) NOT NULL, -- 调度表达式
enabled TINYINT(1) NOT NULL DEFAULT 1, -- 是否启用
priority INT NOT NULL DEFAULT 10, -- 优先级
backfill_allowed TINYINT(1) NOT NULL DEFAULT 1, -- 是否允许补数
max_runtime_sec INT NULL, -- 最大运行时长(秒)
retry_times INT NOT NULL DEFAULT 0, -- 失败重试次数
owner_team VARCHAR(64) NULL,
owner_user_id BIGINT NULL,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
KEY idx_metric_enabled (metric_id, enabled),
KEY idx_owner (owner_team, owner_user_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE metric_job_run (
id BIGINT AUTO_INCREMENT,
metric_id BIGINT NOT NULL, -- metric_def.id
schedule_id BIGINT NULL, -- metric_schedule.id手动触发则可为空
source_turn_id BIGINT NULL, -- 若本次运行由某次问答触发,关联 qa_turn.id
data_time_from DATETIME NULL, -- 指标统计时间窗口起
data_time_to DATETIME NULL, -- 指标统计时间窗口止
metric_version INT NOT NULL, -- 执行时使用的指标版本
base_sql_snapshot MEDIUMTEXT NOT NULL, -- 本次执行使用的SQL快照
status VARCHAR(32) NOT NULL, -- RUNNING/SUCCESS/FAILED/SKIPPED
error_msg TEXT NULL,
affected_rows BIGINT NULL, -- 写入行数
runtime_ms BIGINT NULL, -- 执行耗时
triggered_by VARCHAR(32) NOT NULL, -- SCHEDULER/MANUAL/API/QA_TURN
triggered_at DATETIME NOT NULL,
started_at DATETIME NULL,
finished_at DATETIME NULL,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
-- 主键改为联合主键,必须包含 created_at
PRIMARY KEY (id, created_at),
KEY idx_metric_time (metric_id, data_time_from, data_time_to),
KEY idx_status_time (status, triggered_at),
KEY idx_schedule (schedule_id),
KEY idx_source_turn (source_turn_id)
)
ENGINE=InnoDB
DEFAULT CHARSET=utf8mb4
PARTITION BY RANGE COLUMNS(created_at) (
-- 历史数据分区(根据实际需求调整)
PARTITION p202511 VALUES LESS THAN ('2025-12-01'),
PARTITION p202512 VALUES LESS THAN ('2026-01-01'),
-- 2026年按月分区
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
PARTITION p202602 VALUES LESS THAN ('2026-03-01'),
PARTITION p202603 VALUES LESS THAN ('2026-04-01'),
PARTITION p202604 VALUES LESS THAN ('2026-05-01'),
PARTITION p202605 VALUES LESS THAN ('2026-06-01'),
PARTITION p202606 VALUES LESS THAN ('2026-07-01'),
-- ... 可以预建几个月 ...
-- 兜底分区,存放未来的数据,防止插入报错
PARTITION p_future VALUES LESS THAN (MAXVALUE)
);
CREATE TABLE metric_result (
id BIGINT AUTO_INCREMENT,
metric_id BIGINT NOT NULL, -- metric_def.id
metric_version INT NOT NULL, -- metric_def.version
stat_time DATETIME NOT NULL, -- 按 time_grain 对齐后的时间
extra_dims JSON NULL, -- 其他维度JSON 存
metric_value DECIMAL(32,8) NOT NULL, -- 指标结果值
load_time DATETIME NOT NULL, -- 入库时间
data_version BIGINT NULL, -- 版本或 job_run id
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
-- 主键改为联合主键,必须包含 created_at
PRIMARY KEY (id, created_at),
KEY idx_metric_time (metric_id, stat_time),
KEY idx_load_time (load_time)
)
ENGINE=InnoDB
DEFAULT CHARSET=utf8mb4
PARTITION BY RANGE COLUMNS(created_at) (
-- 历史数据分区(根据实际需求调整)
PARTITION p202511 VALUES LESS THAN ('2025-12-01'),
PARTITION p202512 VALUES LESS THAN ('2026-01-01'),
-- 2026年按月分区
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
PARTITION p202602 VALUES LESS THAN ('2026-03-01'),
PARTITION p202603 VALUES LESS THAN ('2026-04-01'),
PARTITION p202604 VALUES LESS THAN ('2026-05-01'),
PARTITION p202605 VALUES LESS THAN ('2026-06-01'),
PARTITION p202606 VALUES LESS THAN ('2026-07-01'),
-- ... 可以预建几个月 ...
-- 兜底分区,存放未来的数据,防止插入报错
PARTITION p_future VALUES LESS THAN (MAXVALUE)
);

View File

@ -1,24 +0,0 @@
CREATE TABLE `rag_snippet` (
`rag_item_id` bigint NOT NULL COMMENT 'RAG item id (stable hash of table/version/snippet_id)',
`workspace_id` bigint NOT NULL COMMENT 'RAG workspace scope',
`table_id` bigint NOT NULL COMMENT '来源表ID',
`version_ts` bigint NOT NULL COMMENT '表版本号',
`action_result_id` bigint NOT NULL COMMENT '来源 action_results 主键IDsnippet_alias 或 snippet 行)',
`snippet_id` varchar(255) COLLATE utf8mb4_bin NOT NULL COMMENT '原始 snippet id',
`rag_text` text COLLATE utf8mb4_bin NOT NULL COMMENT '用于向量化的拼接文本',
`merged_json` json NOT NULL COMMENT '合并后的 snippet 对象',
`created_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '写入时间,用于分区',
`updated_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (`rag_item_id`,`created_at`),
KEY `idx_action_result` (`action_result_id`),
KEY `idx_workspace` (`workspace_id`),
KEY `idx_table_version` (`table_id`,`version_ts`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin
PARTITION BY RANGE COLUMNS (`created_at`) (
PARTITION p202401 VALUES LESS THAN ('2024-02-01'),
PARTITION p202402 VALUES LESS THAN ('2024-03-01'),
PARTITION p202403 VALUES LESS THAN ('2024-04-01'),
PARTITION p202404 VALUES LESS THAN ('2024-05-01'),
PARTITION p202405 VALUES LESS THAN ('2024-06-01'),
PARTITION p_future VALUES LESS THAN (MAXVALUE)
) COMMENT='RAG snippet 索引缓存';

View File

@ -1,40 +0,0 @@
CREATE TABLE `action_results` (
`id` bigint NOT NULL AUTO_INCREMENT COMMENT '主键',
`table_id` bigint NOT NULL COMMENT '表ID',
`version_ts` bigint NOT NULL COMMENT '版本时间戳(版本号)',
`action_type` enum('ge_profiling','ge_result_desc','snippet','snippet_alias') COLLATE utf8mb4_bin NOT NULL COMMENT '动作类型',
`status` enum('pending','running','success','failed','partial') COLLATE utf8mb4_bin NOT NULL DEFAULT 'pending' COMMENT '执行状态',
`model` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型名称',
`model_provider` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型渠道',
`model_params` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型参数,如温度',
`llm_usage` json DEFAULT NULL COMMENT 'LLM token usage统计',
`error_code` varchar(128) COLLATE utf8mb4_bin DEFAULT NULL,
`error_message` text COLLATE utf8mb4_bin,
`started_at` datetime DEFAULT NULL,
`finished_at` datetime DEFAULT NULL,
`duration_ms` int DEFAULT NULL,
`table_schema_version_id` varchar(19) COLLATE utf8mb4_bin NOT NULL,
`table_schema` json NOT NULL,
`ge_profiling_json` json DEFAULT NULL COMMENT 'Profiling完整结果JSON',
`ge_profiling_json_size_bytes` bigint DEFAULT NULL,
`ge_profiling_summary` json DEFAULT NULL COMMENT 'Profiling摘要剔除大value_set等',
`ge_profiling_summary_size_bytes` bigint DEFAULT NULL,
`ge_profiling_total_size_bytes` bigint DEFAULT NULL COMMENT '上两者合计',
`ge_profiling_html_report_url` varchar(1024) COLLATE utf8mb4_bin DEFAULT NULL COMMENT 'GE报告HTML路径/URL',
`ge_result_desc_json` json DEFAULT NULL COMMENT '表描述结果JSON',
`ge_result_desc_json_size_bytes` bigint DEFAULT NULL,
`snippet_json` json DEFAULT NULL COMMENT 'SQL知识片段结果JSON',
`snippet_json_size_bytes` bigint DEFAULT NULL,
`snippet_alias_json` json DEFAULT NULL COMMENT 'SQL片段改写/丰富结果JSON',
`snippet_alias_json_size_bytes` bigint DEFAULT NULL,
`callback_url` varchar(1024) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
`result_checksum` varbinary(32) DEFAULT NULL COMMENT '对当前action有效载荷计算的MD5/xxhash',
`created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
`updated_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
UNIQUE KEY `uq_table_ver_action` (`table_id`,`version_ts`,`action_type`),
KEY `idx_status` (`status`),
KEY `idx_table` (`table_id`,`updated_at`),
KEY `idx_action_time` (`action_type`,`version_ts`),
KEY `idx_schema_version` (`table_schema_version_id`)
) ENGINE=InnoDB AUTO_INCREMENT=53 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC COMMENT='数据分析知识片段表';

View File

@ -121,7 +121,7 @@ def clean_value(value: Any) -> Any:
if isinstance(value, (np.generic,)): if isinstance(value, (np.generic,)):
return value.item() return value.item()
if isinstance(value, pd.Timestamp): if isinstance(value, pd.Timestamp):
return str(value) return value.isoformat()
if pd.isna(value): if pd.isna(value):
return None return None
return value return value

View File

@ -1,30 +0,0 @@
version: 1
formatters:
standard:
format: "%(asctime)s %(levelname)s %(name)s:%(lineno)d %(message)s"
handlers:
console:
class: logging.StreamHandler
level: INFO
formatter: standard
stream: ext://sys.stdout
file:
class: logging.handlers.RotatingFileHandler
level: INFO
formatter: standard
filename: logs/app.log
maxBytes: 10485760 # 10 MB
backupCount: 5
encoding: utf-8
loggers:
app:
level: INFO
handlers:
- console
- file
propagate: no
root:
level: INFO
handlers:
- console
- file

View File

@ -1,6 +0,0 @@
def main():
print("Hello from data-ge-new!")
if __name__ == "__main__":
main()

View File

@ -1,23 +0,0 @@
项目结构与逻辑
app/main.py创建 FastAPI 应用与生命周期,初始化共享 httpx.AsyncClient 和 LLMGateway统一异常处理后暴露四个接口聊天代理、导入分析、表画像流水线、表片段入库。
app/models.py定义所有请求/响应模型与枚举LLM 请求、导入分析作业、表画像作业、片段入库等),并给出字段校验与默认值。
app/services核心业务逻辑
gateway.py 将 /v1/chat/completions 请求转发到 NEW_API_BASE_URL带可选 Bearer Token并归一化返回。
import_analysis.py 组装导入提示词prompt/data_import_analysis.md、解析/截断样本、调用统一聊天接口、抽取 JSON 结果与 token 用量,最后回调业务方。
table_profiling.py 串行执行 4 步流水线Great Expectations profiling → LLM 结果描述prompt/ge_result_desc_prompt.md→ 片段生成prompt/snippet_generator.md→ 片段别名prompt/snippet_alias_generator.md每步都回调状态与结果。
table_snippet.py 将各步骤结果 upsert 到数据库表,自动序列化 JSON/大小信息并构造 INSERT ... ON DUPLICATE KEY UPDATE。
app/providers/*各云厂商直连客户端OpenAI/Anthropic/OpenRouter/Gemini/Qwen/DeepSeek实现统一 chat 接口;当前主流程通过 new-api 转发,但保留直连能力。
prompt/ 存放提示词模板scripts/ 与 test/ 目录提供接口调用示例和回归样本table_snippet.sql 给出 action_results 表结构(用于片段与 profiling 结果持久化)。
功能/需求说明
LLM 网关POST /v1/chat/completions 接收 LLMRequestprovider+model+messages 等),将 payload 透传到 NEW_API_BASE_URL/v1/chat/completions带可选 NEW_API_AUTH_TOKEN 认证;异常时返回 4xx/5xx 并记录原始响应。
导入分析异步POST /v1/import/analyze 接收导入样本rows/headers/raw_csv/table_schema、目标模型 llm_model默认 DEFAULT_IMPORT_MODEL可被 IMPORT_SUPPORTED_MODELS 白名单限制)、温度与回调地址。服务将样本转 CSV、附加 schema拼接系统+用户消息后调用统一聊天接口,解析首个 choice 中的 JSON 作为分析结果,连同 LLM usage 一并以回调形式返回;失败时回调 status=failed 与错误信息。
表画像流水线异步POST /v1/table/profiling 接收表标识、版本号、回调地址及 GE/LLM 配置datasource/batch_request、连接串模板、LLM 模型与超时)。流水线按顺序执行:
Great Expectations profiling可指定 profiler 类型、datasource、runtime SQL 查询/表),生成完整与摘要 JSON 及 Data Docs 路径;
调用聊天接口生成 GE 结果描述 JSON
基于描述生成 SQL 片段数组;
生成片段别名/关键词。
每步成功/失败都会回调payload 包含 action_type、结果 JSON、模型、llm_usage、报错信息等。
片段结果入库POST /v1/table/snippet 接收 TableSnippetUpsertRequest表/版本、action 类型、状态、schema、模型信息、各阶段 JSON 及大小、错误码、时间戳等),组装到 action_results 表进行 UPSERT返回是否更新已有记录。
配置与运行要求:核心环境变量在 app/settings.pyAPI Keys、DEFAULT_IMPORT_MODEL、IMPORT_GATEWAY_BASE_URL/NEW_API_BASE_URL、模型白名单、数据库 URL 等);日志使用 logging.yaml 自动创建 logs/HTTP 客户端超时/代理可通过 HTTP_CLIENT_TIMEOUT、HTTP_CLIENT_TRUST_ENV、HTTP_CLIENT_PROXY 控制。 调试可用 uvicorn app.main:app --reloadDocker 由 Dockerfile/docker-compose.yml 提供。

View File

@ -1,47 +0,0 @@
系统角色System
你是“数据画像抽取器”。输入是一段 Great Expectations 的 profiling/validation 结果 JSON
可能包含列级期望expect_*)、统计、样例值、类型推断等;也可能带表级/批次元数据。
请将其归一化为一个可被程序消费的“表画像”JSON对不确定项给出置信度与理由。
禁止臆造不存在的列、时间范围或数值。
用户消息User
【输入GE结果JSON】
{{GE_RESULT_JSON}}
【输出要求只输出JSON不要解释文字
{
"table": "<库.表 表名>",
"row_count": <int|null>, // 若未知可为 null
"role": "fact|dimension|unknown", // 依据指标/维度占比与唯一性启发式
"grain": ["<列1>", "<列2>", ...], // 事实粒度猜测(如含 dt/店/类目)
"time": { "column": "<name>|null", "granularity": "day|week|month|unknown", "range": ["YYYY-MM-DD","YYYY-MM-DD"]|null, "has_gaps": true|false|null },
"columns": [
{
"name": "<col>",
"dtype": "<ge推断/物理类型>",
"semantic_type": "dimension|metric|time|text|id|unknown",
"null_rate": <0~1|null>,
"distinct_count": <int|null>,
"distinct_ratio": <0~1|null>,
"stats": { "min": <number|string|null>,"max": <number|string|null>,"mean": <number|null>,"std": <number|null>,"skewness": <number|null> },
"enumish": true|false|null, // 低熵/可枚举
"top_values": [{"value":"<v>","pct":<0~1>}, ...],// 取前K个≤10
"pk_candidate_score": <0~1>, // 唯一性+非空综合评分
"metric_candidate_score": <0~1>, // 数值/偏态/业务词命中
"comment": "<列注释或GE描述|可为空>"
}
],
"primary_key_candidates": [["colA","colB"], ...], // 依据 unique/compound unique 期望
"fk_candidates": [{"from":"<col>","to":"<dim_table(col)>","confidence":<0~1>}],
"quality": {
"failed_expectations": [{"name":"<expect_*>","column":"<col|table>","summary":"<一句话>"}],
"warning_hints": ["空值率>0.2的列: ...", "时间列存在缺口: ..."]
},
"confidence_notes": ["<为什么判定role/grain/time列>"]
}
【判定规则(简要)】
- time列类型为日期/时间 OR 命中 dt/date/day 等命名;若有 min/max 可给出 range若间隔缺口≥1天记 has_gaps=true。
- semantic_type数值+右偏/方差大→更偏 metric高唯一/ID命名→id高基数+文本→text低熵+有限取值→dimension。
- rolemetric列占比高且存在time列→倾向 fact几乎全是枚举/ID且少数值→dimension。
- 置信不高时给出 null 或 unknown并写入 confidence_notes。

View File

@ -1,52 +0,0 @@
系统角色System
你是“SQL片段别名生成器”。
输入为一个或多个 SQL 片段对象(来自 snippet.json输出为针对每个片段生成的多样化别名口语 / 中性 / 专业)、关键词与意图标签。
要求逐个处理所有片段对象,输出同样数量的 JSON 元素。
用户消息User
【上下文】
SQL片段对象数组{{SNIPPET_ARRAY}} // snippet.json中的一个或多个片段
【任务要求】
请针对输入数组中的 每个 SQL 片段,输出一个 JSON 对象,结构如下:
{
"id": "<与输入片段id一致>",
"aliases": [
{"text": "…", "tone": "口语|中性|专业"},
{"text": "…", "tone": "专业"}
],
"keywords": [
"GMV","销售额","TopN","category","类目","趋势","同比","客户","订单","质量","异常检测","join","过滤","sample"
],
"intent_tags": ["aggregate","trend","topn","ratio","quality","join","sample","filter","by_dimension"]
}
生成逻辑规范
1.逐条输出输入数组中每个片段对应一个输出对象id 保持一致)。
2.aliases生成
至少 3 个别名,分别覆盖语气类型:口语 / 中性 / 专业。
≤20字语义需等价不得添加不存在的字段或业务口径。
示例:
GMV趋势分析中性
每天卖多少钱(口语)
按日GMV曲线专业
3.keywords生成
8~15个关键词需涵盖片段核心维度、指标、分析类型和语义近义词。
中英文混合(如 "GMV"/"销售额"、"同比"/"YoY"、"类目"/"category" 等)。
包含用于匹配的分析意图关键词(如 “趋势”、“排行”、“占比”、“质量检查”、“过滤” 等)。
4.intent_tags生成
从以下集合中选取与片段type及用途一致
["aggregate","trend","topn","ratio","quality","join","sample","filter","by_dimension"]
若为条件片段WHERE句型补充 "filter";若含维度分组逻辑,补充 "by_dimension"。
5.语言与内容要求
保持正式书面风格,不添加解释说明。
只输出JSON数组不包含文字描述或额外文本。

View File

@ -1,46 +0,0 @@
系统角色System
你是“SQL片段生成器”。只能基于给定“表画像”生成可复用的分析片段。
为每个片段产出标题、用途描述、片段类型、变量、适用条件、SQL模板mysql方言并注明业务口径与安全限制。
不要发明画像里没有的列。时间/维度/指标须与画像匹配。
用户消息User
【表画像JSON】
{{TABLE_PROFILE_JSON}}
【输出要求只输出JSON数组
[
{
"id": "snpt_<slug>",
"title": "中文标题≤16字",
"desc": "一句话用途",
"type": "aggregate|trend|topn|ratio|quality|join|sample",
"applicability": {
"required_columns": ["<col>", ...],
"time_column": "<dt|nullable>",
"constraints": {
"dim_cardinality_hint": <int|null>, // 用于TopN限制与性能提示
"fk_join_available": true|false,
"notes": ["高基数维度建议LIMIT<=50", "..."]
}
},
"variables": [
{"name":"start_date","type":"date"},
{"name":"end_date","type":"date"},
{"name":"top_n","type":"int","default":10}
],
"dialect_sql": {
"mysql": ""
},
"business_caliber": "清晰口径说明,如 UV以device_id去重粒度=日-类目",
"examples": ["示例问法1","示例问法2"]
}
]
【片段选择建议】
- 若存在 time 列:生成 trend_by_day / yoy_qoq / moving_avg。
- 若存在 enumish 维度distinct 5~200生成 topn_by_dimension / share_of_total。
- 若 metric 列:生成 sum/avg/max、分位数/异常检测3σ/箱线)。
- 有主键/唯一:生成 去重/明细抽样/质量检查。
- 有 fk_candidates同时生成“join维表命名版”和“纯ID版”。
- 高枚举维度:在 constraints.notes 中强调 LIMIT 建议与可能的性能风险。
- 除了完整的sql片段还有sql里部分内容的sql片段比如 where payment_method = 'Credit Card' and delivery_status = 'Deliverd' 的含义是支付方式为信用卡且配送状态是已送达

View File

@ -1,21 +0,0 @@
[project]
name = "data-ge-new"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"fastapi>=0.111.0",
"uvicorn[standard]>=0.29.0",
"pydantic>=2.6.0",
"sqlalchemy>=2.0.28",
"pymysql>=1.1.0",
"great-expectations[profilers]==0.18.19",
"pandas>=2.0",
"numpy>=1.24",
"openpyxl>=3.1",
"httpx==0.27.2",
"python-dotenv==1.0.1",
"requests>=2.31.0",
"PyYAML>=6.0.1",
]

11
requirements.txt Normal file
View File

@ -0,0 +1,11 @@
fastapi>=0.111.0
uvicorn[standard]>=0.29.0
pydantic>=2.6.0
sqlalchemy>=2.0.28
pymysql>=1.1.0
great_expectations>=0.18.0,<0.19.0
pandas>=2.0
numpy>=1.24
openpyxl>=3.1
httpx==0.27.2
python-dotenv==1.0.1

View File

@ -1,226 +0,0 @@
import argparse
import logging
import os
from typing import Dict, Iterable, List, Optional
import datasets
from datasets import DownloadConfig
from huggingface_hub import snapshot_download
# 批量下载 Hugging Face 上的数据集和模型
# 支持通过命令行参数配置代理和下载参数如超时和重试次数支持批量循环下载存储到file目录下dataset和model子目录
def _parse_id_list(values: Iterable[str]) -> List[str]:
"""将多次传入以及逗号分隔的标识整理为列表."""
ids: List[str] = []
for value in values:
value = value.strip()
if not value:
continue
if "," in value:
ids.extend(v.strip() for v in value.split(",") if v.strip())
else:
ids.append(value)
return ids
def _parse_proxy_args(proxy_args: Iterable[str]) -> Dict[str, str]:
"""解析命令行传入的代理设置,格式 scheme=url."""
proxies: Dict[str, str] = {}
for item in proxy_args:
raw = item.strip()
if not raw:
continue
if "=" not in raw:
logging.warning("代理参数 %s 缺少 '=' 分隔符,将忽略该项", raw)
continue
key, value = raw.split("=", 1)
key = key.strip()
value = value.strip()
if not key or not value:
logging.warning("代理参数 %s 解析失败,将忽略该项", raw)
continue
proxies[key] = value
return proxies
def _sanitize_dir_name(name: str) -> str:
return name.replace("/", "__")
def _ensure_dirs(root_dir: str) -> Dict[str, str]:
paths = {
"dataset": os.path.join(root_dir, "dataset"),
"model": os.path.join(root_dir, "model"),
}
for path in paths.values():
os.makedirs(path, exist_ok=True)
return paths
def _build_download_config(cache_dir: str, retries: Optional[int], proxies: Dict[str, str]) -> DownloadConfig:
config_kwargs = {"cache_dir": cache_dir}
if retries is not None:
config_kwargs["max_retries"] = retries
if proxies:
config_kwargs["proxies"] = proxies
return DownloadConfig(**config_kwargs)
def _apply_timeout(timeout: Optional[float]) -> None:
if timeout is None:
return
str_timeout = str(timeout)
os.environ.setdefault("HF_DATASETS_HTTP_TIMEOUT", str_timeout)
os.environ.setdefault("HF_HUB_HTTP_TIMEOUT", str_timeout)
def _resolve_log_level(level_name: str) -> int:
if isinstance(level_name, int):
return level_name
upper_name = str(level_name).upper()
return getattr(logging, upper_name, logging.INFO)
def _build_argument_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="批量下载 Hugging Face 数据集和模型并存储到指定目录。"
)
parser.add_argument(
"-d",
"--dataset",
action="append",
default=[],
help="要下载的数据集 ID可重复使用或传入逗号分隔列表。",
)
parser.add_argument(
"-m",
"--model",
action="append",
default=[],
help="要下载的模型 ID可重复使用或传入逗号分隔列表。",
)
parser.add_argument(
"-r",
"--root",
default="file",
help="存储根目录,默认 file。",
)
parser.add_argument(
"--retries",
type=int,
default=None,
help="失败后的重试次数,默认不重试。",
)
parser.add_argument(
"--timeout",
type=float,
default=None,
help="HTTP 超时时间(秒),默认跟随库设置。",
)
parser.add_argument(
"-p",
"--proxy",
action="append",
default=[],
help="代理设置,格式 scheme=url可多次传入例如 --proxy http=http://127.0.0.1:7890",
)
parser.add_argument(
"--log-level",
default="INFO",
help="日志级别,默认 INFO。",
)
return parser
def download_datasets(dataset_ids: Iterable[str], root_dir: str, retries: Optional[int], proxies: Dict[str, str]) -> None:
if not dataset_ids:
return
cache_dir = root_dir
download_config = _build_download_config(cache_dir, retries, proxies)
for dataset_id in dataset_ids:
try:
logging.info("开始下载数据集 %s", dataset_id)
# 使用 load_dataset 触发缓存下载
dataset = datasets.load_dataset(
dataset_id,
cache_dir=cache_dir,
download_config=download_config,
download_mode="reuse_cache_if_exists",
)
target_path = os.path.join(root_dir, _sanitize_dir_name(dataset_id))
dataset.save_to_disk(target_path)
logging.info("数据集 %s 下载完成,存储于 %s", dataset_id, target_path)
except Exception as exc: # pylint: disable=broad-except
logging.error("下载数据集 %s 失败: %s", dataset_id, exc)
def download_models(
model_ids: Iterable[str],
target_dir: str,
retries: Optional[int],
proxies: Dict[str, str],
timeout: Optional[float],
) -> None:
if not model_ids:
return
max_attempts = (retries or 0) + 1
hub_kwargs = {
"local_dir": target_dir,
"local_dir_use_symlinks": False,
"max_workers": os.cpu_count() or 4,
}
if proxies:
hub_kwargs["proxies"] = proxies
if timeout is not None:
hub_kwargs["timeout"] = timeout
for model_id in model_ids:
attempt = 0
while attempt < max_attempts:
attempt += 1
try:
logging.info("开始下载模型 %s (尝试 %s/%s)", model_id, attempt, max_attempts)
snapshot_download(
repo_id=model_id,
**hub_kwargs,
)
logging.info("模型 %s 下载完成,存储于 %s", model_id, target_dir)
break
except Exception as exc: # pylint: disable=broad-except
logging.error("下载模型 %s 失败: %s", model_id, exc)
if attempt >= max_attempts:
logging.error("模型 %s 在重试后仍未成功下载", model_id)
def main() -> None:
parser = _build_argument_parser()
args = parser.parse_args()
logging.basicConfig(
level=_resolve_log_level(args.log_level),
format="%(asctime)s - %(levelname)s - %(message)s",
)
dataset_ids = _parse_id_list(args.dataset)
model_ids = _parse_id_list(args.model)
retries = args.retries
timeout = args.timeout
proxies = _parse_proxy_args(args.proxy)
_apply_timeout(timeout)
if not dataset_ids and not model_ids:
logging.warning(
"未配置任何数据集或模型,"
"请通过参数 --dataset / --model 指定 Hugging Face ID"
)
return
dirs = _ensure_dirs(args.root)
download_datasets(dataset_ids, dirs["dataset"], retries, proxies)
download_models(model_ids, dirs["model"], retries, proxies, timeout)
if __name__ == "__main__":
main()

View File

@ -1,80 +0,0 @@
from __future__ import annotations
import json
import os
import sys
from datetime import datetime
from typing import Any, Dict
import requests
def build_demo_payload() -> Dict[str, Any]:
now = datetime.utcnow()
started_at = now.replace(microsecond=0).isoformat() + "Z"
finished_at = now.replace(microsecond=0).isoformat() + "Z"
return {
"table_id": 42,
"version_ts": 20251101200000,
"action_type": "snippet",
"status": "success",
"callback_url": "http://localhost:9999/dummy-callback",
"table_schema_version_id": 7,
"table_schema": {
"columns": [
{"name": "order_id", "type": "bigint"},
{"name": "order_dt", "type": "date"},
{"name": "gmv", "type": "decimal(18,2)"},
]
},
"result_json": [
{
"id": "snpt_daily_gmv",
"title": "按日GMV",
"desc": "统计每日GMV总额",
"type": "trend",
"dialect_sql": {
"mysql": "SELECT order_dt, SUM(gmv) AS total_gmv FROM orders GROUP BY order_dt ORDER BY order_dt"
},
}
],
"result_summary_json": {"total_snippets": 1},
"html_report_url": None,
"error_code": None,
"error_message": None,
"started_at": started_at,
"finished_at": finished_at,
"duration_ms": 1234,
"result_checksum": "demo-checksum",
}
def main() -> int:
base_url = os.getenv("TABLE_SNIPPET_DEMO_BASE_URL", "http://localhost:8000")
endpoint = f"{base_url.rstrip('/')}/v1/table/snippet"
payload = build_demo_payload()
print(f"POST {endpoint}")
print(json.dumps(payload, ensure_ascii=False, indent=2))
try:
response = requests.post(endpoint, json=payload, timeout=30)
except requests.RequestException as exc:
print(f"Request failed: {exc}", file=sys.stderr)
return 1
print(f"\nStatus: {response.status_code}")
try:
data = response.json()
print("Response JSON:")
print(json.dumps(data, ensure_ascii=False, indent=2))
except ValueError:
print("Response Text:")
print(response.text)
return 0 if response.ok else 1
if __name__ == "__main__":
raise SystemExit(main())

View File

@ -1,142 +0,0 @@
from __future__ import annotations
import os
import random
from pathlib import Path
from typing import Generator, List
import sys
import pytest
from fastapi.testclient import TestClient
from sqlalchemy import text
from sqlalchemy.exc import SQLAlchemyError
# Ensure the project root is importable when running directly via python.
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from app import db
from app.main import create_app
TEST_USER_ID = 872341
SCHEMA_PATH = Path("file/tableschema/chat.sql")
DEFAULT_MYSQL_URL = "mysql+pymysql://root:12345678@127.0.0.1:3306/data-ge?charset=utf8mb4"
@pytest.fixture(scope="module")
def client() -> Generator[TestClient, None, None]:
mysql_url = os.getenv("TEST_DATABASE_URL", DEFAULT_MYSQL_URL)
os.environ["DATABASE_URL"] = mysql_url
db.get_engine.cache_clear()
engine = db.get_engine()
try:
# Quick connectivity check
with engine.connect() as conn:
conn.execute(text("SELECT 1"))
except SQLAlchemyError:
pytest.skip(f"Cannot connect to MySQL at {mysql_url}")
#_ensure_chat_schema(engine)
app = create_app()
with TestClient(app) as test_client:
yield test_client
# cleanup test artifacts
with engine.begin() as conn:
# remove retrievals and turns tied to test sessions
conn.execute(
text(
"""
DELETE FROM chat_turn_retrieval
WHERE turn_id IN (
SELECT id FROM chat_turn WHERE session_id IN (SELECT id FROM chat_session WHERE user_id=:uid)
)
"""
),
{"uid": TEST_USER_ID},
)
conn.execute(
text("DELETE FROM chat_turn WHERE session_id IN (SELECT id FROM chat_session WHERE user_id=:uid)"),
{"uid": TEST_USER_ID},
)
conn.execute(text("DELETE FROM chat_session WHERE user_id=:uid"), {"uid": TEST_USER_ID})
db.get_engine.cache_clear()
def test_session_lifecycle_mysql(client: TestClient) -> None:
# Create a session
resp = client.post("/api/v1/chat/sessions", json={"user_id": TEST_USER_ID})
assert resp.status_code == 200, resp.text
session = resp.json()
session_id = session["id"]
assert session["status"] == "OPEN"
# Get session
assert client.get(f"/api/v1/chat/sessions/{session_id}").status_code == 200
# List sessions (filter by user)
resp = client.get(f"/api/v1/chat/sessions", params={"user_id": TEST_USER_ID})
assert resp.status_code == 200
assert any(item["id"] == session_id for item in resp.json())
# Update status
resp = client.post(f"/api/v1/chat/sessions/{session_id}/update", json={"status": "PAUSED"})
assert resp.status_code == 200
assert resp.json()["status"] == "PAUSED"
# Close session
resp = client.post(f"/api/v1/chat/sessions/{session_id}/close")
assert resp.status_code == 200
assert resp.json()["status"] == "CLOSED"
def test_turns_and_retrievals_mysql(client: TestClient) -> None:
session_id = client.post("/api/v1/chat/sessions", json={"user_id": TEST_USER_ID}).json()["id"]
turn_payload = {
"user_id": TEST_USER_ID,
"user_query": "展示昨天订单GMV",
"intent": "METRIC_QUERY",
"ast_json": {"select": ["gmv"], "where": {"dt": "yesterday"}},
"main_metric_ids": [random.randint(1000, 9999)],
"created_metric_ids": [],
}
resp = client.post(f"/api/v1/chat/sessions/{session_id}/turns", json=turn_payload)
assert resp.status_code == 200, resp.text
turn = resp.json()
turn_id = turn["id"]
assert turn["turn_no"] == 1
# Fetch turn
assert client.get(f"/api/v1/chat/turns/{turn_id}").status_code == 200
# List turns under session
resp = client.get(f"/api/v1/chat/sessions/{session_id}/turns")
assert resp.status_code == 200
assert any(t["id"] == turn_id for t in resp.json())
# Insert retrievals
retrievals_payload = {
"retrievals": [
{"item_type": "METRIC", "item_id": "metric_foo", "used_in_sql": True, "rank_no": 1},
{"item_type": "SNIPPET", "item_id": "snpt_bar", "similarity_score": 0.77, "rank_no": 2},
]
}
resp = client.post(f"/api/v1/chat/turns/{turn_id}/retrievals", json=retrievals_payload)
assert resp.status_code == 200
assert resp.json()["inserted"] == 2
# List retrievals
resp = client.get(f"/api/v1/chat/turns/{turn_id}/retrievals")
assert resp.status_code == 200
items = resp.json()
assert len(items) == 2
assert {item["item_type"] for item in items} == {"METRIC", "SNIPPET"}
if __name__ == "__main__":
import pytest as _pytest
raise SystemExit(_pytest.main([__file__]))

View File

@ -1,207 +0,0 @@
from __future__ import annotations
import os
import random
from datetime import datetime, timedelta
from pathlib import Path
from typing import Generator, List
import pytest
from fastapi.testclient import TestClient
from sqlalchemy import text
from sqlalchemy.exc import SQLAlchemyError
# Ensure project root on path for direct execution
ROOT = Path(__file__).resolve().parents[1]
import sys
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from app import db
from app.main import create_app
TEST_USER_ID = 98765
#SCHEMA_PATH = Path("file/tableschema/metrics.sql")
DEFAULT_MYSQL_URL = "mysql+pymysql://root:12345678@127.0.0.1:3306/data-ge?charset=utf8mb4"
# def _run_sql_script(engine, sql_text: str) -> None:
# """Execute semicolon-terminated SQL statements sequentially."""
# statements: List[str] = []
# buffer: List[str] = []
# for line in sql_text.splitlines():
# stripped = line.strip()
# if not stripped or stripped.startswith("--"):
# continue
# buffer.append(line)
# if stripped.endswith(";"):
# statements.append("\n".join(buffer).rstrip(";"))
# buffer = []
# if buffer:
# statements.append("\n".join(buffer))
# with engine.begin() as conn:
# for stmt in statements:
# conn.execute(text(stmt))
# def _ensure_metric_schema(engine) -> None:
# if not SCHEMA_PATH.exists():
# pytest.skip("metrics.sql schema file not found.")
# raw_sql = SCHEMA_PATH.read_text(encoding="utf-8")
# raw_sql = raw_sql.replace("CREATE TABLE metric_def", "CREATE TABLE IF NOT EXISTS metric_def")
# raw_sql = raw_sql.replace("CREATE TABLE metric_schedule", "CREATE TABLE IF NOT EXISTS metric_schedule")
# raw_sql = raw_sql.replace("CREATE TABLE metric_job_run", "CREATE TABLE IF NOT EXISTS metric_job_run")
# raw_sql = raw_sql.replace("CREATE TABLE metric_result", "CREATE TABLE IF NOT EXISTS metric_result")
# _run_sql_script(engine, raw_sql)
@pytest.fixture(scope="module")
def client() -> Generator[TestClient, None, None]:
mysql_url = os.getenv("TEST_DATABASE_URL", DEFAULT_MYSQL_URL)
os.environ["DATABASE_URL"] = mysql_url
db.get_engine.cache_clear()
engine = db.get_engine()
try:
with engine.connect() as conn:
conn.execute(text("SELECT 1"))
except SQLAlchemyError:
pytest.skip(f"Cannot connect to MySQL at {mysql_url}")
#_ensure_metric_schema(engine)
app = create_app()
with TestClient(app) as test_client:
yield test_client
# cleanup test artifacts
with engine.begin() as conn:
conn.execute(text("DELETE FROM metric_result WHERE metric_id IN (SELECT id FROM metric_def WHERE created_by=:uid)"), {"uid": TEST_USER_ID})
conn.execute(text("DELETE FROM metric_job_run WHERE metric_id IN (SELECT id FROM metric_def WHERE created_by=:uid)"), {"uid": TEST_USER_ID})
conn.execute(text("DELETE FROM metric_schedule WHERE metric_id IN (SELECT id FROM metric_def WHERE created_by=:uid)"), {"uid": TEST_USER_ID})
conn.execute(text("DELETE FROM metric_def WHERE created_by=:uid"), {"uid": TEST_USER_ID})
db.get_engine.cache_clear()
def test_metric_crud_and_schedule_mysql(client: TestClient) -> None:
code = f"metric_{random.randint(1000,9999)}"
create_payload = {
"metric_code": code,
"metric_name": "订单数",
"biz_domain": "order",
"biz_desc": "订单总数",
"base_sql": "select count(*) as order_cnt from orders",
"time_grain": "DAY",
"dim_binding": ["dt"],
"update_strategy": "FULL",
"metric_aliases": ["订单量"],
"created_by": TEST_USER_ID,
}
resp = client.post("/api/v1/metrics", json=create_payload)
assert resp.status_code == 200, resp.text
metric = resp.json()
metric_id = metric["id"]
assert metric["metric_code"] == code
# Update metric
resp = client.post(f"/api/v1/metrics/{metric_id}", json={"metric_name": "订单数-更新", "is_active": False})
assert resp.status_code == 200
assert resp.json()["is_active"] is False
# Get metric
resp = client.get(f"/api/v1/metrics/{metric_id}")
assert resp.status_code == 200
assert resp.json()["metric_name"] == "订单数-更新"
# Create schedule
resp = client.post(
"/api/v1/metric-schedules",
json={"metric_id": metric_id, "cron_expr": "0 2 * * *", "priority": 5, "enabled": True},
)
assert resp.status_code == 200, resp.text
schedule = resp.json()
schedule_id = schedule["id"]
# Update schedule
resp = client.post(f"/api/v1/metric-schedules/{schedule_id}", json={"enabled": False, "retry_times": 1})
assert resp.status_code == 200
assert resp.json()["enabled"] is False
# List schedules for metric
resp = client.get(f"/api/v1/metrics/{metric_id}/schedules")
assert resp.status_code == 200
assert any(s["id"] == schedule_id for s in resp.json())
def test_metric_runs_and_results_mysql(client: TestClient) -> None:
code = f"gmv_{random.randint(1000,9999)}"
metric_id = client.post(
"/api/v1/metrics",
json={
"metric_code": code,
"metric_name": "GMV",
"biz_domain": "order",
"base_sql": "select sum(pay_amount) as gmv from orders",
"time_grain": "DAY",
"dim_binding": ["dt"],
"update_strategy": "FULL",
"created_by": TEST_USER_ID,
},
).json()["id"]
# Trigger run
resp = client.post(
"/api/v1/metric-runs/trigger",
json={
"metric_id": metric_id,
"triggered_by": "API",
"data_time_from": (datetime.utcnow() - timedelta(days=1)).isoformat(),
"data_time_to": datetime.utcnow().isoformat(),
},
)
assert resp.status_code == 200, resp.text
run = resp.json()
run_id = run["id"]
assert run["status"] == "RUNNING"
# List runs
resp = client.get("/api/v1/metric-runs", params={"metric_id": metric_id})
assert resp.status_code == 200
assert any(r["id"] == run_id for r in resp.json())
# Get run
resp = client.get(f"/api/v1/metric-runs/{run_id}")
assert resp.status_code == 200
# Write results
now = datetime.utcnow()
resp = client.post(
f"/api/v1/metric-results/{metric_id}",
json={
"metric_id": metric_id,
"results": [
{"stat_time": (now - timedelta(days=1)).isoformat(), "metric_value": 123.45, "data_version": run_id},
{"stat_time": now.isoformat(), "metric_value": 234.56, "data_version": run_id},
],
},
)
assert resp.status_code == 200, resp.text
assert resp.json()["inserted"] == 2
# Query results
resp = client.get("/api/v1/metric-results", params={"metric_id": metric_id})
assert resp.status_code == 200
results = resp.json()
assert len(results) >= 2
# Latest result
resp = client.get("/api/v1/metric-results/latest", params={"metric_id": metric_id})
assert resp.status_code == 200
latest = resp.json()
assert float(latest["metric_value"]) in {123.45, 234.56}
if __name__ == "__main__":
import pytest as _pytest
raise SystemExit(_pytest.main([__file__]))

View File

@ -1,91 +0,0 @@
from __future__ import annotations
import json
import httpx
import pytest
from app.exceptions import ProviderAPICallError
from app.schemas.rag import RagDeleteRequest, RagItemPayload, RagRetrieveRequest
from app.services.rag_client import RagAPIClient
@pytest.mark.asyncio
async def test_add_sends_payload_and_headers() -> None:
rag_client = RagAPIClient(base_url="http://rag.test", auth_token="secret-token")
def handler(request: httpx.Request) -> httpx.Response:
assert request.method == "POST"
assert str(request.url) == "http://rag.test/rag/add"
assert request.headers["Authorization"] == "Bearer secret-token"
payload = json.loads(request.content.decode())
assert payload == {
"id": 1,
"workspaceId": 2,
"name": "demo",
"embeddingData": "vector",
"type": "METRIC",
}
return httpx.Response(200, json={"ok": True, "echo": payload})
transport = httpx.MockTransport(handler)
async with httpx.AsyncClient(transport=transport) as client:
result = await rag_client.add(
client,
RagItemPayload(id=1, workspaceId=2, name="demo", embeddingData="vector", type="METRIC"),
)
assert result["ok"] is True
assert result["echo"]["name"] == "demo"
@pytest.mark.asyncio
async def test_add_batch_serializes_list() -> None:
rag_client = RagAPIClient(base_url="http://rag.test", auth_token=None)
def handler(request: httpx.Request) -> httpx.Response:
payload = json.loads(request.content.decode())
assert request.url.path == "/rag/addBatch"
assert isinstance(payload, list) and len(payload) == 2
return httpx.Response(200, json={"received": len(payload)})
items = [
RagItemPayload(id=1, workspaceId=2, name="a", embeddingData="vec-a", type="METRIC"),
RagItemPayload(id=2, workspaceId=2, name="b", embeddingData="vec-b", type="METRIC"),
]
transport = httpx.MockTransport(handler)
async with httpx.AsyncClient(transport=transport) as client:
result = await rag_client.add_batch(client, items)
assert result == {"received": 2}
@pytest.mark.asyncio
async def test_http_error_raises_provider_error() -> None:
rag_client = RagAPIClient(base_url="http://rag.test")
def handler(request: httpx.Request) -> httpx.Response:
return httpx.Response(500, text="boom")
transport = httpx.MockTransport(handler)
async with httpx.AsyncClient(transport=transport) as client:
with pytest.raises(ProviderAPICallError) as excinfo:
await rag_client.delete(client, RagDeleteRequest(id=1, type="METRIC"))
err = excinfo.value
assert err.status_code == 500
assert "boom" in (err.response_text or "")
@pytest.mark.asyncio
async def test_non_json_response_returns_raw_text() -> None:
rag_client = RagAPIClient(base_url="http://rag.test")
def handler(request: httpx.Request) -> httpx.Response:
return httpx.Response(200, text="plain-text-body")
transport = httpx.MockTransport(handler)
async with httpx.AsyncClient(transport=transport) as client:
result = await rag_client.retrieve(
client, RagRetrieveRequest(query="foo", num=1, workspaceId=1, type="METRIC")
)
assert result == {"raw": "plain-text-body"}

View File

@ -1,157 +0,0 @@
from __future__ import annotations
import json
from datetime import datetime
import httpx
import pytest
from sqlalchemy import create_engine, text
from app.services.table_snippet import ingest_snippet_rag_from_db
def _setup_sqlite_engine():
engine = create_engine("sqlite://")
with engine.begin() as conn:
conn.execute(
text(
"""
CREATE TABLE action_results (
id INTEGER PRIMARY KEY AUTOINCREMENT,
table_id INTEGER,
version_ts INTEGER,
action_type TEXT,
status TEXT,
snippet_json TEXT,
snippet_alias_json TEXT,
updated_at TEXT
)
"""
)
)
conn.execute(
text(
"""
CREATE TABLE rag_snippet (
rag_item_id INTEGER PRIMARY KEY,
action_result_id INTEGER NOT NULL,
workspace_id INTEGER,
table_id INTEGER,
version_ts INTEGER,
created_at TEXT,
snippet_id TEXT,
rag_text TEXT,
merged_json TEXT,
updated_at TEXT
)
"""
)
)
return engine
def _insert_action_row(engine, payload: dict) -> None:
with engine.begin() as conn:
conn.execute(
text(
"""
INSERT INTO action_results (table_id, version_ts, action_type, status, snippet_json, snippet_alias_json, updated_at)
VALUES (:table_id, :version_ts, :action_type, :status, :snippet_json, :snippet_alias_json, :updated_at)
"""
),
{
"table_id": payload["table_id"],
"version_ts": payload["version_ts"],
"action_type": payload["action_type"],
"status": payload.get("status", "success"),
"snippet_json": json.dumps(payload.get("snippet_json"), ensure_ascii=False)
if payload.get("snippet_json") is not None
else None,
"snippet_alias_json": json.dumps(payload.get("snippet_alias_json"), ensure_ascii=False)
if payload.get("snippet_alias_json") is not None
else None,
"updated_at": payload.get("updated_at") or datetime.utcnow().isoformat(),
},
)
class _StubRagClient:
def __init__(self) -> None:
self.received = None
async def add_batch(self, _client, items):
self.received = items
return {"count": len(items)}
@pytest.mark.asyncio
async def test_ingest_snippet_rag_from_db_persists_and_calls_rag_client() -> None:
engine = _setup_sqlite_engine()
table_id = 321
version_ts = 20240102000000
snippet_payload = [
{
"id": "snpt_topn",
"title": "TopN",
"aliases": [{"text": "站点水表排行前N", "tone": "中性"}],
"keywords": ["TopN", "站点"],
}
]
alias_payload = [
{
"id": "snpt_topn",
"aliases": [
{"text": "站点水表排行前N", "tone": "中性"},
{"text": "按站点水表TopN", "tone": "专业"},
],
"keywords": ["TopN", "排行"],
"intent_tags": ["topn", "aggregate"],
},
{
"id": "snpt_extra",
"aliases": [{"text": "额外别名"}],
"keywords": ["extra"],
},
]
_insert_action_row(
engine,
{
"table_id": table_id,
"version_ts": version_ts,
"action_type": "snippet_alias",
"snippet_json": snippet_payload,
"snippet_alias_json": alias_payload,
"updated_at": "2024-01-02T00:00:00",
},
)
rag_stub = _StubRagClient()
async with httpx.AsyncClient() as client:
rag_ids = await ingest_snippet_rag_from_db(
table_id=table_id,
version_ts=version_ts,
workspace_id=99,
rag_item_type="SNIPPET",
client=client,
engine=engine,
rag_client=rag_stub,
)
assert rag_stub.received is not None
assert len(rag_stub.received) == 2 # includes alias-only row
assert len(rag_ids) == 2
with engine.connect() as conn:
rows = list(
conn.execute(
text("SELECT snippet_id, action_result_id, rag_text, merged_json FROM rag_snippet ORDER BY snippet_id")
)
)
assert {row[0] for row in rows} == {"snpt_extra", "snpt_topn"}
assert all(row[1] is not None for row in rows)
topn_row = next(row for row in rows if row[0] == "snpt_topn")
assert "TopN" in topn_row[2]
assert "按站点水表TopN" in topn_row[2]
assert "排行" in topn_row[2]

View File

@ -1,74 +0,0 @@
from __future__ import annotations
from app.services.table_profiling import _parse_completion_payload
from app.utils.llm_usage import extract_usage
def test_parse_completion_payload_handles_array_with_trailing_text() -> None:
response_payload = {
"choices": [
{
"message": {
"content": """
结果如下:
[
{"id": "snpt_a"},
{"id": "snpt_b"}
]
附加说明:模型可能会输出额外文本。
""".strip()
}
}
]
}
parsed = _parse_completion_payload(response_payload)
assert isinstance(parsed, list)
assert [item["id"] for item in parsed] == ["snpt_a", "snpt_b"]
def test_extract_usage_info_normalizes_numeric_fields() -> None:
response_payload = {
"raw": {
"usage": {
"prompt_tokens": 12.7,
"completion_tokens": 3,
"total_tokens": 15.7,
"prompt_tokens_details": {"cached_tokens": 8.9, "other": None},
"non_numeric": "ignored",
}
}
}
usage = extract_usage(response_payload)
assert usage == {
"prompt_tokens": 12,
"completion_tokens": 3,
"total_tokens": 15,
"prompt_tokens_details": {"cached_tokens": 8},
}
def test_extract_usage_handles_alias_keys() -> None:
response_payload = {
"raw": {
"usageMetadata": {
"input_tokens": 20,
"output_tokens": 4,
}
}
}
usage = extract_usage(response_payload)
assert usage == {
"prompt_tokens": 20,
"completion_tokens": 4,
"total_tokens": 24,
}
def test_extract_usage_returns_none_when_missing() -> None:
assert extract_usage({"raw": {}}) is None

View File

@ -1,213 +0,0 @@
from __future__ import annotations
import json
import os
import random
from datetime import datetime, timedelta
from typing import List
from pathlib import Path
import sys
import pytest
from sqlalchemy import text
from sqlalchemy.engine import Engine
from sqlalchemy.exc import SQLAlchemyError
# Ensure the project root is importable when running directly via python.
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from app import db
from app.main import create_app
from app.services.table_snippet import merge_snippet_records_from_db
DEFAULT_MYSQL_URL = "mysql+pymysql://root:12345678@127.0.0.1:3306/data-ge?charset=utf8mb4"
@pytest.fixture()
def mysql_engine() -> Engine:
mysql_url = os.getenv("TEST_DATABASE_URL", DEFAULT_MYSQL_URL)
os.environ["DATABASE_URL"] = mysql_url
db.get_engine.cache_clear()
engine = db.get_engine()
try:
with engine.connect() as conn:
conn.execute(text("SELECT 1"))
exists = conn.execute(text("SHOW TABLES LIKE 'action_results'")).scalar()
if not exists:
pytest.skip("action_results table not found in test database.")
except SQLAlchemyError:
pytest.skip(f"Cannot connect to MySQL at {mysql_url}")
return engine
def _insert_action_row(
engine: Engine,
*,
table_id: int,
version_ts: int,
action_type: str,
status: str = "success",
snippet_json: List[dict] | None = None,
snippet_alias_json: List[dict] | None = None,
updated_at: datetime | None = None,
) -> None:
snippet_json_str = json.dumps(snippet_json, ensure_ascii=False) if snippet_json is not None else None
snippet_alias_json_str = (
json.dumps(snippet_alias_json, ensure_ascii=False) if snippet_alias_json is not None else None
)
with engine.begin() as conn:
conn.execute(
text(
"""
INSERT INTO action_results (
table_id, version_ts, action_type, status,
callback_url, table_schema_version_id, table_schema,
snippet_json, snippet_alias_json, updated_at
) VALUES (
:table_id, :version_ts, :action_type, :status,
:callback_url, :table_schema_version_id, :table_schema,
:snippet_json, :snippet_alias_json, :updated_at
)
ON DUPLICATE KEY UPDATE
status=VALUES(status),
snippet_json=VALUES(snippet_json),
snippet_alias_json=VALUES(snippet_alias_json),
updated_at=VALUES(updated_at)
"""
),
{
"table_id": table_id,
"version_ts": version_ts,
"action_type": action_type,
"status": status,
"callback_url": "http://localhost/test-callback",
"table_schema_version_id": "1",
"table_schema": json.dumps({}, ensure_ascii=False),
"snippet_json": snippet_json_str,
"snippet_alias_json": snippet_alias_json_str,
"updated_at": updated_at or datetime.utcnow(),
},
)
def _cleanup(engine: Engine, table_id: int, version_ts: int) -> None:
with engine.begin() as conn:
conn.execute(
text("DELETE FROM action_results WHERE table_id=:table_id AND version_ts=:version_ts"),
{"table_id": table_id, "version_ts": version_ts},
)
def test_merge_prefers_alias_row_and_appends_alias_only_entries(mysql_engine: Engine) -> None:
table_id = 990000000 + random.randint(1, 9999)
version_ts = int(datetime.utcnow().strftime("%Y%m%d%H%M%S"))
alias_updated = datetime(2024, 1, 2, 0, 0, 0)
snippet_payload = [
{
"id": "snpt_topn",
"aliases": [{"text": "站点水表排行前N", "tone": "中性"}],
"keywords": ["TopN", "站点"],
}
]
alias_payload = [
{
"id": "snpt_topn",
"aliases": [
{"text": "站点水表排行前N", "tone": "中性"},
{"text": "按站点水表TopN", "tone": "专业"},
],
"keywords": ["TopN", "排行"],
"intent_tags": ["topn", "aggregate"],
},
{
"id": "snpt_extra",
"aliases": [{"text": "额外别名"}],
"keywords": ["extra"],
},
]
_insert_action_row(
mysql_engine,
table_id=table_id,
version_ts=version_ts,
action_type="snippet_alias",
snippet_json=snippet_payload,
snippet_alias_json=alias_payload,
updated_at=alias_updated,
)
try:
merged = merge_snippet_records_from_db(table_id, version_ts, engine=mysql_engine)
assert len(merged) == 2
topn = next(item for item in merged if item["id"] == "snpt_topn")
assert topn["source"] == "snippet"
assert topn["updated_at_from_action"] == alias_updated
assert {a["text"] for a in topn["aliases"]} == {"站点水表排行前N", "按站点水表TopN"}
assert set(topn["keywords"]) == {"TopN", "站点", "排行"}
assert set(topn["intent_tags"]) == {"topn", "aggregate"}
alias_only = next(item for item in merged if item["source"] == "alias_only")
assert alias_only["id"] == "snpt_extra"
assert alias_only["aliases"][0]["text"] == "额外别名"
finally:
_cleanup(mysql_engine, table_id, version_ts)
def test_merge_falls_back_to_snippet_row_when_alias_row_missing_snippet_json(mysql_engine: Engine) -> None:
table_id = 991000000 + random.randint(1, 9999)
version_ts = int((datetime.utcnow() + timedelta(seconds=1)).strftime("%Y%m%d%H%M%S"))
alias_updated = datetime(2024, 1, 3, 0, 0, 0)
alias_payload = [
{
"id": "snpt_quality",
"aliases": [{"text": "质量检查"}],
"keywords": ["quality"],
}
]
snippet_payload = [
{
"id": "snpt_quality",
"title": "质量检查",
"keywords": ["data-quality"],
"aliases": [{"text": "质量检查"}],
}
]
_insert_action_row(
mysql_engine,
table_id=table_id,
version_ts=version_ts,
action_type="snippet_alias",
snippet_json=None,
snippet_alias_json=alias_payload,
updated_at=alias_updated,
)
_insert_action_row(
mysql_engine,
table_id=table_id,
version_ts=version_ts,
action_type="snippet",
snippet_json=snippet_payload,
snippet_alias_json=None,
updated_at=datetime(2024, 1, 2, 0, 0, 0),
)
try:
merged = merge_snippet_records_from_db(table_id, version_ts, engine=mysql_engine)
assert len(merged) == 1
record = merged[0]
assert record["id"] == "snpt_quality"
assert record["source"] == "snippet"
assert record["updated_at_from_action"] == alias_updated
assert set(record["keywords"]) == {"data-quality", "quality"}
assert {a["text"] for a in record["aliases"]} == {"质量检查"}
finally:
_cleanup(mysql_engine, table_id, version_ts)

13
uv.lock generated
View File

@ -1,13 +0,0 @@
version = 1
revision = 1
requires-python = ">=3.11"
resolution-markers = [
"python_full_version >= '3.14'",
"python_full_version >= '3.12' and python_full_version < '3.14'",
"python_full_version < '3.12'",
]
[[package]]
name = "data-ge-new"
version = "0.1.0"
source = { virtual = "." }