Compare commits
12 Commits
0b765e6719
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| daf951d81f | |||
| 248492d68e | |||
| 3218e51bad | |||
| ebd79b75bd | |||
| 0bc26ef4a1 | |||
| 509dae3270 | |||
| f261121845 | |||
| eefaf91ed1 | |||
| abe3d479a4 | |||
| 368ffaaaae | |||
| a72ca3593e | |||
| 7eb3c059a1 |
7
.env
7
.env
@ -16,8 +16,11 @@ DEFAULT_IMPORT_MODEL=deepseek:deepseek-chat
|
|||||||
# Service configuration
|
# Service configuration
|
||||||
IMPORT_GATEWAY_BASE_URL=http://localhost:8000
|
IMPORT_GATEWAY_BASE_URL=http://localhost:8000
|
||||||
|
|
||||||
|
# prod nbackend base url
|
||||||
|
NBACKEND_BASE_URL=https://chatbi.agentcarrier.cn/chatbi/api
|
||||||
|
|
||||||
# HTTP client configuration
|
# HTTP client configuration
|
||||||
HTTP_CLIENT_TIMEOUT=60
|
HTTP_CLIENT_TIMEOUT=120
|
||||||
HTTP_CLIENT_TRUST_ENV=false
|
HTTP_CLIENT_TRUST_ENV=false
|
||||||
# HTTP_CLIENT_PROXY=
|
# HTTP_CLIENT_PROXY=
|
||||||
|
|
||||||
@ -27,3 +30,5 @@ IMPORT_CHAT_TIMEOUT_SECONDS=120
|
|||||||
# Logging
|
# Logging
|
||||||
LOG_LEVEL=INFO
|
LOG_LEVEL=INFO
|
||||||
# LOG_FORMAT=%(asctime)s %(levelname)s %(name)s:%(lineno)d %(message)s
|
# LOG_FORMAT=%(asctime)s %(levelname)s %(name)s:%(lineno)d %(message)s
|
||||||
|
NEW_API_BASE_URL=http://localhost:3000
|
||||||
|
NEW_API_AUTH_TOKEN="sk-Q79KGFJRs5Vk9HsfFqoiJk948uLMDhAVe037AeCb31URyWGL"
|
||||||
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
This project exposes a FastAPI-based microservice that provides:
|
This project exposes a FastAPI-based microservice that provides:
|
||||||
|
|
||||||
- A unified chat completions gateway supporting multiple LLM providers (OpenAI, Anthropic, OpenRouter, Gemini, Qwen, DeepSeek, etc.)
|
- A unified chat completions gateway that now forwards requests to the internal `new-api` service (default `http://localhost:3000`) while preserving the same client-facing schema.
|
||||||
- An asynchronous data import analysis pipeline that orchestrates LLM calls to produce structured metadata and processing recommendations
|
- An asynchronous data import analysis pipeline that orchestrates LLM calls to produce structured metadata and processing recommendations
|
||||||
|
|
||||||
The following instructions cover environment setup, dependency installation, and running the backend service.
|
The following instructions cover environment setup, dependency installation, and running the backend service.
|
||||||
@ -56,6 +56,7 @@ Copy `.env.example` to `.env` (if provided) or edit `.env` to supply API keys an
|
|||||||
- `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `OPENROUTER_API_KEY`, etc.
|
- `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `OPENROUTER_API_KEY`, etc.
|
||||||
- `HTTP_CLIENT_TIMEOUT`, `IMPORT_CHAT_TIMEOUT_SECONDS`
|
- `HTTP_CLIENT_TIMEOUT`, `IMPORT_CHAT_TIMEOUT_SECONDS`
|
||||||
- `LOG_LEVEL`, `LOG_FORMAT` for logging
|
- `LOG_LEVEL`, `LOG_FORMAT` for logging
|
||||||
|
- `NEW_API_BASE_URL` (defaults to `http://localhost:3000`) and optional `NEW_API_AUTH_TOKEN` if the new-api component enforces authentication.
|
||||||
|
|
||||||
|
|
||||||
## Run the Backend Service
|
## Run the Backend Service
|
||||||
@ -84,4 +85,4 @@ Or use a process manager such as `pm2`, `supervisor`, or systemd for production
|
|||||||
|
|
||||||
- Run the data import analysis example: `python test/data_import_analysis_example.py`
|
- Run the data import analysis example: `python test/data_import_analysis_example.py`
|
||||||
- Test the OpenRouter demo: `python test/openrouter_chat_example.py`
|
- Test the OpenRouter demo: `python test/openrouter_chat_example.py`
|
||||||
- Send a DeepSeek chat request script: `python scripts/deepseek_request.py`
|
- Send a DeepSeek chat request script: `python scripts/deepseek_request.py`
|
||||||
|
|||||||
69
app/main.py
69
app/main.py
@ -16,19 +16,24 @@ from fastapi.responses import JSONResponse
|
|||||||
|
|
||||||
from app.exceptions import ProviderAPICallError, ProviderConfigurationError
|
from app.exceptions import ProviderAPICallError, ProviderConfigurationError
|
||||||
from app.models import (
|
from app.models import (
|
||||||
|
ActionStatus,
|
||||||
|
ActionType,
|
||||||
DataImportAnalysisJobAck,
|
DataImportAnalysisJobAck,
|
||||||
DataImportAnalysisJobRequest,
|
DataImportAnalysisJobRequest,
|
||||||
LLMRequest,
|
LLMRequest,
|
||||||
LLMResponse,
|
LLMResponse,
|
||||||
TableProfilingJobAck,
|
TableProfilingJobAck,
|
||||||
TableProfilingJobRequest,
|
TableProfilingJobRequest,
|
||||||
|
TableSnippetRagIngestRequest,
|
||||||
|
TableSnippetRagIngestResponse,
|
||||||
TableSnippetUpsertRequest,
|
TableSnippetUpsertRequest,
|
||||||
TableSnippetUpsertResponse,
|
TableSnippetUpsertResponse,
|
||||||
)
|
)
|
||||||
|
from app.routers import chat_router, metrics_router
|
||||||
from app.services import LLMGateway
|
from app.services import LLMGateway
|
||||||
from app.services.import_analysis import process_import_analysis_job
|
from app.services.import_analysis import process_import_analysis_job
|
||||||
from app.services.table_profiling import process_table_profiling_job
|
from app.services.table_profiling import process_table_profiling_job
|
||||||
from app.services.table_snippet import upsert_action_result
|
from app.services.table_snippet import ingest_snippet_rag_from_db, upsert_action_result
|
||||||
|
|
||||||
|
|
||||||
def _ensure_log_directories(config: dict[str, Any]) -> None:
|
def _ensure_log_directories(config: dict[str, Any]) -> None:
|
||||||
@ -135,6 +140,9 @@ def create_app() -> FastAPI:
|
|||||||
version="0.1.0",
|
version="0.1.0",
|
||||||
lifespan=lifespan,
|
lifespan=lifespan,
|
||||||
)
|
)
|
||||||
|
# Chat/metric management APIs
|
||||||
|
application.include_router(chat_router)
|
||||||
|
application.include_router(metrics_router)
|
||||||
|
|
||||||
@application.exception_handler(RequestValidationError)
|
@application.exception_handler(RequestValidationError)
|
||||||
async def request_validation_exception_handler(
|
async def request_validation_exception_handler(
|
||||||
@ -230,11 +238,12 @@ def create_app() -> FastAPI:
|
|||||||
)
|
)
|
||||||
async def upsert_table_snippet(
|
async def upsert_table_snippet(
|
||||||
payload: TableSnippetUpsertRequest,
|
payload: TableSnippetUpsertRequest,
|
||||||
|
client: httpx.AsyncClient = Depends(get_http_client),
|
||||||
) -> TableSnippetUpsertResponse:
|
) -> TableSnippetUpsertResponse:
|
||||||
request_copy = payload.model_copy(deep=True)
|
request_copy = payload.model_copy(deep=True)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return await asyncio.to_thread(upsert_action_result, request_copy)
|
response = await asyncio.to_thread(upsert_action_result, request_copy)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.error(
|
logger.error(
|
||||||
"Failed to upsert table snippet: table_id=%s version_ts=%s action_type=%s",
|
"Failed to upsert table snippet: table_id=%s version_ts=%s action_type=%s",
|
||||||
@ -244,6 +253,62 @@ def create_app() -> FastAPI:
|
|||||||
exc_info=True,
|
exc_info=True,
|
||||||
)
|
)
|
||||||
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
||||||
|
else:
|
||||||
|
# After snippet_alias is stored, automatically trigger RAG ingest when configured.
|
||||||
|
if (
|
||||||
|
payload.action_type == ActionType.SNIPPET_ALIAS
|
||||||
|
and payload.status == ActionStatus.SUCCESS
|
||||||
|
and payload.rag_workspace_id is not None
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
await ingest_snippet_rag_from_db(
|
||||||
|
table_id=payload.table_id,
|
||||||
|
version_ts=payload.version_ts,
|
||||||
|
workspace_id=payload.rag_workspace_id,
|
||||||
|
rag_item_type=payload.rag_item_type or "SNIPPET",
|
||||||
|
client=client,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
logger.exception(
|
||||||
|
"Failed to ingest snippet RAG artifacts after snippet_alias upsert",
|
||||||
|
extra={
|
||||||
|
"table_id": payload.table_id,
|
||||||
|
"version_ts": payload.version_ts,
|
||||||
|
"workspace_id": payload.rag_workspace_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
@application.post(
|
||||||
|
"/v1/table/snippet/rag_ingest",
|
||||||
|
response_model=TableSnippetRagIngestResponse,
|
||||||
|
summary="Merge snippet+alias results from action_results and ingest into RAG.",
|
||||||
|
)
|
||||||
|
async def ingest_snippet_rag(
|
||||||
|
payload: TableSnippetRagIngestRequest,
|
||||||
|
client: httpx.AsyncClient = Depends(get_http_client),
|
||||||
|
) -> TableSnippetRagIngestResponse:
|
||||||
|
try:
|
||||||
|
rag_item_ids = await ingest_snippet_rag_from_db(
|
||||||
|
table_id=payload.table_id,
|
||||||
|
version_ts=payload.version_ts,
|
||||||
|
workspace_id=payload.workspace_id,
|
||||||
|
rag_item_type=payload.rag_item_type or "SNIPPET",
|
||||||
|
client=client,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception(
|
||||||
|
"Failed to ingest snippet RAG artifacts",
|
||||||
|
extra={
|
||||||
|
"table_id": payload.table_id,
|
||||||
|
"version_ts": payload.version_ts,
|
||||||
|
"workspace_id": payload.workspace_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
||||||
|
|
||||||
|
return TableSnippetRagIngestResponse(rag_item_ids=rag_item_ids)
|
||||||
|
|
||||||
@application.post("/__mock__/import-callback")
|
@application.post("/__mock__/import-callback")
|
||||||
async def mock_import_callback(payload: dict[str, Any]) -> dict[str, str]:
|
async def mock_import_callback(payload: dict[str, Any]) -> dict[str, str]:
|
||||||
|
|||||||
@ -232,6 +232,15 @@ class TableProfilingJobRequest(BaseModel):
|
|||||||
None,
|
None,
|
||||||
description="Miscellaneous execution flags applied across pipeline steps.",
|
description="Miscellaneous execution flags applied across pipeline steps.",
|
||||||
)
|
)
|
||||||
|
workspace_id: Optional[int] = Field(
|
||||||
|
None,
|
||||||
|
ge=0,
|
||||||
|
description="Optional workspace identifier forwarded to snippet_alias callback for RAG ingestion.",
|
||||||
|
)
|
||||||
|
rag_item_type: Optional[str] = Field(
|
||||||
|
"SNIPPET",
|
||||||
|
description="Optional RAG item type forwarded to snippet_alias callback.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TableProfilingJobAck(BaseModel):
|
class TableProfilingJobAck(BaseModel):
|
||||||
@ -247,6 +256,16 @@ class TableSnippetUpsertRequest(BaseModel):
|
|||||||
ge=0,
|
ge=0,
|
||||||
description="Version timestamp aligned with the pipeline (yyyyMMddHHmmss as integer).",
|
description="Version timestamp aligned with the pipeline (yyyyMMddHHmmss as integer).",
|
||||||
)
|
)
|
||||||
|
workspace_id: Optional[int] = Field(
|
||||||
|
None,
|
||||||
|
ge=0,
|
||||||
|
description="Optional workspace identifier for RAG ingestion; when provided and action_type=snippet_alias "
|
||||||
|
"with status=success, merged snippets will be written to rag_snippet and pushed to RAG.",
|
||||||
|
)
|
||||||
|
rag_item_type: Optional[str] = Field(
|
||||||
|
"SNIPPET",
|
||||||
|
description="Optional RAG item type used when pushing snippets to RAG. Defaults to 'SNIPPET'.",
|
||||||
|
)
|
||||||
action_type: ActionType = Field(..., description="Pipeline action type for this record.")
|
action_type: ActionType = Field(..., description="Pipeline action type for this record.")
|
||||||
status: ActionStatus = Field(
|
status: ActionStatus = Field(
|
||||||
ActionStatus.SUCCESS, description="Execution status for the action."
|
ActionStatus.SUCCESS, description="Execution status for the action."
|
||||||
@ -254,17 +273,57 @@ class TableSnippetUpsertRequest(BaseModel):
|
|||||||
callback_url: HttpUrl = Field(..., description="Callback URL associated with the action run.")
|
callback_url: HttpUrl = Field(..., description="Callback URL associated with the action run.")
|
||||||
table_schema_version_id: int = Field(..., ge=0, description="Identifier for the schema snapshot.")
|
table_schema_version_id: int = Field(..., ge=0, description="Identifier for the schema snapshot.")
|
||||||
table_schema: Any = Field(..., description="Schema snapshot payload for the table.")
|
table_schema: Any = Field(..., description="Schema snapshot payload for the table.")
|
||||||
result_json: Optional[Any] = Field(
|
model: Optional[str] = Field(
|
||||||
None,
|
None,
|
||||||
description="Primary result payload for the action (e.g., profiling output, snippet array).",
|
description="LLM model identifier (can be provider alias) used for this action, when applicable.",
|
||||||
)
|
)
|
||||||
result_summary_json: Optional[Any] = Field(
|
model_provider: Optional[str] = Field(
|
||||||
None,
|
None,
|
||||||
description="Optional summary payload (e.g., profiling summary) for the action.",
|
description="LLM provider responsible for executing the action's model.",
|
||||||
)
|
)
|
||||||
html_report_url: Optional[str] = Field(
|
model_params: Optional[Dict[str, Any]] = Field(
|
||||||
None,
|
None,
|
||||||
description="Optional HTML report URL generated by the action.",
|
description="Optional model parameter overrides (e.g., temperature) associated with the action.",
|
||||||
|
)
|
||||||
|
llm_usage: Optional[Any] = Field(
|
||||||
|
None,
|
||||||
|
description="Optional token usage metrics reported by the LLM provider.",
|
||||||
|
)
|
||||||
|
ge_profiling_json: Optional[Any] = Field(
|
||||||
|
None, description="Full GE profiling result payload for the profiling action."
|
||||||
|
)
|
||||||
|
ge_profiling_json_size_bytes: Optional[int] = Field(
|
||||||
|
None, ge=0, description="Size in bytes of the GE profiling result JSON."
|
||||||
|
)
|
||||||
|
ge_profiling_summary: Optional[Any] = Field(
|
||||||
|
None, description="Sanitised GE profiling summary payload."
|
||||||
|
)
|
||||||
|
ge_profiling_summary_size_bytes: Optional[int] = Field(
|
||||||
|
None, ge=0, description="Size in bytes of the GE profiling summary JSON."
|
||||||
|
)
|
||||||
|
ge_profiling_total_size_bytes: Optional[int] = Field(
|
||||||
|
None, ge=0, description="Combined size (bytes) of profiling result + summary."
|
||||||
|
)
|
||||||
|
ge_profiling_html_report_url: Optional[str] = Field(
|
||||||
|
None, description="Optional URL to the generated GE profiling HTML report."
|
||||||
|
)
|
||||||
|
ge_result_desc_json: Optional[Any] = Field(
|
||||||
|
None, description="Result JSON for the GE result description action."
|
||||||
|
)
|
||||||
|
ge_result_desc_json_size_bytes: Optional[int] = Field(
|
||||||
|
None, ge=0, description="Size in bytes of the GE result description JSON."
|
||||||
|
)
|
||||||
|
snippet_json: Optional[Any] = Field(
|
||||||
|
None, description="Snippet generation action result JSON."
|
||||||
|
)
|
||||||
|
snippet_json_size_bytes: Optional[int] = Field(
|
||||||
|
None, ge=0, description="Size in bytes of the snippet result JSON."
|
||||||
|
)
|
||||||
|
snippet_alias_json: Optional[Any] = Field(
|
||||||
|
None, description="Snippet alias expansion result JSON."
|
||||||
|
)
|
||||||
|
snippet_alias_json_size_bytes: Optional[int] = Field(
|
||||||
|
None, ge=0, description="Size in bytes of the snippet alias result JSON."
|
||||||
)
|
)
|
||||||
error_code: Optional[str] = Field(None, description="Optional error code when status indicates a failure.")
|
error_code: Optional[str] = Field(None, description="Optional error code when status indicates a failure.")
|
||||||
error_message: Optional[str] = Field(None, description="Optional error message when status indicates a failure.")
|
error_message: Optional[str] = Field(None, description="Optional error message when status indicates a failure.")
|
||||||
@ -279,6 +338,24 @@ class TableSnippetUpsertRequest(BaseModel):
|
|||||||
ge=0,
|
ge=0,
|
||||||
description="Optional execution duration in milliseconds.",
|
description="Optional execution duration in milliseconds.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TableSnippetRagIngestRequest(BaseModel):
|
||||||
|
table_id: int = Field(..., ge=1, description="Unique identifier for the table.")
|
||||||
|
version_ts: int = Field(
|
||||||
|
...,
|
||||||
|
ge=0,
|
||||||
|
description="Version timestamp aligned with the pipeline (yyyyMMddHHmmss as integer).",
|
||||||
|
)
|
||||||
|
workspace_id: int = Field(..., ge=0, description="Workspace id used when pushing snippets to RAG.")
|
||||||
|
rag_item_type: Optional[str] = Field(
|
||||||
|
"SNIPPET",
|
||||||
|
description="Optional RAG item type used when pushing snippets to RAG. Defaults to 'SNIPPET'.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TableSnippetRagIngestResponse(BaseModel):
|
||||||
|
rag_item_ids: List[int] = Field(..., description="List of ingested rag_item_ids.")
|
||||||
result_checksum: Optional[str] = Field(
|
result_checksum: Optional[str] = Field(
|
||||||
None,
|
None,
|
||||||
description="Optional checksum for the result payload (e.g., MD5).",
|
description="Optional checksum for the result payload (e.g., MD5).",
|
||||||
|
|||||||
4
app/routers/__init__.py
Normal file
4
app/routers/__init__.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
from .chat import router as chat_router
|
||||||
|
from .metrics import router as metrics_router
|
||||||
|
|
||||||
|
__all__ = ["chat_router", "metrics_router"]
|
||||||
102
app/routers/chat.py
Normal file
102
app/routers/chat.py
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any, List, Optional
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException, Query
|
||||||
|
|
||||||
|
from app.schemas.chat import (
|
||||||
|
ChatSessionCreate,
|
||||||
|
ChatSessionUpdate,
|
||||||
|
ChatTurnCreate,
|
||||||
|
ChatTurnRetrievalBatch,
|
||||||
|
)
|
||||||
|
from app.services import metric_store
|
||||||
|
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/v1/chat", tags=["chat"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sessions")
|
||||||
|
def create_session(payload: ChatSessionCreate) -> Any:
|
||||||
|
"""Create a chat session."""
|
||||||
|
return metric_store.create_chat_session(payload)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sessions/{session_id}/update")
|
||||||
|
def update_session(session_id: int, payload: ChatSessionUpdate) -> Any:
|
||||||
|
try:
|
||||||
|
return metric_store.update_chat_session(session_id, payload)
|
||||||
|
except KeyError:
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sessions/{session_id}/close")
|
||||||
|
def close_session(session_id: int) -> Any:
|
||||||
|
"""Close a chat session and stamp end_time."""
|
||||||
|
try:
|
||||||
|
return metric_store.close_chat_session(session_id)
|
||||||
|
except KeyError:
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/sessions/{session_id}")
|
||||||
|
def get_session(session_id: int) -> Any:
|
||||||
|
"""Fetch one session."""
|
||||||
|
session = metric_store.get_chat_session(session_id)
|
||||||
|
if not session:
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
return session
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/sessions")
|
||||||
|
def list_sessions(
|
||||||
|
user_id: Optional[int] = None,
|
||||||
|
status: Optional[str] = None,
|
||||||
|
start_from: Optional[datetime] = Query(None, description="Filter by start time lower bound."),
|
||||||
|
start_to: Optional[datetime] = Query(None, description="Filter by start time upper bound."),
|
||||||
|
limit: int = Query(50, ge=1, le=500),
|
||||||
|
offset: int = Query(0, ge=0),
|
||||||
|
) -> List[Any]:
|
||||||
|
return metric_store.list_chat_sessions(
|
||||||
|
user_id=user_id,
|
||||||
|
status=status,
|
||||||
|
start_from=start_from,
|
||||||
|
start_to=start_to,
|
||||||
|
limit=limit,
|
||||||
|
offset=offset,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sessions/{session_id}/turns")
|
||||||
|
def create_turn(session_id: int, payload: ChatTurnCreate) -> Any:
|
||||||
|
"""Create a turn under a session."""
|
||||||
|
try:
|
||||||
|
return metric_store.create_chat_turn(session_id, payload)
|
||||||
|
except Exception as exc:
|
||||||
|
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/sessions/{session_id}/turns")
|
||||||
|
def list_turns(session_id: int) -> List[Any]:
|
||||||
|
return metric_store.list_chat_turns(session_id)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/turns/{turn_id}")
|
||||||
|
def get_turn(turn_id: int) -> Any:
|
||||||
|
turn = metric_store.get_chat_turn(turn_id)
|
||||||
|
if not turn:
|
||||||
|
raise HTTPException(status_code=404, detail="Turn not found")
|
||||||
|
return turn
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/turns/{turn_id}/retrievals")
|
||||||
|
def write_retrievals(turn_id: int, payload: ChatTurnRetrievalBatch) -> Any:
|
||||||
|
"""Batch write retrieval records for a turn."""
|
||||||
|
count = metric_store.create_retrievals(turn_id, payload.retrievals)
|
||||||
|
return {"turn_id": turn_id, "inserted": count}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/turns/{turn_id}/retrievals")
|
||||||
|
def list_retrievals(turn_id: int) -> List[Any]:
|
||||||
|
return metric_store.list_retrievals(turn_id)
|
||||||
166
app/routers/metrics.py
Normal file
166
app/routers/metrics.py
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any, List, Optional
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException, Query
|
||||||
|
|
||||||
|
from app.schemas.metrics import (
|
||||||
|
MetricCreate,
|
||||||
|
MetricResultsWriteRequest,
|
||||||
|
MetricRunTrigger,
|
||||||
|
MetricScheduleCreate,
|
||||||
|
MetricScheduleUpdate,
|
||||||
|
MetricUpdate,
|
||||||
|
)
|
||||||
|
from app.services import metric_store
|
||||||
|
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/v1", tags=["metrics"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/metrics")
|
||||||
|
def create_metric(payload: MetricCreate) -> Any:
|
||||||
|
"""Create a metric definition."""
|
||||||
|
try:
|
||||||
|
return metric_store.create_metric(payload)
|
||||||
|
except Exception as exc:
|
||||||
|
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/metrics/{metric_id}")
|
||||||
|
def update_metric(metric_id: int, payload: MetricUpdate) -> Any:
|
||||||
|
"""Update fields of a metric definition."""
|
||||||
|
try:
|
||||||
|
return metric_store.update_metric(metric_id, payload)
|
||||||
|
except KeyError:
|
||||||
|
raise HTTPException(status_code=404, detail="Metric not found")
|
||||||
|
except Exception as exc:
|
||||||
|
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/metrics/{metric_id}")
|
||||||
|
def get_metric(metric_id: int) -> Any:
|
||||||
|
"""Fetch a metric definition by id."""
|
||||||
|
metric = metric_store.get_metric(metric_id)
|
||||||
|
if not metric:
|
||||||
|
raise HTTPException(status_code=404, detail="Metric not found")
|
||||||
|
return metric
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/metrics")
|
||||||
|
def list_metrics(
|
||||||
|
biz_domain: Optional[str] = None,
|
||||||
|
is_active: Optional[bool] = None,
|
||||||
|
keyword: Optional[str] = Query(None, description="Search by code/name"),
|
||||||
|
limit: int = Query(100, ge=1, le=500),
|
||||||
|
offset: int = Query(0, ge=0),
|
||||||
|
) -> List[Any]:
|
||||||
|
"""List metrics with optional filters."""
|
||||||
|
return metric_store.list_metrics(
|
||||||
|
biz_domain=biz_domain,
|
||||||
|
is_active=is_active,
|
||||||
|
keyword=keyword,
|
||||||
|
limit=limit,
|
||||||
|
offset=offset,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/metric-schedules")
|
||||||
|
def create_schedule(payload: MetricScheduleCreate) -> Any:
|
||||||
|
"""Create a metric schedule."""
|
||||||
|
try:
|
||||||
|
return metric_store.create_metric_schedule(payload)
|
||||||
|
except Exception as exc:
|
||||||
|
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/metric-schedules/{schedule_id}")
|
||||||
|
def update_schedule(schedule_id: int, payload: MetricScheduleUpdate) -> Any:
|
||||||
|
"""Update a metric schedule."""
|
||||||
|
try:
|
||||||
|
return metric_store.update_metric_schedule(schedule_id, payload)
|
||||||
|
except KeyError:
|
||||||
|
raise HTTPException(status_code=404, detail="Schedule not found")
|
||||||
|
except Exception as exc:
|
||||||
|
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/metrics/{metric_id}/schedules")
|
||||||
|
def list_schedules(metric_id: int) -> List[Any]:
|
||||||
|
"""List schedules for one metric."""
|
||||||
|
return metric_store.list_schedules_for_metric(metric_id=metric_id)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/metric-runs/trigger")
|
||||||
|
def trigger_run(payload: MetricRunTrigger) -> Any:
|
||||||
|
"""Insert a run record (execution handled externally)."""
|
||||||
|
try:
|
||||||
|
return metric_store.trigger_metric_run(payload)
|
||||||
|
except KeyError as exc:
|
||||||
|
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
||||||
|
except Exception as exc:
|
||||||
|
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/metric-runs")
|
||||||
|
def list_runs(
|
||||||
|
metric_id: Optional[int] = None,
|
||||||
|
status: Optional[str] = None,
|
||||||
|
limit: int = Query(100, ge=1, le=500),
|
||||||
|
offset: int = Query(0, ge=0),
|
||||||
|
) -> List[Any]:
|
||||||
|
"""List run records."""
|
||||||
|
return metric_store.list_metric_runs(
|
||||||
|
metric_id=metric_id, status=status, limit=limit, offset=offset
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/metric-runs/{run_id}")
|
||||||
|
def get_run(run_id: int) -> Any:
|
||||||
|
"""Fetch run details."""
|
||||||
|
run = metric_store.get_metric_run(run_id)
|
||||||
|
if not run:
|
||||||
|
raise HTTPException(status_code=404, detail="Run not found")
|
||||||
|
return run
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/metric-results/{metric_id}")
|
||||||
|
def write_results(metric_id: int, payload: MetricResultsWriteRequest) -> Any:
|
||||||
|
# Align path metric_id with payload to avoid mismatch.
|
||||||
|
if payload.metric_id != metric_id:
|
||||||
|
raise HTTPException(status_code=400, detail="metric_id in path/body mismatch")
|
||||||
|
try:
|
||||||
|
inserted = metric_store.write_metric_results(payload)
|
||||||
|
except KeyError as exc:
|
||||||
|
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
||||||
|
except Exception as exc:
|
||||||
|
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||||
|
return {"metric_id": metric_id, "inserted": inserted}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/metric-results")
|
||||||
|
def query_results(
|
||||||
|
metric_id: int,
|
||||||
|
stat_from: Optional[datetime] = None,
|
||||||
|
stat_to: Optional[datetime] = None,
|
||||||
|
limit: int = Query(200, ge=1, le=1000),
|
||||||
|
offset: int = Query(0, ge=0),
|
||||||
|
) -> List[Any]:
|
||||||
|
"""Query metric results by time range."""
|
||||||
|
return metric_store.query_metric_results(
|
||||||
|
metric_id=metric_id,
|
||||||
|
stat_from=stat_from,
|
||||||
|
stat_to=stat_to,
|
||||||
|
limit=limit,
|
||||||
|
offset=offset,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/metric-results/latest")
|
||||||
|
def latest_result(metric_id: int) -> Any:
|
||||||
|
"""Fetch the latest metric result."""
|
||||||
|
result = metric_store.latest_metric_result(metric_id)
|
||||||
|
if not result:
|
||||||
|
raise HTTPException(status_code=404, detail="Metric result not found")
|
||||||
|
return result
|
||||||
53
app/schemas/chat.py
Normal file
53
app/schemas/chat.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any, List, Optional
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
class ChatSessionCreate(BaseModel):
|
||||||
|
"""Create a chat session to group multiple turns for a user."""
|
||||||
|
user_id: int = Field(..., description="User ID owning the session.")
|
||||||
|
session_uuid: Optional[str] = Field(None, description="Optional externally provided UUID.")
|
||||||
|
status: Optional[str] = Field("OPEN", description="Session status, default OPEN.")
|
||||||
|
end_time: Optional[datetime] = Field(None, description="Optional end time.")
|
||||||
|
ext_context: Optional[dict[str, Any]] = Field(None, description="Arbitrary business context.")
|
||||||
|
|
||||||
|
|
||||||
|
class ChatSessionUpdate(BaseModel):
|
||||||
|
"""Partial update for a chat session."""
|
||||||
|
status: Optional[str] = Field(None, description="New session status.")
|
||||||
|
end_time: Optional[datetime] = Field(None, description="Close time override.")
|
||||||
|
last_turn_id: Optional[int] = Field(None, description="Pointer to last chat turn.")
|
||||||
|
ext_context: Optional[dict[str, Any]] = Field(None, description="Context patch.")
|
||||||
|
|
||||||
|
|
||||||
|
class ChatTurnCreate(BaseModel):
|
||||||
|
"""Create a single chat turn with intent/SQL context."""
|
||||||
|
user_id: int = Field(..., description="User ID for this turn.")
|
||||||
|
user_query: str = Field(..., description="Raw user query content.")
|
||||||
|
intent: Optional[str] = Field(None, description="Intent tag such as METRIC_QUERY.")
|
||||||
|
ast_json: Optional[dict[str, Any]] = Field(None, description="Parsed AST payload.")
|
||||||
|
generated_sql: Optional[str] = Field(None, description="Final SQL text, if generated.")
|
||||||
|
sql_status: Optional[str] = Field(None, description="SQL generation/execution status.")
|
||||||
|
error_msg: Optional[str] = Field(None, description="Error message when SQL failed.")
|
||||||
|
main_metric_ids: Optional[List[int]] = Field(None, description="Metric IDs referenced in this turn.")
|
||||||
|
created_metric_ids: Optional[List[int]] = Field(None, description="Metric IDs created in this turn.")
|
||||||
|
end_time: Optional[datetime] = Field(None, description="Turn end time.")
|
||||||
|
|
||||||
|
|
||||||
|
class ChatTurnRetrievalItem(BaseModel):
|
||||||
|
"""Record of one retrieved item contributing to a turn."""
|
||||||
|
item_type: str = Field(..., description="METRIC/SNIPPET/CHAT etc.")
|
||||||
|
item_id: str = Field(..., description="Identifier such as metric_id or snippet_id.")
|
||||||
|
item_extra: Optional[dict[str, Any]] = Field(None, description="Additional context like column name.")
|
||||||
|
similarity_score: Optional[float] = Field(None, description="Similarity score.")
|
||||||
|
rank_no: Optional[int] = Field(None, description="Ranking position.")
|
||||||
|
used_in_reasoning: Optional[bool] = Field(False, description="Flag if used in reasoning.")
|
||||||
|
used_in_sql: Optional[bool] = Field(False, description="Flag if used in final SQL.")
|
||||||
|
|
||||||
|
|
||||||
|
class ChatTurnRetrievalBatch(BaseModel):
|
||||||
|
"""Batch insert wrapper for retrieval records."""
|
||||||
|
retrievals: List[ChatTurnRetrievalItem]
|
||||||
99
app/schemas/metrics.py
Normal file
99
app/schemas/metrics.py
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any, List, Optional
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
class MetricCreate(BaseModel):
|
||||||
|
"""Create a metric definition with business and technical metadata."""
|
||||||
|
metric_code: str = Field(..., description="Internal metric code, unique.")
|
||||||
|
metric_name: str = Field(..., description="Display name.")
|
||||||
|
metric_aliases: Optional[List[str]] = Field(None, description="Optional alias list.")
|
||||||
|
biz_domain: str = Field(..., description="Business domain identifier.")
|
||||||
|
biz_desc: Optional[str] = Field(None, description="Business definition.")
|
||||||
|
chat_turn_id: Optional[int] = Field(None, description="Source chat turn ID.")
|
||||||
|
tech_desc: Optional[str] = Field(None, description="Technical definition.")
|
||||||
|
formula_expr: Optional[str] = Field(None, description="Formula expression text.")
|
||||||
|
base_sql: str = Field(..., description="Canonical SQL used to compute the metric.")
|
||||||
|
time_grain: str = Field(..., description="DAY/HOUR/WEEK/MONTH etc.")
|
||||||
|
dim_binding: List[str] = Field(..., description="Dimension columns bound to the metric.")
|
||||||
|
update_strategy: str = Field(..., description="FULL/INCR/REALTIME.")
|
||||||
|
schedule_id: Optional[int] = Field(None, description="Linked schedule id if any.")
|
||||||
|
schedule_type: Optional[int] = Field(None, description="Scheduler type identifier.")
|
||||||
|
is_active: bool = Field(True, description="Whether the metric is enabled.")
|
||||||
|
created_by: Optional[int] = Field(None, description="Creator user id.")
|
||||||
|
updated_by: Optional[int] = Field(None, description="Updater user id.")
|
||||||
|
|
||||||
|
|
||||||
|
class MetricUpdate(BaseModel):
|
||||||
|
"""Partial update for an existing metric definition."""
|
||||||
|
metric_name: Optional[str] = None
|
||||||
|
metric_aliases: Optional[List[str]] = None
|
||||||
|
biz_domain: Optional[str] = None
|
||||||
|
biz_desc: Optional[str] = None
|
||||||
|
tech_desc: Optional[str] = None
|
||||||
|
formula_expr: Optional[str] = None
|
||||||
|
base_sql: Optional[str] = None
|
||||||
|
time_grain: Optional[str] = None
|
||||||
|
dim_binding: Optional[List[str]] = None
|
||||||
|
update_strategy: Optional[str] = None
|
||||||
|
schedule_id: Optional[int] = None
|
||||||
|
schedule_type: Optional[int] = None
|
||||||
|
is_active: Optional[bool] = None
|
||||||
|
updated_by: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
|
class MetricScheduleCreate(BaseModel):
|
||||||
|
"""Create a cron-based schedule for a metric."""
|
||||||
|
metric_id: int
|
||||||
|
cron_expr: str
|
||||||
|
enabled: bool = True
|
||||||
|
priority: int = 10
|
||||||
|
backfill_allowed: bool = True
|
||||||
|
max_runtime_sec: Optional[int] = None
|
||||||
|
retry_times: int = 0
|
||||||
|
owner_team: Optional[str] = None
|
||||||
|
owner_user_id: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
|
class MetricScheduleUpdate(BaseModel):
|
||||||
|
"""Update fields of an existing metric schedule."""
|
||||||
|
cron_expr: Optional[str] = None
|
||||||
|
enabled: Optional[bool] = None
|
||||||
|
priority: Optional[int] = None
|
||||||
|
backfill_allowed: Optional[bool] = None
|
||||||
|
max_runtime_sec: Optional[int] = None
|
||||||
|
retry_times: Optional[int] = None
|
||||||
|
owner_team: Optional[str] = None
|
||||||
|
owner_user_id: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
|
class MetricRunTrigger(BaseModel):
|
||||||
|
"""Trigger a metric run, optionally linking to a chat turn or schedule."""
|
||||||
|
metric_id: int
|
||||||
|
schedule_id: Optional[int] = None
|
||||||
|
source_turn_id: Optional[int] = None
|
||||||
|
data_time_from: Optional[datetime] = None
|
||||||
|
data_time_to: Optional[datetime] = None
|
||||||
|
metric_version: Optional[int] = None
|
||||||
|
base_sql_snapshot: Optional[str] = None
|
||||||
|
triggered_by: str = Field("API", description="SCHEDULER/MANUAL/API/QA_TURN")
|
||||||
|
triggered_at: Optional[datetime] = None
|
||||||
|
|
||||||
|
|
||||||
|
class MetricResultItem(BaseModel):
|
||||||
|
"""Single metric result row to be persisted."""
|
||||||
|
stat_time: datetime
|
||||||
|
metric_value: float
|
||||||
|
metric_version: Optional[int] = None
|
||||||
|
extra_dims: Optional[dict[str, Any]] = None
|
||||||
|
load_time: Optional[datetime] = None
|
||||||
|
data_version: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
|
class MetricResultsWriteRequest(BaseModel):
|
||||||
|
"""Batch write request for metric results."""
|
||||||
|
metric_id: int
|
||||||
|
results: List[MetricResultItem]
|
||||||
46
app/schemas/rag.py
Normal file
46
app/schemas/rag.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any, List
|
||||||
|
|
||||||
|
from pydantic import BaseModel, ConfigDict, Field
|
||||||
|
|
||||||
|
|
||||||
|
class RagItemPayload(BaseModel):
|
||||||
|
"""Payload for creating or updating a single RAG item."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(populate_by_name=True, extra="ignore")
|
||||||
|
|
||||||
|
id: int = Field(..., description="Unique identifier for the RAG item.")
|
||||||
|
workspace_id: int = Field(..., alias="workspaceId", description="Workspace identifier.")
|
||||||
|
name: str = Field(..., description="Readable name of the item.")
|
||||||
|
embedding_data: str = Field(..., alias="embeddingData", description="Serialized embedding payload.")
|
||||||
|
type: str = Field(..., description='Item type, e.g. "METRIC".')
|
||||||
|
|
||||||
|
|
||||||
|
class RagDeleteRequest(BaseModel):
|
||||||
|
"""Payload for deleting a single RAG item."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(populate_by_name=True, extra="ignore")
|
||||||
|
|
||||||
|
id: int = Field(..., description="Identifier of the item to delete.")
|
||||||
|
type: str = Field(..., description="Item type matching the stored record.")
|
||||||
|
|
||||||
|
|
||||||
|
class RagRetrieveRequest(BaseModel):
|
||||||
|
"""Payload for retrieving RAG items by semantic query."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(populate_by_name=True, extra="ignore")
|
||||||
|
|
||||||
|
query: str = Field(..., description="Search query text.")
|
||||||
|
num: int = Field(..., description="Number of items to return.")
|
||||||
|
workspace_id: int = Field(..., alias="workspaceId", description="Workspace scope for the search.")
|
||||||
|
type: str = Field(..., description="Item type to search, e.g. METRIC.")
|
||||||
|
|
||||||
|
|
||||||
|
class RagRetrieveResponse(BaseModel):
|
||||||
|
"""Generic RAG retrieval response wrapper."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(extra="allow")
|
||||||
|
|
||||||
|
data: List[Any] = Field(default_factory=list, description="Retrieved items.")
|
||||||
|
|
||||||
@ -1,3 +1,4 @@
|
|||||||
from .gateway import LLMGateway
|
from .gateway import LLMGateway
|
||||||
|
from .rag_client import RagAPIClient
|
||||||
|
|
||||||
__all__ = ["LLMGateway"]
|
__all__ = ["LLMGateway", "RagAPIClient"]
|
||||||
|
|||||||
@ -1,53 +1,93 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
import logging
|
||||||
from typing import Dict, Type
|
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
from pydantic import ValidationError
|
||||||
|
|
||||||
from app.exceptions import ProviderConfigurationError
|
from app.exceptions import ProviderAPICallError
|
||||||
from app.models import LLMProvider, LLMRequest, LLMResponse
|
from app.models import LLMChoice, LLMMessage, LLMRequest, LLMResponse
|
||||||
from app.providers import (
|
from app.settings import NEW_API_AUTH_TOKEN, NEW_API_BASE_URL
|
||||||
AnthropicProvider,
|
|
||||||
DeepSeekProvider,
|
|
||||||
GeminiProvider,
|
logger = logging.getLogger(__name__)
|
||||||
LLMProviderClient,
|
|
||||||
OpenAIProvider,
|
|
||||||
OpenRouterProvider,
|
|
||||||
QwenProvider,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class LLMGateway:
|
class LLMGateway:
|
||||||
"""Simple registry that dispatches chat requests to provider clients."""
|
"""Forward chat requests to the configured new-api component."""
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(
|
||||||
self._providers: Dict[LLMProvider, LLMProviderClient] = {}
|
self,
|
||||||
self._factory: Dict[LLMProvider, Type[LLMProviderClient]] = {
|
*,
|
||||||
LLMProvider.OPENAI: OpenAIProvider,
|
base_url: str | None = None,
|
||||||
LLMProvider.ANTHROPIC: AnthropicProvider,
|
auth_token: str | None = None,
|
||||||
LLMProvider.OPENROUTER: OpenRouterProvider,
|
) -> None:
|
||||||
LLMProvider.GEMINI: GeminiProvider,
|
resolved_base = base_url or NEW_API_BASE_URL
|
||||||
LLMProvider.QWEN: QwenProvider,
|
self._base_url = resolved_base.rstrip("/")
|
||||||
LLMProvider.DEEPSEEK: DeepSeekProvider,
|
self._auth_token = auth_token or NEW_API_AUTH_TOKEN
|
||||||
}
|
|
||||||
|
|
||||||
def get_provider(self, provider: LLMProvider) -> LLMProviderClient:
|
|
||||||
if provider not in self._factory:
|
|
||||||
raise ProviderConfigurationError(f"Unsupported provider '{provider.value}'.")
|
|
||||||
|
|
||||||
if provider not in self._providers:
|
|
||||||
self._providers[provider] = self._build_provider(provider)
|
|
||||||
return self._providers[provider]
|
|
||||||
|
|
||||||
def _build_provider(self, provider: LLMProvider) -> LLMProviderClient:
|
|
||||||
provider_cls = self._factory[provider]
|
|
||||||
api_key_env = getattr(provider_cls, "api_key_env", None)
|
|
||||||
api_key = os.getenv(api_key_env) if api_key_env else None
|
|
||||||
return provider_cls(api_key)
|
|
||||||
|
|
||||||
async def chat(
|
async def chat(
|
||||||
self, request: LLMRequest, client: httpx.AsyncClient
|
self, request: LLMRequest, client: httpx.AsyncClient
|
||||||
) -> LLMResponse:
|
) -> LLMResponse:
|
||||||
provider_client = self.get_provider(request.provider)
|
url = f"{self._base_url}/v1/chat/completions"
|
||||||
return await provider_client.chat(request, client)
|
payload = request.model_dump(mode="json", exclude_none=True)
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
if self._auth_token:
|
||||||
|
headers["Authorization"] = f"Bearer {self._auth_token}"
|
||||||
|
logger.info("Forwarding chat request to new-api at %s", url)
|
||||||
|
try:
|
||||||
|
response = await client.post(url, json=payload, headers=headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
except httpx.HTTPStatusError as exc:
|
||||||
|
status_code = exc.response.status_code if exc.response else None
|
||||||
|
response_text = exc.response.text if exc.response else ""
|
||||||
|
logger.error(
|
||||||
|
"new-api upstream returned %s: %s",
|
||||||
|
status_code,
|
||||||
|
response_text,
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
raise ProviderAPICallError(
|
||||||
|
"Chat completion request failed.",
|
||||||
|
status_code=status_code,
|
||||||
|
response_text=response_text,
|
||||||
|
) from exc
|
||||||
|
except httpx.HTTPError as exc:
|
||||||
|
logger.error("new-api transport error: %s", exc, exc_info=True)
|
||||||
|
raise ProviderAPICallError(f"Chat completion request failed: {exc}") from exc
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = response.json()
|
||||||
|
except ValueError as exc:
|
||||||
|
logger.error("new-api responded with invalid JSON.", exc_info=True)
|
||||||
|
raise ProviderAPICallError(
|
||||||
|
"Chat completion response was not valid JSON."
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
logger.info("new-api payload: %s", data)
|
||||||
|
normalized_choices: list[LLMChoice] = []
|
||||||
|
for idx, choice in enumerate(data.get("choices", []) or []):
|
||||||
|
message_payload = choice.get("message") or {}
|
||||||
|
message = LLMMessage(
|
||||||
|
role=message_payload.get("role", "assistant"),
|
||||||
|
content=message_payload.get("content", ""),
|
||||||
|
)
|
||||||
|
normalized_choices.append(
|
||||||
|
LLMChoice(index=choice.get("index", idx), message=message)
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
normalized_response = LLMResponse(
|
||||||
|
provider=request.provider,
|
||||||
|
model=data.get("model", request.model),
|
||||||
|
choices=normalized_choices,
|
||||||
|
raw=data,
|
||||||
|
)
|
||||||
|
return normalized_response
|
||||||
|
except ValidationError as exc:
|
||||||
|
logger.error(
|
||||||
|
"new-api response did not match expected schema: %s", data, exc_info=True
|
||||||
|
)
|
||||||
|
raise ProviderAPICallError(
|
||||||
|
"Chat completion response was not in the expected format."
|
||||||
|
) from exc
|
||||||
|
|||||||
@ -22,13 +22,24 @@ from app.models import (
|
|||||||
LLMResponse,
|
LLMResponse,
|
||||||
LLMRole,
|
LLMRole,
|
||||||
)
|
)
|
||||||
from app.settings import DEFAULT_IMPORT_MODEL, get_supported_import_models
|
from app.settings import (
|
||||||
|
DEFAULT_IMPORT_MODEL,
|
||||||
|
NEW_API_AUTH_TOKEN,
|
||||||
|
NEW_API_BASE_URL,
|
||||||
|
get_supported_import_models,
|
||||||
|
)
|
||||||
|
from app.utils.llm_usage import extract_usage
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
IMPORT_GATEWAY_BASE_URL = os.getenv(
|
IMPORT_GATEWAY_BASE_URL = os.getenv("IMPORT_GATEWAY_BASE_URL", NEW_API_BASE_URL)
|
||||||
"IMPORT_GATEWAY_BASE_URL", "http://localhost:8000"
|
|
||||||
)
|
|
||||||
|
def build_import_gateway_headers() -> dict[str, str]:
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
if NEW_API_AUTH_TOKEN:
|
||||||
|
headers["Authorization"] = f"Bearer {NEW_API_AUTH_TOKEN}"
|
||||||
|
return headers
|
||||||
|
|
||||||
|
|
||||||
def _env_float(name: str, default: float) -> float:
|
def _env_float(name: str, default: float) -> float:
|
||||||
@ -313,16 +324,18 @@ async def dispatch_import_analysis_job(
|
|||||||
url = f"{IMPORT_GATEWAY_BASE_URL.rstrip('/')}/v1/chat/completions"
|
url = f"{IMPORT_GATEWAY_BASE_URL.rstrip('/')}/v1/chat/completions"
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"Dispatching import %s to %s: %s",
|
"Dispatching import %s to %s using provider=%s model=%s",
|
||||||
request.import_record_id,
|
request.import_record_id,
|
||||||
url,
|
url,
|
||||||
json.dumps(payload, ensure_ascii=False),
|
payload.get("provider"),
|
||||||
|
payload.get("model"),
|
||||||
)
|
)
|
||||||
|
|
||||||
timeout = httpx.Timeout(IMPORT_CHAT_TIMEOUT_SECONDS)
|
timeout = httpx.Timeout(IMPORT_CHAT_TIMEOUT_SECONDS)
|
||||||
|
headers = build_import_gateway_headers()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = await client.post(url, json=payload, timeout=timeout)
|
response = await client.post(url, json=payload, timeout=timeout, headers=headers)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
except httpx.HTTPStatusError as exc:
|
except httpx.HTTPStatusError as exc:
|
||||||
body_preview = ""
|
body_preview = ""
|
||||||
@ -347,9 +360,10 @@ async def dispatch_import_analysis_job(
|
|||||||
response.status_code,
|
response.status_code,
|
||||||
)
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
"LLM response for %s: %s",
|
"LLM response received for %s (status %s, choices=%s)",
|
||||||
request.import_record_id,
|
request.import_record_id,
|
||||||
json.dumps(response_data, ensure_ascii=False),
|
response.status_code,
|
||||||
|
len(response_data.get("choices") or []),
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -375,18 +389,6 @@ async def dispatch_import_analysis_job(
|
|||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# 兼容处理多模型的使用量字段提取
|
|
||||||
def extract_usage(resp_json: dict) -> dict:
|
|
||||||
usage = resp_json.get("usage") or resp_json.get("usageMetadata") or {}
|
|
||||||
return {
|
|
||||||
"prompt_tokens": usage.get("prompt_tokens") or usage.get("input_tokens") or usage.get("promptTokenCount"),
|
|
||||||
"completion_tokens": usage.get("completion_tokens") or usage.get("output_tokens") or usage.get("candidatesTokenCount"),
|
|
||||||
"total_tokens": usage.get("total_tokens") or usage.get("totalTokenCount") or (
|
|
||||||
(usage.get("prompt_tokens") or usage.get("input_tokens") or 0)
|
|
||||||
+ (usage.get("completion_tokens") or usage.get("output_tokens") or 0)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
async def notify_import_analysis_callback(
|
async def notify_import_analysis_callback(
|
||||||
callback_url: str,
|
callback_url: str,
|
||||||
payload: Dict[str, Any],
|
payload: Dict[str, Any],
|
||||||
@ -415,6 +417,7 @@ async def process_import_analysis_job(
|
|||||||
request: DataImportAnalysisJobRequest,
|
request: DataImportAnalysisJobRequest,
|
||||||
client: httpx.AsyncClient,
|
client: httpx.AsyncClient,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
# Run the import analysis and ensure the callback fires regardless of success/failure.
|
||||||
try:
|
try:
|
||||||
payload = await dispatch_import_analysis_job(request, client)
|
payload = await dispatch_import_analysis_job(request, client)
|
||||||
except ProviderAPICallError as exc:
|
except ProviderAPICallError as exc:
|
||||||
|
|||||||
842
app/services/metric_store.py
Normal file
842
app/services/metric_store.py
Normal file
@ -0,0 +1,842 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any, Dict, Iterable, List, Optional
|
||||||
|
from uuid import uuid4
|
||||||
|
|
||||||
|
from sqlalchemy import text
|
||||||
|
from sqlalchemy.engine import Row
|
||||||
|
|
||||||
|
from app.db import get_engine
|
||||||
|
from app.schemas.chat import (
|
||||||
|
ChatSessionCreate,
|
||||||
|
ChatSessionUpdate,
|
||||||
|
ChatTurnCreate,
|
||||||
|
ChatTurnRetrievalItem,
|
||||||
|
)
|
||||||
|
from app.schemas.metrics import (
|
||||||
|
MetricCreate,
|
||||||
|
MetricResultItem,
|
||||||
|
MetricResultsWriteRequest,
|
||||||
|
MetricRunTrigger,
|
||||||
|
MetricScheduleCreate,
|
||||||
|
MetricScheduleUpdate,
|
||||||
|
MetricUpdate,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# Common helpers
|
||||||
|
def _json_dump(value: Any) -> Optional[str]:
|
||||||
|
"""Safe JSON dumper; returns None on failure to keep DB writes simple."""
|
||||||
|
if value is None:
|
||||||
|
return None
|
||||||
|
if isinstance(value, str):
|
||||||
|
return value
|
||||||
|
try:
|
||||||
|
return json.dumps(value, ensure_ascii=False)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_json_fields(payload: Dict[str, Any], fields: Iterable[str]) -> Dict[str, Any]:
|
||||||
|
"""Parse select fields from JSON strings into dict/list for responses."""
|
||||||
|
for field in fields:
|
||||||
|
raw = payload.get(field)
|
||||||
|
if raw is None or isinstance(raw, (dict, list)):
|
||||||
|
continue
|
||||||
|
if isinstance(raw, (bytes, bytearray)):
|
||||||
|
raw = raw.decode("utf-8", errors="ignore")
|
||||||
|
if isinstance(raw, str):
|
||||||
|
try:
|
||||||
|
payload[field] = json.loads(raw)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
return payload
|
||||||
|
|
||||||
|
|
||||||
|
def _row_to_dict(row: Row[Any]) -> Dict[str, Any]:
|
||||||
|
return dict(row._mapping)
|
||||||
|
|
||||||
|
|
||||||
|
# Chat sessions & turns
|
||||||
|
def create_chat_session(payload: ChatSessionCreate) -> Dict[str, Any]:
|
||||||
|
"""Create a chat session row with optional external UUID."""
|
||||||
|
engine = get_engine()
|
||||||
|
session_uuid = payload.session_uuid or str(uuid4())
|
||||||
|
now = datetime.utcnow()
|
||||||
|
params = {
|
||||||
|
"user_id": payload.user_id,
|
||||||
|
"session_uuid": session_uuid,
|
||||||
|
"end_time": payload.end_time,
|
||||||
|
"status": payload.status or "OPEN",
|
||||||
|
"ext_context": _json_dump(payload.ext_context),
|
||||||
|
}
|
||||||
|
with engine.begin() as conn:
|
||||||
|
result = conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
INSERT INTO chat_session (user_id, session_uuid, end_time, status, ext_context)
|
||||||
|
VALUES (:user_id, :session_uuid, :end_time, :status, :ext_context)
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
params,
|
||||||
|
)
|
||||||
|
session_id = result.lastrowid
|
||||||
|
row = conn.execute(
|
||||||
|
text("SELECT * FROM chat_session WHERE id=:id"), {"id": session_id}
|
||||||
|
).first()
|
||||||
|
if not row:
|
||||||
|
raise RuntimeError("Failed to create chat session.")
|
||||||
|
data = _row_to_dict(row)
|
||||||
|
_parse_json_fields(data, ["ext_context"])
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def update_chat_session(session_id: int, payload: ChatSessionUpdate) -> Dict[str, Any]:
|
||||||
|
"""Patch selected chat session fields."""
|
||||||
|
updates = {}
|
||||||
|
if payload.status is not None:
|
||||||
|
updates["status"] = payload.status
|
||||||
|
if payload.end_time is not None:
|
||||||
|
updates["end_time"] = payload.end_time
|
||||||
|
if payload.last_turn_id is not None:
|
||||||
|
updates["last_turn_id"] = payload.last_turn_id
|
||||||
|
if payload.ext_context is not None:
|
||||||
|
updates["ext_context"] = _json_dump(payload.ext_context)
|
||||||
|
|
||||||
|
if not updates:
|
||||||
|
current = get_chat_session(session_id)
|
||||||
|
if not current:
|
||||||
|
raise KeyError(f"Session {session_id} not found.")
|
||||||
|
return current
|
||||||
|
|
||||||
|
set_clause = ", ".join(f"{key}=:{key}" for key in updates.keys())
|
||||||
|
params = dict(updates)
|
||||||
|
params["id"] = session_id
|
||||||
|
|
||||||
|
engine = get_engine()
|
||||||
|
with engine.begin() as conn:
|
||||||
|
conn.execute(
|
||||||
|
text(f"UPDATE chat_session SET {set_clause} WHERE id=:id"),
|
||||||
|
params,
|
||||||
|
)
|
||||||
|
row = conn.execute(
|
||||||
|
text("SELECT * FROM chat_session WHERE id=:id"), {"id": session_id}
|
||||||
|
).first()
|
||||||
|
if not row:
|
||||||
|
raise KeyError(f"Session {session_id} not found.")
|
||||||
|
data = _row_to_dict(row)
|
||||||
|
_parse_json_fields(data, ["ext_context"])
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def close_chat_session(session_id: int) -> Dict[str, Any]:
|
||||||
|
"""Mark a chat session as CLOSED with end_time."""
|
||||||
|
now = datetime.utcnow()
|
||||||
|
return update_chat_session(
|
||||||
|
session_id,
|
||||||
|
ChatSessionUpdate(status="CLOSED", end_time=now),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_chat_session(session_id: int) -> Optional[Dict[str, Any]]:
|
||||||
|
engine = get_engine()
|
||||||
|
with engine.begin() as conn:
|
||||||
|
row = conn.execute(
|
||||||
|
text("SELECT * FROM chat_session WHERE id=:id"), {"id": session_id}
|
||||||
|
).first()
|
||||||
|
if not row:
|
||||||
|
return None
|
||||||
|
data = _row_to_dict(row)
|
||||||
|
_parse_json_fields(data, ["ext_context"])
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def list_chat_sessions(
|
||||||
|
*,
|
||||||
|
user_id: Optional[int] = None,
|
||||||
|
status: Optional[str] = None,
|
||||||
|
start_from: Optional[datetime] = None,
|
||||||
|
start_to: Optional[datetime] = None,
|
||||||
|
limit: int = 50,
|
||||||
|
offset: int = 0,
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""List chat sessions with optional filters and pagination."""
|
||||||
|
conditions = []
|
||||||
|
params: Dict[str, Any] = {"limit": limit, "offset": offset}
|
||||||
|
if user_id is not None:
|
||||||
|
conditions.append("user_id=:user_id")
|
||||||
|
params["user_id"] = user_id
|
||||||
|
if status is not None:
|
||||||
|
conditions.append("status=:status")
|
||||||
|
params["status"] = status
|
||||||
|
if start_from is not None:
|
||||||
|
conditions.append("created_at>=:start_from")
|
||||||
|
params["start_from"] = start_from
|
||||||
|
if start_to is not None:
|
||||||
|
conditions.append("created_at<=:start_to")
|
||||||
|
params["start_to"] = start_to
|
||||||
|
|
||||||
|
where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
|
||||||
|
engine = get_engine()
|
||||||
|
with engine.begin() as conn:
|
||||||
|
rows = conn.execute(
|
||||||
|
text(
|
||||||
|
f"SELECT * FROM chat_session {where_clause} "
|
||||||
|
"ORDER BY created_at DESC LIMIT :limit OFFSET :offset"
|
||||||
|
),
|
||||||
|
params,
|
||||||
|
).fetchall()
|
||||||
|
results: List[Dict[str, Any]] = []
|
||||||
|
for row in rows:
|
||||||
|
data = _row_to_dict(row)
|
||||||
|
_parse_json_fields(data, ["ext_context"])
|
||||||
|
results.append(data)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _next_turn_no(conn, session_id: int) -> int:
|
||||||
|
row = conn.execute(
|
||||||
|
text("SELECT COALESCE(MAX(turn_no), 0) + 1 AS next_no FROM chat_turn WHERE session_id=:sid"),
|
||||||
|
{"sid": session_id},
|
||||||
|
).first()
|
||||||
|
if not row:
|
||||||
|
return 1
|
||||||
|
return int(row._mapping["next_no"])
|
||||||
|
|
||||||
|
|
||||||
|
def create_chat_turn(session_id: int, payload: ChatTurnCreate) -> Dict[str, Any]:
|
||||||
|
"""Insert a chat turn and auto-increment turn number within the session."""
|
||||||
|
engine = get_engine()
|
||||||
|
now = datetime.utcnow()
|
||||||
|
params = {
|
||||||
|
"session_id": session_id,
|
||||||
|
"user_id": payload.user_id,
|
||||||
|
"user_query": payload.user_query,
|
||||||
|
"intent": payload.intent,
|
||||||
|
"ast_json": _json_dump(payload.ast_json),
|
||||||
|
"generated_sql": payload.generated_sql,
|
||||||
|
"sql_status": payload.sql_status,
|
||||||
|
"error_msg": payload.error_msg,
|
||||||
|
"main_metric_ids": _json_dump(payload.main_metric_ids),
|
||||||
|
"created_metric_ids": _json_dump(payload.created_metric_ids),
|
||||||
|
"end_time": payload.end_time,
|
||||||
|
}
|
||||||
|
with engine.begin() as conn:
|
||||||
|
turn_no = _next_turn_no(conn, session_id)
|
||||||
|
params["turn_no"] = turn_no
|
||||||
|
result = conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
INSERT INTO chat_turn (
|
||||||
|
session_id, turn_no, user_id,
|
||||||
|
user_query, intent, ast_json,
|
||||||
|
generated_sql, sql_status, error_msg,
|
||||||
|
main_metric_ids, created_metric_ids,
|
||||||
|
end_time
|
||||||
|
)
|
||||||
|
VALUES (
|
||||||
|
:session_id, :turn_no, :user_id,
|
||||||
|
:user_query, :intent, :ast_json,
|
||||||
|
:generated_sql, :sql_status, :error_msg,
|
||||||
|
:main_metric_ids, :created_metric_ids,
|
||||||
|
:end_time
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
params,
|
||||||
|
)
|
||||||
|
turn_id = result.lastrowid
|
||||||
|
row = conn.execute(
|
||||||
|
text("SELECT * FROM chat_turn WHERE id=:id"), {"id": turn_id}
|
||||||
|
).first()
|
||||||
|
if not row:
|
||||||
|
raise RuntimeError("Failed to create chat turn.")
|
||||||
|
data = _row_to_dict(row)
|
||||||
|
_parse_json_fields(data, ["ast_json", "main_metric_ids", "created_metric_ids"])
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_chat_turn(turn_id: int) -> Optional[Dict[str, Any]]:
|
||||||
|
engine = get_engine()
|
||||||
|
with engine.begin() as conn:
|
||||||
|
row = conn.execute(
|
||||||
|
text("SELECT * FROM chat_turn WHERE id=:id"), {"id": turn_id}
|
||||||
|
).first()
|
||||||
|
if not row:
|
||||||
|
return None
|
||||||
|
data = _row_to_dict(row)
|
||||||
|
_parse_json_fields(data, ["ast_json", "main_metric_ids", "created_metric_ids"])
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def list_chat_turns(session_id: int) -> List[Dict[str, Any]]:
|
||||||
|
engine = get_engine()
|
||||||
|
with engine.begin() as conn:
|
||||||
|
rows = conn.execute(
|
||||||
|
text(
|
||||||
|
"SELECT * FROM chat_turn WHERE session_id=:session_id ORDER BY turn_no ASC"
|
||||||
|
),
|
||||||
|
{"session_id": session_id},
|
||||||
|
).fetchall()
|
||||||
|
results: List[Dict[str, Any]] = []
|
||||||
|
for row in rows:
|
||||||
|
data = _row_to_dict(row)
|
||||||
|
_parse_json_fields(data, ["ast_json", "main_metric_ids", "created_metric_ids"])
|
||||||
|
results.append(data)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def create_retrievals(turn_id: int, retrievals: List[ChatTurnRetrievalItem]) -> int:
|
||||||
|
"""Batch insert retrieval records for a turn."""
|
||||||
|
if not retrievals:
|
||||||
|
return 0
|
||||||
|
engine = get_engine()
|
||||||
|
params_list = []
|
||||||
|
for item in retrievals:
|
||||||
|
params_list.append(
|
||||||
|
{
|
||||||
|
"turn_id": turn_id,
|
||||||
|
"item_type": item.item_type,
|
||||||
|
"item_id": item.item_id,
|
||||||
|
"item_extra": _json_dump(item.item_extra),
|
||||||
|
"similarity_score": item.similarity_score,
|
||||||
|
"rank_no": item.rank_no,
|
||||||
|
"used_in_reasoning": 1 if item.used_in_reasoning else 0,
|
||||||
|
"used_in_sql": 1 if item.used_in_sql else 0,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
with engine.begin() as conn:
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
INSERT INTO chat_turn_retrieval (
|
||||||
|
turn_id, item_type, item_id, item_extra,
|
||||||
|
similarity_score, rank_no, used_in_reasoning, used_in_sql
|
||||||
|
)
|
||||||
|
VALUES (
|
||||||
|
:turn_id, :item_type, :item_id, :item_extra,
|
||||||
|
:similarity_score, :rank_no, :used_in_reasoning, :used_in_sql
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
params_list,
|
||||||
|
)
|
||||||
|
return len(retrievals)
|
||||||
|
|
||||||
|
|
||||||
|
def list_retrievals(turn_id: int) -> List[Dict[str, Any]]:
|
||||||
|
engine = get_engine()
|
||||||
|
with engine.begin() as conn:
|
||||||
|
rows = conn.execute(
|
||||||
|
text(
|
||||||
|
"SELECT * FROM chat_turn_retrieval WHERE turn_id=:turn_id ORDER BY created_at ASC, rank_no ASC"
|
||||||
|
),
|
||||||
|
{"turn_id": turn_id},
|
||||||
|
).fetchall()
|
||||||
|
results: List[Dict[str, Any]] = []
|
||||||
|
for row in rows:
|
||||||
|
data = _row_to_dict(row)
|
||||||
|
_parse_json_fields(data, ["item_extra"])
|
||||||
|
data["used_in_reasoning"] = bool(data.get("used_in_reasoning"))
|
||||||
|
data["used_in_sql"] = bool(data.get("used_in_sql"))
|
||||||
|
results.append(data)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
# Metric registry
|
||||||
|
def _metric_sql_hash(sql_text: str) -> str:
|
||||||
|
"""Compute a stable hash to detect SQL definition changes."""
|
||||||
|
return hashlib.md5(sql_text.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def create_metric(payload: MetricCreate) -> Dict[str, Any]:
|
||||||
|
"""Insert a new metric definition; version starts at 1."""
|
||||||
|
engine = get_engine()
|
||||||
|
now = datetime.utcnow()
|
||||||
|
sql_hash = _metric_sql_hash(payload.base_sql)
|
||||||
|
params = {
|
||||||
|
"metric_code": payload.metric_code,
|
||||||
|
"metric_name": payload.metric_name,
|
||||||
|
"metric_aliases": _json_dump(payload.metric_aliases),
|
||||||
|
"biz_domain": payload.biz_domain,
|
||||||
|
"biz_desc": payload.biz_desc,
|
||||||
|
"chat_turn_id": payload.chat_turn_id,
|
||||||
|
"tech_desc": payload.tech_desc,
|
||||||
|
"formula_expr": payload.formula_expr,
|
||||||
|
"base_sql": payload.base_sql,
|
||||||
|
"time_grain": payload.time_grain,
|
||||||
|
"dim_binding": _json_dump(payload.dim_binding),
|
||||||
|
"update_strategy": payload.update_strategy,
|
||||||
|
"schedule_id": payload.schedule_id,
|
||||||
|
"schedule_type": payload.schedule_type,
|
||||||
|
"version": 1,
|
||||||
|
"is_active": 1 if payload.is_active else 0,
|
||||||
|
"sql_hash": sql_hash,
|
||||||
|
"created_by": payload.created_by,
|
||||||
|
"updated_by": payload.updated_by,
|
||||||
|
"created_at": now,
|
||||||
|
"updated_at": now,
|
||||||
|
}
|
||||||
|
with engine.begin() as conn:
|
||||||
|
result = conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
INSERT INTO metric_def (
|
||||||
|
metric_code, metric_name, metric_aliases, biz_domain, biz_desc,
|
||||||
|
chat_turn_id, tech_desc, formula_expr, base_sql,
|
||||||
|
time_grain, dim_binding, update_strategy,
|
||||||
|
schedule_id, schedule_type, version, is_active,
|
||||||
|
sql_hash, created_by, updated_by, created_at, updated_at
|
||||||
|
)
|
||||||
|
VALUES (
|
||||||
|
:metric_code, :metric_name, :metric_aliases, :biz_domain, :biz_desc,
|
||||||
|
:chat_turn_id, :tech_desc, :formula_expr, :base_sql,
|
||||||
|
:time_grain, :dim_binding, :update_strategy,
|
||||||
|
:schedule_id, :schedule_type, :version, :is_active,
|
||||||
|
:sql_hash, :created_by, :updated_by, :created_at, :updated_at
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
params,
|
||||||
|
)
|
||||||
|
metric_id = result.lastrowid
|
||||||
|
row = conn.execute(
|
||||||
|
text("SELECT * FROM metric_def WHERE id=:id"), {"id": metric_id}
|
||||||
|
).first()
|
||||||
|
if not row:
|
||||||
|
raise RuntimeError("Failed to create metric definition.")
|
||||||
|
data = _row_to_dict(row)
|
||||||
|
_parse_json_fields(data, ["metric_aliases", "dim_binding"])
|
||||||
|
data["is_active"] = bool(data.get("is_active"))
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def update_metric(metric_id: int, payload: MetricUpdate) -> Dict[str, Any]:
|
||||||
|
"""Update mutable fields of a metric definition and refresh sql_hash when needed."""
|
||||||
|
updates: Dict[str, Any] = {}
|
||||||
|
for field in (
|
||||||
|
"metric_name",
|
||||||
|
"biz_domain",
|
||||||
|
"biz_desc",
|
||||||
|
"tech_desc",
|
||||||
|
"formula_expr",
|
||||||
|
"base_sql",
|
||||||
|
"time_grain",
|
||||||
|
"update_strategy",
|
||||||
|
"schedule_id",
|
||||||
|
"schedule_type",
|
||||||
|
"updated_by",
|
||||||
|
):
|
||||||
|
value = getattr(payload, field)
|
||||||
|
if value is not None:
|
||||||
|
updates[field] = value
|
||||||
|
|
||||||
|
if payload.metric_aliases is not None:
|
||||||
|
updates["metric_aliases"] = _json_dump(payload.metric_aliases)
|
||||||
|
if payload.dim_binding is not None:
|
||||||
|
updates["dim_binding"] = _json_dump(payload.dim_binding)
|
||||||
|
if payload.is_active is not None:
|
||||||
|
updates["is_active"] = 1 if payload.is_active else 0
|
||||||
|
if payload.base_sql is not None:
|
||||||
|
updates["sql_hash"] = _metric_sql_hash(payload.base_sql)
|
||||||
|
|
||||||
|
if not updates:
|
||||||
|
current = get_metric(metric_id)
|
||||||
|
if not current:
|
||||||
|
raise KeyError(f"Metric {metric_id} not found.")
|
||||||
|
return current
|
||||||
|
|
||||||
|
updates["updated_at"] = datetime.utcnow()
|
||||||
|
set_clause = ", ".join(f"{key}=:{key}" for key in updates.keys())
|
||||||
|
params = dict(updates)
|
||||||
|
params["id"] = metric_id
|
||||||
|
|
||||||
|
engine = get_engine()
|
||||||
|
with engine.begin() as conn:
|
||||||
|
conn.execute(
|
||||||
|
text(f"UPDATE metric_def SET {set_clause} WHERE id=:id"),
|
||||||
|
params,
|
||||||
|
)
|
||||||
|
row = conn.execute(
|
||||||
|
text("SELECT * FROM metric_def WHERE id=:id"), {"id": metric_id}
|
||||||
|
).first()
|
||||||
|
if not row:
|
||||||
|
raise KeyError(f"Metric {metric_id} not found.")
|
||||||
|
data = _row_to_dict(row)
|
||||||
|
_parse_json_fields(data, ["metric_aliases", "dim_binding"])
|
||||||
|
data["is_active"] = bool(data.get("is_active"))
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def get_metric(metric_id: int) -> Optional[Dict[str, Any]]:
|
||||||
|
engine = get_engine()
|
||||||
|
with engine.begin() as conn:
|
||||||
|
row = conn.execute(
|
||||||
|
text("SELECT * FROM metric_def WHERE id=:id"), {"id": metric_id}
|
||||||
|
).first()
|
||||||
|
if not row:
|
||||||
|
return None
|
||||||
|
data = _row_to_dict(row)
|
||||||
|
_parse_json_fields(data, ["metric_aliases", "dim_binding"])
|
||||||
|
data["is_active"] = bool(data.get("is_active"))
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def list_metrics(
|
||||||
|
*,
|
||||||
|
biz_domain: Optional[str] = None,
|
||||||
|
is_active: Optional[bool] = None,
|
||||||
|
keyword: Optional[str] = None,
|
||||||
|
limit: int = 100,
|
||||||
|
offset: int = 0,
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""List metric definitions with simple filters and pagination."""
|
||||||
|
conditions = []
|
||||||
|
params: Dict[str, Any] = {"limit": limit, "offset": offset}
|
||||||
|
if biz_domain:
|
||||||
|
conditions.append("biz_domain=:biz_domain")
|
||||||
|
params["biz_domain"] = biz_domain
|
||||||
|
if is_active is not None:
|
||||||
|
conditions.append("is_active=:is_active")
|
||||||
|
params["is_active"] = 1 if is_active else 0
|
||||||
|
if keyword:
|
||||||
|
conditions.append("(metric_code LIKE :kw OR metric_name LIKE :kw)")
|
||||||
|
params["kw"] = f"%{keyword}%"
|
||||||
|
|
||||||
|
where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
|
||||||
|
engine = get_engine()
|
||||||
|
with engine.begin() as conn:
|
||||||
|
rows = conn.execute(
|
||||||
|
text(
|
||||||
|
f"SELECT * FROM metric_def {where_clause} "
|
||||||
|
"ORDER BY updated_at DESC LIMIT :limit OFFSET :offset"
|
||||||
|
),
|
||||||
|
params,
|
||||||
|
).fetchall()
|
||||||
|
results: List[Dict[str, Any]] = []
|
||||||
|
for row in rows:
|
||||||
|
data = _row_to_dict(row)
|
||||||
|
_parse_json_fields(data, ["metric_aliases", "dim_binding"])
|
||||||
|
data["is_active"] = bool(data.get("is_active"))
|
||||||
|
results.append(data)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
# Metric schedules
|
||||||
|
def create_metric_schedule(payload: MetricScheduleCreate) -> Dict[str, Any]:
|
||||||
|
"""Create a schedule record for a metric."""
|
||||||
|
engine = get_engine()
|
||||||
|
params = {
|
||||||
|
"metric_id": payload.metric_id,
|
||||||
|
"cron_expr": payload.cron_expr,
|
||||||
|
"enabled": 1 if payload.enabled else 0,
|
||||||
|
"priority": payload.priority,
|
||||||
|
"backfill_allowed": 1 if payload.backfill_allowed else 0,
|
||||||
|
"max_runtime_sec": payload.max_runtime_sec,
|
||||||
|
"retry_times": payload.retry_times,
|
||||||
|
"owner_team": payload.owner_team,
|
||||||
|
"owner_user_id": payload.owner_user_id,
|
||||||
|
}
|
||||||
|
with engine.begin() as conn:
|
||||||
|
result = conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
INSERT INTO metric_schedule (
|
||||||
|
metric_id, cron_expr, enabled, priority,
|
||||||
|
backfill_allowed, max_runtime_sec, retry_times,
|
||||||
|
owner_team, owner_user_id
|
||||||
|
) VALUES (
|
||||||
|
:metric_id, :cron_expr, :enabled, :priority,
|
||||||
|
:backfill_allowed, :max_runtime_sec, :retry_times,
|
||||||
|
:owner_team, :owner_user_id
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
params,
|
||||||
|
)
|
||||||
|
schedule_id = result.lastrowid
|
||||||
|
row = conn.execute(
|
||||||
|
text("SELECT * FROM metric_schedule WHERE id=:id"), {"id": schedule_id}
|
||||||
|
).first()
|
||||||
|
if not row:
|
||||||
|
raise RuntimeError("Failed to create metric schedule.")
|
||||||
|
data = _row_to_dict(row)
|
||||||
|
data["enabled"] = bool(data.get("enabled"))
|
||||||
|
data["backfill_allowed"] = bool(data.get("backfill_allowed"))
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def update_metric_schedule(schedule_id: int, payload: MetricScheduleUpdate) -> Dict[str, Any]:
|
||||||
|
updates: Dict[str, Any] = {}
|
||||||
|
for field in (
|
||||||
|
"cron_expr",
|
||||||
|
"priority",
|
||||||
|
"max_runtime_sec",
|
||||||
|
"retry_times",
|
||||||
|
"owner_team",
|
||||||
|
"owner_user_id",
|
||||||
|
):
|
||||||
|
value = getattr(payload, field)
|
||||||
|
if value is not None:
|
||||||
|
updates[field] = value
|
||||||
|
if payload.enabled is not None:
|
||||||
|
updates["enabled"] = 1 if payload.enabled else 0
|
||||||
|
if payload.backfill_allowed is not None:
|
||||||
|
updates["backfill_allowed"] = 1 if payload.backfill_allowed else 0
|
||||||
|
|
||||||
|
if not updates:
|
||||||
|
current = list_schedules_for_metric(schedule_id=schedule_id)
|
||||||
|
if current:
|
||||||
|
return current[0]
|
||||||
|
raise KeyError(f"Schedule {schedule_id} not found.")
|
||||||
|
|
||||||
|
set_clause = ", ".join(f"{key}=:{key}" for key in updates.keys())
|
||||||
|
params = dict(updates)
|
||||||
|
params["id"] = schedule_id
|
||||||
|
|
||||||
|
engine = get_engine()
|
||||||
|
with engine.begin() as conn:
|
||||||
|
conn.execute(
|
||||||
|
text(f"UPDATE metric_schedule SET {set_clause} WHERE id=:id"),
|
||||||
|
params,
|
||||||
|
)
|
||||||
|
row = conn.execute(
|
||||||
|
text("SELECT * FROM metric_schedule WHERE id=:id"), {"id": schedule_id}
|
||||||
|
).first()
|
||||||
|
if not row:
|
||||||
|
raise KeyError(f"Schedule {schedule_id} not found.")
|
||||||
|
data = _row_to_dict(row)
|
||||||
|
data["enabled"] = bool(data.get("enabled"))
|
||||||
|
data["backfill_allowed"] = bool(data.get("backfill_allowed"))
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def list_schedules_for_metric(metric_id: Optional[int] = None, schedule_id: Optional[int] = None) -> List[Dict[str, Any]]:
|
||||||
|
conditions = []
|
||||||
|
params: Dict[str, Any] = {}
|
||||||
|
if metric_id is not None:
|
||||||
|
conditions.append("metric_id=:metric_id")
|
||||||
|
params["metric_id"] = metric_id
|
||||||
|
if schedule_id is not None:
|
||||||
|
conditions.append("id=:id")
|
||||||
|
params["id"] = schedule_id
|
||||||
|
where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
|
||||||
|
engine = get_engine()
|
||||||
|
with engine.begin() as conn:
|
||||||
|
rows = conn.execute(
|
||||||
|
text(f"SELECT * FROM metric_schedule {where_clause} ORDER BY id DESC"),
|
||||||
|
params,
|
||||||
|
).fetchall()
|
||||||
|
results: List[Dict[str, Any]] = []
|
||||||
|
for row in rows:
|
||||||
|
data = _row_to_dict(row)
|
||||||
|
data["enabled"] = bool(data.get("enabled"))
|
||||||
|
data["backfill_allowed"] = bool(data.get("backfill_allowed"))
|
||||||
|
results.append(data)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
# Metric runs
|
||||||
|
def trigger_metric_run(payload: MetricRunTrigger) -> Dict[str, Any]:
|
||||||
|
"""Create a metric_job_run entry; execution is orchestrated elsewhere."""
|
||||||
|
metric = get_metric(payload.metric_id)
|
||||||
|
if not metric:
|
||||||
|
raise KeyError(f"Metric {payload.metric_id} not found.")
|
||||||
|
metric_version = payload.metric_version or metric.get("version", 1)
|
||||||
|
base_sql_snapshot = payload.base_sql_snapshot or metric.get("base_sql")
|
||||||
|
triggered_at = payload.triggered_at or datetime.utcnow()
|
||||||
|
|
||||||
|
params = {
|
||||||
|
"metric_id": payload.metric_id,
|
||||||
|
"schedule_id": payload.schedule_id,
|
||||||
|
"source_turn_id": payload.source_turn_id,
|
||||||
|
"data_time_from": payload.data_time_from,
|
||||||
|
"data_time_to": payload.data_time_to,
|
||||||
|
"metric_version": metric_version,
|
||||||
|
"base_sql_snapshot": base_sql_snapshot,
|
||||||
|
"status": "RUNNING",
|
||||||
|
"error_msg": None,
|
||||||
|
"affected_rows": None,
|
||||||
|
"runtime_ms": None,
|
||||||
|
"triggered_by": payload.triggered_by,
|
||||||
|
"triggered_at": triggered_at,
|
||||||
|
"started_at": None,
|
||||||
|
"finished_at": None,
|
||||||
|
}
|
||||||
|
engine = get_engine()
|
||||||
|
with engine.begin() as conn:
|
||||||
|
result = conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
INSERT INTO metric_job_run (
|
||||||
|
metric_id, schedule_id, source_turn_id,
|
||||||
|
data_time_from, data_time_to, metric_version,
|
||||||
|
base_sql_snapshot, status, error_msg,
|
||||||
|
affected_rows, runtime_ms,
|
||||||
|
triggered_by, triggered_at, started_at, finished_at
|
||||||
|
) VALUES (
|
||||||
|
:metric_id, :schedule_id, :source_turn_id,
|
||||||
|
:data_time_from, :data_time_to, :metric_version,
|
||||||
|
:base_sql_snapshot, :status, :error_msg,
|
||||||
|
:affected_rows, :runtime_ms,
|
||||||
|
:triggered_by, :triggered_at, :started_at, :finished_at
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
params,
|
||||||
|
)
|
||||||
|
run_id = result.lastrowid
|
||||||
|
row = conn.execute(
|
||||||
|
text("SELECT * FROM metric_job_run WHERE id=:id"), {"id": run_id}
|
||||||
|
).first()
|
||||||
|
if not row:
|
||||||
|
raise RuntimeError("Failed to create metric job run.")
|
||||||
|
return _row_to_dict(row)
|
||||||
|
|
||||||
|
|
||||||
|
def get_metric_run(run_id: int) -> Optional[Dict[str, Any]]:
|
||||||
|
engine = get_engine()
|
||||||
|
with engine.begin() as conn:
|
||||||
|
row = conn.execute(
|
||||||
|
text("SELECT * FROM metric_job_run WHERE id=:id"), {"id": run_id}
|
||||||
|
).first()
|
||||||
|
if not row:
|
||||||
|
return None
|
||||||
|
return _row_to_dict(row)
|
||||||
|
|
||||||
|
|
||||||
|
def list_metric_runs(
|
||||||
|
*,
|
||||||
|
metric_id: Optional[int] = None,
|
||||||
|
status: Optional[str] = None,
|
||||||
|
limit: int = 100,
|
||||||
|
offset: int = 0,
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
conditions = []
|
||||||
|
params: Dict[str, Any] = {"limit": limit, "offset": offset}
|
||||||
|
if metric_id is not None:
|
||||||
|
conditions.append("metric_id=:metric_id")
|
||||||
|
params["metric_id"] = metric_id
|
||||||
|
if status is not None:
|
||||||
|
conditions.append("status=:status")
|
||||||
|
params["status"] = status
|
||||||
|
where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
|
||||||
|
engine = get_engine()
|
||||||
|
with engine.begin() as conn:
|
||||||
|
rows = conn.execute(
|
||||||
|
text(
|
||||||
|
f"SELECT * FROM metric_job_run {where_clause} "
|
||||||
|
"ORDER BY triggered_at DESC LIMIT :limit OFFSET :offset"
|
||||||
|
),
|
||||||
|
params,
|
||||||
|
).fetchall()
|
||||||
|
return [_row_to_dict(row) for row in rows]
|
||||||
|
|
||||||
|
|
||||||
|
# Metric results
|
||||||
|
def write_metric_results(payload: MetricResultsWriteRequest) -> int:
|
||||||
|
"""Bulk insert metric_result rows for a metric/version."""
|
||||||
|
metric = get_metric(payload.metric_id)
|
||||||
|
if not metric:
|
||||||
|
raise KeyError(f"Metric {payload.metric_id} not found.")
|
||||||
|
default_version = metric.get("version", 1)
|
||||||
|
now = datetime.utcnow()
|
||||||
|
rows: List[Dict[str, Any]] = []
|
||||||
|
for item in payload.results:
|
||||||
|
rows.append(
|
||||||
|
{
|
||||||
|
"metric_id": payload.metric_id,
|
||||||
|
"metric_version": item.metric_version or default_version,
|
||||||
|
"stat_time": item.stat_time,
|
||||||
|
"extra_dims": _json_dump(item.extra_dims),
|
||||||
|
"metric_value": item.metric_value,
|
||||||
|
"load_time": item.load_time or now,
|
||||||
|
"data_version": item.data_version,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if not rows:
|
||||||
|
return 0
|
||||||
|
engine = get_engine()
|
||||||
|
with engine.begin() as conn:
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
INSERT INTO metric_result (
|
||||||
|
metric_id, metric_version, stat_time,
|
||||||
|
extra_dims, metric_value, load_time, data_version
|
||||||
|
) VALUES (
|
||||||
|
:metric_id, :metric_version, :stat_time,
|
||||||
|
:extra_dims, :metric_value, :load_time, :data_version
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
rows,
|
||||||
|
)
|
||||||
|
return len(rows)
|
||||||
|
|
||||||
|
|
||||||
|
def query_metric_results(
|
||||||
|
*,
|
||||||
|
metric_id: int,
|
||||||
|
stat_from: Optional[datetime] = None,
|
||||||
|
stat_to: Optional[datetime] = None,
|
||||||
|
limit: int = 200,
|
||||||
|
offset: int = 0,
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
conditions = ["metric_id=:metric_id"]
|
||||||
|
params: Dict[str, Any] = {
|
||||||
|
"metric_id": metric_id,
|
||||||
|
"limit": limit,
|
||||||
|
"offset": offset,
|
||||||
|
}
|
||||||
|
if stat_from is not None:
|
||||||
|
conditions.append("stat_time>=:stat_from")
|
||||||
|
params["stat_from"] = stat_from
|
||||||
|
if stat_to is not None:
|
||||||
|
conditions.append("stat_time<=:stat_to")
|
||||||
|
params["stat_to"] = stat_to
|
||||||
|
|
||||||
|
where_clause = "WHERE " + " AND ".join(conditions)
|
||||||
|
engine = get_engine()
|
||||||
|
with engine.begin() as conn:
|
||||||
|
rows = conn.execute(
|
||||||
|
text(
|
||||||
|
f"SELECT * FROM metric_result {where_clause} "
|
||||||
|
"ORDER BY stat_time DESC LIMIT :limit OFFSET :offset"
|
||||||
|
),
|
||||||
|
params,
|
||||||
|
).fetchall()
|
||||||
|
results: List[Dict[str, Any]] = []
|
||||||
|
for row in rows:
|
||||||
|
data = _row_to_dict(row)
|
||||||
|
_parse_json_fields(data, ["extra_dims"])
|
||||||
|
results.append(data)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def latest_metric_result(metric_id: int) -> Optional[Dict[str, Any]]:
|
||||||
|
engine = get_engine()
|
||||||
|
with engine.begin() as conn:
|
||||||
|
row = conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
SELECT * FROM metric_result
|
||||||
|
WHERE metric_id=:metric_id
|
||||||
|
ORDER BY stat_time DESC
|
||||||
|
LIMIT 1
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
{"metric_id": metric_id},
|
||||||
|
).first()
|
||||||
|
if not row:
|
||||||
|
return None
|
||||||
|
data = _row_to_dict(row)
|
||||||
|
_parse_json_fields(data, ["extra_dims"])
|
||||||
|
return data
|
||||||
83
app/services/rag_client.py
Normal file
83
app/services/rag_client.py
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Any, Sequence
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from app.exceptions import ProviderAPICallError
|
||||||
|
from app.schemas.rag import RagDeleteRequest, RagItemPayload, RagRetrieveRequest
|
||||||
|
from app.settings import RAG_API_AUTH_TOKEN, RAG_API_BASE_URL
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class RagAPIClient:
|
||||||
|
"""Thin async client wrapper around the RAG endpoints described in doc/rag-api.md."""
|
||||||
|
|
||||||
|
def __init__(self, *, base_url: str | None = None, auth_token: str | None = None) -> None:
|
||||||
|
resolved_base = base_url or RAG_API_BASE_URL
|
||||||
|
self._base_url = resolved_base.rstrip("/")
|
||||||
|
self._auth_token = auth_token or RAG_API_AUTH_TOKEN
|
||||||
|
|
||||||
|
def _headers(self) -> dict[str, str]:
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
if self._auth_token:
|
||||||
|
headers["Authorization"] = f"Bearer {self._auth_token}"
|
||||||
|
return headers
|
||||||
|
|
||||||
|
async def _post(
|
||||||
|
self,
|
||||||
|
client: httpx.AsyncClient,
|
||||||
|
path: str,
|
||||||
|
payload: Any,
|
||||||
|
) -> Any:
|
||||||
|
url = f"{self._base_url}{path}"
|
||||||
|
try:
|
||||||
|
response = await client.post(url, json=payload, headers=self._headers())
|
||||||
|
response.raise_for_status()
|
||||||
|
except httpx.HTTPStatusError as exc:
|
||||||
|
status_code = exc.response.status_code if exc.response else None
|
||||||
|
response_text = exc.response.text if exc.response else ""
|
||||||
|
logger.error(
|
||||||
|
"RAG API responded with an error (%s) for %s: %s",
|
||||||
|
status_code,
|
||||||
|
url,
|
||||||
|
response_text,
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
raise ProviderAPICallError(
|
||||||
|
"RAG API call failed.",
|
||||||
|
status_code=status_code,
|
||||||
|
response_text=response_text,
|
||||||
|
) from exc
|
||||||
|
except httpx.HTTPError as exc:
|
||||||
|
logger.error("Transport error calling RAG API %s: %s", url, exc, exc_info=True)
|
||||||
|
raise ProviderAPICallError(f"RAG API call failed: {exc}") from exc
|
||||||
|
|
||||||
|
try:
|
||||||
|
return response.json()
|
||||||
|
except ValueError:
|
||||||
|
logger.warning("RAG API returned non-JSON response for %s; returning raw text.", url)
|
||||||
|
return {"raw": response.text}
|
||||||
|
|
||||||
|
async def add(self, client: httpx.AsyncClient, payload: RagItemPayload) -> Any:
|
||||||
|
body = payload.model_dump(by_alias=True, exclude_none=True)
|
||||||
|
return await self._post(client, "/rag/add", body)
|
||||||
|
|
||||||
|
async def add_batch(self, client: httpx.AsyncClient, items: Sequence[RagItemPayload]) -> Any:
|
||||||
|
body = [item.model_dump(by_alias=True, exclude_none=True) for item in items]
|
||||||
|
return await self._post(client, "/rag/addBatch", body)
|
||||||
|
|
||||||
|
async def update(self, client: httpx.AsyncClient, payload: RagItemPayload) -> Any:
|
||||||
|
body = payload.model_dump(by_alias=True, exclude_none=True)
|
||||||
|
return await self._post(client, "/rag/update", body)
|
||||||
|
|
||||||
|
async def delete(self, client: httpx.AsyncClient, payload: RagDeleteRequest) -> Any:
|
||||||
|
body = payload.model_dump(by_alias=True, exclude_none=True)
|
||||||
|
return await self._post(client, "/rag/delete", body)
|
||||||
|
|
||||||
|
async def retrieve(self, client: httpx.AsyncClient, payload: RagRetrieveRequest) -> Any:
|
||||||
|
body = payload.model_dump(by_alias=True, exclude_none=True)
|
||||||
|
return await self._post(client, "/rag/retrieve", body)
|
||||||
@ -26,6 +26,7 @@ from app.services.import_analysis import (
|
|||||||
IMPORT_GATEWAY_BASE_URL,
|
IMPORT_GATEWAY_BASE_URL,
|
||||||
resolve_provider_from_model,
|
resolve_provider_from_model,
|
||||||
)
|
)
|
||||||
|
from app.utils.llm_usage import extract_usage as extract_llm_usage
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -37,7 +38,7 @@ PROMPT_FILENAMES = {
|
|||||||
"snippet_generator": "snippet_generator.md",
|
"snippet_generator": "snippet_generator.md",
|
||||||
"snippet_alias": "snippet_alias_generator.md",
|
"snippet_alias": "snippet_alias_generator.md",
|
||||||
}
|
}
|
||||||
DEFAULT_CHAT_TIMEOUT_SECONDS = 90.0
|
DEFAULT_CHAT_TIMEOUT_SECONDS = 180.0
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -47,6 +48,12 @@ class GEProfilingArtifacts:
|
|||||||
docs_path: str
|
docs_path: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LLMCallResult:
|
||||||
|
data: Any
|
||||||
|
usage: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
|
|
||||||
class PipelineActionType:
|
class PipelineActionType:
|
||||||
GE_PROFILING = "ge_profiling"
|
GE_PROFILING = "ge_profiling"
|
||||||
GE_RESULT_DESC = "ge_result_desc"
|
GE_RESULT_DESC = "ge_result_desc"
|
||||||
@ -124,11 +131,16 @@ def _extract_json_payload(content: str) -> str:
|
|||||||
if not stripped:
|
if not stripped:
|
||||||
raise ValueError("Empty LLM content.")
|
raise ValueError("Empty LLM content.")
|
||||||
|
|
||||||
for opener, closer in (("{", "}"), ("[", "]")):
|
decoder = json.JSONDecoder()
|
||||||
start = stripped.find(opener)
|
for idx, char in enumerate(stripped):
|
||||||
end = stripped.rfind(closer)
|
if char not in {"{", "["}:
|
||||||
if start != -1 and end != -1 and end > start:
|
continue
|
||||||
candidate = stripped[start : end + 1].strip()
|
try:
|
||||||
|
_, end = decoder.raw_decode(stripped[idx:])
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
candidate = stripped[idx : idx + end].strip()
|
||||||
|
if candidate:
|
||||||
return candidate
|
return candidate
|
||||||
|
|
||||||
return stripped
|
return stripped
|
||||||
@ -559,7 +571,9 @@ async def _call_chat_completions(
|
|||||||
except ValueError as exc:
|
except ValueError as exc:
|
||||||
raise ProviderAPICallError("Chat completions response was not valid JSON.") from exc
|
raise ProviderAPICallError("Chat completions response was not valid JSON.") from exc
|
||||||
|
|
||||||
return _parse_completion_payload(response_payload)
|
parsed_payload = _parse_completion_payload(response_payload)
|
||||||
|
usage_info = extract_llm_usage(response_payload)
|
||||||
|
return LLMCallResult(data=parsed_payload, usage=usage_info)
|
||||||
|
|
||||||
|
|
||||||
def _normalize_for_json(value: Any) -> Any:
|
def _normalize_for_json(value: Any) -> Any:
|
||||||
@ -628,7 +642,7 @@ async def _execute_result_desc(
|
|||||||
client=client,
|
client=client,
|
||||||
timeout_seconds=timeout_seconds,
|
timeout_seconds=timeout_seconds,
|
||||||
)
|
)
|
||||||
if not isinstance(llm_output, dict):
|
if not isinstance(llm_output.data, dict):
|
||||||
raise ProviderAPICallError("GE result description payload must be a JSON object.")
|
raise ProviderAPICallError("GE result description payload must be a JSON object.")
|
||||||
return llm_output
|
return llm_output
|
||||||
|
|
||||||
@ -651,7 +665,7 @@ async def _execute_snippet_generation(
|
|||||||
client=client,
|
client=client,
|
||||||
timeout_seconds=timeout_seconds,
|
timeout_seconds=timeout_seconds,
|
||||||
)
|
)
|
||||||
if not isinstance(llm_output, list):
|
if not isinstance(llm_output.data, list):
|
||||||
raise ProviderAPICallError("Snippet generator must return a JSON array.")
|
raise ProviderAPICallError("Snippet generator must return a JSON array.")
|
||||||
return llm_output
|
return llm_output
|
||||||
|
|
||||||
@ -674,7 +688,7 @@ async def _execute_snippet_alias(
|
|||||||
client=client,
|
client=client,
|
||||||
timeout_seconds=timeout_seconds,
|
timeout_seconds=timeout_seconds,
|
||||||
)
|
)
|
||||||
if not isinstance(llm_output, list):
|
if not isinstance(llm_output.data, list):
|
||||||
raise ProviderAPICallError("Snippet alias generator must return a JSON array.")
|
raise ProviderAPICallError("Snippet alias generator must return a JSON array.")
|
||||||
return llm_output
|
return llm_output
|
||||||
|
|
||||||
@ -711,6 +725,12 @@ async def _run_action_with_callback(
|
|||||||
await _post_callback(callback_url, failure_payload, client)
|
await _post_callback(callback_url, failure_payload, client)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
usage_info: Optional[Dict[str, Any]] = None
|
||||||
|
result_payload = result
|
||||||
|
if isinstance(result, LLMCallResult):
|
||||||
|
usage_info = result.usage
|
||||||
|
result_payload = result.data
|
||||||
|
|
||||||
success_payload = dict(callback_base)
|
success_payload = dict(callback_base)
|
||||||
success_payload.update(
|
success_payload.update(
|
||||||
{
|
{
|
||||||
@ -724,23 +744,26 @@ async def _run_action_with_callback(
|
|||||||
logger.info(
|
logger.info(
|
||||||
"Pipeline action %s output: %s",
|
"Pipeline action %s output: %s",
|
||||||
action_type,
|
action_type,
|
||||||
_preview_for_log(result),
|
_preview_for_log(result_payload),
|
||||||
)
|
)
|
||||||
|
|
||||||
if action_type == PipelineActionType.GE_PROFILING:
|
if action_type == PipelineActionType.GE_PROFILING:
|
||||||
artifacts: GEProfilingArtifacts = result
|
artifacts: GEProfilingArtifacts = result_payload
|
||||||
success_payload["profiling_json"] = artifacts.profiling_result
|
success_payload["ge_profiling_json"] = artifacts.profiling_result
|
||||||
success_payload["profiling_summary"] = artifacts.profiling_summary
|
success_payload["ge_profiling_summary"] = artifacts.profiling_summary
|
||||||
success_payload["ge_report_path"] = artifacts.docs_path
|
success_payload["ge_report_path"] = artifacts.docs_path
|
||||||
elif action_type == PipelineActionType.GE_RESULT_DESC:
|
elif action_type == PipelineActionType.GE_RESULT_DESC:
|
||||||
success_payload["table_desc_json"] = result
|
success_payload["ge_result_desc_json"] = result_payload
|
||||||
elif action_type == PipelineActionType.SNIPPET:
|
elif action_type == PipelineActionType.SNIPPET:
|
||||||
success_payload["snippet_json"] = result
|
success_payload["snippet_json"] = result_payload
|
||||||
elif action_type == PipelineActionType.SNIPPET_ALIAS:
|
elif action_type == PipelineActionType.SNIPPET_ALIAS:
|
||||||
success_payload["snippet_alias_json"] = result
|
success_payload["snippet_alias_json"] = result_payload
|
||||||
|
|
||||||
|
if usage_info:
|
||||||
|
success_payload["llm_usage"] = usage_info
|
||||||
|
|
||||||
await _post_callback(callback_url, success_payload, client)
|
await _post_callback(callback_url, success_payload, client)
|
||||||
return result
|
return result_payload
|
||||||
|
|
||||||
|
|
||||||
async def process_table_profiling_job(
|
async def process_table_profiling_job(
|
||||||
@ -762,6 +785,8 @@ async def process_table_profiling_job(
|
|||||||
"table_schema_version_id": request.table_schema_version_id,
|
"table_schema_version_id": request.table_schema_version_id,
|
||||||
"llm_model": request.llm_model,
|
"llm_model": request.llm_model,
|
||||||
"llm_timeout_seconds": timeout_seconds,
|
"llm_timeout_seconds": timeout_seconds,
|
||||||
|
"workspace_id": request.workspace_id,
|
||||||
|
"rag_item_type": request.rag_item_type,
|
||||||
}
|
}
|
||||||
|
|
||||||
logging_request_payload = _profiling_request_for_log(request)
|
logging_request_payload = _profiling_request_for_log(request)
|
||||||
|
|||||||
@ -1,19 +1,19 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from typing import Any, Dict, Tuple
|
from datetime import datetime
|
||||||
|
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
||||||
|
|
||||||
from sqlalchemy import text
|
from sqlalchemy import text
|
||||||
from sqlalchemy.engine import Engine
|
from sqlalchemy.engine import Engine
|
||||||
from sqlalchemy.exc import SQLAlchemyError
|
from sqlalchemy.exc import SQLAlchemyError
|
||||||
|
|
||||||
from app.db import get_engine
|
from app.db import get_engine
|
||||||
from app.models import (
|
from app.models import ActionType, TableSnippetUpsertRequest, TableSnippetUpsertResponse
|
||||||
ActionType,
|
from app.schemas.rag import RagItemPayload
|
||||||
TableSnippetUpsertRequest,
|
from app.services.rag_client import RagAPIClient
|
||||||
TableSnippetUpsertResponse,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -38,7 +38,15 @@ def _prepare_table_schema(value: Any) -> str:
|
|||||||
return json.dumps(value, ensure_ascii=False)
|
return json.dumps(value, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
|
def _prepare_model_params(params: Dict[str, Any] | None) -> str | None:
|
||||||
|
if not params:
|
||||||
|
return None
|
||||||
|
serialized, _ = _serialize_json(params)
|
||||||
|
return serialized
|
||||||
|
|
||||||
|
|
||||||
def _collect_common_columns(request: TableSnippetUpsertRequest) -> Dict[str, Any]:
|
def _collect_common_columns(request: TableSnippetUpsertRequest) -> Dict[str, Any]:
|
||||||
|
# Build the base column set shared by all action types; action-specific fields are populated later.
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Collecting common columns for table_id=%s version_ts=%s action_type=%s",
|
"Collecting common columns for table_id=%s version_ts=%s action_type=%s",
|
||||||
request.table_id,
|
request.table_id,
|
||||||
@ -53,8 +61,34 @@ def _collect_common_columns(request: TableSnippetUpsertRequest) -> Dict[str, Any
|
|||||||
"callback_url": str(request.callback_url),
|
"callback_url": str(request.callback_url),
|
||||||
"table_schema_version_id": request.table_schema_version_id,
|
"table_schema_version_id": request.table_schema_version_id,
|
||||||
"table_schema": _prepare_table_schema(request.table_schema),
|
"table_schema": _prepare_table_schema(request.table_schema),
|
||||||
|
"model": request.model,
|
||||||
|
"model_provider": request.model_provider,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
payload.update(
|
||||||
|
{
|
||||||
|
"ge_profiling_json": None,
|
||||||
|
"ge_profiling_json_size_bytes": None,
|
||||||
|
"ge_profiling_summary": None,
|
||||||
|
"ge_profiling_summary_size_bytes": None,
|
||||||
|
"ge_profiling_total_size_bytes": None,
|
||||||
|
"ge_profiling_html_report_url": None,
|
||||||
|
"ge_result_desc_json": None,
|
||||||
|
"ge_result_desc_json_size_bytes": None,
|
||||||
|
"snippet_json": None,
|
||||||
|
"snippet_json_size_bytes": None,
|
||||||
|
"snippet_alias_json": None,
|
||||||
|
"snippet_alias_json_size_bytes": None,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
payload["model_params"] = _prepare_model_params(request.model_params)
|
||||||
|
|
||||||
|
if request.llm_usage is not None:
|
||||||
|
llm_usage_json, _ = _serialize_json(request.llm_usage)
|
||||||
|
if llm_usage_json is not None:
|
||||||
|
payload["llm_usage"] = llm_usage_json
|
||||||
|
|
||||||
if request.error_code is not None:
|
if request.error_code is not None:
|
||||||
logger.debug("Adding error_code: %s", request.error_code)
|
logger.debug("Adding error_code: %s", request.error_code)
|
||||||
payload["error_code"] = request.error_code
|
payload["error_code"] = request.error_code
|
||||||
@ -80,35 +114,35 @@ def _apply_action_payload(
|
|||||||
) -> None:
|
) -> None:
|
||||||
logger.debug("Applying action-specific payload for action_type=%s", request.action_type)
|
logger.debug("Applying action-specific payload for action_type=%s", request.action_type)
|
||||||
if request.action_type == ActionType.GE_PROFILING:
|
if request.action_type == ActionType.GE_PROFILING:
|
||||||
full_json, full_size = _serialize_json(request.result_json)
|
full_json, full_size = _serialize_json(request.ge_profiling_json)
|
||||||
summary_json, summary_size = _serialize_json(request.result_summary_json)
|
summary_json, summary_size = _serialize_json(request.ge_profiling_summary)
|
||||||
if full_json is not None:
|
if full_json is not None:
|
||||||
payload["ge_profiling_full"] = full_json
|
payload["ge_profiling_json"] = full_json
|
||||||
payload["ge_profiling_full_size_bytes"] = full_size
|
payload["ge_profiling_json_size_bytes"] = full_size
|
||||||
if summary_json is not None:
|
if summary_json is not None:
|
||||||
payload["ge_profiling_summary"] = summary_json
|
payload["ge_profiling_summary"] = summary_json
|
||||||
payload["ge_profiling_summary_size_bytes"] = summary_size
|
payload["ge_profiling_summary_size_bytes"] = summary_size
|
||||||
if full_size is not None or summary_size is not None:
|
if request.ge_profiling_total_size_bytes is not None:
|
||||||
payload["ge_profiling_total_size_bytes"] = (full_size or 0) + (
|
payload["ge_profiling_total_size_bytes"] = request.ge_profiling_total_size_bytes
|
||||||
summary_size or 0
|
elif full_size is not None or summary_size is not None:
|
||||||
)
|
payload["ge_profiling_total_size_bytes"] = (full_size or 0) + (summary_size or 0)
|
||||||
if request.html_report_url:
|
if request.ge_profiling_html_report_url:
|
||||||
payload["ge_profiling_html_report_url"] = request.html_report_url
|
payload["ge_profiling_html_report_url"] = request.ge_profiling_html_report_url
|
||||||
elif request.action_type == ActionType.GE_RESULT_DESC:
|
elif request.action_type == ActionType.GE_RESULT_DESC:
|
||||||
full_json, full_size = _serialize_json(request.result_json)
|
full_json, full_size = _serialize_json(request.ge_result_desc_json)
|
||||||
if full_json is not None:
|
if full_json is not None:
|
||||||
payload["ge_result_desc_full"] = full_json
|
payload["ge_result_desc_json"] = full_json
|
||||||
payload["ge_result_desc_full_size_bytes"] = full_size
|
payload["ge_result_desc_json_size_bytes"] = full_size
|
||||||
elif request.action_type == ActionType.SNIPPET:
|
elif request.action_type == ActionType.SNIPPET:
|
||||||
full_json, full_size = _serialize_json(request.result_json)
|
full_json, full_size = _serialize_json(request.snippet_json)
|
||||||
if full_json is not None:
|
if full_json is not None:
|
||||||
payload["snippet_full"] = full_json
|
payload["snippet_json"] = full_json
|
||||||
payload["snippet_full_size_bytes"] = full_size
|
payload["snippet_json_size_bytes"] = full_size
|
||||||
elif request.action_type == ActionType.SNIPPET_ALIAS:
|
elif request.action_type == ActionType.SNIPPET_ALIAS:
|
||||||
full_json, full_size = _serialize_json(request.result_json)
|
full_json, full_size = _serialize_json(request.snippet_alias_json)
|
||||||
if full_json is not None:
|
if full_json is not None:
|
||||||
payload["snippet_alias_full"] = full_json
|
payload["snippet_alias_json"] = full_json
|
||||||
payload["snippet_alias_full_size_bytes"] = full_size
|
payload["snippet_alias_json_size_bytes"] = full_size
|
||||||
else:
|
else:
|
||||||
logger.error("Unsupported action type encountered: %s", request.action_type)
|
logger.error("Unsupported action type encountered: %s", request.action_type)
|
||||||
raise ValueError(f"Unsupported action type '{request.action_type}'.")
|
raise ValueError(f"Unsupported action type '{request.action_type}'.")
|
||||||
@ -182,3 +216,425 @@ def upsert_action_result(request: TableSnippetUpsertRequest) -> TableSnippetUpse
|
|||||||
status=request.status,
|
status=request.status,
|
||||||
updated=updated,
|
updated=updated,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _decode_json_field(value: Any) -> Any:
|
||||||
|
"""Decode JSON columns that may be returned as str/bytes/dicts/lists."""
|
||||||
|
if value is None:
|
||||||
|
return None
|
||||||
|
if isinstance(value, (dict, list)):
|
||||||
|
return value
|
||||||
|
if isinstance(value, (bytes, bytearray)):
|
||||||
|
try:
|
||||||
|
value = value.decode("utf-8")
|
||||||
|
except Exception: # pragma: no cover - defensive
|
||||||
|
return None
|
||||||
|
if isinstance(value, str):
|
||||||
|
try:
|
||||||
|
return json.loads(value)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
logger.warning("Failed to decode JSON field: %s", value)
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _coerce_json_array(value: Any) -> List[Any]:
|
||||||
|
decoded = _decode_json_field(value)
|
||||||
|
return decoded if isinstance(decoded, list) else []
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_action_payload(
|
||||||
|
engine: Engine, table_id: int, version_ts: int, action_type: ActionType
|
||||||
|
) -> Optional[Dict[str, Any]]:
|
||||||
|
sql = text(
|
||||||
|
"""
|
||||||
|
SELECT id AS action_result_id, snippet_json, snippet_alias_json, updated_at, status
|
||||||
|
FROM action_results
|
||||||
|
WHERE table_id = :table_id
|
||||||
|
AND version_ts = :version_ts
|
||||||
|
AND action_type = :action_type
|
||||||
|
AND status IN ('success', 'partial')
|
||||||
|
ORDER BY CASE status WHEN 'success' THEN 0 ELSE 1 END, updated_at DESC
|
||||||
|
LIMIT 1
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
with engine.connect() as conn:
|
||||||
|
row = conn.execute(
|
||||||
|
sql,
|
||||||
|
{
|
||||||
|
"table_id": table_id,
|
||||||
|
"version_ts": version_ts,
|
||||||
|
"action_type": action_type.value,
|
||||||
|
},
|
||||||
|
).mappings().first()
|
||||||
|
return dict(row) if row else None
|
||||||
|
|
||||||
|
|
||||||
|
def _load_snippet_sources(
|
||||||
|
engine: Engine, table_id: int, version_ts: int
|
||||||
|
) -> Tuple[List[Any], List[Any], Optional[datetime], Optional[int], Optional[int]]:
|
||||||
|
alias_row = _fetch_action_payload(engine, table_id, version_ts, ActionType.SNIPPET_ALIAS)
|
||||||
|
snippet_row = _fetch_action_payload(engine, table_id, version_ts, ActionType.SNIPPET)
|
||||||
|
|
||||||
|
snippet_json = _coerce_json_array(alias_row.get("snippet_json") if alias_row else None)
|
||||||
|
alias_json = _coerce_json_array(alias_row.get("snippet_alias_json") if alias_row else None)
|
||||||
|
updated_at: Optional[datetime] = alias_row.get("updated_at") if alias_row else None
|
||||||
|
alias_action_id: Optional[int] = alias_row.get("action_result_id") if alias_row else None
|
||||||
|
snippet_action_id: Optional[int] = snippet_row.get("action_result_id") if snippet_row else None
|
||||||
|
|
||||||
|
if not snippet_json and snippet_row:
|
||||||
|
snippet_json = _coerce_json_array(snippet_row.get("snippet_json"))
|
||||||
|
if updated_at is None:
|
||||||
|
updated_at = snippet_row.get("updated_at")
|
||||||
|
if alias_action_id is None:
|
||||||
|
alias_action_id = snippet_action_id
|
||||||
|
|
||||||
|
if not updated_at and alias_row:
|
||||||
|
updated_at = alias_row.get("updated_at")
|
||||||
|
|
||||||
|
return snippet_json, alias_json, updated_at, alias_action_id, snippet_action_id
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_aliases(raw_aliases: Any) -> List[Dict[str, Any]]:
|
||||||
|
aliases: List[Dict[str, Any]] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
if not raw_aliases:
|
||||||
|
return aliases
|
||||||
|
if not isinstance(raw_aliases, list):
|
||||||
|
return aliases
|
||||||
|
for item in raw_aliases:
|
||||||
|
if isinstance(item, dict):
|
||||||
|
text_val = item.get("text")
|
||||||
|
if not text_val or text_val in seen:
|
||||||
|
continue
|
||||||
|
seen.add(text_val)
|
||||||
|
aliases.append({"text": text_val, "tone": item.get("tone")})
|
||||||
|
elif isinstance(item, str):
|
||||||
|
if item in seen:
|
||||||
|
continue
|
||||||
|
seen.add(item)
|
||||||
|
aliases.append({"text": item})
|
||||||
|
return aliases
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_str_list(values: Any) -> List[str]:
|
||||||
|
if not values:
|
||||||
|
return []
|
||||||
|
if not isinstance(values, list):
|
||||||
|
return []
|
||||||
|
seen: set[str] = set()
|
||||||
|
normalised: List[str] = []
|
||||||
|
for val in values:
|
||||||
|
if not isinstance(val, str):
|
||||||
|
continue
|
||||||
|
if val in seen:
|
||||||
|
continue
|
||||||
|
seen.add(val)
|
||||||
|
normalised.append(val)
|
||||||
|
return normalised
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_alias_lists(primary: List[Dict[str, Any]], secondary: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||||
|
merged: List[Dict[str, Any]] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
for source in (primary, secondary):
|
||||||
|
for item in source:
|
||||||
|
if not isinstance(item, dict):
|
||||||
|
continue
|
||||||
|
text_val = item.get("text")
|
||||||
|
if not text_val or text_val in seen:
|
||||||
|
continue
|
||||||
|
seen.add(text_val)
|
||||||
|
merged.append({"text": text_val, "tone": item.get("tone")})
|
||||||
|
return merged
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_str_lists(primary: List[str], secondary: List[str]) -> List[str]:
|
||||||
|
merged: List[str] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
for source in (primary, secondary):
|
||||||
|
for item in source:
|
||||||
|
if item in seen:
|
||||||
|
continue
|
||||||
|
seen.add(item)
|
||||||
|
merged.append(item)
|
||||||
|
return merged
|
||||||
|
|
||||||
|
|
||||||
|
def _build_alias_map(alias_payload: List[Any]) -> Dict[str, Dict[str, Any]]:
|
||||||
|
alias_map: Dict[str, Dict[str, Any]] = {}
|
||||||
|
for item in alias_payload:
|
||||||
|
if not isinstance(item, dict):
|
||||||
|
continue
|
||||||
|
alias_id = item.get("id")
|
||||||
|
if not alias_id:
|
||||||
|
continue
|
||||||
|
existing = alias_map.setdefault(
|
||||||
|
alias_id,
|
||||||
|
{"aliases": [], "keywords": [], "intent_tags": []},
|
||||||
|
)
|
||||||
|
existing["aliases"] = _merge_alias_lists(
|
||||||
|
existing["aliases"], _normalize_aliases(item.get("aliases"))
|
||||||
|
)
|
||||||
|
existing["keywords"] = _merge_str_lists(
|
||||||
|
existing["keywords"], _normalize_str_list(item.get("keywords"))
|
||||||
|
)
|
||||||
|
existing["intent_tags"] = _merge_str_lists(
|
||||||
|
existing["intent_tags"], _normalize_str_list(item.get("intent_tags"))
|
||||||
|
)
|
||||||
|
return alias_map
|
||||||
|
|
||||||
|
|
||||||
|
def merge_snippet_records_from_db(
|
||||||
|
table_id: int,
|
||||||
|
version_ts: int,
|
||||||
|
*,
|
||||||
|
engine: Optional[Engine] = None,
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Load snippet + snippet_alias JSON from action_results after snippet_alias is stored,
|
||||||
|
then merge into a unified snippet object list ready for downstream RAG.
|
||||||
|
"""
|
||||||
|
engine = engine or get_engine()
|
||||||
|
snippets, aliases, updated_at, alias_action_id, snippet_action_id = _load_snippet_sources(
|
||||||
|
engine, table_id, version_ts
|
||||||
|
)
|
||||||
|
alias_map = _build_alias_map(aliases)
|
||||||
|
|
||||||
|
merged: List[Dict[str, Any]] = []
|
||||||
|
seen_ids: set[str] = set()
|
||||||
|
|
||||||
|
for snippet in snippets:
|
||||||
|
if not isinstance(snippet, dict):
|
||||||
|
continue
|
||||||
|
snippet_id = snippet.get("id")
|
||||||
|
if not snippet_id:
|
||||||
|
continue
|
||||||
|
alias_info = alias_map.get(snippet_id)
|
||||||
|
record = dict(snippet)
|
||||||
|
record_aliases = _normalize_aliases(record.get("aliases"))
|
||||||
|
record_keywords = _normalize_str_list(record.get("keywords"))
|
||||||
|
record_intents = _normalize_str_list(record.get("intent_tags"))
|
||||||
|
|
||||||
|
if alias_info:
|
||||||
|
record_aliases = _merge_alias_lists(record_aliases, alias_info["aliases"])
|
||||||
|
record_keywords = _merge_str_lists(record_keywords, alias_info["keywords"])
|
||||||
|
record_intents = _merge_str_lists(record_intents, alias_info["intent_tags"])
|
||||||
|
|
||||||
|
record["aliases"] = record_aliases
|
||||||
|
record["keywords"] = record_keywords
|
||||||
|
record["intent_tags"] = record_intents
|
||||||
|
record["table_id"] = table_id
|
||||||
|
record["version_ts"] = version_ts
|
||||||
|
record["updated_at_from_action"] = updated_at
|
||||||
|
record["source"] = "snippet"
|
||||||
|
record["action_result_id"] = alias_action_id or snippet_action_id
|
||||||
|
merged.append(record)
|
||||||
|
seen_ids.add(snippet_id)
|
||||||
|
|
||||||
|
for alias_id, alias_info in alias_map.items():
|
||||||
|
if alias_id in seen_ids:
|
||||||
|
continue
|
||||||
|
if alias_action_id is None and snippet_action_id is None:
|
||||||
|
continue
|
||||||
|
merged.append(
|
||||||
|
{
|
||||||
|
"id": alias_id,
|
||||||
|
"aliases": alias_info["aliases"],
|
||||||
|
"keywords": alias_info["keywords"],
|
||||||
|
"intent_tags": alias_info["intent_tags"],
|
||||||
|
"table_id": table_id,
|
||||||
|
"version_ts": version_ts,
|
||||||
|
"updated_at_from_action": updated_at,
|
||||||
|
"source": "alias_only",
|
||||||
|
"action_result_id": alias_action_id or snippet_action_id,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return merged
|
||||||
|
|
||||||
|
|
||||||
|
def _stable_rag_item_id(table_id: int, version_ts: int, snippet_id: str) -> int:
|
||||||
|
digest = hashlib.md5(f"{table_id}:{version_ts}:{snippet_id}".encode("utf-8")).hexdigest()
|
||||||
|
return int(digest[:16], 16) % 9_000_000_000_000_000_000
|
||||||
|
|
||||||
|
|
||||||
|
def _to_serializable(value: Any) -> Any:
|
||||||
|
if value is None or isinstance(value, (str, int, float, bool)):
|
||||||
|
return value
|
||||||
|
if isinstance(value, datetime):
|
||||||
|
return value.isoformat()
|
||||||
|
if isinstance(value, dict):
|
||||||
|
return {k: _to_serializable(v) for k, v in value.items()}
|
||||||
|
if isinstance(value, list):
|
||||||
|
return [_to_serializable(v) for v in value]
|
||||||
|
return str(value)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_rag_text(snippet: Dict[str, Any]) -> str:
|
||||||
|
# Deterministic text concatenation for embedding input.
|
||||||
|
parts: List[str] = []
|
||||||
|
|
||||||
|
def _add(label: str, value: Any) -> None:
|
||||||
|
if value is None:
|
||||||
|
return
|
||||||
|
if isinstance(value, list):
|
||||||
|
value = ", ".join([str(v) for v in value if v])
|
||||||
|
elif isinstance(value, dict):
|
||||||
|
value = json.dumps(value, ensure_ascii=False)
|
||||||
|
if value:
|
||||||
|
parts.append(f"{label}: {value}")
|
||||||
|
|
||||||
|
_add("Title", snippet.get("title") or snippet.get("id"))
|
||||||
|
_add("Description", snippet.get("desc"))
|
||||||
|
_add("Business", snippet.get("business_caliber"))
|
||||||
|
_add("Type", snippet.get("type"))
|
||||||
|
_add("Examples", snippet.get("examples") or [])
|
||||||
|
_add("Aliases", [a.get("text") for a in snippet.get("aliases") or [] if isinstance(a, dict)])
|
||||||
|
_add("Keywords", snippet.get("keywords") or [])
|
||||||
|
_add("IntentTags", snippet.get("intent_tags") or [])
|
||||||
|
_add("Applicability", snippet.get("applicability"))
|
||||||
|
_add("DialectSQL", snippet.get("dialect_sql"))
|
||||||
|
return "\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _prepare_rag_payloads(
|
||||||
|
snippets: List[Dict[str, Any]],
|
||||||
|
table_id: int,
|
||||||
|
version_ts: int,
|
||||||
|
workspace_id: int,
|
||||||
|
rag_item_type: str = "SNIPPET",
|
||||||
|
) -> Tuple[List[Dict[str, Any]], List[RagItemPayload]]:
|
||||||
|
rows: List[Dict[str, Any]] = []
|
||||||
|
payloads: List[RagItemPayload] = []
|
||||||
|
now = datetime.utcnow()
|
||||||
|
|
||||||
|
for snippet in snippets:
|
||||||
|
snippet_id = snippet.get("id")
|
||||||
|
if not snippet_id:
|
||||||
|
continue
|
||||||
|
action_result_id = snippet.get("action_result_id")
|
||||||
|
if action_result_id is None:
|
||||||
|
logger.warning(
|
||||||
|
"Skipping snippet without action_result_id for RAG ingestion (table_id=%s version_ts=%s snippet_id=%s)",
|
||||||
|
table_id,
|
||||||
|
version_ts,
|
||||||
|
snippet_id,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
rag_item_id = _stable_rag_item_id(table_id, version_ts, snippet_id)
|
||||||
|
rag_text = _build_rag_text(snippet)
|
||||||
|
serializable_snippet = _to_serializable(snippet)
|
||||||
|
merged_json = json.dumps(serializable_snippet, ensure_ascii=False)
|
||||||
|
updated_at_raw = snippet.get("updated_at_from_action") or now
|
||||||
|
if isinstance(updated_at_raw, str):
|
||||||
|
try:
|
||||||
|
updated_at = datetime.fromisoformat(updated_at_raw)
|
||||||
|
except ValueError:
|
||||||
|
updated_at = now
|
||||||
|
else:
|
||||||
|
updated_at = updated_at_raw if isinstance(updated_at_raw, datetime) else now
|
||||||
|
|
||||||
|
created_at = updated_at
|
||||||
|
|
||||||
|
row = {
|
||||||
|
"rag_item_id": rag_item_id,
|
||||||
|
"workspace_id": workspace_id,
|
||||||
|
"table_id": table_id,
|
||||||
|
"version_ts": version_ts,
|
||||||
|
"created_at": created_at,
|
||||||
|
"action_result_id": action_result_id,
|
||||||
|
"snippet_id": snippet_id,
|
||||||
|
"rag_text": rag_text,
|
||||||
|
"merged_json": merged_json,
|
||||||
|
"updated_at": updated_at,
|
||||||
|
}
|
||||||
|
rows.append(row)
|
||||||
|
|
||||||
|
payloads.append(
|
||||||
|
RagItemPayload(
|
||||||
|
id=rag_item_id,
|
||||||
|
workspaceId=workspace_id,
|
||||||
|
name=snippet.get("title") or snippet_id,
|
||||||
|
embeddingData=rag_text,
|
||||||
|
type=rag_item_type or "SNIPPET",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return rows, payloads
|
||||||
|
|
||||||
|
|
||||||
|
def _upsert_rag_snippet_rows(engine: Engine, rows: Sequence[Dict[str, Any]]) -> None:
|
||||||
|
if not rows:
|
||||||
|
return
|
||||||
|
delete_sql = text("DELETE FROM rag_snippet WHERE rag_item_id=:rag_item_id")
|
||||||
|
insert_sql = text(
|
||||||
|
"""
|
||||||
|
INSERT INTO rag_snippet (
|
||||||
|
rag_item_id,
|
||||||
|
workspace_id,
|
||||||
|
table_id,
|
||||||
|
version_ts,
|
||||||
|
created_at,
|
||||||
|
action_result_id,
|
||||||
|
snippet_id,
|
||||||
|
rag_text,
|
||||||
|
merged_json,
|
||||||
|
updated_at
|
||||||
|
) VALUES (
|
||||||
|
:rag_item_id,
|
||||||
|
:workspace_id,
|
||||||
|
:table_id,
|
||||||
|
:version_ts,
|
||||||
|
:created_at,
|
||||||
|
:action_result_id,
|
||||||
|
:snippet_id,
|
||||||
|
:rag_text,
|
||||||
|
:merged_json,
|
||||||
|
:updated_at
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
with engine.begin() as conn:
|
||||||
|
for row in rows:
|
||||||
|
conn.execute(delete_sql, row)
|
||||||
|
conn.execute(insert_sql, row)
|
||||||
|
|
||||||
|
|
||||||
|
async def ingest_snippet_rag_from_db(
|
||||||
|
table_id: int,
|
||||||
|
version_ts: int,
|
||||||
|
*,
|
||||||
|
workspace_id: int,
|
||||||
|
rag_item_type: str = "SNIPPET",
|
||||||
|
client,
|
||||||
|
engine: Optional[Engine] = None,
|
||||||
|
rag_client: Optional[RagAPIClient] = None,
|
||||||
|
) -> List[int]:
|
||||||
|
"""
|
||||||
|
Merge snippet + alias JSON from action_results, persist to rag_snippet, then push to RAG via addBatch.
|
||||||
|
Returns list of rag_item_id ingested.
|
||||||
|
"""
|
||||||
|
engine = engine or get_engine()
|
||||||
|
snippets = merge_snippet_records_from_db(table_id, version_ts, engine=engine)
|
||||||
|
if not snippets:
|
||||||
|
logger.info(
|
||||||
|
"No snippets available for RAG ingestion (table_id=%s version_ts=%s)",
|
||||||
|
table_id,
|
||||||
|
version_ts,
|
||||||
|
)
|
||||||
|
return []
|
||||||
|
|
||||||
|
rows, payloads = _prepare_rag_payloads(
|
||||||
|
snippets,
|
||||||
|
table_id=table_id,
|
||||||
|
version_ts=version_ts,
|
||||||
|
workspace_id=workspace_id,
|
||||||
|
rag_item_type=rag_item_type,
|
||||||
|
)
|
||||||
|
|
||||||
|
_upsert_rag_snippet_rows(engine, rows)
|
||||||
|
|
||||||
|
rag_client = rag_client or RagAPIClient()
|
||||||
|
await rag_client.add_batch(client, payloads)
|
||||||
|
return [row["rag_item_id"] for row in rows]
|
||||||
|
|||||||
@ -20,7 +20,11 @@ PROVIDER_KEY_ENV_MAP: Dict[str, str] = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_IMPORT_MODEL = os.getenv("DEFAULT_IMPORT_MODEL", "openai:gpt-4.1-mini")
|
DEFAULT_IMPORT_MODEL = os.getenv("DEFAULT_IMPORT_MODEL", "deepseek:deepseek-chat")
|
||||||
|
NEW_API_BASE_URL = os.getenv("NEW_API_BASE_URL")
|
||||||
|
NEW_API_AUTH_TOKEN = os.getenv("NEW_API_AUTH_TOKEN")
|
||||||
|
RAG_API_BASE_URL = os.getenv("RAG_API_BASE_URL", "https://tchatbi.agentcarrier.cn/chatbi/api")
|
||||||
|
RAG_API_AUTH_TOKEN = os.getenv("RAG_API_AUTH_TOKEN")
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=1)
|
@lru_cache(maxsize=1)
|
||||||
|
|||||||
116
app/utils/llm_usage.py
Normal file
116
app/utils/llm_usage.py
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any, Dict, Iterable, Optional
|
||||||
|
|
||||||
|
|
||||||
|
PROMPT_TOKEN_KEYS: tuple[str, ...] = ("prompt_tokens", "input_tokens", "promptTokenCount")
|
||||||
|
COMPLETION_TOKEN_KEYS: tuple[str, ...] = (
|
||||||
|
"completion_tokens",
|
||||||
|
"output_tokens",
|
||||||
|
"candidatesTokenCount",
|
||||||
|
)
|
||||||
|
TOTAL_TOKEN_KEYS: tuple[str, ...] = ("total_tokens", "totalTokenCount")
|
||||||
|
USAGE_CONTAINER_KEYS: tuple[str, ...] = ("usage", "usageMetadata", "usage_metadata")
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_usage_value(value: Any) -> Any:
|
||||||
|
if isinstance(value, (int, float)):
|
||||||
|
return int(value)
|
||||||
|
|
||||||
|
if isinstance(value, str):
|
||||||
|
stripped = value.strip()
|
||||||
|
if not stripped:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
numeric = float(stripped)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
return int(numeric)
|
||||||
|
|
||||||
|
if isinstance(value, dict):
|
||||||
|
normalized: Dict[str, Any] = {}
|
||||||
|
for key, nested_value in value.items():
|
||||||
|
normalized_value = _normalize_usage_value(nested_value)
|
||||||
|
if normalized_value is not None:
|
||||||
|
normalized[key] = normalized_value
|
||||||
|
return normalized or None
|
||||||
|
|
||||||
|
if isinstance(value, (list, tuple, set)):
|
||||||
|
normalized_list = [
|
||||||
|
item for item in (_normalize_usage_value(element) for element in value) if item is not None
|
||||||
|
]
|
||||||
|
return normalized_list or None
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _first_numeric(payload: Dict[str, Any], keys: Iterable[str]) -> Optional[int]:
|
||||||
|
for key in keys:
|
||||||
|
value = payload.get(key)
|
||||||
|
if isinstance(value, (int, float)):
|
||||||
|
return int(value)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _canonicalize_counts(payload: Dict[str, Any]) -> None:
|
||||||
|
prompt = _first_numeric(payload, PROMPT_TOKEN_KEYS)
|
||||||
|
completion = _first_numeric(payload, COMPLETION_TOKEN_KEYS)
|
||||||
|
total = _first_numeric(payload, TOTAL_TOKEN_KEYS)
|
||||||
|
|
||||||
|
if prompt is not None:
|
||||||
|
payload["prompt_tokens"] = prompt
|
||||||
|
else:
|
||||||
|
payload.pop("prompt_tokens", None)
|
||||||
|
|
||||||
|
if completion is not None:
|
||||||
|
payload["completion_tokens"] = completion
|
||||||
|
else:
|
||||||
|
payload.pop("completion_tokens", None)
|
||||||
|
|
||||||
|
if total is not None:
|
||||||
|
payload["total_tokens"] = total
|
||||||
|
elif prompt is not None and completion is not None:
|
||||||
|
payload["total_tokens"] = prompt + completion
|
||||||
|
else:
|
||||||
|
payload.pop("total_tokens", None)
|
||||||
|
|
||||||
|
for alias in PROMPT_TOKEN_KEYS[1:]:
|
||||||
|
payload.pop(alias, None)
|
||||||
|
for alias in COMPLETION_TOKEN_KEYS[1:]:
|
||||||
|
payload.pop(alias, None)
|
||||||
|
for alias in TOTAL_TOKEN_KEYS[1:]:
|
||||||
|
payload.pop(alias, None)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_usage_container(candidate: Any) -> Optional[Dict[str, Any]]:
|
||||||
|
if not isinstance(candidate, dict):
|
||||||
|
return None
|
||||||
|
for key in USAGE_CONTAINER_KEYS:
|
||||||
|
value = candidate.get(key)
|
||||||
|
if isinstance(value, dict):
|
||||||
|
return value
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_usage(payload: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Unified helper to parse token usage metadata from diverse provider responses."""
|
||||||
|
if not isinstance(payload, dict):
|
||||||
|
return None
|
||||||
|
|
||||||
|
usage_candidate = _extract_usage_container(payload)
|
||||||
|
if usage_candidate is None:
|
||||||
|
raw_section = payload.get("raw")
|
||||||
|
usage_candidate = _extract_usage_container(raw_section)
|
||||||
|
|
||||||
|
if usage_candidate is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
normalized = _normalize_usage_value(usage_candidate)
|
||||||
|
if not isinstance(normalized, dict):
|
||||||
|
return None
|
||||||
|
|
||||||
|
_canonicalize_counts(normalized)
|
||||||
|
return normalized or None
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["extract_usage"]
|
||||||
@ -1,41 +0,0 @@
|
|||||||
{
|
|
||||||
"provider": "deepseek",
|
|
||||||
"model": "deepseek-chat",
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"message": {
|
|
||||||
"role": "assistant",
|
|
||||||
"content": "```json\n{\n \"table_name\": \"national_brand_sales\",\n \"description\": \"全国品牌系统外销售数据\",\n \"columns\": [\n {\n \"original_name\": \"品牌\",\n \"standard_name\": \"brand\",\n \"data_type\": \"string\",\n \"db_type\": \"varchar(50)\",\n \"java_type\": \"string\",\n \"nullable\": true,\n \"distinct_count_sample\": 5,\n \"null_ratio_sample\": 0.4,\n \"is_enum_candidate\": false,\n \"description\": \"品牌名称\",\n \"date_format\": null\n },\n {\n \"original_name\": \"产品价类\",\n \"standard_name\": \"price_category\",\n \"data_type\": \"string\",\n \"db_type\": \"varchar(10)\",\n \"java_type\": \"string\",\n \"nullable\": false,\n \"distinct_count_sample\": 3,\n \"null_ratio_sample\": 0.0,\n \"is_enum_candidate\": true,\n \"description\": \"产品价格分类(一类/二类/三类)\",\n \"date_format\": null\n },\n {\n \"original_name\": \"是否重点品牌"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"raw": {
|
|
||||||
"id": "67f3cc80-38bc-4bb7-b336-48d4886722c4",
|
|
||||||
"object": "chat.completion",
|
|
||||||
"created": 1761752207,
|
|
||||||
"model": "deepseek-chat",
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"message": {
|
|
||||||
"role": "assistant",
|
|
||||||
"content": "```json\n{\n \"table_name\": \"national_brand_sales\",\n \"description\": \"全国品牌系统外销售数据\",\n \"columns\": [\n {\n \"original_name\": \"品牌\",\n \"standard_name\": \"brand\",\n \"data_type\": \"string\",\n \"db_type\": \"varchar(50)\",\n \"java_type\": \"string\",\n \"nullable\": true,\n \"distinct_count_sample\": 5,\n \"null_ratio_sample\": 0.4,\n \"is_enum_candidate\": false,\n \"description\": \"品牌名称\",\n \"date_format\": null\n },\n {\n \"original_name\": \"产品价类\",\n \"standard_name\": \"price_category\",\n \"data_type\": \"string\",\n \"db_type\": \"varchar(10)\",\n \"java_type\": \"string\",\n \"nullable\": false,\n \"distinct_count_sample\": 3,\n \"null_ratio_sample\": 0.0,\n \"is_enum_candidate\": true,\n \"description\": \"产品价格分类(一类/二类/三类)\",\n \"date_format\": null\n },\n {\n \"original_name\": \"是否重点品牌"
|
|
||||||
},
|
|
||||||
"logprobs": null,
|
|
||||||
"finish_reason": "length"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"usage": {
|
|
||||||
"prompt_tokens": 1078,
|
|
||||||
"completion_tokens": 256,
|
|
||||||
"total_tokens": 1334,
|
|
||||||
"prompt_tokens_details": {
|
|
||||||
"cached_tokens": 1024
|
|
||||||
},
|
|
||||||
"prompt_cache_hit_tokens": 1024,
|
|
||||||
"prompt_cache_miss_tokens": 54
|
|
||||||
},
|
|
||||||
"system_fingerprint": "fp_ffc7281d48_prod0820_fp8_kvcache"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
1
demo/水务/水务-gemini2.5-ge-result.json
Normal file
1
demo/水务/水务-gemini2.5-ge-result.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"role": "dimension", "time": {"range": null, "column": null, "has_gaps": null, "granularity": "unknown"}, "grain": ["service_point_id"], "table": "data-ge.water_meter_info", "columns": [{"name": "meter_subtype", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 9, "distinct_ratio": 0.03, "pk_candidate_score": 0.03, "metric_candidate_score": 0.0}, {"name": "installation_position", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 4, "distinct_ratio": 0.013333333333333334, "pk_candidate_score": 0.013333333333333334, "metric_candidate_score": 0.0}, {"name": "supply_office", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 11, "distinct_ratio": 0.03666666666666667, "pk_candidate_score": 0.03666666666666667, "metric_candidate_score": 0.0}, {"name": "meter_diameter", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 8, "distinct_ratio": 0.02666666666666667, "pk_candidate_score": 0.02666666666666667, "metric_candidate_score": 0.0}, {"name": "account_id", "dtype": "unknown", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列的统计指标(如空值率、唯一性)缺失,但根据命名规则推断为ID。", "enumish": null, "null_rate": null, "top_values": [], "semantic_type": "id", "distinct_count": null, "distinct_ratio": null, "pk_candidate_score": 0.9, "metric_candidate_score": 0.0}, {"name": "service_point_id", "dtype": "unknown", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列的统计指标(如空值率、唯一性)缺失,但根据命名规则推断为ID。", "enumish": null, "null_rate": null, "top_values": [], "semantic_type": "id", "distinct_count": null, "distinct_ratio": null, "pk_candidate_score": 0.95, "metric_candidate_score": 0.0}, {"name": "station", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 36, "distinct_ratio": 0.12, "pk_candidate_score": 0.12, "metric_candidate_score": 0.0}, {"name": "meter_type", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 5, "distinct_ratio": 0.016666666666666666, "pk_candidate_score": 0.016666666666666666, "metric_candidate_score": 0.0}, {"name": "district", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 13, "distinct_ratio": 0.043333333333333335, "pk_candidate_score": 0.043333333333333335, "metric_candidate_score": 0.0}, {"name": "meter_status", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列只有一个唯一值 '有效'。", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 1, "distinct_ratio": 0.0033333333333333335, "pk_candidate_score": 0.0033333333333333335, "metric_candidate_score": 0.0}], "quality": {"warning_hints": ["列 'meter_status' 只有一个唯一值 '有效',可能为常量列。"], "failed_expectations": []}, "row_count": 300, "fk_candidates": [], "confidence_notes": ["表角色(role)被推断为 'dimension',因为其列几乎完全由ID和类别属性构成,且缺少数值指标或时间序列列。", "主键候选(primary_key_candidates) 'service_point_id' 和 'account_id' 是基于命名约定(包含'_id')推断的。其唯一性和非空性未在GE结果中直接度量,因此这是一个高置信度的猜测。", "表粒度(grain)可能为 'service_point',与推断的主键 'service_point_id' 相对应。", "未根据列名或数据格式识别出时间列。"], "primary_key_candidates": [["service_point_id"], ["account_id"]]}
|
||||||
180
demo/水务/水务-gemini2.5-snippet-alias.json
Normal file
180
demo/水务/水务-gemini2.5-snippet-alias.json
Normal file
@ -0,0 +1,180 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"id": "snpt_count-service-points-by-dimension",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "各个区有多少水表",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "按维度统计用水点数",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "各维度用水点数量分布",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"用水点数",
|
||||||
|
"service_point_count",
|
||||||
|
"数量",
|
||||||
|
"统计",
|
||||||
|
"汇总",
|
||||||
|
"aggregate",
|
||||||
|
"维度",
|
||||||
|
"dimension",
|
||||||
|
"区域",
|
||||||
|
"district",
|
||||||
|
"供水所",
|
||||||
|
"分组统计",
|
||||||
|
"水表"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"aggregate",
|
||||||
|
"by_dimension"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_topn-service-points-by-dimension",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "哪个地方水表最多",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "用水点数Top-N排名",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Top-N用水点数维度排行",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"Top-N",
|
||||||
|
"top",
|
||||||
|
"排名",
|
||||||
|
"排行",
|
||||||
|
"ranking",
|
||||||
|
"最多",
|
||||||
|
"用水点数",
|
||||||
|
"service_point_count",
|
||||||
|
"维度",
|
||||||
|
"dimension",
|
||||||
|
"站点",
|
||||||
|
"station",
|
||||||
|
"水表"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"topn",
|
||||||
|
"by_dimension"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_ratio-service-points-by-dimension",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "各种水表各占多少",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "各维度用水点数占比",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "用水点维度构成分析",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"占比",
|
||||||
|
"percentage",
|
||||||
|
"百分比",
|
||||||
|
"ratio",
|
||||||
|
"构成",
|
||||||
|
"分布",
|
||||||
|
"用水点数",
|
||||||
|
"水表类型",
|
||||||
|
"meter_type",
|
||||||
|
"维度",
|
||||||
|
"dimension",
|
||||||
|
"水表"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"ratio",
|
||||||
|
"by_dimension"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_quality-check-duplicate-spid",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "有没有重复的水表号",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "检查重复的用水点ID",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "用水点ID唯一性校验",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"数据质量",
|
||||||
|
"quality",
|
||||||
|
"检查",
|
||||||
|
"校验",
|
||||||
|
"重复",
|
||||||
|
"duplicate",
|
||||||
|
"唯一性",
|
||||||
|
"uniqueness",
|
||||||
|
"用水点ID",
|
||||||
|
"service_point_id",
|
||||||
|
"异常检测",
|
||||||
|
"主键"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"quality",
|
||||||
|
"by_dimension"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_sample-filter-service-points-by-dims",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "给我看城区的机械表",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "按多维度筛选用水点",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "多维组合条件过滤用水点",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"筛选",
|
||||||
|
"过滤",
|
||||||
|
"filter",
|
||||||
|
"查询",
|
||||||
|
"明细",
|
||||||
|
"列表",
|
||||||
|
"sample",
|
||||||
|
"用水点",
|
||||||
|
"区域",
|
||||||
|
"district",
|
||||||
|
"水表类型",
|
||||||
|
"meter_type",
|
||||||
|
"条件查询"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"sample",
|
||||||
|
"filter"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
186
demo/水务/水务-gemini2.5-snippet.json
Normal file
186
demo/水务/水务-gemini2.5-snippet.json
Normal file
@ -0,0 +1,186 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"id": "snpt_count-service-points-by-dimension",
|
||||||
|
"desc": "按指定维度(如区域、供水所)分组,统计各分类下的用水点数量。",
|
||||||
|
"type": "aggregate",
|
||||||
|
"title": "按维度统计用水点数",
|
||||||
|
"examples": [
|
||||||
|
"按区域统计用水点数量",
|
||||||
|
"各个供水所分别有多少个用水点"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "dimension_column",
|
||||||
|
"type": "column",
|
||||||
|
"default": "district"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT\n `${dimension_column}`,\n COUNT(DISTINCT service_point_id) AS service_point_count\nFROM\n `data-ge.water_meter_info`\nGROUP BY\n `${dimension_column}`\nORDER BY\n service_point_count DESC;"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"适用于对水表档案信息进行分类汇总统计。",
|
||||||
|
"可将变量 ${dimension_column} 替换为任一维度列,如 district, supply_office, station, meter_type 等。"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": null
|
||||||
|
},
|
||||||
|
"time_column": null,
|
||||||
|
"required_columns": [
|
||||||
|
"service_point_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "用水点数:对 `service_point_id` 进行去重计数,代表一个独立的服务点(通常对应一个水表)。统计粒度为“指定维度”。"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_topn-service-points-by-dimension",
|
||||||
|
"desc": "按指定维度(如区域、站点)统计用水点数,并展示数量最多的前N个分类。",
|
||||||
|
"type": "topn",
|
||||||
|
"title": "Top-N 用水点数维度排名",
|
||||||
|
"examples": [
|
||||||
|
"哪个区域的用水点最多",
|
||||||
|
"用水点数排名前5的站点是哪些"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "dimension_column",
|
||||||
|
"type": "column",
|
||||||
|
"default": "station"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "top_n",
|
||||||
|
"type": "int",
|
||||||
|
"default": 10
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT\n `${dimension_column}`,\n COUNT(DISTINCT service_point_id) AS service_point_count\nFROM\n `data-ge.water_meter_info`\nGROUP BY\n `${dimension_column}`\nORDER BY\n service_point_count DESC\nLIMIT ${top_n};"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"维度 `station` 基数较高 (36),建议 Top-N 查询时结合业务场景合理设置 N 值。"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 36
|
||||||
|
},
|
||||||
|
"time_column": null,
|
||||||
|
"required_columns": [
|
||||||
|
"service_point_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "用水点数:对 `service_point_id` 进行去重计数。排名依据为各维度分类下的用水点总数。统计粒度为“指定维度”。"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_ratio-service-points-by-dimension",
|
||||||
|
"desc": "计算在指定维度下,各分类的用水点数占总用水点数的百分比,以分析其分布构成。",
|
||||||
|
"type": "ratio",
|
||||||
|
"title": "各维度用水点数占比",
|
||||||
|
"examples": [
|
||||||
|
"不同水表类型(meter_type)的分布情况",
|
||||||
|
"各个区域的用水点占比是多少"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "dimension_column",
|
||||||
|
"type": "column",
|
||||||
|
"default": "meter_type"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT\n `${dimension_column}`,\n COUNT(DISTINCT service_point_id) AS service_point_count,\n COUNT(DISTINCT service_point_id) * 100.0 / SUM(COUNT(DISTINCT service_point_id)) OVER () AS percentage\nFROM\n `data-ge.water_meter_info`\nGROUP BY\n `${dimension_column}`\nORDER BY\n service_point_count DESC;"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"SQL模板使用了窗口函数 SUM() OVER(),请确保MySQL版本支持(8.0+)。"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": null
|
||||||
|
},
|
||||||
|
"time_column": null,
|
||||||
|
"required_columns": [
|
||||||
|
"service_point_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "用水点数占比:某分类下的用水点数 / 总用水点数。用水点数以 `service_point_id` 去重计数。统计粒度为“指定维度”。"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_quality-check-duplicate-spid",
|
||||||
|
"desc": "查找在用水点信息表中存在重复的 `service_point_id`,用于数据质量校验。",
|
||||||
|
"type": "quality",
|
||||||
|
"title": "检查重复的用水点ID",
|
||||||
|
"examples": [
|
||||||
|
"检查是否存在重复的水表档案",
|
||||||
|
"校验用水点ID的唯一性"
|
||||||
|
],
|
||||||
|
"variables": [],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT\n service_point_id,\n COUNT(*) AS occurrences\nFROM\n `data-ge.water_meter_info`\nGROUP BY\n service_point_id\nHAVING\n COUNT(*) > 1;"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"预期返回结果为空。若有返回,则表示数据存在一致性问题,`service_point_id` 未能作为唯一主键。"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": null
|
||||||
|
},
|
||||||
|
"time_column": null,
|
||||||
|
"required_columns": [
|
||||||
|
"service_point_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "重复项:指 `service_point_id` 出现次数大于1的记录。此ID应为表的主键,理论上不应重复。"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_sample-filter-service-points-by-dims",
|
||||||
|
"desc": "根据区域、水表类型、供水所等多个维度组合条件,筛选出符合条件的用水点明细。",
|
||||||
|
"type": "sample",
|
||||||
|
"title": "多维度筛选用水点列表",
|
||||||
|
"examples": [
|
||||||
|
"查询城区的机械表有哪些",
|
||||||
|
"拉取某个供水所下特定口径水表的列表"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "district_name",
|
||||||
|
"type": "string",
|
||||||
|
"default": "城区"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "meter_type_name",
|
||||||
|
"type": "string",
|
||||||
|
"default": "机械表"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "limit_num",
|
||||||
|
"type": "int",
|
||||||
|
"default": 100
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT\n service_point_id,\n account_id,\n district,\n supply_office,\n meter_type,\n meter_subtype,\n meter_diameter\nFROM\n `data-ge.water_meter_info`\nWHERE\n district = '${district_name}'\n AND meter_type = '${meter_type_name}'\n -- AND meter_status = '有效' -- 可选:根据画像,该列为常量'有效',可不加\nLIMIT ${limit_num};"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": null
|
||||||
|
},
|
||||||
|
"time_column": null,
|
||||||
|
"required_columns": [
|
||||||
|
"service_point_id",
|
||||||
|
"account_id",
|
||||||
|
"district",
|
||||||
|
"supply_office",
|
||||||
|
"meter_type",
|
||||||
|
"meter_subtype",
|
||||||
|
"meter_diameter"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "返回满足所有筛选条件的用水点明细信息。`meter_status` 列只有一个值 '有效',通常无需作为筛选条件。"
|
||||||
|
}
|
||||||
|
]
|
||||||
230
demo/水务/水务-gpt5-ge-desc.json
Normal file
230
demo/水务/水务-gpt5-ge-desc.json
Normal file
@ -0,0 +1,230 @@
|
|||||||
|
{
|
||||||
|
"role": "dimension",
|
||||||
|
"time": {
|
||||||
|
"range": null,
|
||||||
|
"column": null,
|
||||||
|
"has_gaps": null,
|
||||||
|
"granularity": "unknown"
|
||||||
|
},
|
||||||
|
"grain": [
|
||||||
|
"service_point_id"
|
||||||
|
],
|
||||||
|
"table": "data-ge.water_meter_info",
|
||||||
|
"columns": [
|
||||||
|
{
|
||||||
|
"name": "supply_office",
|
||||||
|
"dtype": "string",
|
||||||
|
"stats": {
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"std": null,
|
||||||
|
"mean": null,
|
||||||
|
"skewness": null
|
||||||
|
},
|
||||||
|
"comment": "非空;11 个枚举值(GE 约束)",
|
||||||
|
"enumish": true,
|
||||||
|
"null_rate": 0.0,
|
||||||
|
"top_values": [],
|
||||||
|
"semantic_type": "dimension",
|
||||||
|
"distinct_count": 11,
|
||||||
|
"distinct_ratio": 0.03666666666666667,
|
||||||
|
"pk_candidate_score": 0.05,
|
||||||
|
"metric_candidate_score": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "station",
|
||||||
|
"dtype": "string",
|
||||||
|
"stats": {
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"std": null,
|
||||||
|
"mean": null,
|
||||||
|
"skewness": null
|
||||||
|
},
|
||||||
|
"comment": "非空;36 个枚举值(GE 约束)",
|
||||||
|
"enumish": true,
|
||||||
|
"null_rate": 0.0,
|
||||||
|
"top_values": [],
|
||||||
|
"semantic_type": "dimension",
|
||||||
|
"distinct_count": 36,
|
||||||
|
"distinct_ratio": 0.12,
|
||||||
|
"pk_candidate_score": 0.1,
|
||||||
|
"metric_candidate_score": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "district",
|
||||||
|
"dtype": "string",
|
||||||
|
"stats": {
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"std": null,
|
||||||
|
"mean": null,
|
||||||
|
"skewness": null
|
||||||
|
},
|
||||||
|
"comment": "非空;13 个枚举值(GE 约束)",
|
||||||
|
"enumish": true,
|
||||||
|
"null_rate": 0.0,
|
||||||
|
"top_values": [],
|
||||||
|
"semantic_type": "dimension",
|
||||||
|
"distinct_count": 13,
|
||||||
|
"distinct_ratio": 0.043333333333333335,
|
||||||
|
"pk_candidate_score": 0.05,
|
||||||
|
"metric_candidate_score": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "meter_diameter",
|
||||||
|
"dtype": "string",
|
||||||
|
"stats": {
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"std": null,
|
||||||
|
"mean": null,
|
||||||
|
"skewness": null
|
||||||
|
},
|
||||||
|
"comment": "非空;8 个枚举值(GE 约束)",
|
||||||
|
"enumish": true,
|
||||||
|
"null_rate": 0.0,
|
||||||
|
"top_values": [],
|
||||||
|
"semantic_type": "dimension",
|
||||||
|
"distinct_count": 8,
|
||||||
|
"distinct_ratio": 0.02666666666666667,
|
||||||
|
"pk_candidate_score": 0.03,
|
||||||
|
"metric_candidate_score": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "meter_status",
|
||||||
|
"dtype": "string",
|
||||||
|
"stats": {
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"std": null,
|
||||||
|
"mean": null,
|
||||||
|
"skewness": null
|
||||||
|
},
|
||||||
|
"comment": "非空;单一取值(\"有效\")",
|
||||||
|
"enumish": true,
|
||||||
|
"null_rate": 0.0,
|
||||||
|
"top_values": [],
|
||||||
|
"semantic_type": "dimension",
|
||||||
|
"distinct_count": 1,
|
||||||
|
"distinct_ratio": 0.0033333333333333335,
|
||||||
|
"pk_candidate_score": 0.0,
|
||||||
|
"metric_candidate_score": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "meter_subtype",
|
||||||
|
"dtype": "string",
|
||||||
|
"stats": {
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"std": null,
|
||||||
|
"mean": null,
|
||||||
|
"skewness": null
|
||||||
|
},
|
||||||
|
"comment": "非空;9 个枚举值(GE 约束)",
|
||||||
|
"enumish": true,
|
||||||
|
"null_rate": 0.0,
|
||||||
|
"top_values": [],
|
||||||
|
"semantic_type": "dimension",
|
||||||
|
"distinct_count": 9,
|
||||||
|
"distinct_ratio": 0.03,
|
||||||
|
"pk_candidate_score": 0.03,
|
||||||
|
"metric_candidate_score": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "meter_type",
|
||||||
|
"dtype": "string",
|
||||||
|
"stats": {
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"std": null,
|
||||||
|
"mean": null,
|
||||||
|
"skewness": null
|
||||||
|
},
|
||||||
|
"comment": "非空;5 个枚举值(GE 约束)",
|
||||||
|
"enumish": true,
|
||||||
|
"null_rate": 0.0,
|
||||||
|
"top_values": [],
|
||||||
|
"semantic_type": "dimension",
|
||||||
|
"distinct_count": 5,
|
||||||
|
"distinct_ratio": 0.016666666666666666,
|
||||||
|
"pk_candidate_score": 0.02,
|
||||||
|
"metric_candidate_score": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "installation_position",
|
||||||
|
"dtype": "string",
|
||||||
|
"stats": {
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"std": null,
|
||||||
|
"mean": null,
|
||||||
|
"skewness": null
|
||||||
|
},
|
||||||
|
"comment": "非空;4 个枚举值(GE 约束)",
|
||||||
|
"enumish": true,
|
||||||
|
"null_rate": 0.0,
|
||||||
|
"top_values": [],
|
||||||
|
"semantic_type": "dimension",
|
||||||
|
"distinct_count": 4,
|
||||||
|
"distinct_ratio": 0.013333333333333334,
|
||||||
|
"pk_candidate_score": 0.02,
|
||||||
|
"metric_candidate_score": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "service_point_id",
|
||||||
|
"dtype": "unknown",
|
||||||
|
"stats": {
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"std": null,
|
||||||
|
"mean": null,
|
||||||
|
"skewness": null
|
||||||
|
},
|
||||||
|
"comment": "命名指示标识列;未提供唯一性或非空验证",
|
||||||
|
"enumish": null,
|
||||||
|
"null_rate": null,
|
||||||
|
"top_values": [],
|
||||||
|
"semantic_type": "id",
|
||||||
|
"distinct_count": null,
|
||||||
|
"distinct_ratio": null,
|
||||||
|
"pk_candidate_score": 0.6,
|
||||||
|
"metric_candidate_score": 0.05
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "account_id",
|
||||||
|
"dtype": "unknown",
|
||||||
|
"stats": {
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"std": null,
|
||||||
|
"mean": null,
|
||||||
|
"skewness": null
|
||||||
|
},
|
||||||
|
"comment": "命名指示账户标识;未提供唯一性或非空验证",
|
||||||
|
"enumish": null,
|
||||||
|
"null_rate": null,
|
||||||
|
"top_values": [],
|
||||||
|
"semantic_type": "id",
|
||||||
|
"distinct_count": null,
|
||||||
|
"distinct_ratio": null,
|
||||||
|
"pk_candidate_score": 0.5,
|
||||||
|
"metric_candidate_score": 0.05
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"quality": {
|
||||||
|
"warning_hints": [
|
||||||
|
"以下列未设置非空校验:service_point_id, account_id(空值情况未知)",
|
||||||
|
"未识别到时间列"
|
||||||
|
],
|
||||||
|
"failed_expectations": []
|
||||||
|
},
|
||||||
|
"row_count": 300,
|
||||||
|
"fk_candidates": [],
|
||||||
|
"confidence_notes": [
|
||||||
|
"role 判定为 dimension:表内列均为枚举/分类或ID,未发现数值型度量或时间列;34/34 期望均为分类枚举/非空与去重比例。",
|
||||||
|
"grain 猜测为 service_point_id:仅依据命名启发式,缺少唯一性与非空度量佐证(置信度较低)。",
|
||||||
|
"未识别时间列:列名与期望均未涉及日期/时间,也无最小/最大时间范围可推断。"
|
||||||
|
],
|
||||||
|
"primary_key_candidates": []
|
||||||
|
}
|
||||||
372
demo/水务/水务-gpt5-snippet-alias.json
Normal file
372
demo/水务/水务-gpt5-snippet-alias.json
Normal file
@ -0,0 +1,372 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"id": "snpt_topn_station",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "站点水表排行前N",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "哪个站点表最多",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "按站点水表TopN",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"TopN",
|
||||||
|
"排名",
|
||||||
|
"排行",
|
||||||
|
"station",
|
||||||
|
"站点",
|
||||||
|
"水表数",
|
||||||
|
"meter count",
|
||||||
|
"distinct",
|
||||||
|
"去重",
|
||||||
|
"聚合",
|
||||||
|
"排序",
|
||||||
|
"榜单"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"topn",
|
||||||
|
"aggregate",
|
||||||
|
"by_dimension"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_share_district",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "各辖区水表占比",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "哪个辖区占比高",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "按辖区水表比例",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"占比",
|
||||||
|
"ratio",
|
||||||
|
"district",
|
||||||
|
"辖区",
|
||||||
|
"水表数",
|
||||||
|
"meter count",
|
||||||
|
"distinct",
|
||||||
|
"去重",
|
||||||
|
"百分比",
|
||||||
|
"份额",
|
||||||
|
"聚合",
|
||||||
|
"排序",
|
||||||
|
"分布"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"ratio",
|
||||||
|
"aggregate",
|
||||||
|
"by_dimension"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_dist_diameter",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "表径水表数分布",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "不同口径有多少",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "按表径去重计数",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"分布",
|
||||||
|
"distribution",
|
||||||
|
"meter_diameter",
|
||||||
|
"表径",
|
||||||
|
"水表数",
|
||||||
|
"meter count",
|
||||||
|
"distinct",
|
||||||
|
"去重",
|
||||||
|
"聚合",
|
||||||
|
"类别",
|
||||||
|
"category",
|
||||||
|
"条形图",
|
||||||
|
"饼图",
|
||||||
|
"排行"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"aggregate",
|
||||||
|
"by_dimension"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_type_subtype_matrix",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "类型×子类水表数",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "看各类型各子类",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "类型子类组合统计",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"类型",
|
||||||
|
"type",
|
||||||
|
"子类",
|
||||||
|
"subtype",
|
||||||
|
"组合",
|
||||||
|
"matrix",
|
||||||
|
"交叉分析",
|
||||||
|
"cross-tab",
|
||||||
|
"水表数",
|
||||||
|
"meter count",
|
||||||
|
"distinct",
|
||||||
|
"去重",
|
||||||
|
"分布",
|
||||||
|
"聚合",
|
||||||
|
"维度"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"aggregate",
|
||||||
|
"by_dimension"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_quality_spid_uniq",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "服务点ID唯一性检",
|
||||||
|
"tone": "专业"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "服务点ID有重复吗",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "服务点ID完整性评估",
|
||||||
|
"tone": "中性"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"质量检查",
|
||||||
|
"quality",
|
||||||
|
"唯一性",
|
||||||
|
"uniqueness",
|
||||||
|
"重复",
|
||||||
|
"duplicate",
|
||||||
|
"空值",
|
||||||
|
"NULL",
|
||||||
|
"完整性",
|
||||||
|
"integrity",
|
||||||
|
"service_point_id",
|
||||||
|
"数据质量",
|
||||||
|
"统计",
|
||||||
|
"去重",
|
||||||
|
"异常检测"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"quality"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_quality_account_nulls",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "账户ID缺失明细",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "看看哪些账户为空",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "account_id空值样本",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"质量检查",
|
||||||
|
"缺失",
|
||||||
|
"missing",
|
||||||
|
"空值",
|
||||||
|
"NULL",
|
||||||
|
"account_id",
|
||||||
|
"样本",
|
||||||
|
"sample",
|
||||||
|
"抽样",
|
||||||
|
"sampling",
|
||||||
|
"明细",
|
||||||
|
"排查",
|
||||||
|
"过滤",
|
||||||
|
"WHERE",
|
||||||
|
"LIMIT"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"quality",
|
||||||
|
"sample"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_sample_random_rows",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "随机抽样水表明细",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "随机取几条看看",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "RAND()样本抽取",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"随机",
|
||||||
|
"random",
|
||||||
|
"样本",
|
||||||
|
"sample",
|
||||||
|
"抽样",
|
||||||
|
"sampling",
|
||||||
|
"明细",
|
||||||
|
"details",
|
||||||
|
"质检",
|
||||||
|
"QA",
|
||||||
|
"RAND()",
|
||||||
|
"LIMIT",
|
||||||
|
"抽取",
|
||||||
|
"数据验证"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"sample"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_filter_office_type_where",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "按所与类型过滤有效",
|
||||||
|
"tone": "专业"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "筛选某所的指定类型",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "只看这所的这种表",
|
||||||
|
"tone": "口语"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"过滤",
|
||||||
|
"filter",
|
||||||
|
"WHERE",
|
||||||
|
"supply_office",
|
||||||
|
"营业所",
|
||||||
|
"meter_type",
|
||||||
|
"类型",
|
||||||
|
"meter_status",
|
||||||
|
"有效",
|
||||||
|
"条件片段",
|
||||||
|
"筛选",
|
||||||
|
"查询拼接",
|
||||||
|
"字段",
|
||||||
|
"约束"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"filter"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_office_station_dist",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "所站组合水表数",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "各站在各所有多少",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "营业所×站点分布",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"supply_office",
|
||||||
|
"营业所",
|
||||||
|
"station",
|
||||||
|
"站点",
|
||||||
|
"层级",
|
||||||
|
"hierarchy",
|
||||||
|
"分布",
|
||||||
|
"distribution",
|
||||||
|
"水表数",
|
||||||
|
"meter count",
|
||||||
|
"distinct",
|
||||||
|
"去重",
|
||||||
|
"聚合",
|
||||||
|
"交叉分析",
|
||||||
|
"排行"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"aggregate",
|
||||||
|
"by_dimension"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_total_meter_baseline",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "水表总量基线",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "现在有多少水表",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "全表去重总数",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"总量",
|
||||||
|
"total",
|
||||||
|
"baseline",
|
||||||
|
"基线",
|
||||||
|
"水表总数",
|
||||||
|
"meter total",
|
||||||
|
"service_point_id",
|
||||||
|
"distinct",
|
||||||
|
"去重",
|
||||||
|
"分母",
|
||||||
|
"denominator",
|
||||||
|
"占比",
|
||||||
|
"聚合",
|
||||||
|
"汇总",
|
||||||
|
"snapshot"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"aggregate"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
330
demo/水务/水务-gpt5-snippet.json
Normal file
330
demo/水务/水务-gpt5-snippet.json
Normal file
@ -0,0 +1,330 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"id": "snpt_topn_station",
|
||||||
|
"desc": "按站点统计水表数量并取前N",
|
||||||
|
"type": "topn",
|
||||||
|
"title": "站点TopN水表数",
|
||||||
|
"examples": [
|
||||||
|
"各站点水表数量排名前10",
|
||||||
|
"站点水表覆盖情况排行"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "top_n",
|
||||||
|
"type": "int",
|
||||||
|
"default": 10
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT station,\n COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY station\nORDER BY meter_cnt DESC\nLIMIT {{top_n}};"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"TopN建议N<=36",
|
||||||
|
"以service_point_id去重计数",
|
||||||
|
"无时间列,无法做趋势"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 36
|
||||||
|
},
|
||||||
|
"time_column": null,
|
||||||
|
"required_columns": [
|
||||||
|
"station",
|
||||||
|
"service_point_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "水表数=按service_point_id去重计数;粒度=站点。仅统计当前表中的有效记录(不含时间口径)。安全限制:用于分析排名,避免扩大LIMIT造成全量导出。"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_share_district",
|
||||||
|
"desc": "统计各辖区水表数及其占比",
|
||||||
|
"type": "ratio",
|
||||||
|
"title": "辖区水表占比",
|
||||||
|
"examples": [
|
||||||
|
"各辖区水表占比",
|
||||||
|
"哪个辖区水表最多"
|
||||||
|
],
|
||||||
|
"variables": [],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "WITH by_district AS (\n SELECT district, COUNT(DISTINCT service_point_id) AS meter_cnt\n FROM `data-ge`.`water_meter_info`\n GROUP BY district\n), tot AS (\n SELECT COUNT(DISTINCT service_point_id) AS total_cnt\n FROM `data-ge`.`water_meter_info`\n)\nSELECT b.district,\n b.meter_cnt,\n ROUND(b.meter_cnt / NULLIF(t.total_cnt, 0) * 100, 2) AS pct\nFROM by_district b\nCROSS JOIN tot t\nORDER BY pct DESC, b.district;"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"占比分母为全表service_point_id去重总数",
|
||||||
|
"service_point_id为空将被忽略"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 13
|
||||||
|
},
|
||||||
|
"time_column": null,
|
||||||
|
"required_columns": [
|
||||||
|
"district",
|
||||||
|
"service_point_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "水表数=按service_point_id去重计数;粒度=辖区。占比=辖区水表数/全表水表总数。安全限制:仅基于本表,不代表全市/全网口径;无时间维度。"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_dist_diameter",
|
||||||
|
"desc": "按表径统计水表数量分布",
|
||||||
|
"type": "aggregate",
|
||||||
|
"title": "表径分布统计",
|
||||||
|
"examples": [
|
||||||
|
"不同口径水表有多少",
|
||||||
|
"查看表径分布情况"
|
||||||
|
],
|
||||||
|
"variables": [],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT meter_diameter,\n COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY meter_diameter\nORDER BY meter_cnt DESC, meter_diameter;"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"以service_point_id去重计数",
|
||||||
|
"适合绘制条形图/饼图"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 8
|
||||||
|
},
|
||||||
|
"time_column": null,
|
||||||
|
"required_columns": [
|
||||||
|
"meter_diameter",
|
||||||
|
"service_point_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "水表数=按service_point_id去重计数;粒度=表径。安全限制:仅用于分布分析,不含时间过滤;避免用于明细导出。"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_type_subtype_matrix",
|
||||||
|
"desc": "统计水表类型与子类组合的数量",
|
||||||
|
"type": "aggregate",
|
||||||
|
"title": "类型子类分布",
|
||||||
|
"examples": [
|
||||||
|
"不同类型与子类的水表数量",
|
||||||
|
"查看类型与子类的组合分布"
|
||||||
|
],
|
||||||
|
"variables": [],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT meter_type,\n meter_subtype,\n COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY meter_type, meter_subtype\nORDER BY meter_cnt DESC, meter_type, meter_subtype;"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"组合基数<=5×9=45",
|
||||||
|
"以service_point_id去重计数"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 45
|
||||||
|
},
|
||||||
|
"time_column": null,
|
||||||
|
"required_columns": [
|
||||||
|
"meter_type",
|
||||||
|
"meter_subtype",
|
||||||
|
"service_point_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "水表数=按service_point_id去重计数;粒度=类型×子类组合。安全限制:仅用于汇总分析,不包含时间或业务状态变化。"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_quality_spid_uniq",
|
||||||
|
"desc": "评估service_point_id的空值与重复情况",
|
||||||
|
"type": "quality",
|
||||||
|
"title": "服务点唯一性检",
|
||||||
|
"examples": [
|
||||||
|
"检查服务点ID是否唯一",
|
||||||
|
"统计service_point_id空值与重复情况"
|
||||||
|
],
|
||||||
|
"variables": [],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT\n COUNT(*) AS total_rows,\n SUM(service_point_id IS NULL) AS null_cnt,\n COUNT(DISTINCT service_point_id) AS distinct_cnt,\n (COUNT(*) - COUNT(DISTINCT service_point_id)) AS duplicate_rows_est,\n (\n SELECT COUNT(*) FROM (\n SELECT service_point_id\n FROM `data-ge`.`water_meter_info`\n GROUP BY service_point_id\n HAVING COUNT(*) > 1\n ) AS dup\n ) AS dup_key_groups\nFROM `data-ge`.`water_meter_info`;"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"用于键完整性检查",
|
||||||
|
"重复行估算=总行数-去重数"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": null
|
||||||
|
},
|
||||||
|
"time_column": null,
|
||||||
|
"required_columns": [
|
||||||
|
"service_point_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "质量检查口径:在本表内评估service_point_id的非空与唯一性,不代表跨表全局唯一。安全限制:仅输出汇总指标,不暴露明细重复值。"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_quality_account_nulls",
|
||||||
|
"desc": "抽取account_id为空的记录用于排查",
|
||||||
|
"type": "quality",
|
||||||
|
"title": "账户ID缺失明细",
|
||||||
|
"examples": [
|
||||||
|
"列出account_id为空的水表",
|
||||||
|
"抽样查看账户缺失的数据行"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "limit_n",
|
||||||
|
"type": "int",
|
||||||
|
"default": 50
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT *\nFROM `data-ge`.`water_meter_info`\nWHERE account_id IS NULL\nLIMIT {{limit_n}};"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"明细仅限小样本抽取",
|
||||||
|
"建议LIMIT<=100,避免全量导出"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": null
|
||||||
|
},
|
||||||
|
"time_column": null,
|
||||||
|
"required_columns": [
|
||||||
|
"account_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "质量抽样:筛出账户ID缺失的水表记录,便于核对。安全限制:仅用于样本排查,不建议在生产中全量导出;如需口径统计请改为COUNT聚合。"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_sample_random_rows",
|
||||||
|
"desc": "随机抽取水表信息用于人工核验",
|
||||||
|
"type": "sample",
|
||||||
|
"title": "随机抽样明细",
|
||||||
|
"examples": [
|
||||||
|
"抽样查看水表信息",
|
||||||
|
"随机抽取20条做质检"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "sample_size",
|
||||||
|
"type": "int",
|
||||||
|
"default": 20
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT *\nFROM `data-ge`.`water_meter_info`\nORDER BY RAND()\nLIMIT {{sample_size}};"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"使用RAND()随机,样本不可复现",
|
||||||
|
"建议限制样本量"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 300
|
||||||
|
},
|
||||||
|
"time_column": null,
|
||||||
|
"required_columns": [
|
||||||
|
"service_point_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "样本抽取:从本表随机返回若干行明细。安全限制:避免扩大LIMIT进行全量下载;如需可复现样本,请改用带种子的随机方法(MySQL不原生支持)。"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_filter_office_type_where",
|
||||||
|
"desc": "常用WHERE筛选条件片段:按营业所与类型且为有效",
|
||||||
|
"type": "sample",
|
||||||
|
"title": "机构类型筛选片",
|
||||||
|
"examples": [
|
||||||
|
"筛选A营业所的机械表",
|
||||||
|
"仅查看某营业所的指定类型水表"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "supply_office",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "meter_type",
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "WHERE supply_office = '{{supply_office}}'\n AND meter_type = '{{meter_type}}'\n AND meter_status = '有效'"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"这是条件片段,可拼接到其他查询",
|
||||||
|
"meter_status当前为单一值“有效”"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 11
|
||||||
|
},
|
||||||
|
"time_column": null,
|
||||||
|
"required_columns": [
|
||||||
|
"supply_office",
|
||||||
|
"meter_type",
|
||||||
|
"meter_status"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "过滤口径:仅保留指定营业所与指定水表类型、且状态为“有效”的记录。安全限制:为片段用途,需拼接在SELECT…FROM之后使用。"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_office_station_dist",
|
||||||
|
"desc": "按营业所与站点组合统计水表数",
|
||||||
|
"type": "aggregate",
|
||||||
|
"title": "所站层级分布",
|
||||||
|
"examples": [
|
||||||
|
"按营业所查看各站点水表数",
|
||||||
|
"所站两级的水表分布情况"
|
||||||
|
],
|
||||||
|
"variables": [],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT supply_office,\n station,\n COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY supply_office, station\nORDER BY supply_office, meter_cnt DESC, station;"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"组合基数<=11×36=396",
|
||||||
|
"以service_point_id去重计数",
|
||||||
|
"如结果过长可再按TopN筛选"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 396
|
||||||
|
},
|
||||||
|
"time_column": null,
|
||||||
|
"required_columns": [
|
||||||
|
"supply_office",
|
||||||
|
"station",
|
||||||
|
"service_point_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "水表数=按service_point_id去重计数;粒度=营业所×站点。安全限制:结果行数可能较多,建议在可视化端增加筛选或分页。"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_total_meter_baseline",
|
||||||
|
"desc": "获取全表水表去重总量基线",
|
||||||
|
"type": "aggregate",
|
||||||
|
"title": "水表总量基线",
|
||||||
|
"examples": [
|
||||||
|
"当前有多少只水表",
|
||||||
|
"作为占比分析的分母基线"
|
||||||
|
],
|
||||||
|
"variables": [],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT COUNT(DISTINCT service_point_id) AS meter_total\nFROM `data-ge`.`water_meter_info`;"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"作为其他占比/分摊分母基线",
|
||||||
|
"忽略service_point_id为空的记录"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 300
|
||||||
|
},
|
||||||
|
"time_column": null,
|
||||||
|
"required_columns": [
|
||||||
|
"service_point_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "水表总量=按service_point_id去重计数;基于当前表的全量记录。安全限制:无时间维度,无法反映存量随时间变化。"
|
||||||
|
}
|
||||||
|
]
|
||||||
415
demo/水务/水务-qwen3-coder-480b-ge-desc.json
Normal file
415
demo/水务/水务-qwen3-coder-480b-ge-desc.json
Normal file
@ -0,0 +1,415 @@
|
|||||||
|
{
|
||||||
|
"role": "dimension",
|
||||||
|
"time": {
|
||||||
|
"range": null,
|
||||||
|
"column": null,
|
||||||
|
"has_gaps": null,
|
||||||
|
"granularity": "unknown"
|
||||||
|
},
|
||||||
|
"grain": [
|
||||||
|
"account_id",
|
||||||
|
"service_point_id"
|
||||||
|
],
|
||||||
|
"table": "data-ge.water_meter_info",
|
||||||
|
"columns": [
|
||||||
|
{
|
||||||
|
"name": "supply_office",
|
||||||
|
"dtype": "string",
|
||||||
|
"stats": {},
|
||||||
|
"comment": "供水管理所名称,枚举值",
|
||||||
|
"enumish": true,
|
||||||
|
"null_rate": 0.0,
|
||||||
|
"top_values": [
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "宝山供水管理所"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "黄浦供水管理所"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "青东供水管理所"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "虹口供水管理所"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "闸北供水管理所"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "松北供水管理所"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "杨浦供水管理所"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "长宁供水管理所"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "闵行供水管理所"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "徐汇供水管理所"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "普陀供水管理所"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"semantic_type": "dimension",
|
||||||
|
"distinct_count": 11,
|
||||||
|
"distinct_ratio": 0.03666666666666667,
|
||||||
|
"pk_candidate_score": 0.11,
|
||||||
|
"metric_candidate_score": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "station",
|
||||||
|
"dtype": "string",
|
||||||
|
"stats": {},
|
||||||
|
"comment": "站点名称,枚举值",
|
||||||
|
"enumish": true,
|
||||||
|
"null_rate": 0.0,
|
||||||
|
"top_values": [
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "新闸站"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "宝杨站"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "江川站"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "长江站"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "市光站"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "徐泾站"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "真北站"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "半淞园站"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "芙蓉江站"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "密云站"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"semantic_type": "dimension",
|
||||||
|
"distinct_count": 36,
|
||||||
|
"distinct_ratio": 0.12,
|
||||||
|
"pk_candidate_score": 0.36,
|
||||||
|
"metric_candidate_score": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "district",
|
||||||
|
"dtype": "string",
|
||||||
|
"stats": {},
|
||||||
|
"comment": "行政区划名称,枚举值",
|
||||||
|
"enumish": true,
|
||||||
|
"null_rate": 0.0,
|
||||||
|
"top_values": [
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "普陀区"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "闵行区"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "嘉定区"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "杨浦区"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "徐汇区"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "黄浦区"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "松江区"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "长宁区"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "青浦区"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "虹口区"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"semantic_type": "dimension",
|
||||||
|
"distinct_count": 13,
|
||||||
|
"distinct_ratio": 0.043333333333333335,
|
||||||
|
"pk_candidate_score": 0.13,
|
||||||
|
"metric_candidate_score": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "meter_diameter",
|
||||||
|
"dtype": "string",
|
||||||
|
"stats": {},
|
||||||
|
"comment": "水表直径规格,枚举值",
|
||||||
|
"enumish": true,
|
||||||
|
"null_rate": 0.0,
|
||||||
|
"top_values": [
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "20mm"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "15mm"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "25mm"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "40mm"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "150mm"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "100mm"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "80mm"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "50mm"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"semantic_type": "dimension",
|
||||||
|
"distinct_count": 8,
|
||||||
|
"distinct_ratio": 0.02666666666666667,
|
||||||
|
"pk_candidate_score": 0.08,
|
||||||
|
"metric_candidate_score": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "meter_status",
|
||||||
|
"dtype": "string",
|
||||||
|
"stats": {},
|
||||||
|
"comment": "水表状态,枚举值",
|
||||||
|
"enumish": true,
|
||||||
|
"null_rate": 0.0,
|
||||||
|
"top_values": [
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "有效"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"semantic_type": "dimension",
|
||||||
|
"distinct_count": 1,
|
||||||
|
"distinct_ratio": 0.0033333333333333335,
|
||||||
|
"pk_candidate_score": 0.01,
|
||||||
|
"metric_candidate_score": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "meter_subtype",
|
||||||
|
"dtype": "string",
|
||||||
|
"stats": {},
|
||||||
|
"comment": "水表子类型,枚举值",
|
||||||
|
"enumish": true,
|
||||||
|
"null_rate": 0.0,
|
||||||
|
"top_values": [
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "旋翼半液封式"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "超声波式"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "旋翼湿式(指针式)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "旋翼湿式(数字指针式)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "电磁式"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "无直管段要求超声波式"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "无直管段要求电磁式"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "垂直螺翼干式"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "机械容积式"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"semantic_type": "dimension",
|
||||||
|
"distinct_count": 9,
|
||||||
|
"distinct_ratio": 0.03,
|
||||||
|
"pk_candidate_score": 0.09,
|
||||||
|
"metric_candidate_score": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "meter_type",
|
||||||
|
"dtype": "string",
|
||||||
|
"stats": {},
|
||||||
|
"comment": "水表类型,枚举值",
|
||||||
|
"enumish": true,
|
||||||
|
"null_rate": 0.0,
|
||||||
|
"top_values": [
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "容积式机械水表"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "速度式机械水表"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "电磁式远传水表"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "速度式机电远传水表"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "超声波式远传水表"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"semantic_type": "dimension",
|
||||||
|
"distinct_count": 5,
|
||||||
|
"distinct_ratio": 0.016666666666666666,
|
||||||
|
"pk_candidate_score": 0.05,
|
||||||
|
"metric_candidate_score": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "installation_position",
|
||||||
|
"dtype": "string",
|
||||||
|
"stats": {},
|
||||||
|
"comment": "安装位置,枚举值",
|
||||||
|
"enumish": true,
|
||||||
|
"null_rate": 0.0,
|
||||||
|
"top_values": [
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "嵌墙表"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "管道井表"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "地下表"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pct": null,
|
||||||
|
"value": "龙头表"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"semantic_type": "dimension",
|
||||||
|
"distinct_count": 4,
|
||||||
|
"distinct_ratio": 0.013333333333333334,
|
||||||
|
"pk_candidate_score": 0.04,
|
||||||
|
"metric_candidate_score": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "account_id",
|
||||||
|
"dtype": "string",
|
||||||
|
"stats": {},
|
||||||
|
"comment": "账户ID",
|
||||||
|
"enumish": false,
|
||||||
|
"null_rate": null,
|
||||||
|
"top_values": [],
|
||||||
|
"semantic_type": "id",
|
||||||
|
"distinct_count": null,
|
||||||
|
"distinct_ratio": null,
|
||||||
|
"pk_candidate_score": 0.95,
|
||||||
|
"metric_candidate_score": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "service_point_id",
|
||||||
|
"dtype": "string",
|
||||||
|
"stats": {},
|
||||||
|
"comment": "服务点ID",
|
||||||
|
"enumish": false,
|
||||||
|
"null_rate": null,
|
||||||
|
"top_values": [],
|
||||||
|
"semantic_type": "id",
|
||||||
|
"distinct_count": null,
|
||||||
|
"distinct_ratio": null,
|
||||||
|
"pk_candidate_score": 0.95,
|
||||||
|
"metric_candidate_score": 0.0
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"quality": {
|
||||||
|
"warning_hints": [],
|
||||||
|
"failed_expectations": []
|
||||||
|
},
|
||||||
|
"row_count": 300,
|
||||||
|
"fk_candidates": [],
|
||||||
|
"confidence_notes": [
|
||||||
|
"role判定为dimension,因所有列均为枚举或ID类型,无metric列",
|
||||||
|
"grain依据account_id和service_point_id为唯一标识推测",
|
||||||
|
"未发现时间列,因此time字段为null"
|
||||||
|
],
|
||||||
|
"primary_key_candidates": [
|
||||||
|
[
|
||||||
|
"account_id"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"service_point_id"
|
||||||
|
]
|
||||||
|
]
|
||||||
|
}
|
||||||
286
demo/水务/水务-qwen3-coder-480b-snippet-alias.json
Normal file
286
demo/水务/水务-qwen3-coder-480b-snippet-alias.json
Normal file
@ -0,0 +1,286 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"id": "snpt_water_meter_top_supply_office",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "供水所水表排行",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "哪个供水所水表最多",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "供水管理所水表TopN统计",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"水表",
|
||||||
|
"供水管理所",
|
||||||
|
"排行",
|
||||||
|
"TopN",
|
||||||
|
"数量",
|
||||||
|
"统计",
|
||||||
|
"count",
|
||||||
|
"排名",
|
||||||
|
"前N",
|
||||||
|
"供水所",
|
||||||
|
"水表数",
|
||||||
|
"维度聚合",
|
||||||
|
"by_dimension",
|
||||||
|
"topn"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"topn",
|
||||||
|
"by_dimension"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_water_meter_top_station",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "站点水表数量排行",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "哪个站点水表最多",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "站点维度水表TopN分析",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"水表",
|
||||||
|
"站点",
|
||||||
|
"排行",
|
||||||
|
"TopN",
|
||||||
|
"数量",
|
||||||
|
"统计",
|
||||||
|
"count",
|
||||||
|
"排名",
|
||||||
|
"前N",
|
||||||
|
"站点数",
|
||||||
|
"维度聚合",
|
||||||
|
"by_dimension",
|
||||||
|
"topn"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"topn",
|
||||||
|
"by_dimension"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_water_meter_top_district",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "区域水表数量排名",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "哪个区水表最多",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "行政区水表TopN统计",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"水表",
|
||||||
|
"区域",
|
||||||
|
"行政区",
|
||||||
|
"排行",
|
||||||
|
"TopN",
|
||||||
|
"数量",
|
||||||
|
"统计",
|
||||||
|
"count",
|
||||||
|
"排名",
|
||||||
|
"前N",
|
||||||
|
"区",
|
||||||
|
"水表数",
|
||||||
|
"维度聚合",
|
||||||
|
"by_dimension",
|
||||||
|
"topn"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"topn",
|
||||||
|
"by_dimension"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_water_meter_share_by_type",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "水表类型占比",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "哪种水表用得最多",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "水表类型分布比例",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"水表",
|
||||||
|
"类型",
|
||||||
|
"占比",
|
||||||
|
"比例",
|
||||||
|
"ratio",
|
||||||
|
"分布",
|
||||||
|
"meter_type",
|
||||||
|
"百分比",
|
||||||
|
"分类统计",
|
||||||
|
"水表类型",
|
||||||
|
"ratio",
|
||||||
|
"aggregate",
|
||||||
|
"by_dimension"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"ratio",
|
||||||
|
"by_dimension"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_water_meter_subtype_distribution",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "水表子类型分布",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "各种子类型水表情况",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "水表子类型计数与占比",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"水表",
|
||||||
|
"子类型",
|
||||||
|
"分布",
|
||||||
|
"数量",
|
||||||
|
"占比",
|
||||||
|
"meter_subtype",
|
||||||
|
"统计",
|
||||||
|
"count",
|
||||||
|
"百分比",
|
||||||
|
"分类统计",
|
||||||
|
"aggregate",
|
||||||
|
"by_dimension"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"aggregate",
|
||||||
|
"by_dimension"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_water_meter_installation_position_stats",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "安装位置统计",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "哪种位置装表最多",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "水表安装位置分布",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"水表",
|
||||||
|
"安装位置",
|
||||||
|
"统计",
|
||||||
|
"分布",
|
||||||
|
"installation_position",
|
||||||
|
"数量",
|
||||||
|
"count",
|
||||||
|
"位置",
|
||||||
|
"安装点",
|
||||||
|
"aggregate",
|
||||||
|
"by_dimension"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"aggregate",
|
||||||
|
"by_dimension"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_water_meter_grain_check",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "主键粒度校验",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "数据有没有重复",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "数据粒度一致性检查",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"主键",
|
||||||
|
"粒度",
|
||||||
|
"校验",
|
||||||
|
"质量",
|
||||||
|
"quality",
|
||||||
|
"重复",
|
||||||
|
"唯一性",
|
||||||
|
"account_id",
|
||||||
|
"service_point_id",
|
||||||
|
"数据校验",
|
||||||
|
"质量检查",
|
||||||
|
"异常检测"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"quality"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_water_meter_sample_records",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "水表数据抽样",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "给我看点水表数据",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "水表记录样本抽取",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"水表",
|
||||||
|
"样本",
|
||||||
|
"抽样",
|
||||||
|
"sample",
|
||||||
|
"随机",
|
||||||
|
"记录",
|
||||||
|
"抽查",
|
||||||
|
"limit",
|
||||||
|
"数据结构",
|
||||||
|
"数据示例",
|
||||||
|
"sample",
|
||||||
|
"limit_rows"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"sample"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
235
demo/水务/水务-qwen3-coder-480b-snippet.json
Normal file
235
demo/水务/水务-qwen3-coder-480b-snippet.json
Normal file
@ -0,0 +1,235 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"id": "snpt_water_meter_top_supply_office",
|
||||||
|
"desc": "统计各供水管理所下辖水表数量并排序",
|
||||||
|
"type": "topn",
|
||||||
|
"title": "供水管理所水表数量排行",
|
||||||
|
"examples": [
|
||||||
|
"列出水表最多的前10个供水管理所",
|
||||||
|
"各供水所水表数量排名"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "top_n",
|
||||||
|
"type": "int",
|
||||||
|
"default": 10
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT supply_office AS dim_value, COUNT(*) AS metric_value FROM `data-ge.water_meter_info` GROUP BY supply_office ORDER BY metric_value DESC LIMIT {{top_n}}"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 11
|
||||||
|
},
|
||||||
|
"time_column": "nullable",
|
||||||
|
"required_columns": [
|
||||||
|
"supply_office"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "按供水管理所维度聚合水表总数,粒度=供水管理所"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_water_meter_top_station",
|
||||||
|
"desc": "统计各个站点下辖水表数量并排序",
|
||||||
|
"type": "topn",
|
||||||
|
"title": "站点水表数量排行",
|
||||||
|
"examples": [
|
||||||
|
"列出水表最多的前10个站点",
|
||||||
|
"各站点水表数量排名"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "top_n",
|
||||||
|
"type": "int",
|
||||||
|
"default": 10
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT station AS dim_value, COUNT(*) AS metric_value FROM `data-ge.water_meter_info` GROUP BY station ORDER BY metric_value DESC LIMIT {{top_n}}"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"高基数维度建议LIMIT<=50"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 36
|
||||||
|
},
|
||||||
|
"time_column": "nullable",
|
||||||
|
"required_columns": [
|
||||||
|
"station"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "按站点维度聚合水表总数,粒度=站点"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_water_meter_top_district",
|
||||||
|
"desc": "统计各区水表数量并排序",
|
||||||
|
"type": "topn",
|
||||||
|
"title": "区域水表数量排行",
|
||||||
|
"examples": [
|
||||||
|
"列出各区水表数量排名",
|
||||||
|
"哪个区的水表最多?"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "top_n",
|
||||||
|
"type": "int",
|
||||||
|
"default": 10
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT district AS dim_value, COUNT(*) AS metric_value FROM `data-ge.water_meter_info` GROUP BY district ORDER BY metric_value DESC LIMIT {{top_n}}"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 13
|
||||||
|
},
|
||||||
|
"time_column": "nullable",
|
||||||
|
"required_columns": [
|
||||||
|
"district"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "按行政区划维度聚合水表总数,粒度=区"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_water_meter_share_by_type",
|
||||||
|
"desc": "计算各类水表占总水表的比例",
|
||||||
|
"type": "ratio",
|
||||||
|
"title": "水表类型占比分布",
|
||||||
|
"examples": [
|
||||||
|
"各类水表占比是多少?",
|
||||||
|
"哪种类型的水表使用最广泛?"
|
||||||
|
],
|
||||||
|
"variables": [],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT meter_type AS dim_value, COUNT(*) * 100.0 / (SELECT COUNT(*) FROM `data-ge.water_meter_info`) AS ratio_percent FROM `data-ge.water_meter_info` GROUP BY meter_type ORDER BY ratio_percent DESC"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 5
|
||||||
|
},
|
||||||
|
"time_column": "nullable",
|
||||||
|
"required_columns": [
|
||||||
|
"meter_type"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "按水表类型分类计算其占比,粒度=水表类型"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_water_meter_subtype_distribution",
|
||||||
|
"desc": "展示不同水表子类型的数量及比例",
|
||||||
|
"type": "aggregate",
|
||||||
|
"title": "水表子类型分布情况",
|
||||||
|
"examples": [
|
||||||
|
"各种子类型水表的数量和占比",
|
||||||
|
"哪种子类型水表最多?"
|
||||||
|
],
|
||||||
|
"variables": [],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT meter_subtype AS dim_value, COUNT(*) AS count_value, ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM `data-ge.water_meter_info`), 2) AS percentage FROM `data-ge.water_meter_info` GROUP BY meter_subtype ORDER BY count_value DESC"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 9
|
||||||
|
},
|
||||||
|
"time_column": "nullable",
|
||||||
|
"required_columns": [
|
||||||
|
"meter_subtype"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "按水表子类型进行计数和百分比统计,粒度=水表子类型"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_water_meter_installation_position_stats",
|
||||||
|
"desc": "统计不同安装位置下的水表数量",
|
||||||
|
"type": "aggregate",
|
||||||
|
"title": "安装位置分布统计",
|
||||||
|
"examples": [
|
||||||
|
"各种安装位置的水表数量",
|
||||||
|
"哪种安装位置最为常见?"
|
||||||
|
],
|
||||||
|
"variables": [],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT installation_position AS dim_value, COUNT(*) AS count_value FROM `data-ge.water_meter_info` GROUP BY installation_position ORDER BY count_value DESC"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 4
|
||||||
|
},
|
||||||
|
"time_column": "nullable",
|
||||||
|
"required_columns": [
|
||||||
|
"installation_position"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "按安装位置对水表进行分组计数,粒度=安装位置"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_water_meter_grain_check",
|
||||||
|
"desc": "验证 account_id 和 service_point_id 是否构成唯一组合",
|
||||||
|
"type": "quality",
|
||||||
|
"title": "主键粒度校验",
|
||||||
|
"examples": [
|
||||||
|
"这张表的数据粒度是否正确?",
|
||||||
|
"是否存在重复的服务点记录?"
|
||||||
|
],
|
||||||
|
"variables": [],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT IF(COUNT(*) = COUNT(DISTINCT account_id, service_point_id), 'PASS', 'FAIL') AS grain_check_result FROM `data-ge.water_meter_info`"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": null
|
||||||
|
},
|
||||||
|
"time_column": "nullable",
|
||||||
|
"required_columns": [
|
||||||
|
"account_id",
|
||||||
|
"service_point_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "检验数据是否符合预期的主键粒度(account_id + service_point_id)"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_water_meter_sample_records",
|
||||||
|
"desc": "随机抽取部分水表信息用于查看结构",
|
||||||
|
"type": "sample",
|
||||||
|
"title": "样本抽取",
|
||||||
|
"examples": [
|
||||||
|
"给我看几条水表数据的例子",
|
||||||
|
"抽查一些原始数据看看格式"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "limit_rows",
|
||||||
|
"type": "int",
|
||||||
|
"default": 5
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT * FROM `data-ge.water_meter_info` ORDER BY RAND() LIMIT {{limit_rows}}"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": null
|
||||||
|
},
|
||||||
|
"time_column": "nullable",
|
||||||
|
"required_columns": []
|
||||||
|
},
|
||||||
|
"business_caliber": "从全量数据中随机采样若干条记录供参考"
|
||||||
|
}
|
||||||
|
]
|
||||||
1
demo/水务/水务-qwen3-next-80b-ge-desc.json
Normal file
1
demo/水务/水务-qwen3-next-80b-ge-desc.json
Normal file
File diff suppressed because one or more lines are too long
249
demo/水务/水务-qwen3-next-80b-snippet-alias.json
Normal file
249
demo/水务/水务-qwen3-next-80b-snippet-alias.json
Normal file
@ -0,0 +1,249 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"id": "snpt_topn_supply_office_by_account",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "哪个供水所用户最多?",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "按供应办公室统计账户数量",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "供应办公室账户数TopN排名",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"供应办公室",
|
||||||
|
"账户数",
|
||||||
|
"TopN",
|
||||||
|
"排行",
|
||||||
|
"统计",
|
||||||
|
"account_id",
|
||||||
|
"supply_office",
|
||||||
|
"去重",
|
||||||
|
"高占比",
|
||||||
|
"维度分析",
|
||||||
|
"by_dimension",
|
||||||
|
"aggregate",
|
||||||
|
"topn"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"topn",
|
||||||
|
"aggregate",
|
||||||
|
"by_dimension"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_topn_station_by_account",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "哪些站点用户最多?",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "按站点统计账户分布",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "站点账户数Top20排名",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"站点",
|
||||||
|
"账户数",
|
||||||
|
"TopN",
|
||||||
|
"排行",
|
||||||
|
"统计",
|
||||||
|
"station",
|
||||||
|
"account_id",
|
||||||
|
"去重",
|
||||||
|
"高负载",
|
||||||
|
"维度分析",
|
||||||
|
"by_dimension",
|
||||||
|
"aggregate",
|
||||||
|
"topn"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"topn",
|
||||||
|
"aggregate",
|
||||||
|
"by_dimension"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_topn_district_by_account",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "哪个区用户最多?",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "按行政区统计账户数量",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "行政区账户数全量排名",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"行政区",
|
||||||
|
"账户数",
|
||||||
|
"TopN",
|
||||||
|
"排行",
|
||||||
|
"统计",
|
||||||
|
"district",
|
||||||
|
"account_id",
|
||||||
|
"去重",
|
||||||
|
"区域对比",
|
||||||
|
"维度分析",
|
||||||
|
"by_dimension",
|
||||||
|
"aggregate",
|
||||||
|
"topn"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"topn",
|
||||||
|
"aggregate",
|
||||||
|
"by_dimension"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_share_of_meter_type",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "各类水表占多少比例?",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "水表类型占比分析",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "水表类型占比分布",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"水表类型",
|
||||||
|
"占比",
|
||||||
|
"比例",
|
||||||
|
"meter_type",
|
||||||
|
"account_id",
|
||||||
|
"去重",
|
||||||
|
"分布",
|
||||||
|
"主流类型",
|
||||||
|
"技术选型",
|
||||||
|
"ratio",
|
||||||
|
"aggregate",
|
||||||
|
"by_dimension"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"ratio",
|
||||||
|
"aggregate",
|
||||||
|
"by_dimension"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_sample_account_service_point",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "随机看10条账户信息",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "抽样账户与服务点明细",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "账户-服务点随机抽样验证",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"抽样",
|
||||||
|
"随机",
|
||||||
|
"样本",
|
||||||
|
"account_id",
|
||||||
|
"service_point_id",
|
||||||
|
"数据质量",
|
||||||
|
"验证",
|
||||||
|
"唯一性",
|
||||||
|
"格式检查",
|
||||||
|
"sample",
|
||||||
|
"quality"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"sample",
|
||||||
|
"quality"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_filter_meter_status_valid",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "只取有效的水表记录",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "筛选有效水表记录",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "水表状态有效性过滤",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"有效",
|
||||||
|
"过滤",
|
||||||
|
"筛选",
|
||||||
|
"meter_status",
|
||||||
|
"质量检查",
|
||||||
|
"断言",
|
||||||
|
"清洗",
|
||||||
|
"filter",
|
||||||
|
"quality"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"filter",
|
||||||
|
"quality"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_filter_meter_diameter_20mm",
|
||||||
|
"aliases": [
|
||||||
|
{
|
||||||
|
"text": "找出所有20mm水表用户",
|
||||||
|
"tone": "口语"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "筛选20mm水表记录",
|
||||||
|
"tone": "中性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "20mm口径水表子集提取",
|
||||||
|
"tone": "专业"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"keywords": [
|
||||||
|
"20mm",
|
||||||
|
"水表直径",
|
||||||
|
"过滤",
|
||||||
|
"筛选",
|
||||||
|
"meter_diameter",
|
||||||
|
"子集",
|
||||||
|
"分析",
|
||||||
|
"住宅用水",
|
||||||
|
"规格",
|
||||||
|
"filter",
|
||||||
|
"by_dimension"
|
||||||
|
],
|
||||||
|
"intent_tags": [
|
||||||
|
"filter",
|
||||||
|
"by_dimension"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
227
demo/水务/水务-qwen3-next-80b-snippet.json
Normal file
227
demo/水务/水务-qwen3-next-80b-snippet.json
Normal file
@ -0,0 +1,227 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"id": "snpt_topn_supply_office_by_account",
|
||||||
|
"desc": "统计各供应办公室对应的账户数量,识别高占比管理所",
|
||||||
|
"type": "topn",
|
||||||
|
"title": "按供应办公室统计账户数",
|
||||||
|
"examples": [
|
||||||
|
"哪个供水管理所服务的用户最多?",
|
||||||
|
"列出前5个账户数最多的供应办公室"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "top_n",
|
||||||
|
"type": "int",
|
||||||
|
"default": 11
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT supply_office, COUNT(DISTINCT account_id) AS account_count\nFROM water_meter_info\nGROUP BY supply_office\nORDER BY account_count DESC\nLIMIT {{top_n}};"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"供应办公室仅11个唯一值,可安全展示全部;建议LIMIT 11避免冗余排序"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 11
|
||||||
|
},
|
||||||
|
"time_column": "nullable",
|
||||||
|
"required_columns": [
|
||||||
|
"supply_office",
|
||||||
|
"account_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "粒度=供应办公室,指标=去重账户数(account_id),仅统计水表信息表中有效账户,不关联外部表"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_topn_station_by_account",
|
||||||
|
"desc": "统计各站点服务的账户数量,识别高负载站点",
|
||||||
|
"type": "topn",
|
||||||
|
"title": "按站点统计账户分布",
|
||||||
|
"examples": [
|
||||||
|
"哪些站点服务的用户最多?",
|
||||||
|
"TOP10用户最多的站点是哪些?"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "top_n",
|
||||||
|
"type": "int",
|
||||||
|
"default": 20
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT station, COUNT(DISTINCT account_id) AS account_count\nFROM water_meter_info\nGROUP BY station\nORDER BY account_count DESC\nLIMIT {{top_n}};"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"站点有36个唯一值,建议LIMIT<=20以避免结果过长;高基数维度可能影响查询性能"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 36
|
||||||
|
},
|
||||||
|
"time_column": "nullable",
|
||||||
|
"required_columns": [
|
||||||
|
"station",
|
||||||
|
"account_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "粒度=站点(station),指标=去重账户数(account_id),基于水表信息表直接聚合,不涉及时间维度"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_topn_district_by_account",
|
||||||
|
"desc": "统计各行政区的账户数量,辅助区域资源分配分析",
|
||||||
|
"type": "topn",
|
||||||
|
"title": "按行政区统计账户分布",
|
||||||
|
"examples": [
|
||||||
|
"哪个区的用水账户最多?",
|
||||||
|
"列出所有行政区的账户数量排名"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "top_n",
|
||||||
|
"type": "int",
|
||||||
|
"default": 13
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT district, COUNT(DISTINCT account_id) AS account_count\nFROM water_meter_info\nGROUP BY district\nORDER BY account_count DESC\nLIMIT {{top_n}};"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"行政区共13个,可完整展示;适合用于区域对比分析"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 13
|
||||||
|
},
|
||||||
|
"time_column": "nullable",
|
||||||
|
"required_columns": [
|
||||||
|
"district",
|
||||||
|
"account_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "粒度=行政区(district),指标=去重账户数(account_id),基于水表信息表聚合,反映各区域用户规模"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_share_of_meter_type",
|
||||||
|
"desc": "计算各类水表类型在总账户中的占比,识别主流类型",
|
||||||
|
"type": "ratio",
|
||||||
|
"title": "水表类型占比分析",
|
||||||
|
"examples": [
|
||||||
|
"各类水表在用户中的占比是多少?",
|
||||||
|
"电磁式远传水表占总用户比例多少?"
|
||||||
|
],
|
||||||
|
"variables": [],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT meter_type, \n COUNT(DISTINCT account_id) AS account_count,\n ROUND(COUNT(DISTINCT account_id) * 100.0 / SUM(COUNT(DISTINCT account_id)) OVER (), 2) AS percentage\nFROM water_meter_info\nGROUP BY meter_type\nORDER BY account_count DESC;"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"水表类型仅5种,适合计算占比;可直接展示全量分布"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 5
|
||||||
|
},
|
||||||
|
"time_column": "nullable",
|
||||||
|
"required_columns": [
|
||||||
|
"meter_type",
|
||||||
|
"account_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "粒度=水表类型(meter_type),指标=去重账户数占比,分母为全表去重账户总数,反映技术选型分布"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_sample_account_service_point",
|
||||||
|
"desc": "随机抽取部分账户与服务点ID的原始记录,用于数据质量核查",
|
||||||
|
"type": "sample",
|
||||||
|
"title": "抽样账户与服务点明细",
|
||||||
|
"examples": [
|
||||||
|
"随机查看10条账户与服务点的详细信息",
|
||||||
|
"抽样检查水表信息是否符合预期格式"
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "sample_size",
|
||||||
|
"type": "int",
|
||||||
|
"default": 10
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT account_id, service_point_id, supply_office, station, district, meter_diameter, meter_type, meter_subtype, installation_position\nFROM water_meter_info\nORDER BY RAND()\nLIMIT {{sample_size}};"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"主键组合为account_id+service_point_id,适合抽样验证唯一性;建议样本量≤100"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": null
|
||||||
|
},
|
||||||
|
"time_column": "nullable",
|
||||||
|
"required_columns": [
|
||||||
|
"account_id",
|
||||||
|
"service_point_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "粒度=单条水表记录,抽取样本用于验证account_id与service_point_id的组合唯一性及维度字段完整性"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_filter_meter_status_valid",
|
||||||
|
"desc": "过滤出水表状态为'有效'的记录,用于后续分析",
|
||||||
|
"type": "quality",
|
||||||
|
"title": "筛选有效水表记录",
|
||||||
|
"examples": [
|
||||||
|
"只取状态为有效的水表记录",
|
||||||
|
"确认所有水表是否均为有效状态"
|
||||||
|
],
|
||||||
|
"variables": [],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT *\nFROM water_meter_info\nWHERE meter_status = '有效';"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"meter_status仅存在'有效'值,此条件恒成立;可用于数据清洗流程的显式过滤"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 1
|
||||||
|
},
|
||||||
|
"time_column": "nullable",
|
||||||
|
"required_columns": [
|
||||||
|
"meter_status"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "仅保留水表状态为'有效'的记录,因全表均为有效值,此过滤为冗余但可作为数据质量校验的显式断言"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_filter_meter_diameter_20mm",
|
||||||
|
"desc": "筛选水表直径为20mm的记录,用于特定口径设备分析",
|
||||||
|
"type": "quality",
|
||||||
|
"title": "筛选20mm水表记录",
|
||||||
|
"examples": [
|
||||||
|
"找出所有使用20mm水表的用户",
|
||||||
|
"20mm水表分布在哪些站点?"
|
||||||
|
],
|
||||||
|
"variables": [],
|
||||||
|
"dialect_sql": {
|
||||||
|
"mysql": "SELECT *\nFROM water_meter_info\nWHERE meter_diameter = '20mm';"
|
||||||
|
},
|
||||||
|
"applicability": {
|
||||||
|
"constraints": {
|
||||||
|
"notes": [
|
||||||
|
"水表直径共8种枚举值,20mm为常见规格;可作为子集分析的起点"
|
||||||
|
],
|
||||||
|
"fk_join_available": false,
|
||||||
|
"dim_cardinality_hint": 8
|
||||||
|
},
|
||||||
|
"time_column": "nullable",
|
||||||
|
"required_columns": [
|
||||||
|
"meter_diameter"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"business_caliber": "粒度=单条水表记录,筛选条件为meter_diameter='20mm',用于分析标准住宅用水表的分布特征"
|
||||||
|
}
|
||||||
|
]
|
||||||
57
doc/rag-api.md
Normal file
57
doc/rag-api.md
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
#添加RAG
|
||||||
|
curl --location --request POST 'http://127.0.0.1:8000/rag/add' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--header 'Authorization: Bearer ' \
|
||||||
|
--data-raw '{
|
||||||
|
"id": 0,
|
||||||
|
"workspaceId": 0,
|
||||||
|
"name": "string",
|
||||||
|
"embeddingData": "string",
|
||||||
|
"type": "METRIC"
|
||||||
|
}'
|
||||||
|
|
||||||
|
#批量添加RAG
|
||||||
|
curl --location --request POST 'http://127.0.0.1:8000/rag/addBatch' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--header 'Authorization: Bearer ' \
|
||||||
|
--data-raw '[
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"workspaceId": 0,
|
||||||
|
"name": "string",
|
||||||
|
"embeddingData": "string",
|
||||||
|
"type": "METRIC"
|
||||||
|
}
|
||||||
|
]'
|
||||||
|
|
||||||
|
#更新RAG
|
||||||
|
curl --location --request POST 'http://127.0.0.1:8000/rag/update' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--header 'Authorization: Bearer ' \
|
||||||
|
--data-raw '{
|
||||||
|
"id": 0,
|
||||||
|
"workspaceId": 0,
|
||||||
|
"name": "string",
|
||||||
|
"embeddingData": "string",
|
||||||
|
"type": "METRIC"
|
||||||
|
}'
|
||||||
|
|
||||||
|
#删除RAG
|
||||||
|
curl --location --request POST 'http://127.0.0.1:8000/rag/delete' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--header 'Authorization: Bearer ' \
|
||||||
|
--data-raw '{
|
||||||
|
"id": 0,
|
||||||
|
"type": "METRIC"
|
||||||
|
}'
|
||||||
|
|
||||||
|
#检索RAG
|
||||||
|
curl --location --request POST 'http://127.0.0.1:8000/rag/retrieve' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--header 'Authorization: Bearer ' \
|
||||||
|
--data-raw '{
|
||||||
|
"query": "string",
|
||||||
|
"num": 0,
|
||||||
|
"workspaceId": 0,
|
||||||
|
"type": "METRIC"
|
||||||
|
}'
|
||||||
49
doc/会话api.md
Normal file
49
doc/会话api.md
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
# 创建会话
|
||||||
|
curl -X POST "/api/v1/chat/sessions" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d "{\"user_id\": $CHAT_USER_ID}"
|
||||||
|
|
||||||
|
# 获取会话
|
||||||
|
curl "/api/v1/chat/sessions/{session_id}"
|
||||||
|
|
||||||
|
# 按用户列出会话
|
||||||
|
curl "/api/v1/chat/sessions?user_id=$CHAT_USER_ID"
|
||||||
|
|
||||||
|
# 更新会话状态
|
||||||
|
curl -X POST "/api/v1/chat/sessions/{session_id}/update" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"status":"PAUSED"}'
|
||||||
|
|
||||||
|
# 关闭会话
|
||||||
|
curl -X POST "/api/v1/chat/sessions/{session_id}/close"
|
||||||
|
|
||||||
|
# 创建对话轮次
|
||||||
|
curl -X POST "/api/v1/chat/sessions/{session_id}/turns" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"user_id": '"$CHAT_USER_ID"',
|
||||||
|
"user_query": "展示昨天订单GMV",
|
||||||
|
"intent": "METRIC_QUERY",
|
||||||
|
"ast_json": {"select":["gmv"],"where":{"dt":"yesterday"}},
|
||||||
|
"main_metric_ids": [1234],
|
||||||
|
"created_metric_ids": []
|
||||||
|
}'
|
||||||
|
|
||||||
|
# 获取单条对话轮次
|
||||||
|
curl "/api/v1/chat/turns/{turn_id}"
|
||||||
|
|
||||||
|
# 列出会话下的轮次
|
||||||
|
curl "/api/v1/chat/sessions/{session_id}/turns"
|
||||||
|
|
||||||
|
# 写入检索结果
|
||||||
|
curl -X POST "/api/v1/chat/turns/{turn_id}/retrievals" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"retrievals": [
|
||||||
|
{"item_type":"METRIC","item_id":"metric_foo","used_in_sql":true,"rank_no":1},
|
||||||
|
{"item_type":"SNIPPET","item_id":"snpt_bar","similarity_score":0.77,"rank_no":2}
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
|
||||||
|
# 列出轮次的检索结果
|
||||||
|
curl "/api/v1/chat/turns/{turn_id}/retrievals"
|
||||||
69
doc/指标api.md
Normal file
69
doc/指标api.md
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
# 新建指标
|
||||||
|
curl -X POST "/api/v1/metrics" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"metric_code": "metric_1234",
|
||||||
|
"metric_name": "订单数",
|
||||||
|
"biz_domain": "order",
|
||||||
|
"biz_desc": "订单总数",
|
||||||
|
"base_sql": "select count(*) as order_cnt from orders",
|
||||||
|
"time_grain": "DAY",
|
||||||
|
"dim_binding": ["dt"],
|
||||||
|
"update_strategy": "FULL",
|
||||||
|
"metric_aliases": ["订单量"],
|
||||||
|
"created_by": '"$METRIC_USER_ID"'
|
||||||
|
}'
|
||||||
|
|
||||||
|
# 更新指标
|
||||||
|
curl -X POST "/api/v1/metrics/{metric_id}" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"metric_name":"订单数-更新","is_active":false}'
|
||||||
|
|
||||||
|
# 获取指标
|
||||||
|
curl "/api/v1/metrics/{metric_id}"
|
||||||
|
|
||||||
|
# 新建调度
|
||||||
|
curl -X POST "/api/v1/metric-schedules" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"metric_id":{metric_id},"cron_expr":"0 2 * * *","priority":5,"enabled":true}'
|
||||||
|
|
||||||
|
# 更新调度
|
||||||
|
curl -X POST "/api/v1/metric-schedules/{schedule_id}" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"enabled":false,"retry_times":1}'
|
||||||
|
|
||||||
|
# 列出某指标的调度
|
||||||
|
curl "/api/v1/metrics/{metric_id}/schedules"
|
||||||
|
|
||||||
|
# 触发运行
|
||||||
|
curl -X POST "/api/v1/metric-runs/trigger" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"metric_id": {metric_id},
|
||||||
|
"triggered_by": "API",
|
||||||
|
"data_time_from": "2024-05-01T00:00:00Z",
|
||||||
|
"data_time_to": "2024-05-02T00:00:00Z"
|
||||||
|
}'
|
||||||
|
|
||||||
|
# 列出运行
|
||||||
|
curl "/api/v1/metric-runs?metric_id={metric_id}"
|
||||||
|
|
||||||
|
# 获取单次运行
|
||||||
|
curl "/api/v1/metric-runs/{run_id}"
|
||||||
|
|
||||||
|
# 写入指标结果
|
||||||
|
curl -X POST "/api/v1/metric-results/{metric_id}" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"metric_id": {metric_id},
|
||||||
|
"results": [
|
||||||
|
{"stat_time":"2024-05-01T00:00:00Z","metric_value":123.45,"data_version":"{run_id}"},
|
||||||
|
{"stat_time":"2024-05-02T00:00:00Z","metric_value":234.56,"data_version":"{run_id}"}
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
|
||||||
|
# 查询指标结果
|
||||||
|
curl "/api/v1/metric-results?metric_id={metric_id}"
|
||||||
|
|
||||||
|
# 查询最新结果
|
||||||
|
curl "/api/v1/metric-results/latest?metric_id={metric_id}"
|
||||||
83
doc/指标生成.md
Normal file
83
doc/指标生成.md
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
某个用户的一句问话 → 解析成某轮 chat_turn → 这轮用了哪些指标/知识/会话(chat_turn_retrieval) →
|
||||||
|
是否产生了新的指标(metric_def) →
|
||||||
|
是否触发了指标调度运行(metric_job_run.turn_id) →
|
||||||
|
最终产生了哪些指标结果(metric_result.metric_id + stat_time)。
|
||||||
|
|
||||||
|
会话域
|
||||||
|
schema
|
||||||
|
会话表 chat_session
|
||||||
|
|
||||||
|
会话轮次表 chat_turn
|
||||||
|
|
||||||
|
会话轮次检索关联表 chat_turn_retrieval
|
||||||
|
|
||||||
|
|
||||||
|
API
|
||||||
|
1. 创建会话
|
||||||
|
POST /api/v1/chat/sessions
|
||||||
|
2. 更新会话轮次
|
||||||
|
POST /api/v1/chat/sessions/{session_id}/update
|
||||||
|
3. 结束会话
|
||||||
|
POST /api/v1/chat/sessions/{session_id}/close
|
||||||
|
4. 查询会话
|
||||||
|
GET /api/v1/chat/sessions/{session_id}
|
||||||
|
5. 会话列表查询(按用户、时间)
|
||||||
|
GET /api/v1/chat/sessions
|
||||||
|
6. 创建问答轮次(用户发起 query)
|
||||||
|
POST /api/v1/chat/sessions/{session_id}/turns
|
||||||
|
7. 查询某会话的所有轮次
|
||||||
|
GET /api/v1/chat/sessions/{session_id}/turns
|
||||||
|
8. 查看单轮问答详情
|
||||||
|
GET /api/v1/chat/turns/{turn_id}
|
||||||
|
9. 批量写入某轮的检索结果
|
||||||
|
POST /api/v1/chat/turns/{turn_id}/retrievals
|
||||||
|
10. 查询某轮的检索记录
|
||||||
|
GET /api/v1/chat/turns/{turn_id}/retrievals
|
||||||
|
11. 更新某轮的检索记录(in future)
|
||||||
|
POST /api/v1/chat/turns/{turn_id}/retrievals/update
|
||||||
|
|
||||||
|
元数据域
|
||||||
|
schema
|
||||||
|
指标定义表 metric_def
|
||||||
|
|
||||||
|
|
||||||
|
API
|
||||||
|
12. 创建指标(来自问答或传统定义)
|
||||||
|
POST /api/v1/metrics
|
||||||
|
13. 更新指标
|
||||||
|
POST /api/v1/metrics/{id}
|
||||||
|
14. 获取指标详情
|
||||||
|
GET /api/v1/metrics
|
||||||
|
|
||||||
|
执行调度域(暂定airflow)
|
||||||
|
schema
|
||||||
|
指标调度配置表 metric_schedule
|
||||||
|
|
||||||
|
调度运行记录表 metric_job_run
|
||||||
|
|
||||||
|
API
|
||||||
|
1. 创建调度配置
|
||||||
|
POST /api/v1/metric-schedules
|
||||||
|
2. 更新调度配置
|
||||||
|
POST /api/v1/metric-schedules/{id}
|
||||||
|
3. 查询指标调度配置详情
|
||||||
|
GET /api/v1/metrics/{metric_id}/schedules
|
||||||
|
4. 手动触发一次指标运行(例如来自问数)
|
||||||
|
POST /api/v1/metric-runs/trigger
|
||||||
|
5. 查询运行记录列表
|
||||||
|
GET /api/v1/metric-runs
|
||||||
|
6. 查询单次运行详情
|
||||||
|
GET /api/metric-runs/{run_id}
|
||||||
|
|
||||||
|
数据域
|
||||||
|
schema
|
||||||
|
指标结果表(纵表)metric_result
|
||||||
|
|
||||||
|
|
||||||
|
API
|
||||||
|
1. 查询指标结果(按时间段 & 维度)
|
||||||
|
GET /api/metric-results
|
||||||
|
2. 单点查询(最新值)
|
||||||
|
GET /api/metric-results/latest
|
||||||
|
3. 批量写入指标结果
|
||||||
|
POST /api/v1/metric-results/{metrics_id}
|
||||||
21
file/ecommerce_orders.sql
Normal file
21
file/ecommerce_orders.sql
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
CREATE TABLE `ecommerce_orders` (
|
||||||
|
`order_id` char(36) COLLATE utf8mb4_unicode_ci NOT NULL COMMENT 'UUID from CSV',
|
||||||
|
`customer_id` int NOT NULL,
|
||||||
|
`product_id` int NOT NULL,
|
||||||
|
`category` varchar(64) COLLATE utf8mb4_unicode_ci NOT NULL,
|
||||||
|
`price` decimal(10,2) NOT NULL,
|
||||||
|
`quantity` int NOT NULL,
|
||||||
|
`order_date` datetime(6) NOT NULL,
|
||||||
|
`shipping_date` datetime(6) NOT NULL,
|
||||||
|
`delivery_status` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
|
||||||
|
`payment_method` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
|
||||||
|
`device_type` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
|
||||||
|
`channel` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
|
||||||
|
`shipping_address` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL,
|
||||||
|
`billing_address` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL,
|
||||||
|
`customer_segment` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
|
||||||
|
PRIMARY KEY (`order_id`),
|
||||||
|
KEY `idx_customer` (`customer_id`),
|
||||||
|
KEY `idx_product` (`product_id`),
|
||||||
|
KEY `idx_order_date` (`order_date`)
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
40
file/tableschema/action_results.sql
Normal file
40
file/tableschema/action_results.sql
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
CREATE TABLE `action_results` (
|
||||||
|
`id` bigint NOT NULL AUTO_INCREMENT COMMENT '主键',
|
||||||
|
`table_id` bigint NOT NULL COMMENT '表ID',
|
||||||
|
`version_ts` bigint NOT NULL COMMENT '版本时间戳(版本号)',
|
||||||
|
`action_type` enum('ge_profiling','ge_result_desc','snippet','snippet_alias') COLLATE utf8mb4_bin NOT NULL COMMENT '动作类型',
|
||||||
|
`status` enum('pending','running','success','failed','partial') COLLATE utf8mb4_bin NOT NULL DEFAULT 'pending' COMMENT '执行状态',
|
||||||
|
`llm_usage` json DEFAULT NULL COMMENT 'LLM token usage统计',
|
||||||
|
`error_code` varchar(128) COLLATE utf8mb4_bin DEFAULT NULL,
|
||||||
|
`error_message` text COLLATE utf8mb4_bin,
|
||||||
|
`started_at` datetime DEFAULT NULL,
|
||||||
|
`finished_at` datetime DEFAULT NULL,
|
||||||
|
`duration_ms` int DEFAULT NULL,
|
||||||
|
`table_schema_version_id` varchar(19) COLLATE utf8mb4_bin NOT NULL,
|
||||||
|
`table_schema` json NOT NULL,
|
||||||
|
`ge_profiling_json` json DEFAULT NULL COMMENT 'Profiling完整结果JSON',
|
||||||
|
`ge_profiling_json_size_bytes` bigint DEFAULT NULL,
|
||||||
|
`ge_profiling_summary` json DEFAULT NULL COMMENT 'Profiling摘要(剔除大value_set等)',
|
||||||
|
`ge_profiling_summary_size_bytes` bigint DEFAULT NULL,
|
||||||
|
`ge_profiling_total_size_bytes` bigint DEFAULT NULL COMMENT '上两者合计',
|
||||||
|
`ge_profiling_html_report_url` varchar(1024) COLLATE utf8mb4_bin DEFAULT NULL COMMENT 'GE报告HTML路径/URL',
|
||||||
|
`ge_result_desc_json` json DEFAULT NULL COMMENT '表描述结果JSON',
|
||||||
|
`ge_result_desc_json_size_bytes` bigint DEFAULT NULL,
|
||||||
|
`snippet_json` json DEFAULT NULL COMMENT 'SQL知识片段结果JSON',
|
||||||
|
`snippet_json_size_bytes` bigint DEFAULT NULL,
|
||||||
|
`snippet_alias_json` json DEFAULT NULL COMMENT 'SQL片段改写/丰富结果JSON',
|
||||||
|
`snippet_alias_json_size_bytes` bigint DEFAULT NULL,
|
||||||
|
`callback_url` varchar(1024) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
|
||||||
|
`result_checksum` varbinary(32) DEFAULT NULL COMMENT '对当前action有效载荷计算的MD5/xxhash',
|
||||||
|
`created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
`updated_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
`model` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型名称',
|
||||||
|
`model_provider` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型渠道',
|
||||||
|
`model_params` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型参数,如温度',
|
||||||
|
PRIMARY KEY (`id`),
|
||||||
|
UNIQUE KEY `uq_table_ver_action` (`table_id`,`version_ts`,`action_type`),
|
||||||
|
KEY `idx_status` (`status`),
|
||||||
|
KEY `idx_table` (`table_id`,`updated_at`),
|
||||||
|
KEY `idx_action_time` (`action_type`,`version_ts`),
|
||||||
|
KEY `idx_schema_version` (`table_schema_version_id`)
|
||||||
|
) ENGINE=InnoDB AUTO_INCREMENT=113 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC COMMENT='数据分析知识片段表';
|
||||||
103
file/tableschema/chat.sql
Normal file
103
file/tableschema/chat.sql
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
CREATE TABLE IF NOT EXISTS chat_session (
|
||||||
|
id BIGINT AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
user_id BIGINT NOT NULL,
|
||||||
|
session_uuid CHAR(36) NOT NULL, -- 可用于对外展示的ID(UUID)
|
||||||
|
end_time DATETIME NULL,
|
||||||
|
status VARCHAR(16) NOT NULL DEFAULT 'OPEN', -- OPEN/CLOSED/ABANDONED
|
||||||
|
last_turn_id BIGINT NULL, -- 指向 chat_turn.id
|
||||||
|
ext_context JSON NULL, -- 业务上下文
|
||||||
|
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
UNIQUE KEY uk_session_uuid (session_uuid),
|
||||||
|
KEY idx_user_time (user_id, created_at),
|
||||||
|
KEY idx_status_time (status, created_at),
|
||||||
|
KEY idx_last_turn (last_turn_id)
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS chat_turn (
|
||||||
|
id BIGINT AUTO_INCREMENT,
|
||||||
|
session_id BIGINT NOT NULL, -- 关联 chat_session.id
|
||||||
|
turn_no INT NOT NULL, -- 会话内轮次序号(1,2,3...)
|
||||||
|
user_id BIGINT NOT NULL,
|
||||||
|
|
||||||
|
user_query TEXT NOT NULL, -- 原始用户问句
|
||||||
|
intent VARCHAR(64) NULL, -- METRIC_QUERY/METRIC_EXPLAIN 等
|
||||||
|
ast_json JSON NULL, -- 解析出来的 AST
|
||||||
|
|
||||||
|
generated_sql MEDIUMTEXT NULL, -- 生成的最终SQL
|
||||||
|
sql_status VARCHAR(32) NULL, -- SUCCESS/FAILED/SKIPPED
|
||||||
|
error_msg TEXT NULL, -- SQL生成/执行错误信息
|
||||||
|
|
||||||
|
main_metric_ids JSON NULL, -- 本轮涉及的指标ID列表
|
||||||
|
created_metric_ids JSON NULL, -- 本轮新建指标ID列表
|
||||||
|
|
||||||
|
end_time DATETIME NULL,
|
||||||
|
|
||||||
|
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
-- 主键改为联合主键,必须包含 created_at
|
||||||
|
PRIMARY KEY (id, created_at),
|
||||||
|
KEY idx_session_turn (session_id, turn_no),
|
||||||
|
KEY idx_session_time (session_id, created_at),
|
||||||
|
KEY idx_intent_time (intent, created_at),
|
||||||
|
KEY idx_user_time (user_id, created_at)
|
||||||
|
)
|
||||||
|
ENGINE=InnoDB
|
||||||
|
DEFAULT CHARSET=utf8mb4
|
||||||
|
PARTITION BY RANGE COLUMNS(created_at) (
|
||||||
|
-- 历史数据分区(根据实际需求调整)
|
||||||
|
PARTITION p202511 VALUES LESS THAN ('2025-12-01'),
|
||||||
|
PARTITION p202512 VALUES LESS THAN ('2026-01-01'),
|
||||||
|
-- 2026年按月分区
|
||||||
|
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
|
||||||
|
PARTITION p202602 VALUES LESS THAN ('2026-03-01'),
|
||||||
|
PARTITION p202603 VALUES LESS THAN ('2026-04-01'),
|
||||||
|
PARTITION p202604 VALUES LESS THAN ('2026-05-01'),
|
||||||
|
PARTITION p202605 VALUES LESS THAN ('2026-06-01'),
|
||||||
|
PARTITION p202606 VALUES LESS THAN ('2026-07-01'),
|
||||||
|
-- ... 可以预建几个月 ...
|
||||||
|
|
||||||
|
-- 兜底分区,存放未来的数据,防止插入报错
|
||||||
|
PARTITION p_future VALUES LESS THAN (MAXVALUE)
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS chat_turn_retrieval (
|
||||||
|
id BIGINT AUTO_INCREMENT,
|
||||||
|
turn_id BIGINT NOT NULL, -- 关联 qa_turn.id
|
||||||
|
|
||||||
|
item_type VARCHAR(32) NOT NULL, -- METRIC/SNIPPET/CHAT
|
||||||
|
item_id VARCHAR(128) NOT NULL, -- metric_id/snippet_id/table_name 等
|
||||||
|
item_extra JSON NULL, -- 附加信息,如字段名等
|
||||||
|
|
||||||
|
similarity_score DECIMAL(10,6) NULL, -- 相似度
|
||||||
|
rank_no INT NULL, -- 检索排名
|
||||||
|
used_in_reasoning TINYINT(1) NOT NULL DEFAULT 0, -- 是否参与推理
|
||||||
|
used_in_sql TINYINT(1) NOT NULL DEFAULT 0, -- 是否影响最终SQL
|
||||||
|
|
||||||
|
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
-- 主键改为联合主键,必须包含 created_at
|
||||||
|
PRIMARY KEY (id, created_at),
|
||||||
|
KEY idx_turn (turn_id),
|
||||||
|
KEY idx_turn_type (turn_id, item_type),
|
||||||
|
KEY idx_item (item_type, item_id)
|
||||||
|
)
|
||||||
|
ENGINE=InnoDB
|
||||||
|
DEFAULT CHARSET=utf8mb4
|
||||||
|
PARTITION BY RANGE COLUMNS(created_at) (
|
||||||
|
-- 历史数据分区(根据实际需求调整)
|
||||||
|
PARTITION p202511 VALUES LESS THAN ('2025-12-01'),
|
||||||
|
PARTITION p202512 VALUES LESS THAN ('2026-01-01'),
|
||||||
|
-- 2026年按月分区
|
||||||
|
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
|
||||||
|
PARTITION p202602 VALUES LESS THAN ('2026-03-01'),
|
||||||
|
PARTITION p202603 VALUES LESS THAN ('2026-04-01'),
|
||||||
|
PARTITION p202604 VALUES LESS THAN ('2026-05-01'),
|
||||||
|
PARTITION p202605 VALUES LESS THAN ('2026-06-01'),
|
||||||
|
PARTITION p202606 VALUES LESS THAN ('2026-07-01'),
|
||||||
|
-- ... 可以预建几个月 ...
|
||||||
|
|
||||||
|
-- 兜底分区,存放未来的数据,防止插入报错
|
||||||
|
PARTITION p_future VALUES LESS THAN (MAXVALUE)
|
||||||
|
);
|
||||||
155
file/tableschema/metrics.sql
Normal file
155
file/tableschema/metrics.sql
Normal file
@ -0,0 +1,155 @@
|
|||||||
|
CREATE TABLE metric_def (
|
||||||
|
id BIGINT AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
|
||||||
|
metric_code VARCHAR(64) NOT NULL, -- 内部编码:order_cnt_delivery
|
||||||
|
metric_name VARCHAR(128) NOT NULL, -- 中文名:外送订单数
|
||||||
|
metric_aliases JSON NULL, -- 别名列表
|
||||||
|
|
||||||
|
biz_domain VARCHAR(64) NOT NULL, -- 通过table tag获取,支持人工配置
|
||||||
|
biz_desc TEXT NULL, -- 业务口径描述
|
||||||
|
|
||||||
|
chat_turn_id BIGINT NULL, -- 来自哪轮会话
|
||||||
|
|
||||||
|
tech_desc TEXT NULL, -- 技术口径描述
|
||||||
|
formula_expr TEXT NULL, -- 公式描述:"sum(pay_amount)"
|
||||||
|
base_sql MEDIUMTEXT NOT NULL, -- 标准计算SQL(逻辑SQL/snippet)
|
||||||
|
|
||||||
|
time_grain VARCHAR(32) NOT NULL, -- DAY/HOUR/WEEK/MONTH
|
||||||
|
dim_binding JSON NOT NULL, -- 维度绑定,如 ["dt","store_id","channel"]
|
||||||
|
|
||||||
|
update_strategy VARCHAR(32) NOT NULL, -- FULL/INCR/REALTIME
|
||||||
|
schedule_id BIGINT NULL, -- 调度ID
|
||||||
|
schedule_type INT NULL, -- 调度类型,默认调度cron
|
||||||
|
|
||||||
|
version INT NOT NULL DEFAULT 1,
|
||||||
|
is_active TINYINT(1) NOT NULL DEFAULT 1,
|
||||||
|
|
||||||
|
sql_hash VARCHAR(64) NULL, -- base_sql hash 用于版本比较
|
||||||
|
created_by BIGINT NULL,
|
||||||
|
updated_by BIGINT NULL,
|
||||||
|
|
||||||
|
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
|
||||||
|
UNIQUE KEY uk_metric_code (metric_code),
|
||||||
|
KEY idx_domain_active (biz_domain, is_active),
|
||||||
|
KEY idx_update_strategy (update_strategy),
|
||||||
|
KEY idx_name (metric_name)
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE metric_schedule (
|
||||||
|
id BIGINT AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
metric_id BIGINT NOT NULL, -- 关联 metric_def.id
|
||||||
|
|
||||||
|
cron_expr VARCHAR(64) NOT NULL, -- 调度表达式
|
||||||
|
enabled TINYINT(1) NOT NULL DEFAULT 1, -- 是否启用
|
||||||
|
priority INT NOT NULL DEFAULT 10, -- 优先级
|
||||||
|
|
||||||
|
backfill_allowed TINYINT(1) NOT NULL DEFAULT 1, -- 是否允许补数
|
||||||
|
max_runtime_sec INT NULL, -- 最大运行时长(秒)
|
||||||
|
retry_times INT NOT NULL DEFAULT 0, -- 失败重试次数
|
||||||
|
|
||||||
|
owner_team VARCHAR(64) NULL,
|
||||||
|
owner_user_id BIGINT NULL,
|
||||||
|
|
||||||
|
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
|
||||||
|
KEY idx_metric_enabled (metric_id, enabled),
|
||||||
|
KEY idx_owner (owner_team, owner_user_id)
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE metric_job_run (
|
||||||
|
id BIGINT AUTO_INCREMENT,
|
||||||
|
|
||||||
|
metric_id BIGINT NOT NULL, -- metric_def.id
|
||||||
|
schedule_id BIGINT NULL, -- metric_schedule.id,手动触发则可为空
|
||||||
|
source_turn_id BIGINT NULL, -- 若本次运行由某次问答触发,关联 qa_turn.id
|
||||||
|
|
||||||
|
data_time_from DATETIME NULL, -- 指标统计时间窗口起
|
||||||
|
data_time_to DATETIME NULL, -- 指标统计时间窗口止
|
||||||
|
|
||||||
|
metric_version INT NOT NULL, -- 执行时使用的指标版本
|
||||||
|
base_sql_snapshot MEDIUMTEXT NOT NULL, -- 本次执行使用的SQL快照
|
||||||
|
|
||||||
|
status VARCHAR(32) NOT NULL, -- RUNNING/SUCCESS/FAILED/SKIPPED
|
||||||
|
error_msg TEXT NULL,
|
||||||
|
|
||||||
|
affected_rows BIGINT NULL, -- 写入行数
|
||||||
|
runtime_ms BIGINT NULL, -- 执行耗时
|
||||||
|
|
||||||
|
triggered_by VARCHAR(32) NOT NULL, -- SCHEDULER/MANUAL/API/QA_TURN
|
||||||
|
triggered_at DATETIME NOT NULL,
|
||||||
|
started_at DATETIME NULL,
|
||||||
|
finished_at DATETIME NULL,
|
||||||
|
|
||||||
|
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
-- 主键改为联合主键,必须包含 created_at
|
||||||
|
PRIMARY KEY (id, created_at),
|
||||||
|
KEY idx_metric_time (metric_id, data_time_from, data_time_to),
|
||||||
|
KEY idx_status_time (status, triggered_at),
|
||||||
|
KEY idx_schedule (schedule_id),
|
||||||
|
KEY idx_source_turn (source_turn_id)
|
||||||
|
)
|
||||||
|
ENGINE=InnoDB
|
||||||
|
DEFAULT CHARSET=utf8mb4
|
||||||
|
PARTITION BY RANGE COLUMNS(created_at) (
|
||||||
|
-- 历史数据分区(根据实际需求调整)
|
||||||
|
PARTITION p202511 VALUES LESS THAN ('2025-12-01'),
|
||||||
|
PARTITION p202512 VALUES LESS THAN ('2026-01-01'),
|
||||||
|
-- 2026年按月分区
|
||||||
|
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
|
||||||
|
PARTITION p202602 VALUES LESS THAN ('2026-03-01'),
|
||||||
|
PARTITION p202603 VALUES LESS THAN ('2026-04-01'),
|
||||||
|
PARTITION p202604 VALUES LESS THAN ('2026-05-01'),
|
||||||
|
PARTITION p202605 VALUES LESS THAN ('2026-06-01'),
|
||||||
|
PARTITION p202606 VALUES LESS THAN ('2026-07-01'),
|
||||||
|
-- ... 可以预建几个月 ...
|
||||||
|
|
||||||
|
-- 兜底分区,存放未来的数据,防止插入报错
|
||||||
|
PARTITION p_future VALUES LESS THAN (MAXVALUE)
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE metric_result (
|
||||||
|
id BIGINT AUTO_INCREMENT,
|
||||||
|
|
||||||
|
metric_id BIGINT NOT NULL, -- metric_def.id
|
||||||
|
metric_version INT NOT NULL, -- metric_def.version
|
||||||
|
stat_time DATETIME NOT NULL, -- 按 time_grain 对齐后的时间
|
||||||
|
|
||||||
|
extra_dims JSON NULL, -- 其他维度,JSON 存
|
||||||
|
|
||||||
|
metric_value DECIMAL(32,8) NOT NULL, -- 指标结果值
|
||||||
|
|
||||||
|
load_time DATETIME NOT NULL, -- 入库时间
|
||||||
|
data_version BIGINT NULL, -- 版本或 job_run id
|
||||||
|
|
||||||
|
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
-- 主键改为联合主键,必须包含 created_at
|
||||||
|
PRIMARY KEY (id, created_at),
|
||||||
|
KEY idx_metric_time (metric_id, stat_time),
|
||||||
|
KEY idx_load_time (load_time)
|
||||||
|
)
|
||||||
|
ENGINE=InnoDB
|
||||||
|
DEFAULT CHARSET=utf8mb4
|
||||||
|
PARTITION BY RANGE COLUMNS(created_at) (
|
||||||
|
-- 历史数据分区(根据实际需求调整)
|
||||||
|
PARTITION p202511 VALUES LESS THAN ('2025-12-01'),
|
||||||
|
PARTITION p202512 VALUES LESS THAN ('2026-01-01'),
|
||||||
|
-- 2026年按月分区
|
||||||
|
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
|
||||||
|
PARTITION p202602 VALUES LESS THAN ('2026-03-01'),
|
||||||
|
PARTITION p202603 VALUES LESS THAN ('2026-04-01'),
|
||||||
|
PARTITION p202604 VALUES LESS THAN ('2026-05-01'),
|
||||||
|
PARTITION p202605 VALUES LESS THAN ('2026-06-01'),
|
||||||
|
PARTITION p202606 VALUES LESS THAN ('2026-07-01'),
|
||||||
|
-- ... 可以预建几个月 ...
|
||||||
|
|
||||||
|
-- 兜底分区,存放未来的数据,防止插入报错
|
||||||
|
PARTITION p_future VALUES LESS THAN (MAXVALUE)
|
||||||
|
);
|
||||||
24
file/tableschema/rag_snippet.sql
Normal file
24
file/tableschema/rag_snippet.sql
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
CREATE TABLE `rag_snippet` (
|
||||||
|
`rag_item_id` bigint NOT NULL COMMENT 'RAG item id (stable hash of table/version/snippet_id)',
|
||||||
|
`workspace_id` bigint NOT NULL COMMENT 'RAG workspace scope',
|
||||||
|
`table_id` bigint NOT NULL COMMENT '来源表ID',
|
||||||
|
`version_ts` bigint NOT NULL COMMENT '表版本号',
|
||||||
|
`action_result_id` bigint NOT NULL COMMENT '来源 action_results 主键ID(snippet_alias 或 snippet 行)',
|
||||||
|
`snippet_id` varchar(255) COLLATE utf8mb4_bin NOT NULL COMMENT '原始 snippet id',
|
||||||
|
`rag_text` text COLLATE utf8mb4_bin NOT NULL COMMENT '用于向量化的拼接文本',
|
||||||
|
`merged_json` json NOT NULL COMMENT '合并后的 snippet 对象',
|
||||||
|
`created_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '写入时间,用于分区',
|
||||||
|
`updated_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
PRIMARY KEY (`rag_item_id`,`created_at`),
|
||||||
|
KEY `idx_action_result` (`action_result_id`),
|
||||||
|
KEY `idx_workspace` (`workspace_id`),
|
||||||
|
KEY `idx_table_version` (`table_id`,`version_ts`)
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin
|
||||||
|
PARTITION BY RANGE COLUMNS (`created_at`) (
|
||||||
|
PARTITION p202401 VALUES LESS THAN ('2024-02-01'),
|
||||||
|
PARTITION p202402 VALUES LESS THAN ('2024-03-01'),
|
||||||
|
PARTITION p202403 VALUES LESS THAN ('2024-04-01'),
|
||||||
|
PARTITION p202404 VALUES LESS THAN ('2024-05-01'),
|
||||||
|
PARTITION p202405 VALUES LESS THAN ('2024-06-01'),
|
||||||
|
PARTITION p_future VALUES LESS THAN (MAXVALUE)
|
||||||
|
) COMMENT='RAG snippet 索引缓存';
|
||||||
40
file/tableschema/table_snippet.sql
Normal file
40
file/tableschema/table_snippet.sql
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
CREATE TABLE `action_results` (
|
||||||
|
`id` bigint NOT NULL AUTO_INCREMENT COMMENT '主键',
|
||||||
|
`table_id` bigint NOT NULL COMMENT '表ID',
|
||||||
|
`version_ts` bigint NOT NULL COMMENT '版本时间戳(版本号)',
|
||||||
|
`action_type` enum('ge_profiling','ge_result_desc','snippet','snippet_alias') COLLATE utf8mb4_bin NOT NULL COMMENT '动作类型',
|
||||||
|
`status` enum('pending','running','success','failed','partial') COLLATE utf8mb4_bin NOT NULL DEFAULT 'pending' COMMENT '执行状态',
|
||||||
|
`model` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型名称',
|
||||||
|
`model_provider` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型渠道',
|
||||||
|
`model_params` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型参数,如温度',
|
||||||
|
`llm_usage` json DEFAULT NULL COMMENT 'LLM token usage统计',
|
||||||
|
`error_code` varchar(128) COLLATE utf8mb4_bin DEFAULT NULL,
|
||||||
|
`error_message` text COLLATE utf8mb4_bin,
|
||||||
|
`started_at` datetime DEFAULT NULL,
|
||||||
|
`finished_at` datetime DEFAULT NULL,
|
||||||
|
`duration_ms` int DEFAULT NULL,
|
||||||
|
`table_schema_version_id` varchar(19) COLLATE utf8mb4_bin NOT NULL,
|
||||||
|
`table_schema` json NOT NULL,
|
||||||
|
`ge_profiling_json` json DEFAULT NULL COMMENT 'Profiling完整结果JSON',
|
||||||
|
`ge_profiling_json_size_bytes` bigint DEFAULT NULL,
|
||||||
|
`ge_profiling_summary` json DEFAULT NULL COMMENT 'Profiling摘要(剔除大value_set等)',
|
||||||
|
`ge_profiling_summary_size_bytes` bigint DEFAULT NULL,
|
||||||
|
`ge_profiling_total_size_bytes` bigint DEFAULT NULL COMMENT '上两者合计',
|
||||||
|
`ge_profiling_html_report_url` varchar(1024) COLLATE utf8mb4_bin DEFAULT NULL COMMENT 'GE报告HTML路径/URL',
|
||||||
|
`ge_result_desc_json` json DEFAULT NULL COMMENT '表描述结果JSON',
|
||||||
|
`ge_result_desc_json_size_bytes` bigint DEFAULT NULL,
|
||||||
|
`snippet_json` json DEFAULT NULL COMMENT 'SQL知识片段结果JSON',
|
||||||
|
`snippet_json_size_bytes` bigint DEFAULT NULL,
|
||||||
|
`snippet_alias_json` json DEFAULT NULL COMMENT 'SQL片段改写/丰富结果JSON',
|
||||||
|
`snippet_alias_json_size_bytes` bigint DEFAULT NULL,
|
||||||
|
`callback_url` varchar(1024) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
|
||||||
|
`result_checksum` varbinary(32) DEFAULT NULL COMMENT '对当前action有效载荷计算的MD5/xxhash',
|
||||||
|
`created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
`updated_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
PRIMARY KEY (`id`),
|
||||||
|
UNIQUE KEY `uq_table_ver_action` (`table_id`,`version_ts`,`action_type`),
|
||||||
|
KEY `idx_status` (`status`),
|
||||||
|
KEY `idx_table` (`table_id`,`updated_at`),
|
||||||
|
KEY `idx_action_time` (`action_type`,`version_ts`),
|
||||||
|
KEY `idx_schema_version` (`table_schema_version_id`)
|
||||||
|
) ENGINE=InnoDB AUTO_INCREMENT=53 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC COMMENT='数据分析知识片段表';
|
||||||
6
main.py
Normal file
6
main.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
def main():
|
||||||
|
print("Hello from data-ge-new!")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
23
project.md
Normal file
23
project.md
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
项目结构与逻辑
|
||||||
|
|
||||||
|
app/main.py:创建 FastAPI 应用与生命周期,初始化共享 httpx.AsyncClient 和 LLMGateway,统一异常处理后暴露四个接口:聊天代理、导入分析、表画像流水线、表片段入库。
|
||||||
|
app/models.py:定义所有请求/响应模型与枚举(LLM 请求、导入分析作业、表画像作业、片段入库等),并给出字段校验与默认值。
|
||||||
|
app/services:核心业务逻辑
|
||||||
|
gateway.py 将 /v1/chat/completions 请求转发到 NEW_API_BASE_URL(带可选 Bearer Token),并归一化返回。
|
||||||
|
import_analysis.py 组装导入提示词(prompt/data_import_analysis.md)、解析/截断样本、调用统一聊天接口、抽取 JSON 结果与 token 用量,最后回调业务方。
|
||||||
|
table_profiling.py 串行执行 4 步流水线:Great Expectations profiling → LLM 结果描述(prompt/ge_result_desc_prompt.md)→ 片段生成(prompt/snippet_generator.md)→ 片段别名(prompt/snippet_alias_generator.md),每步都回调状态与结果。
|
||||||
|
table_snippet.py 将各步骤结果 upsert 到数据库表,自动序列化 JSON/大小信息并构造 INSERT ... ON DUPLICATE KEY UPDATE。
|
||||||
|
app/providers/*:各云厂商直连客户端(OpenAI/Anthropic/OpenRouter/Gemini/Qwen/DeepSeek),实现统一 chat 接口;当前主流程通过 new-api 转发,但保留直连能力。
|
||||||
|
prompt/ 存放提示词模板;scripts/ 与 test/ 目录提供接口调用示例和回归样本;table_snippet.sql 给出 action_results 表结构(用于片段与 profiling 结果持久化)。
|
||||||
|
功能/需求说明
|
||||||
|
|
||||||
|
LLM 网关:POST /v1/chat/completions 接收 LLMRequest(provider+model+messages 等),将 payload 透传到 NEW_API_BASE_URL/v1/chat/completions,带可选 NEW_API_AUTH_TOKEN 认证;异常时返回 4xx/5xx 并记录原始响应。
|
||||||
|
导入分析(异步):POST /v1/import/analyze 接收导入样本(rows/headers/raw_csv/table_schema)、目标模型 llm_model(默认 DEFAULT_IMPORT_MODEL,可被 IMPORT_SUPPORTED_MODELS 白名单限制)、温度与回调地址。服务将样本转 CSV、附加 schema,拼接系统+用户消息后调用统一聊天接口,解析首个 choice 中的 JSON 作为分析结果,连同 LLM usage 一并以回调形式返回;失败时回调 status=failed 与错误信息。
|
||||||
|
表画像流水线(异步):POST /v1/table/profiling 接收表标识、版本号、回调地址及 GE/LLM 配置(datasource/batch_request、连接串模板、LLM 模型与超时)。流水线按顺序执行:
|
||||||
|
Great Expectations profiling(可指定 profiler 类型、datasource、runtime SQL 查询/表),生成完整与摘要 JSON 及 Data Docs 路径;
|
||||||
|
调用聊天接口生成 GE 结果描述 JSON;
|
||||||
|
基于描述生成 SQL 片段数组;
|
||||||
|
生成片段别名/关键词。
|
||||||
|
每步成功/失败都会回调,payload 包含 action_type、结果 JSON、模型、llm_usage、报错信息等。
|
||||||
|
片段结果入库:POST /v1/table/snippet 接收 TableSnippetUpsertRequest(表/版本、action 类型、状态、schema、模型信息、各阶段 JSON 及大小、错误码、时间戳等),组装到 action_results 表进行 UPSERT,返回是否更新已有记录。
|
||||||
|
配置与运行要求:核心环境变量在 app/settings.py(API Keys、DEFAULT_IMPORT_MODEL、IMPORT_GATEWAY_BASE_URL/NEW_API_BASE_URL、模型白名单、数据库 URL 等);日志使用 logging.yaml 自动创建 logs/;HTTP 客户端超时/代理可通过 HTTP_CLIENT_TIMEOUT、HTTP_CLIENT_TRUST_ENV、HTTP_CLIENT_PROXY 控制。 调试可用 uvicorn app.main:app --reload,Docker 由 Dockerfile/docker-compose.yml 提供。
|
||||||
21
pyproject.toml
Normal file
21
pyproject.toml
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
[project]
|
||||||
|
name = "data-ge-new"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Add your description here"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.11"
|
||||||
|
dependencies = [
|
||||||
|
"fastapi>=0.111.0",
|
||||||
|
"uvicorn[standard]>=0.29.0",
|
||||||
|
"pydantic>=2.6.0",
|
||||||
|
"sqlalchemy>=2.0.28",
|
||||||
|
"pymysql>=1.1.0",
|
||||||
|
"great-expectations[profilers]==0.18.19",
|
||||||
|
"pandas>=2.0",
|
||||||
|
"numpy>=1.24",
|
||||||
|
"openpyxl>=3.1",
|
||||||
|
"httpx==0.27.2",
|
||||||
|
"python-dotenv==1.0.1",
|
||||||
|
"requests>=2.31.0",
|
||||||
|
"PyYAML>=6.0.1",
|
||||||
|
]
|
||||||
@ -1,13 +0,0 @@
|
|||||||
fastapi>=0.111.0
|
|
||||||
uvicorn[standard]>=0.29.0
|
|
||||||
pydantic>=2.6.0
|
|
||||||
sqlalchemy>=2.0.28
|
|
||||||
pymysql>=1.1.0
|
|
||||||
great_expectations>=0.18.0,<0.19.0
|
|
||||||
pandas>=2.0
|
|
||||||
numpy>=1.24
|
|
||||||
openpyxl>=3.1
|
|
||||||
httpx==0.27.2
|
|
||||||
python-dotenv==1.0.1
|
|
||||||
requests>=2.31.0
|
|
||||||
PyYAML>=6.0.1
|
|
||||||
@ -1,54 +0,0 @@
|
|||||||
CREATE TABLE IF NOT EXISTS action_results (
|
|
||||||
id BIGINT NOT NULL AUTO_INCREMENT COMMENT '主键',
|
|
||||||
table_id BIGINT NOT NULL COMMENT '表ID',
|
|
||||||
version_ts BIGINT NOT NULL COMMENT '版本时间戳(版本号)',
|
|
||||||
action_type ENUM('ge_profiling','ge_result_desc','snippet','snippet_alias') NOT NULL COMMENT '动作类型',
|
|
||||||
|
|
||||||
status ENUM('pending','running','success','failed','partial') NOT NULL DEFAULT 'pending' COMMENT '执行状态',
|
|
||||||
error_code VARCHAR(128) NULL,
|
|
||||||
error_message TEXT NULL,
|
|
||||||
|
|
||||||
-- 回调 & 观测
|
|
||||||
callback_url VARCHAR(1024) NOT NULL,
|
|
||||||
started_at DATETIME NULL,
|
|
||||||
finished_at DATETIME NULL,
|
|
||||||
duration_ms INT NULL,
|
|
||||||
|
|
||||||
-- 本次schema信息
|
|
||||||
table_schema_version_id BIGINT NOT NULL,
|
|
||||||
table_schema JSON NOT NULL,
|
|
||||||
|
|
||||||
-- ===== 动作1:GE Profiling =====
|
|
||||||
ge_profiling_full JSON NULL COMMENT 'Profiling完整结果JSON',
|
|
||||||
ge_profiling_full_size_bytes BIGINT NULL,
|
|
||||||
ge_profiling_summary JSON NULL COMMENT 'Profiling摘要(剔除大value_set等)',
|
|
||||||
ge_profiling_summary_size_bytes BIGINT NULL,
|
|
||||||
ge_profiling_total_size_bytes BIGINT NULL COMMENT '上两者合计',
|
|
||||||
ge_profiling_html_report_url VARCHAR(1024) NULL COMMENT 'GE报告HTML路径/URL',
|
|
||||||
|
|
||||||
-- ===== 动作2:GE Result Desc =====
|
|
||||||
ge_result_desc_full JSON NULL COMMENT '表描述结果JSON',
|
|
||||||
ge_result_desc_full_size_bytes BIGINT NULL,
|
|
||||||
|
|
||||||
-- ===== 动作3:Snippet 生成 =====
|
|
||||||
snippet_full JSON NULL COMMENT 'SQL知识片段结果JSON',
|
|
||||||
snippet_full_size_bytes BIGINT NULL,
|
|
||||||
|
|
||||||
-- ===== 动作4:Snippet Alias 改写 =====
|
|
||||||
snippet_alias_full JSON NULL COMMENT 'SQL片段改写/丰富结果JSON',
|
|
||||||
snippet_alias_full_size_bytes BIGINT NULL,
|
|
||||||
|
|
||||||
-- 通用可选指标
|
|
||||||
result_checksum VARBINARY(32) NULL COMMENT '对当前action有效载荷计算的MD5/xxhash',
|
|
||||||
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
|
||||||
|
|
||||||
PRIMARY KEY (id),
|
|
||||||
UNIQUE KEY uq_table_ver_action (table_id, version_ts, action_type),
|
|
||||||
KEY idx_status (status),
|
|
||||||
KEY idx_table (table_id, updated_at),
|
|
||||||
KEY idx_action_time (action_type, version_ts),
|
|
||||||
KEY idx_schema_version (table_schema_version_id)
|
|
||||||
) ENGINE=InnoDB
|
|
||||||
ROW_FORMAT=DYNAMIC
|
|
||||||
COMMENT='数据分析知识片段表';
|
|
||||||
142
test/test_chat_api_mysql.py
Normal file
142
test/test_chat_api_mysql.py
Normal file
@ -0,0 +1,142 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Generator, List
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
from sqlalchemy import text
|
||||||
|
from sqlalchemy.exc import SQLAlchemyError
|
||||||
|
|
||||||
|
# Ensure the project root is importable when running directly via python.
|
||||||
|
ROOT = Path(__file__).resolve().parents[1]
|
||||||
|
if str(ROOT) not in sys.path:
|
||||||
|
sys.path.insert(0, str(ROOT))
|
||||||
|
|
||||||
|
from app import db
|
||||||
|
from app.main import create_app
|
||||||
|
|
||||||
|
|
||||||
|
TEST_USER_ID = 872341
|
||||||
|
SCHEMA_PATH = Path("file/tableschema/chat.sql")
|
||||||
|
DEFAULT_MYSQL_URL = "mysql+pymysql://root:12345678@127.0.0.1:3306/data-ge?charset=utf8mb4"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def client() -> Generator[TestClient, None, None]:
|
||||||
|
mysql_url = os.getenv("TEST_DATABASE_URL", DEFAULT_MYSQL_URL)
|
||||||
|
os.environ["DATABASE_URL"] = mysql_url
|
||||||
|
db.get_engine.cache_clear()
|
||||||
|
engine = db.get_engine()
|
||||||
|
try:
|
||||||
|
# Quick connectivity check
|
||||||
|
with engine.connect() as conn:
|
||||||
|
conn.execute(text("SELECT 1"))
|
||||||
|
except SQLAlchemyError:
|
||||||
|
pytest.skip(f"Cannot connect to MySQL at {mysql_url}")
|
||||||
|
|
||||||
|
#_ensure_chat_schema(engine)
|
||||||
|
|
||||||
|
app = create_app()
|
||||||
|
with TestClient(app) as test_client:
|
||||||
|
yield test_client
|
||||||
|
|
||||||
|
# cleanup test artifacts
|
||||||
|
with engine.begin() as conn:
|
||||||
|
# remove retrievals and turns tied to test sessions
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
DELETE FROM chat_turn_retrieval
|
||||||
|
WHERE turn_id IN (
|
||||||
|
SELECT id FROM chat_turn WHERE session_id IN (SELECT id FROM chat_session WHERE user_id=:uid)
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
{"uid": TEST_USER_ID},
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
text("DELETE FROM chat_turn WHERE session_id IN (SELECT id FROM chat_session WHERE user_id=:uid)"),
|
||||||
|
{"uid": TEST_USER_ID},
|
||||||
|
)
|
||||||
|
conn.execute(text("DELETE FROM chat_session WHERE user_id=:uid"), {"uid": TEST_USER_ID})
|
||||||
|
db.get_engine.cache_clear()
|
||||||
|
|
||||||
|
|
||||||
|
def test_session_lifecycle_mysql(client: TestClient) -> None:
|
||||||
|
# Create a session
|
||||||
|
resp = client.post("/api/v1/chat/sessions", json={"user_id": TEST_USER_ID})
|
||||||
|
assert resp.status_code == 200, resp.text
|
||||||
|
session = resp.json()
|
||||||
|
session_id = session["id"]
|
||||||
|
assert session["status"] == "OPEN"
|
||||||
|
|
||||||
|
# Get session
|
||||||
|
assert client.get(f"/api/v1/chat/sessions/{session_id}").status_code == 200
|
||||||
|
|
||||||
|
# List sessions (filter by user)
|
||||||
|
resp = client.get(f"/api/v1/chat/sessions", params={"user_id": TEST_USER_ID})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert any(item["id"] == session_id for item in resp.json())
|
||||||
|
|
||||||
|
# Update status
|
||||||
|
resp = client.post(f"/api/v1/chat/sessions/{session_id}/update", json={"status": "PAUSED"})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["status"] == "PAUSED"
|
||||||
|
|
||||||
|
# Close session
|
||||||
|
resp = client.post(f"/api/v1/chat/sessions/{session_id}/close")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["status"] == "CLOSED"
|
||||||
|
|
||||||
|
|
||||||
|
def test_turns_and_retrievals_mysql(client: TestClient) -> None:
|
||||||
|
session_id = client.post("/api/v1/chat/sessions", json={"user_id": TEST_USER_ID}).json()["id"]
|
||||||
|
turn_payload = {
|
||||||
|
"user_id": TEST_USER_ID,
|
||||||
|
"user_query": "展示昨天订单GMV",
|
||||||
|
"intent": "METRIC_QUERY",
|
||||||
|
"ast_json": {"select": ["gmv"], "where": {"dt": "yesterday"}},
|
||||||
|
"main_metric_ids": [random.randint(1000, 9999)],
|
||||||
|
"created_metric_ids": [],
|
||||||
|
}
|
||||||
|
resp = client.post(f"/api/v1/chat/sessions/{session_id}/turns", json=turn_payload)
|
||||||
|
assert resp.status_code == 200, resp.text
|
||||||
|
turn = resp.json()
|
||||||
|
turn_id = turn["id"]
|
||||||
|
assert turn["turn_no"] == 1
|
||||||
|
|
||||||
|
# Fetch turn
|
||||||
|
assert client.get(f"/api/v1/chat/turns/{turn_id}").status_code == 200
|
||||||
|
|
||||||
|
# List turns under session
|
||||||
|
resp = client.get(f"/api/v1/chat/sessions/{session_id}/turns")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert any(t["id"] == turn_id for t in resp.json())
|
||||||
|
|
||||||
|
# Insert retrievals
|
||||||
|
retrievals_payload = {
|
||||||
|
"retrievals": [
|
||||||
|
{"item_type": "METRIC", "item_id": "metric_foo", "used_in_sql": True, "rank_no": 1},
|
||||||
|
{"item_type": "SNIPPET", "item_id": "snpt_bar", "similarity_score": 0.77, "rank_no": 2},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
resp = client.post(f"/api/v1/chat/turns/{turn_id}/retrievals", json=retrievals_payload)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["inserted"] == 2
|
||||||
|
|
||||||
|
# List retrievals
|
||||||
|
resp = client.get(f"/api/v1/chat/turns/{turn_id}/retrievals")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
items = resp.json()
|
||||||
|
assert len(items) == 2
|
||||||
|
assert {item["item_type"] for item in items} == {"METRIC", "SNIPPET"}
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import pytest as _pytest
|
||||||
|
|
||||||
|
raise SystemExit(_pytest.main([__file__]))
|
||||||
207
test/test_metrics_api_mysql.py
Normal file
207
test/test_metrics_api_mysql.py
Normal file
@ -0,0 +1,207 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Generator, List
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
from sqlalchemy import text
|
||||||
|
from sqlalchemy.exc import SQLAlchemyError
|
||||||
|
|
||||||
|
# Ensure project root on path for direct execution
|
||||||
|
ROOT = Path(__file__).resolve().parents[1]
|
||||||
|
import sys
|
||||||
|
if str(ROOT) not in sys.path:
|
||||||
|
sys.path.insert(0, str(ROOT))
|
||||||
|
|
||||||
|
from app import db
|
||||||
|
from app.main import create_app
|
||||||
|
|
||||||
|
|
||||||
|
TEST_USER_ID = 98765
|
||||||
|
#SCHEMA_PATH = Path("file/tableschema/metrics.sql")
|
||||||
|
DEFAULT_MYSQL_URL = "mysql+pymysql://root:12345678@127.0.0.1:3306/data-ge?charset=utf8mb4"
|
||||||
|
|
||||||
|
|
||||||
|
# def _run_sql_script(engine, sql_text: str) -> None:
|
||||||
|
# """Execute semicolon-terminated SQL statements sequentially."""
|
||||||
|
# statements: List[str] = []
|
||||||
|
# buffer: List[str] = []
|
||||||
|
# for line in sql_text.splitlines():
|
||||||
|
# stripped = line.strip()
|
||||||
|
# if not stripped or stripped.startswith("--"):
|
||||||
|
# continue
|
||||||
|
# buffer.append(line)
|
||||||
|
# if stripped.endswith(";"):
|
||||||
|
# statements.append("\n".join(buffer).rstrip(";"))
|
||||||
|
# buffer = []
|
||||||
|
# if buffer:
|
||||||
|
# statements.append("\n".join(buffer))
|
||||||
|
# with engine.begin() as conn:
|
||||||
|
# for stmt in statements:
|
||||||
|
# conn.execute(text(stmt))
|
||||||
|
|
||||||
|
|
||||||
|
# def _ensure_metric_schema(engine) -> None:
|
||||||
|
# if not SCHEMA_PATH.exists():
|
||||||
|
# pytest.skip("metrics.sql schema file not found.")
|
||||||
|
# raw_sql = SCHEMA_PATH.read_text(encoding="utf-8")
|
||||||
|
# raw_sql = raw_sql.replace("CREATE TABLE metric_def", "CREATE TABLE IF NOT EXISTS metric_def")
|
||||||
|
# raw_sql = raw_sql.replace("CREATE TABLE metric_schedule", "CREATE TABLE IF NOT EXISTS metric_schedule")
|
||||||
|
# raw_sql = raw_sql.replace("CREATE TABLE metric_job_run", "CREATE TABLE IF NOT EXISTS metric_job_run")
|
||||||
|
# raw_sql = raw_sql.replace("CREATE TABLE metric_result", "CREATE TABLE IF NOT EXISTS metric_result")
|
||||||
|
# _run_sql_script(engine, raw_sql)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def client() -> Generator[TestClient, None, None]:
|
||||||
|
mysql_url = os.getenv("TEST_DATABASE_URL", DEFAULT_MYSQL_URL)
|
||||||
|
os.environ["DATABASE_URL"] = mysql_url
|
||||||
|
db.get_engine.cache_clear()
|
||||||
|
engine = db.get_engine()
|
||||||
|
try:
|
||||||
|
with engine.connect() as conn:
|
||||||
|
conn.execute(text("SELECT 1"))
|
||||||
|
except SQLAlchemyError:
|
||||||
|
pytest.skip(f"Cannot connect to MySQL at {mysql_url}")
|
||||||
|
|
||||||
|
#_ensure_metric_schema(engine)
|
||||||
|
|
||||||
|
app = create_app()
|
||||||
|
with TestClient(app) as test_client:
|
||||||
|
yield test_client
|
||||||
|
|
||||||
|
# cleanup test artifacts
|
||||||
|
with engine.begin() as conn:
|
||||||
|
conn.execute(text("DELETE FROM metric_result WHERE metric_id IN (SELECT id FROM metric_def WHERE created_by=:uid)"), {"uid": TEST_USER_ID})
|
||||||
|
conn.execute(text("DELETE FROM metric_job_run WHERE metric_id IN (SELECT id FROM metric_def WHERE created_by=:uid)"), {"uid": TEST_USER_ID})
|
||||||
|
conn.execute(text("DELETE FROM metric_schedule WHERE metric_id IN (SELECT id FROM metric_def WHERE created_by=:uid)"), {"uid": TEST_USER_ID})
|
||||||
|
conn.execute(text("DELETE FROM metric_def WHERE created_by=:uid"), {"uid": TEST_USER_ID})
|
||||||
|
db.get_engine.cache_clear()
|
||||||
|
|
||||||
|
|
||||||
|
def test_metric_crud_and_schedule_mysql(client: TestClient) -> None:
|
||||||
|
code = f"metric_{random.randint(1000,9999)}"
|
||||||
|
create_payload = {
|
||||||
|
"metric_code": code,
|
||||||
|
"metric_name": "订单数",
|
||||||
|
"biz_domain": "order",
|
||||||
|
"biz_desc": "订单总数",
|
||||||
|
"base_sql": "select count(*) as order_cnt from orders",
|
||||||
|
"time_grain": "DAY",
|
||||||
|
"dim_binding": ["dt"],
|
||||||
|
"update_strategy": "FULL",
|
||||||
|
"metric_aliases": ["订单量"],
|
||||||
|
"created_by": TEST_USER_ID,
|
||||||
|
}
|
||||||
|
resp = client.post("/api/v1/metrics", json=create_payload)
|
||||||
|
assert resp.status_code == 200, resp.text
|
||||||
|
metric = resp.json()
|
||||||
|
metric_id = metric["id"]
|
||||||
|
assert metric["metric_code"] == code
|
||||||
|
|
||||||
|
# Update metric
|
||||||
|
resp = client.post(f"/api/v1/metrics/{metric_id}", json={"metric_name": "订单数-更新", "is_active": False})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["is_active"] is False
|
||||||
|
|
||||||
|
# Get metric
|
||||||
|
resp = client.get(f"/api/v1/metrics/{metric_id}")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["metric_name"] == "订单数-更新"
|
||||||
|
|
||||||
|
# Create schedule
|
||||||
|
resp = client.post(
|
||||||
|
"/api/v1/metric-schedules",
|
||||||
|
json={"metric_id": metric_id, "cron_expr": "0 2 * * *", "priority": 5, "enabled": True},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200, resp.text
|
||||||
|
schedule = resp.json()
|
||||||
|
schedule_id = schedule["id"]
|
||||||
|
|
||||||
|
# Update schedule
|
||||||
|
resp = client.post(f"/api/v1/metric-schedules/{schedule_id}", json={"enabled": False, "retry_times": 1})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["enabled"] is False
|
||||||
|
|
||||||
|
# List schedules for metric
|
||||||
|
resp = client.get(f"/api/v1/metrics/{metric_id}/schedules")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert any(s["id"] == schedule_id for s in resp.json())
|
||||||
|
|
||||||
|
|
||||||
|
def test_metric_runs_and_results_mysql(client: TestClient) -> None:
|
||||||
|
code = f"gmv_{random.randint(1000,9999)}"
|
||||||
|
metric_id = client.post(
|
||||||
|
"/api/v1/metrics",
|
||||||
|
json={
|
||||||
|
"metric_code": code,
|
||||||
|
"metric_name": "GMV",
|
||||||
|
"biz_domain": "order",
|
||||||
|
"base_sql": "select sum(pay_amount) as gmv from orders",
|
||||||
|
"time_grain": "DAY",
|
||||||
|
"dim_binding": ["dt"],
|
||||||
|
"update_strategy": "FULL",
|
||||||
|
"created_by": TEST_USER_ID,
|
||||||
|
},
|
||||||
|
).json()["id"]
|
||||||
|
|
||||||
|
# Trigger run
|
||||||
|
resp = client.post(
|
||||||
|
"/api/v1/metric-runs/trigger",
|
||||||
|
json={
|
||||||
|
"metric_id": metric_id,
|
||||||
|
"triggered_by": "API",
|
||||||
|
"data_time_from": (datetime.utcnow() - timedelta(days=1)).isoformat(),
|
||||||
|
"data_time_to": datetime.utcnow().isoformat(),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200, resp.text
|
||||||
|
run = resp.json()
|
||||||
|
run_id = run["id"]
|
||||||
|
assert run["status"] == "RUNNING"
|
||||||
|
|
||||||
|
# List runs
|
||||||
|
resp = client.get("/api/v1/metric-runs", params={"metric_id": metric_id})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert any(r["id"] == run_id for r in resp.json())
|
||||||
|
|
||||||
|
# Get run
|
||||||
|
resp = client.get(f"/api/v1/metric-runs/{run_id}")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
|
||||||
|
# Write results
|
||||||
|
now = datetime.utcnow()
|
||||||
|
resp = client.post(
|
||||||
|
f"/api/v1/metric-results/{metric_id}",
|
||||||
|
json={
|
||||||
|
"metric_id": metric_id,
|
||||||
|
"results": [
|
||||||
|
{"stat_time": (now - timedelta(days=1)).isoformat(), "metric_value": 123.45, "data_version": run_id},
|
||||||
|
{"stat_time": now.isoformat(), "metric_value": 234.56, "data_version": run_id},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200, resp.text
|
||||||
|
assert resp.json()["inserted"] == 2
|
||||||
|
|
||||||
|
# Query results
|
||||||
|
resp = client.get("/api/v1/metric-results", params={"metric_id": metric_id})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
results = resp.json()
|
||||||
|
assert len(results) >= 2
|
||||||
|
|
||||||
|
# Latest result
|
||||||
|
resp = client.get("/api/v1/metric-results/latest", params={"metric_id": metric_id})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
latest = resp.json()
|
||||||
|
assert float(latest["metric_value"]) in {123.45, 234.56}
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import pytest as _pytest
|
||||||
|
|
||||||
|
raise SystemExit(_pytest.main([__file__]))
|
||||||
91
test/test_rag_client.py
Normal file
91
test/test_rag_client.py
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.exceptions import ProviderAPICallError
|
||||||
|
from app.schemas.rag import RagDeleteRequest, RagItemPayload, RagRetrieveRequest
|
||||||
|
from app.services.rag_client import RagAPIClient
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_add_sends_payload_and_headers() -> None:
|
||||||
|
rag_client = RagAPIClient(base_url="http://rag.test", auth_token="secret-token")
|
||||||
|
|
||||||
|
def handler(request: httpx.Request) -> httpx.Response:
|
||||||
|
assert request.method == "POST"
|
||||||
|
assert str(request.url) == "http://rag.test/rag/add"
|
||||||
|
assert request.headers["Authorization"] == "Bearer secret-token"
|
||||||
|
payload = json.loads(request.content.decode())
|
||||||
|
assert payload == {
|
||||||
|
"id": 1,
|
||||||
|
"workspaceId": 2,
|
||||||
|
"name": "demo",
|
||||||
|
"embeddingData": "vector",
|
||||||
|
"type": "METRIC",
|
||||||
|
}
|
||||||
|
return httpx.Response(200, json={"ok": True, "echo": payload})
|
||||||
|
|
||||||
|
transport = httpx.MockTransport(handler)
|
||||||
|
async with httpx.AsyncClient(transport=transport) as client:
|
||||||
|
result = await rag_client.add(
|
||||||
|
client,
|
||||||
|
RagItemPayload(id=1, workspaceId=2, name="demo", embeddingData="vector", type="METRIC"),
|
||||||
|
)
|
||||||
|
assert result["ok"] is True
|
||||||
|
assert result["echo"]["name"] == "demo"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_add_batch_serializes_list() -> None:
|
||||||
|
rag_client = RagAPIClient(base_url="http://rag.test", auth_token=None)
|
||||||
|
|
||||||
|
def handler(request: httpx.Request) -> httpx.Response:
|
||||||
|
payload = json.loads(request.content.decode())
|
||||||
|
assert request.url.path == "/rag/addBatch"
|
||||||
|
assert isinstance(payload, list) and len(payload) == 2
|
||||||
|
return httpx.Response(200, json={"received": len(payload)})
|
||||||
|
|
||||||
|
items = [
|
||||||
|
RagItemPayload(id=1, workspaceId=2, name="a", embeddingData="vec-a", type="METRIC"),
|
||||||
|
RagItemPayload(id=2, workspaceId=2, name="b", embeddingData="vec-b", type="METRIC"),
|
||||||
|
]
|
||||||
|
transport = httpx.MockTransport(handler)
|
||||||
|
async with httpx.AsyncClient(transport=transport) as client:
|
||||||
|
result = await rag_client.add_batch(client, items)
|
||||||
|
assert result == {"received": 2}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_http_error_raises_provider_error() -> None:
|
||||||
|
rag_client = RagAPIClient(base_url="http://rag.test")
|
||||||
|
|
||||||
|
def handler(request: httpx.Request) -> httpx.Response:
|
||||||
|
return httpx.Response(500, text="boom")
|
||||||
|
|
||||||
|
transport = httpx.MockTransport(handler)
|
||||||
|
async with httpx.AsyncClient(transport=transport) as client:
|
||||||
|
with pytest.raises(ProviderAPICallError) as excinfo:
|
||||||
|
await rag_client.delete(client, RagDeleteRequest(id=1, type="METRIC"))
|
||||||
|
|
||||||
|
err = excinfo.value
|
||||||
|
assert err.status_code == 500
|
||||||
|
assert "boom" in (err.response_text or "")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_non_json_response_returns_raw_text() -> None:
|
||||||
|
rag_client = RagAPIClient(base_url="http://rag.test")
|
||||||
|
|
||||||
|
def handler(request: httpx.Request) -> httpx.Response:
|
||||||
|
return httpx.Response(200, text="plain-text-body")
|
||||||
|
|
||||||
|
transport = httpx.MockTransport(handler)
|
||||||
|
async with httpx.AsyncClient(transport=transport) as client:
|
||||||
|
result = await rag_client.retrieve(
|
||||||
|
client, RagRetrieveRequest(query="foo", num=1, workspaceId=1, type="METRIC")
|
||||||
|
)
|
||||||
|
assert result == {"raw": "plain-text-body"}
|
||||||
|
|
||||||
157
test/test_snippet_rag_ingest.py
Normal file
157
test/test_snippet_rag_ingest.py
Normal file
@ -0,0 +1,157 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import pytest
|
||||||
|
from sqlalchemy import create_engine, text
|
||||||
|
|
||||||
|
from app.services.table_snippet import ingest_snippet_rag_from_db
|
||||||
|
|
||||||
|
|
||||||
|
def _setup_sqlite_engine():
|
||||||
|
engine = create_engine("sqlite://")
|
||||||
|
with engine.begin() as conn:
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
CREATE TABLE action_results (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
table_id INTEGER,
|
||||||
|
version_ts INTEGER,
|
||||||
|
action_type TEXT,
|
||||||
|
status TEXT,
|
||||||
|
snippet_json TEXT,
|
||||||
|
snippet_alias_json TEXT,
|
||||||
|
updated_at TEXT
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
CREATE TABLE rag_snippet (
|
||||||
|
rag_item_id INTEGER PRIMARY KEY,
|
||||||
|
action_result_id INTEGER NOT NULL,
|
||||||
|
workspace_id INTEGER,
|
||||||
|
table_id INTEGER,
|
||||||
|
version_ts INTEGER,
|
||||||
|
created_at TEXT,
|
||||||
|
snippet_id TEXT,
|
||||||
|
rag_text TEXT,
|
||||||
|
merged_json TEXT,
|
||||||
|
updated_at TEXT
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return engine
|
||||||
|
|
||||||
|
|
||||||
|
def _insert_action_row(engine, payload: dict) -> None:
|
||||||
|
with engine.begin() as conn:
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
INSERT INTO action_results (table_id, version_ts, action_type, status, snippet_json, snippet_alias_json, updated_at)
|
||||||
|
VALUES (:table_id, :version_ts, :action_type, :status, :snippet_json, :snippet_alias_json, :updated_at)
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
{
|
||||||
|
"table_id": payload["table_id"],
|
||||||
|
"version_ts": payload["version_ts"],
|
||||||
|
"action_type": payload["action_type"],
|
||||||
|
"status": payload.get("status", "success"),
|
||||||
|
"snippet_json": json.dumps(payload.get("snippet_json"), ensure_ascii=False)
|
||||||
|
if payload.get("snippet_json") is not None
|
||||||
|
else None,
|
||||||
|
"snippet_alias_json": json.dumps(payload.get("snippet_alias_json"), ensure_ascii=False)
|
||||||
|
if payload.get("snippet_alias_json") is not None
|
||||||
|
else None,
|
||||||
|
"updated_at": payload.get("updated_at") or datetime.utcnow().isoformat(),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class _StubRagClient:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.received = None
|
||||||
|
|
||||||
|
async def add_batch(self, _client, items):
|
||||||
|
self.received = items
|
||||||
|
return {"count": len(items)}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_ingest_snippet_rag_from_db_persists_and_calls_rag_client() -> None:
|
||||||
|
engine = _setup_sqlite_engine()
|
||||||
|
table_id = 321
|
||||||
|
version_ts = 20240102000000
|
||||||
|
|
||||||
|
snippet_payload = [
|
||||||
|
{
|
||||||
|
"id": "snpt_topn",
|
||||||
|
"title": "TopN",
|
||||||
|
"aliases": [{"text": "站点水表排行前N", "tone": "中性"}],
|
||||||
|
"keywords": ["TopN", "站点"],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
alias_payload = [
|
||||||
|
{
|
||||||
|
"id": "snpt_topn",
|
||||||
|
"aliases": [
|
||||||
|
{"text": "站点水表排行前N", "tone": "中性"},
|
||||||
|
{"text": "按站点水表TopN", "tone": "专业"},
|
||||||
|
],
|
||||||
|
"keywords": ["TopN", "排行"],
|
||||||
|
"intent_tags": ["topn", "aggregate"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_extra",
|
||||||
|
"aliases": [{"text": "额外别名"}],
|
||||||
|
"keywords": ["extra"],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
_insert_action_row(
|
||||||
|
engine,
|
||||||
|
{
|
||||||
|
"table_id": table_id,
|
||||||
|
"version_ts": version_ts,
|
||||||
|
"action_type": "snippet_alias",
|
||||||
|
"snippet_json": snippet_payload,
|
||||||
|
"snippet_alias_json": alias_payload,
|
||||||
|
"updated_at": "2024-01-02T00:00:00",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
rag_stub = _StubRagClient()
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
rag_ids = await ingest_snippet_rag_from_db(
|
||||||
|
table_id=table_id,
|
||||||
|
version_ts=version_ts,
|
||||||
|
workspace_id=99,
|
||||||
|
rag_item_type="SNIPPET",
|
||||||
|
client=client,
|
||||||
|
engine=engine,
|
||||||
|
rag_client=rag_stub,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert rag_stub.received is not None
|
||||||
|
assert len(rag_stub.received) == 2 # includes alias-only row
|
||||||
|
assert len(rag_ids) == 2
|
||||||
|
|
||||||
|
with engine.connect() as conn:
|
||||||
|
rows = list(
|
||||||
|
conn.execute(
|
||||||
|
text("SELECT snippet_id, action_result_id, rag_text, merged_json FROM rag_snippet ORDER BY snippet_id")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assert {row[0] for row in rows} == {"snpt_extra", "snpt_topn"}
|
||||||
|
assert all(row[1] is not None for row in rows)
|
||||||
|
topn_row = next(row for row in rows if row[0] == "snpt_topn")
|
||||||
|
assert "TopN" in topn_row[2]
|
||||||
|
assert "按站点水表TopN" in topn_row[2]
|
||||||
|
assert "排行" in topn_row[2]
|
||||||
74
test/test_table_profiling_parsing.py
Normal file
74
test/test_table_profiling_parsing.py
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from app.services.table_profiling import _parse_completion_payload
|
||||||
|
from app.utils.llm_usage import extract_usage
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_completion_payload_handles_array_with_trailing_text() -> None:
|
||||||
|
response_payload = {
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"message": {
|
||||||
|
"content": """
|
||||||
|
结果如下:
|
||||||
|
[
|
||||||
|
{"id": "snpt_a"},
|
||||||
|
{"id": "snpt_b"}
|
||||||
|
]
|
||||||
|
附加说明:模型可能会输出额外文本。
|
||||||
|
""".strip()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
parsed = _parse_completion_payload(response_payload)
|
||||||
|
|
||||||
|
assert isinstance(parsed, list)
|
||||||
|
assert [item["id"] for item in parsed] == ["snpt_a", "snpt_b"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_usage_info_normalizes_numeric_fields() -> None:
|
||||||
|
response_payload = {
|
||||||
|
"raw": {
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": 12.7,
|
||||||
|
"completion_tokens": 3,
|
||||||
|
"total_tokens": 15.7,
|
||||||
|
"prompt_tokens_details": {"cached_tokens": 8.9, "other": None},
|
||||||
|
"non_numeric": "ignored",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
usage = extract_usage(response_payload)
|
||||||
|
|
||||||
|
assert usage == {
|
||||||
|
"prompt_tokens": 12,
|
||||||
|
"completion_tokens": 3,
|
||||||
|
"total_tokens": 15,
|
||||||
|
"prompt_tokens_details": {"cached_tokens": 8},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_usage_handles_alias_keys() -> None:
|
||||||
|
response_payload = {
|
||||||
|
"raw": {
|
||||||
|
"usageMetadata": {
|
||||||
|
"input_tokens": 20,
|
||||||
|
"output_tokens": 4,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
usage = extract_usage(response_payload)
|
||||||
|
|
||||||
|
assert usage == {
|
||||||
|
"prompt_tokens": 20,
|
||||||
|
"completion_tokens": 4,
|
||||||
|
"total_tokens": 24,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_usage_returns_none_when_missing() -> None:
|
||||||
|
assert extract_usage({"raw": {}}) is None
|
||||||
213
test/test_table_snippet_merge.py
Normal file
213
test/test_table_snippet_merge.py
Normal file
@ -0,0 +1,213 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from typing import List
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import pytest
|
||||||
|
from sqlalchemy import text
|
||||||
|
from sqlalchemy.engine import Engine
|
||||||
|
from sqlalchemy.exc import SQLAlchemyError
|
||||||
|
|
||||||
|
# Ensure the project root is importable when running directly via python.
|
||||||
|
ROOT = Path(__file__).resolve().parents[1]
|
||||||
|
if str(ROOT) not in sys.path:
|
||||||
|
sys.path.insert(0, str(ROOT))
|
||||||
|
|
||||||
|
from app import db
|
||||||
|
from app.main import create_app
|
||||||
|
|
||||||
|
|
||||||
|
from app.services.table_snippet import merge_snippet_records_from_db
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_MYSQL_URL = "mysql+pymysql://root:12345678@127.0.0.1:3306/data-ge?charset=utf8mb4"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def mysql_engine() -> Engine:
|
||||||
|
mysql_url = os.getenv("TEST_DATABASE_URL", DEFAULT_MYSQL_URL)
|
||||||
|
os.environ["DATABASE_URL"] = mysql_url
|
||||||
|
db.get_engine.cache_clear()
|
||||||
|
engine = db.get_engine()
|
||||||
|
try:
|
||||||
|
with engine.connect() as conn:
|
||||||
|
conn.execute(text("SELECT 1"))
|
||||||
|
exists = conn.execute(text("SHOW TABLES LIKE 'action_results'")).scalar()
|
||||||
|
if not exists:
|
||||||
|
pytest.skip("action_results table not found in test database.")
|
||||||
|
except SQLAlchemyError:
|
||||||
|
pytest.skip(f"Cannot connect to MySQL at {mysql_url}")
|
||||||
|
return engine
|
||||||
|
|
||||||
|
|
||||||
|
def _insert_action_row(
|
||||||
|
engine: Engine,
|
||||||
|
*,
|
||||||
|
table_id: int,
|
||||||
|
version_ts: int,
|
||||||
|
action_type: str,
|
||||||
|
status: str = "success",
|
||||||
|
snippet_json: List[dict] | None = None,
|
||||||
|
snippet_alias_json: List[dict] | None = None,
|
||||||
|
updated_at: datetime | None = None,
|
||||||
|
) -> None:
|
||||||
|
snippet_json_str = json.dumps(snippet_json, ensure_ascii=False) if snippet_json is not None else None
|
||||||
|
snippet_alias_json_str = (
|
||||||
|
json.dumps(snippet_alias_json, ensure_ascii=False) if snippet_alias_json is not None else None
|
||||||
|
)
|
||||||
|
with engine.begin() as conn:
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
INSERT INTO action_results (
|
||||||
|
table_id, version_ts, action_type, status,
|
||||||
|
callback_url, table_schema_version_id, table_schema,
|
||||||
|
snippet_json, snippet_alias_json, updated_at
|
||||||
|
) VALUES (
|
||||||
|
:table_id, :version_ts, :action_type, :status,
|
||||||
|
:callback_url, :table_schema_version_id, :table_schema,
|
||||||
|
:snippet_json, :snippet_alias_json, :updated_at
|
||||||
|
)
|
||||||
|
ON DUPLICATE KEY UPDATE
|
||||||
|
status=VALUES(status),
|
||||||
|
snippet_json=VALUES(snippet_json),
|
||||||
|
snippet_alias_json=VALUES(snippet_alias_json),
|
||||||
|
updated_at=VALUES(updated_at)
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
{
|
||||||
|
"table_id": table_id,
|
||||||
|
"version_ts": version_ts,
|
||||||
|
"action_type": action_type,
|
||||||
|
"status": status,
|
||||||
|
"callback_url": "http://localhost/test-callback",
|
||||||
|
"table_schema_version_id": "1",
|
||||||
|
"table_schema": json.dumps({}, ensure_ascii=False),
|
||||||
|
"snippet_json": snippet_json_str,
|
||||||
|
"snippet_alias_json": snippet_alias_json_str,
|
||||||
|
"updated_at": updated_at or datetime.utcnow(),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _cleanup(engine: Engine, table_id: int, version_ts: int) -> None:
|
||||||
|
with engine.begin() as conn:
|
||||||
|
conn.execute(
|
||||||
|
text("DELETE FROM action_results WHERE table_id=:table_id AND version_ts=:version_ts"),
|
||||||
|
{"table_id": table_id, "version_ts": version_ts},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_merge_prefers_alias_row_and_appends_alias_only_entries(mysql_engine: Engine) -> None:
|
||||||
|
table_id = 990000000 + random.randint(1, 9999)
|
||||||
|
version_ts = int(datetime.utcnow().strftime("%Y%m%d%H%M%S"))
|
||||||
|
alias_updated = datetime(2024, 1, 2, 0, 0, 0)
|
||||||
|
|
||||||
|
snippet_payload = [
|
||||||
|
{
|
||||||
|
"id": "snpt_topn",
|
||||||
|
"aliases": [{"text": "站点水表排行前N", "tone": "中性"}],
|
||||||
|
"keywords": ["TopN", "站点"],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
alias_payload = [
|
||||||
|
{
|
||||||
|
"id": "snpt_topn",
|
||||||
|
"aliases": [
|
||||||
|
{"text": "站点水表排行前N", "tone": "中性"},
|
||||||
|
{"text": "按站点水表TopN", "tone": "专业"},
|
||||||
|
],
|
||||||
|
"keywords": ["TopN", "排行"],
|
||||||
|
"intent_tags": ["topn", "aggregate"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snpt_extra",
|
||||||
|
"aliases": [{"text": "额外别名"}],
|
||||||
|
"keywords": ["extra"],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
_insert_action_row(
|
||||||
|
mysql_engine,
|
||||||
|
table_id=table_id,
|
||||||
|
version_ts=version_ts,
|
||||||
|
action_type="snippet_alias",
|
||||||
|
snippet_json=snippet_payload,
|
||||||
|
snippet_alias_json=alias_payload,
|
||||||
|
updated_at=alias_updated,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
merged = merge_snippet_records_from_db(table_id, version_ts, engine=mysql_engine)
|
||||||
|
assert len(merged) == 2
|
||||||
|
topn = next(item for item in merged if item["id"] == "snpt_topn")
|
||||||
|
assert topn["source"] == "snippet"
|
||||||
|
assert topn["updated_at_from_action"] == alias_updated
|
||||||
|
assert {a["text"] for a in topn["aliases"]} == {"站点水表排行前N", "按站点水表TopN"}
|
||||||
|
assert set(topn["keywords"]) == {"TopN", "站点", "排行"}
|
||||||
|
assert set(topn["intent_tags"]) == {"topn", "aggregate"}
|
||||||
|
|
||||||
|
alias_only = next(item for item in merged if item["source"] == "alias_only")
|
||||||
|
assert alias_only["id"] == "snpt_extra"
|
||||||
|
assert alias_only["aliases"][0]["text"] == "额外别名"
|
||||||
|
finally:
|
||||||
|
_cleanup(mysql_engine, table_id, version_ts)
|
||||||
|
|
||||||
|
|
||||||
|
def test_merge_falls_back_to_snippet_row_when_alias_row_missing_snippet_json(mysql_engine: Engine) -> None:
|
||||||
|
table_id = 991000000 + random.randint(1, 9999)
|
||||||
|
version_ts = int((datetime.utcnow() + timedelta(seconds=1)).strftime("%Y%m%d%H%M%S"))
|
||||||
|
|
||||||
|
alias_updated = datetime(2024, 1, 3, 0, 0, 0)
|
||||||
|
alias_payload = [
|
||||||
|
{
|
||||||
|
"id": "snpt_quality",
|
||||||
|
"aliases": [{"text": "质量检查"}],
|
||||||
|
"keywords": ["quality"],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
snippet_payload = [
|
||||||
|
{
|
||||||
|
"id": "snpt_quality",
|
||||||
|
"title": "质量检查",
|
||||||
|
"keywords": ["data-quality"],
|
||||||
|
"aliases": [{"text": "质量检查"}],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
_insert_action_row(
|
||||||
|
mysql_engine,
|
||||||
|
table_id=table_id,
|
||||||
|
version_ts=version_ts,
|
||||||
|
action_type="snippet_alias",
|
||||||
|
snippet_json=None,
|
||||||
|
snippet_alias_json=alias_payload,
|
||||||
|
updated_at=alias_updated,
|
||||||
|
)
|
||||||
|
_insert_action_row(
|
||||||
|
mysql_engine,
|
||||||
|
table_id=table_id,
|
||||||
|
version_ts=version_ts,
|
||||||
|
action_type="snippet",
|
||||||
|
snippet_json=snippet_payload,
|
||||||
|
snippet_alias_json=None,
|
||||||
|
updated_at=datetime(2024, 1, 2, 0, 0, 0),
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
merged = merge_snippet_records_from_db(table_id, version_ts, engine=mysql_engine)
|
||||||
|
|
||||||
|
assert len(merged) == 1
|
||||||
|
record = merged[0]
|
||||||
|
assert record["id"] == "snpt_quality"
|
||||||
|
assert record["source"] == "snippet"
|
||||||
|
assert record["updated_at_from_action"] == alias_updated
|
||||||
|
assert set(record["keywords"]) == {"data-quality", "quality"}
|
||||||
|
assert {a["text"] for a in record["aliases"]} == {"质量检查"}
|
||||||
|
finally:
|
||||||
|
_cleanup(mysql_engine, table_id, version_ts)
|
||||||
13
uv.lock
generated
Normal file
13
uv.lock
generated
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
version = 1
|
||||||
|
revision = 1
|
||||||
|
requires-python = ">=3.11"
|
||||||
|
resolution-markers = [
|
||||||
|
"python_full_version >= '3.14'",
|
||||||
|
"python_full_version >= '3.12' and python_full_version < '3.14'",
|
||||||
|
"python_full_version < '3.12'",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "data-ge-new"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = { virtual = "." }
|
||||||
Reference in New Issue
Block a user