Compare commits

...

12 Commits

55 changed files with 6605 additions and 225 deletions

7
.env
View File

@ -16,8 +16,11 @@ DEFAULT_IMPORT_MODEL=deepseek:deepseek-chat
# Service configuration # Service configuration
IMPORT_GATEWAY_BASE_URL=http://localhost:8000 IMPORT_GATEWAY_BASE_URL=http://localhost:8000
# prod nbackend base url
NBACKEND_BASE_URL=https://chatbi.agentcarrier.cn/chatbi/api
# HTTP client configuration # HTTP client configuration
HTTP_CLIENT_TIMEOUT=60 HTTP_CLIENT_TIMEOUT=120
HTTP_CLIENT_TRUST_ENV=false HTTP_CLIENT_TRUST_ENV=false
# HTTP_CLIENT_PROXY= # HTTP_CLIENT_PROXY=
@ -27,3 +30,5 @@ IMPORT_CHAT_TIMEOUT_SECONDS=120
# Logging # Logging
LOG_LEVEL=INFO LOG_LEVEL=INFO
# LOG_FORMAT=%(asctime)s %(levelname)s %(name)s:%(lineno)d %(message)s # LOG_FORMAT=%(asctime)s %(levelname)s %(name)s:%(lineno)d %(message)s
NEW_API_BASE_URL=http://localhost:3000
NEW_API_AUTH_TOKEN="sk-Q79KGFJRs5Vk9HsfFqoiJk948uLMDhAVe037AeCb31URyWGL"

View File

@ -2,7 +2,7 @@
This project exposes a FastAPI-based microservice that provides: This project exposes a FastAPI-based microservice that provides:
- A unified chat completions gateway supporting multiple LLM providers (OpenAI, Anthropic, OpenRouter, Gemini, Qwen, DeepSeek, etc.) - A unified chat completions gateway that now forwards requests to the internal `new-api` service (default `http://localhost:3000`) while preserving the same client-facing schema.
- An asynchronous data import analysis pipeline that orchestrates LLM calls to produce structured metadata and processing recommendations - An asynchronous data import analysis pipeline that orchestrates LLM calls to produce structured metadata and processing recommendations
The following instructions cover environment setup, dependency installation, and running the backend service. The following instructions cover environment setup, dependency installation, and running the backend service.
@ -56,6 +56,7 @@ Copy `.env.example` to `.env` (if provided) or edit `.env` to supply API keys an
- `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `OPENROUTER_API_KEY`, etc. - `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `OPENROUTER_API_KEY`, etc.
- `HTTP_CLIENT_TIMEOUT`, `IMPORT_CHAT_TIMEOUT_SECONDS` - `HTTP_CLIENT_TIMEOUT`, `IMPORT_CHAT_TIMEOUT_SECONDS`
- `LOG_LEVEL`, `LOG_FORMAT` for logging - `LOG_LEVEL`, `LOG_FORMAT` for logging
- `NEW_API_BASE_URL` (defaults to `http://localhost:3000`) and optional `NEW_API_AUTH_TOKEN` if the new-api component enforces authentication.
## Run the Backend Service ## Run the Backend Service
@ -84,4 +85,4 @@ Or use a process manager such as `pm2`, `supervisor`, or systemd for production
- Run the data import analysis example: `python test/data_import_analysis_example.py` - Run the data import analysis example: `python test/data_import_analysis_example.py`
- Test the OpenRouter demo: `python test/openrouter_chat_example.py` - Test the OpenRouter demo: `python test/openrouter_chat_example.py`
- Send a DeepSeek chat request script: `python scripts/deepseek_request.py` - Send a DeepSeek chat request script: `python scripts/deepseek_request.py`

View File

@ -16,19 +16,24 @@ from fastapi.responses import JSONResponse
from app.exceptions import ProviderAPICallError, ProviderConfigurationError from app.exceptions import ProviderAPICallError, ProviderConfigurationError
from app.models import ( from app.models import (
ActionStatus,
ActionType,
DataImportAnalysisJobAck, DataImportAnalysisJobAck,
DataImportAnalysisJobRequest, DataImportAnalysisJobRequest,
LLMRequest, LLMRequest,
LLMResponse, LLMResponse,
TableProfilingJobAck, TableProfilingJobAck,
TableProfilingJobRequest, TableProfilingJobRequest,
TableSnippetRagIngestRequest,
TableSnippetRagIngestResponse,
TableSnippetUpsertRequest, TableSnippetUpsertRequest,
TableSnippetUpsertResponse, TableSnippetUpsertResponse,
) )
from app.routers import chat_router, metrics_router
from app.services import LLMGateway from app.services import LLMGateway
from app.services.import_analysis import process_import_analysis_job from app.services.import_analysis import process_import_analysis_job
from app.services.table_profiling import process_table_profiling_job from app.services.table_profiling import process_table_profiling_job
from app.services.table_snippet import upsert_action_result from app.services.table_snippet import ingest_snippet_rag_from_db, upsert_action_result
def _ensure_log_directories(config: dict[str, Any]) -> None: def _ensure_log_directories(config: dict[str, Any]) -> None:
@ -135,6 +140,9 @@ def create_app() -> FastAPI:
version="0.1.0", version="0.1.0",
lifespan=lifespan, lifespan=lifespan,
) )
# Chat/metric management APIs
application.include_router(chat_router)
application.include_router(metrics_router)
@application.exception_handler(RequestValidationError) @application.exception_handler(RequestValidationError)
async def request_validation_exception_handler( async def request_validation_exception_handler(
@ -230,11 +238,12 @@ def create_app() -> FastAPI:
) )
async def upsert_table_snippet( async def upsert_table_snippet(
payload: TableSnippetUpsertRequest, payload: TableSnippetUpsertRequest,
client: httpx.AsyncClient = Depends(get_http_client),
) -> TableSnippetUpsertResponse: ) -> TableSnippetUpsertResponse:
request_copy = payload.model_copy(deep=True) request_copy = payload.model_copy(deep=True)
try: try:
return await asyncio.to_thread(upsert_action_result, request_copy) response = await asyncio.to_thread(upsert_action_result, request_copy)
except Exception as exc: except Exception as exc:
logger.error( logger.error(
"Failed to upsert table snippet: table_id=%s version_ts=%s action_type=%s", "Failed to upsert table snippet: table_id=%s version_ts=%s action_type=%s",
@ -244,6 +253,62 @@ def create_app() -> FastAPI:
exc_info=True, exc_info=True,
) )
raise HTTPException(status_code=500, detail=str(exc)) from exc raise HTTPException(status_code=500, detail=str(exc)) from exc
else:
# After snippet_alias is stored, automatically trigger RAG ingest when configured.
if (
payload.action_type == ActionType.SNIPPET_ALIAS
and payload.status == ActionStatus.SUCCESS
and payload.rag_workspace_id is not None
):
try:
await ingest_snippet_rag_from_db(
table_id=payload.table_id,
version_ts=payload.version_ts,
workspace_id=payload.rag_workspace_id,
rag_item_type=payload.rag_item_type or "SNIPPET",
client=client,
)
except Exception:
logger.exception(
"Failed to ingest snippet RAG artifacts after snippet_alias upsert",
extra={
"table_id": payload.table_id,
"version_ts": payload.version_ts,
"workspace_id": payload.rag_workspace_id,
},
)
return response
@application.post(
"/v1/table/snippet/rag_ingest",
response_model=TableSnippetRagIngestResponse,
summary="Merge snippet+alias results from action_results and ingest into RAG.",
)
async def ingest_snippet_rag(
payload: TableSnippetRagIngestRequest,
client: httpx.AsyncClient = Depends(get_http_client),
) -> TableSnippetRagIngestResponse:
try:
rag_item_ids = await ingest_snippet_rag_from_db(
table_id=payload.table_id,
version_ts=payload.version_ts,
workspace_id=payload.workspace_id,
rag_item_type=payload.rag_item_type or "SNIPPET",
client=client,
)
except Exception as exc:
logger.exception(
"Failed to ingest snippet RAG artifacts",
extra={
"table_id": payload.table_id,
"version_ts": payload.version_ts,
"workspace_id": payload.workspace_id,
},
)
raise HTTPException(status_code=500, detail=str(exc)) from exc
return TableSnippetRagIngestResponse(rag_item_ids=rag_item_ids)
@application.post("/__mock__/import-callback") @application.post("/__mock__/import-callback")
async def mock_import_callback(payload: dict[str, Any]) -> dict[str, str]: async def mock_import_callback(payload: dict[str, Any]) -> dict[str, str]:

View File

@ -232,6 +232,15 @@ class TableProfilingJobRequest(BaseModel):
None, None,
description="Miscellaneous execution flags applied across pipeline steps.", description="Miscellaneous execution flags applied across pipeline steps.",
) )
workspace_id: Optional[int] = Field(
None,
ge=0,
description="Optional workspace identifier forwarded to snippet_alias callback for RAG ingestion.",
)
rag_item_type: Optional[str] = Field(
"SNIPPET",
description="Optional RAG item type forwarded to snippet_alias callback.",
)
class TableProfilingJobAck(BaseModel): class TableProfilingJobAck(BaseModel):
@ -247,6 +256,16 @@ class TableSnippetUpsertRequest(BaseModel):
ge=0, ge=0,
description="Version timestamp aligned with the pipeline (yyyyMMddHHmmss as integer).", description="Version timestamp aligned with the pipeline (yyyyMMddHHmmss as integer).",
) )
workspace_id: Optional[int] = Field(
None,
ge=0,
description="Optional workspace identifier for RAG ingestion; when provided and action_type=snippet_alias "
"with status=success, merged snippets will be written to rag_snippet and pushed to RAG.",
)
rag_item_type: Optional[str] = Field(
"SNIPPET",
description="Optional RAG item type used when pushing snippets to RAG. Defaults to 'SNIPPET'.",
)
action_type: ActionType = Field(..., description="Pipeline action type for this record.") action_type: ActionType = Field(..., description="Pipeline action type for this record.")
status: ActionStatus = Field( status: ActionStatus = Field(
ActionStatus.SUCCESS, description="Execution status for the action." ActionStatus.SUCCESS, description="Execution status for the action."
@ -254,17 +273,57 @@ class TableSnippetUpsertRequest(BaseModel):
callback_url: HttpUrl = Field(..., description="Callback URL associated with the action run.") callback_url: HttpUrl = Field(..., description="Callback URL associated with the action run.")
table_schema_version_id: int = Field(..., ge=0, description="Identifier for the schema snapshot.") table_schema_version_id: int = Field(..., ge=0, description="Identifier for the schema snapshot.")
table_schema: Any = Field(..., description="Schema snapshot payload for the table.") table_schema: Any = Field(..., description="Schema snapshot payload for the table.")
result_json: Optional[Any] = Field( model: Optional[str] = Field(
None, None,
description="Primary result payload for the action (e.g., profiling output, snippet array).", description="LLM model identifier (can be provider alias) used for this action, when applicable.",
) )
result_summary_json: Optional[Any] = Field( model_provider: Optional[str] = Field(
None, None,
description="Optional summary payload (e.g., profiling summary) for the action.", description="LLM provider responsible for executing the action's model.",
) )
html_report_url: Optional[str] = Field( model_params: Optional[Dict[str, Any]] = Field(
None, None,
description="Optional HTML report URL generated by the action.", description="Optional model parameter overrides (e.g., temperature) associated with the action.",
)
llm_usage: Optional[Any] = Field(
None,
description="Optional token usage metrics reported by the LLM provider.",
)
ge_profiling_json: Optional[Any] = Field(
None, description="Full GE profiling result payload for the profiling action."
)
ge_profiling_json_size_bytes: Optional[int] = Field(
None, ge=0, description="Size in bytes of the GE profiling result JSON."
)
ge_profiling_summary: Optional[Any] = Field(
None, description="Sanitised GE profiling summary payload."
)
ge_profiling_summary_size_bytes: Optional[int] = Field(
None, ge=0, description="Size in bytes of the GE profiling summary JSON."
)
ge_profiling_total_size_bytes: Optional[int] = Field(
None, ge=0, description="Combined size (bytes) of profiling result + summary."
)
ge_profiling_html_report_url: Optional[str] = Field(
None, description="Optional URL to the generated GE profiling HTML report."
)
ge_result_desc_json: Optional[Any] = Field(
None, description="Result JSON for the GE result description action."
)
ge_result_desc_json_size_bytes: Optional[int] = Field(
None, ge=0, description="Size in bytes of the GE result description JSON."
)
snippet_json: Optional[Any] = Field(
None, description="Snippet generation action result JSON."
)
snippet_json_size_bytes: Optional[int] = Field(
None, ge=0, description="Size in bytes of the snippet result JSON."
)
snippet_alias_json: Optional[Any] = Field(
None, description="Snippet alias expansion result JSON."
)
snippet_alias_json_size_bytes: Optional[int] = Field(
None, ge=0, description="Size in bytes of the snippet alias result JSON."
) )
error_code: Optional[str] = Field(None, description="Optional error code when status indicates a failure.") error_code: Optional[str] = Field(None, description="Optional error code when status indicates a failure.")
error_message: Optional[str] = Field(None, description="Optional error message when status indicates a failure.") error_message: Optional[str] = Field(None, description="Optional error message when status indicates a failure.")
@ -279,6 +338,24 @@ class TableSnippetUpsertRequest(BaseModel):
ge=0, ge=0,
description="Optional execution duration in milliseconds.", description="Optional execution duration in milliseconds.",
) )
class TableSnippetRagIngestRequest(BaseModel):
table_id: int = Field(..., ge=1, description="Unique identifier for the table.")
version_ts: int = Field(
...,
ge=0,
description="Version timestamp aligned with the pipeline (yyyyMMddHHmmss as integer).",
)
workspace_id: int = Field(..., ge=0, description="Workspace id used when pushing snippets to RAG.")
rag_item_type: Optional[str] = Field(
"SNIPPET",
description="Optional RAG item type used when pushing snippets to RAG. Defaults to 'SNIPPET'.",
)
class TableSnippetRagIngestResponse(BaseModel):
rag_item_ids: List[int] = Field(..., description="List of ingested rag_item_ids.")
result_checksum: Optional[str] = Field( result_checksum: Optional[str] = Field(
None, None,
description="Optional checksum for the result payload (e.g., MD5).", description="Optional checksum for the result payload (e.g., MD5).",

4
app/routers/__init__.py Normal file
View File

@ -0,0 +1,4 @@
from .chat import router as chat_router
from .metrics import router as metrics_router
__all__ = ["chat_router", "metrics_router"]

102
app/routers/chat.py Normal file
View File

@ -0,0 +1,102 @@
from __future__ import annotations
from datetime import datetime
from typing import Any, List, Optional
from fastapi import APIRouter, HTTPException, Query
from app.schemas.chat import (
ChatSessionCreate,
ChatSessionUpdate,
ChatTurnCreate,
ChatTurnRetrievalBatch,
)
from app.services import metric_store
router = APIRouter(prefix="/api/v1/chat", tags=["chat"])
@router.post("/sessions")
def create_session(payload: ChatSessionCreate) -> Any:
"""Create a chat session."""
return metric_store.create_chat_session(payload)
@router.post("/sessions/{session_id}/update")
def update_session(session_id: int, payload: ChatSessionUpdate) -> Any:
try:
return metric_store.update_chat_session(session_id, payload)
except KeyError:
raise HTTPException(status_code=404, detail="Session not found")
@router.post("/sessions/{session_id}/close")
def close_session(session_id: int) -> Any:
"""Close a chat session and stamp end_time."""
try:
return metric_store.close_chat_session(session_id)
except KeyError:
raise HTTPException(status_code=404, detail="Session not found")
@router.get("/sessions/{session_id}")
def get_session(session_id: int) -> Any:
"""Fetch one session."""
session = metric_store.get_chat_session(session_id)
if not session:
raise HTTPException(status_code=404, detail="Session not found")
return session
@router.get("/sessions")
def list_sessions(
user_id: Optional[int] = None,
status: Optional[str] = None,
start_from: Optional[datetime] = Query(None, description="Filter by start time lower bound."),
start_to: Optional[datetime] = Query(None, description="Filter by start time upper bound."),
limit: int = Query(50, ge=1, le=500),
offset: int = Query(0, ge=0),
) -> List[Any]:
return metric_store.list_chat_sessions(
user_id=user_id,
status=status,
start_from=start_from,
start_to=start_to,
limit=limit,
offset=offset,
)
@router.post("/sessions/{session_id}/turns")
def create_turn(session_id: int, payload: ChatTurnCreate) -> Any:
"""Create a turn under a session."""
try:
return metric_store.create_chat_turn(session_id, payload)
except Exception as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
@router.get("/sessions/{session_id}/turns")
def list_turns(session_id: int) -> List[Any]:
return metric_store.list_chat_turns(session_id)
@router.get("/turns/{turn_id}")
def get_turn(turn_id: int) -> Any:
turn = metric_store.get_chat_turn(turn_id)
if not turn:
raise HTTPException(status_code=404, detail="Turn not found")
return turn
@router.post("/turns/{turn_id}/retrievals")
def write_retrievals(turn_id: int, payload: ChatTurnRetrievalBatch) -> Any:
"""Batch write retrieval records for a turn."""
count = metric_store.create_retrievals(turn_id, payload.retrievals)
return {"turn_id": turn_id, "inserted": count}
@router.get("/turns/{turn_id}/retrievals")
def list_retrievals(turn_id: int) -> List[Any]:
return metric_store.list_retrievals(turn_id)

166
app/routers/metrics.py Normal file
View File

@ -0,0 +1,166 @@
from __future__ import annotations
from datetime import datetime
from typing import Any, List, Optional
from fastapi import APIRouter, HTTPException, Query
from app.schemas.metrics import (
MetricCreate,
MetricResultsWriteRequest,
MetricRunTrigger,
MetricScheduleCreate,
MetricScheduleUpdate,
MetricUpdate,
)
from app.services import metric_store
router = APIRouter(prefix="/api/v1", tags=["metrics"])
@router.post("/metrics")
def create_metric(payload: MetricCreate) -> Any:
"""Create a metric definition."""
try:
return metric_store.create_metric(payload)
except Exception as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
@router.post("/metrics/{metric_id}")
def update_metric(metric_id: int, payload: MetricUpdate) -> Any:
"""Update fields of a metric definition."""
try:
return metric_store.update_metric(metric_id, payload)
except KeyError:
raise HTTPException(status_code=404, detail="Metric not found")
except Exception as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
@router.get("/metrics/{metric_id}")
def get_metric(metric_id: int) -> Any:
"""Fetch a metric definition by id."""
metric = metric_store.get_metric(metric_id)
if not metric:
raise HTTPException(status_code=404, detail="Metric not found")
return metric
@router.get("/metrics")
def list_metrics(
biz_domain: Optional[str] = None,
is_active: Optional[bool] = None,
keyword: Optional[str] = Query(None, description="Search by code/name"),
limit: int = Query(100, ge=1, le=500),
offset: int = Query(0, ge=0),
) -> List[Any]:
"""List metrics with optional filters."""
return metric_store.list_metrics(
biz_domain=biz_domain,
is_active=is_active,
keyword=keyword,
limit=limit,
offset=offset,
)
@router.post("/metric-schedules")
def create_schedule(payload: MetricScheduleCreate) -> Any:
"""Create a metric schedule."""
try:
return metric_store.create_metric_schedule(payload)
except Exception as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
@router.post("/metric-schedules/{schedule_id}")
def update_schedule(schedule_id: int, payload: MetricScheduleUpdate) -> Any:
"""Update a metric schedule."""
try:
return metric_store.update_metric_schedule(schedule_id, payload)
except KeyError:
raise HTTPException(status_code=404, detail="Schedule not found")
except Exception as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
@router.get("/metrics/{metric_id}/schedules")
def list_schedules(metric_id: int) -> List[Any]:
"""List schedules for one metric."""
return metric_store.list_schedules_for_metric(metric_id=metric_id)
@router.post("/metric-runs/trigger")
def trigger_run(payload: MetricRunTrigger) -> Any:
"""Insert a run record (execution handled externally)."""
try:
return metric_store.trigger_metric_run(payload)
except KeyError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc
except Exception as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
@router.get("/metric-runs")
def list_runs(
metric_id: Optional[int] = None,
status: Optional[str] = None,
limit: int = Query(100, ge=1, le=500),
offset: int = Query(0, ge=0),
) -> List[Any]:
"""List run records."""
return metric_store.list_metric_runs(
metric_id=metric_id, status=status, limit=limit, offset=offset
)
@router.get("/metric-runs/{run_id}")
def get_run(run_id: int) -> Any:
"""Fetch run details."""
run = metric_store.get_metric_run(run_id)
if not run:
raise HTTPException(status_code=404, detail="Run not found")
return run
@router.post("/metric-results/{metric_id}")
def write_results(metric_id: int, payload: MetricResultsWriteRequest) -> Any:
# Align path metric_id with payload to avoid mismatch.
if payload.metric_id != metric_id:
raise HTTPException(status_code=400, detail="metric_id in path/body mismatch")
try:
inserted = metric_store.write_metric_results(payload)
except KeyError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc
except Exception as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
return {"metric_id": metric_id, "inserted": inserted}
@router.get("/metric-results")
def query_results(
metric_id: int,
stat_from: Optional[datetime] = None,
stat_to: Optional[datetime] = None,
limit: int = Query(200, ge=1, le=1000),
offset: int = Query(0, ge=0),
) -> List[Any]:
"""Query metric results by time range."""
return metric_store.query_metric_results(
metric_id=metric_id,
stat_from=stat_from,
stat_to=stat_to,
limit=limit,
offset=offset,
)
@router.get("/metric-results/latest")
def latest_result(metric_id: int) -> Any:
"""Fetch the latest metric result."""
result = metric_store.latest_metric_result(metric_id)
if not result:
raise HTTPException(status_code=404, detail="Metric result not found")
return result

53
app/schemas/chat.py Normal file
View File

@ -0,0 +1,53 @@
from __future__ import annotations
from datetime import datetime
from typing import Any, List, Optional
from pydantic import BaseModel, Field
class ChatSessionCreate(BaseModel):
"""Create a chat session to group multiple turns for a user."""
user_id: int = Field(..., description="User ID owning the session.")
session_uuid: Optional[str] = Field(None, description="Optional externally provided UUID.")
status: Optional[str] = Field("OPEN", description="Session status, default OPEN.")
end_time: Optional[datetime] = Field(None, description="Optional end time.")
ext_context: Optional[dict[str, Any]] = Field(None, description="Arbitrary business context.")
class ChatSessionUpdate(BaseModel):
"""Partial update for a chat session."""
status: Optional[str] = Field(None, description="New session status.")
end_time: Optional[datetime] = Field(None, description="Close time override.")
last_turn_id: Optional[int] = Field(None, description="Pointer to last chat turn.")
ext_context: Optional[dict[str, Any]] = Field(None, description="Context patch.")
class ChatTurnCreate(BaseModel):
"""Create a single chat turn with intent/SQL context."""
user_id: int = Field(..., description="User ID for this turn.")
user_query: str = Field(..., description="Raw user query content.")
intent: Optional[str] = Field(None, description="Intent tag such as METRIC_QUERY.")
ast_json: Optional[dict[str, Any]] = Field(None, description="Parsed AST payload.")
generated_sql: Optional[str] = Field(None, description="Final SQL text, if generated.")
sql_status: Optional[str] = Field(None, description="SQL generation/execution status.")
error_msg: Optional[str] = Field(None, description="Error message when SQL failed.")
main_metric_ids: Optional[List[int]] = Field(None, description="Metric IDs referenced in this turn.")
created_metric_ids: Optional[List[int]] = Field(None, description="Metric IDs created in this turn.")
end_time: Optional[datetime] = Field(None, description="Turn end time.")
class ChatTurnRetrievalItem(BaseModel):
"""Record of one retrieved item contributing to a turn."""
item_type: str = Field(..., description="METRIC/SNIPPET/CHAT etc.")
item_id: str = Field(..., description="Identifier such as metric_id or snippet_id.")
item_extra: Optional[dict[str, Any]] = Field(None, description="Additional context like column name.")
similarity_score: Optional[float] = Field(None, description="Similarity score.")
rank_no: Optional[int] = Field(None, description="Ranking position.")
used_in_reasoning: Optional[bool] = Field(False, description="Flag if used in reasoning.")
used_in_sql: Optional[bool] = Field(False, description="Flag if used in final SQL.")
class ChatTurnRetrievalBatch(BaseModel):
"""Batch insert wrapper for retrieval records."""
retrievals: List[ChatTurnRetrievalItem]

99
app/schemas/metrics.py Normal file
View File

@ -0,0 +1,99 @@
from __future__ import annotations
from datetime import datetime
from typing import Any, List, Optional
from pydantic import BaseModel, Field
class MetricCreate(BaseModel):
"""Create a metric definition with business and technical metadata."""
metric_code: str = Field(..., description="Internal metric code, unique.")
metric_name: str = Field(..., description="Display name.")
metric_aliases: Optional[List[str]] = Field(None, description="Optional alias list.")
biz_domain: str = Field(..., description="Business domain identifier.")
biz_desc: Optional[str] = Field(None, description="Business definition.")
chat_turn_id: Optional[int] = Field(None, description="Source chat turn ID.")
tech_desc: Optional[str] = Field(None, description="Technical definition.")
formula_expr: Optional[str] = Field(None, description="Formula expression text.")
base_sql: str = Field(..., description="Canonical SQL used to compute the metric.")
time_grain: str = Field(..., description="DAY/HOUR/WEEK/MONTH etc.")
dim_binding: List[str] = Field(..., description="Dimension columns bound to the metric.")
update_strategy: str = Field(..., description="FULL/INCR/REALTIME.")
schedule_id: Optional[int] = Field(None, description="Linked schedule id if any.")
schedule_type: Optional[int] = Field(None, description="Scheduler type identifier.")
is_active: bool = Field(True, description="Whether the metric is enabled.")
created_by: Optional[int] = Field(None, description="Creator user id.")
updated_by: Optional[int] = Field(None, description="Updater user id.")
class MetricUpdate(BaseModel):
"""Partial update for an existing metric definition."""
metric_name: Optional[str] = None
metric_aliases: Optional[List[str]] = None
biz_domain: Optional[str] = None
biz_desc: Optional[str] = None
tech_desc: Optional[str] = None
formula_expr: Optional[str] = None
base_sql: Optional[str] = None
time_grain: Optional[str] = None
dim_binding: Optional[List[str]] = None
update_strategy: Optional[str] = None
schedule_id: Optional[int] = None
schedule_type: Optional[int] = None
is_active: Optional[bool] = None
updated_by: Optional[int] = None
class MetricScheduleCreate(BaseModel):
"""Create a cron-based schedule for a metric."""
metric_id: int
cron_expr: str
enabled: bool = True
priority: int = 10
backfill_allowed: bool = True
max_runtime_sec: Optional[int] = None
retry_times: int = 0
owner_team: Optional[str] = None
owner_user_id: Optional[int] = None
class MetricScheduleUpdate(BaseModel):
"""Update fields of an existing metric schedule."""
cron_expr: Optional[str] = None
enabled: Optional[bool] = None
priority: Optional[int] = None
backfill_allowed: Optional[bool] = None
max_runtime_sec: Optional[int] = None
retry_times: Optional[int] = None
owner_team: Optional[str] = None
owner_user_id: Optional[int] = None
class MetricRunTrigger(BaseModel):
"""Trigger a metric run, optionally linking to a chat turn or schedule."""
metric_id: int
schedule_id: Optional[int] = None
source_turn_id: Optional[int] = None
data_time_from: Optional[datetime] = None
data_time_to: Optional[datetime] = None
metric_version: Optional[int] = None
base_sql_snapshot: Optional[str] = None
triggered_by: str = Field("API", description="SCHEDULER/MANUAL/API/QA_TURN")
triggered_at: Optional[datetime] = None
class MetricResultItem(BaseModel):
"""Single metric result row to be persisted."""
stat_time: datetime
metric_value: float
metric_version: Optional[int] = None
extra_dims: Optional[dict[str, Any]] = None
load_time: Optional[datetime] = None
data_version: Optional[int] = None
class MetricResultsWriteRequest(BaseModel):
"""Batch write request for metric results."""
metric_id: int
results: List[MetricResultItem]

46
app/schemas/rag.py Normal file
View File

@ -0,0 +1,46 @@
from __future__ import annotations
from typing import Any, List
from pydantic import BaseModel, ConfigDict, Field
class RagItemPayload(BaseModel):
"""Payload for creating or updating a single RAG item."""
model_config = ConfigDict(populate_by_name=True, extra="ignore")
id: int = Field(..., description="Unique identifier for the RAG item.")
workspace_id: int = Field(..., alias="workspaceId", description="Workspace identifier.")
name: str = Field(..., description="Readable name of the item.")
embedding_data: str = Field(..., alias="embeddingData", description="Serialized embedding payload.")
type: str = Field(..., description='Item type, e.g. "METRIC".')
class RagDeleteRequest(BaseModel):
"""Payload for deleting a single RAG item."""
model_config = ConfigDict(populate_by_name=True, extra="ignore")
id: int = Field(..., description="Identifier of the item to delete.")
type: str = Field(..., description="Item type matching the stored record.")
class RagRetrieveRequest(BaseModel):
"""Payload for retrieving RAG items by semantic query."""
model_config = ConfigDict(populate_by_name=True, extra="ignore")
query: str = Field(..., description="Search query text.")
num: int = Field(..., description="Number of items to return.")
workspace_id: int = Field(..., alias="workspaceId", description="Workspace scope for the search.")
type: str = Field(..., description="Item type to search, e.g. METRIC.")
class RagRetrieveResponse(BaseModel):
"""Generic RAG retrieval response wrapper."""
model_config = ConfigDict(extra="allow")
data: List[Any] = Field(default_factory=list, description="Retrieved items.")

View File

@ -1,3 +1,4 @@
from .gateway import LLMGateway from .gateway import LLMGateway
from .rag_client import RagAPIClient
__all__ = ["LLMGateway"] __all__ = ["LLMGateway", "RagAPIClient"]

View File

@ -1,53 +1,93 @@
from __future__ import annotations from __future__ import annotations
import os import logging
from typing import Dict, Type
import httpx import httpx
from pydantic import ValidationError
from app.exceptions import ProviderConfigurationError from app.exceptions import ProviderAPICallError
from app.models import LLMProvider, LLMRequest, LLMResponse from app.models import LLMChoice, LLMMessage, LLMRequest, LLMResponse
from app.providers import ( from app.settings import NEW_API_AUTH_TOKEN, NEW_API_BASE_URL
AnthropicProvider,
DeepSeekProvider,
GeminiProvider, logger = logging.getLogger(__name__)
LLMProviderClient,
OpenAIProvider,
OpenRouterProvider,
QwenProvider,
)
class LLMGateway: class LLMGateway:
"""Simple registry that dispatches chat requests to provider clients.""" """Forward chat requests to the configured new-api component."""
def __init__(self) -> None: def __init__(
self._providers: Dict[LLMProvider, LLMProviderClient] = {} self,
self._factory: Dict[LLMProvider, Type[LLMProviderClient]] = { *,
LLMProvider.OPENAI: OpenAIProvider, base_url: str | None = None,
LLMProvider.ANTHROPIC: AnthropicProvider, auth_token: str | None = None,
LLMProvider.OPENROUTER: OpenRouterProvider, ) -> None:
LLMProvider.GEMINI: GeminiProvider, resolved_base = base_url or NEW_API_BASE_URL
LLMProvider.QWEN: QwenProvider, self._base_url = resolved_base.rstrip("/")
LLMProvider.DEEPSEEK: DeepSeekProvider, self._auth_token = auth_token or NEW_API_AUTH_TOKEN
}
def get_provider(self, provider: LLMProvider) -> LLMProviderClient:
if provider not in self._factory:
raise ProviderConfigurationError(f"Unsupported provider '{provider.value}'.")
if provider not in self._providers:
self._providers[provider] = self._build_provider(provider)
return self._providers[provider]
def _build_provider(self, provider: LLMProvider) -> LLMProviderClient:
provider_cls = self._factory[provider]
api_key_env = getattr(provider_cls, "api_key_env", None)
api_key = os.getenv(api_key_env) if api_key_env else None
return provider_cls(api_key)
async def chat( async def chat(
self, request: LLMRequest, client: httpx.AsyncClient self, request: LLMRequest, client: httpx.AsyncClient
) -> LLMResponse: ) -> LLMResponse:
provider_client = self.get_provider(request.provider) url = f"{self._base_url}/v1/chat/completions"
return await provider_client.chat(request, client) payload = request.model_dump(mode="json", exclude_none=True)
headers = {"Content-Type": "application/json"}
if self._auth_token:
headers["Authorization"] = f"Bearer {self._auth_token}"
logger.info("Forwarding chat request to new-api at %s", url)
try:
response = await client.post(url, json=payload, headers=headers)
response.raise_for_status()
except httpx.HTTPStatusError as exc:
status_code = exc.response.status_code if exc.response else None
response_text = exc.response.text if exc.response else ""
logger.error(
"new-api upstream returned %s: %s",
status_code,
response_text,
exc_info=True,
)
raise ProviderAPICallError(
"Chat completion request failed.",
status_code=status_code,
response_text=response_text,
) from exc
except httpx.HTTPError as exc:
logger.error("new-api transport error: %s", exc, exc_info=True)
raise ProviderAPICallError(f"Chat completion request failed: {exc}") from exc
try:
data = response.json()
except ValueError as exc:
logger.error("new-api responded with invalid JSON.", exc_info=True)
raise ProviderAPICallError(
"Chat completion response was not valid JSON."
) from exc
logger.info("new-api payload: %s", data)
normalized_choices: list[LLMChoice] = []
for idx, choice in enumerate(data.get("choices", []) or []):
message_payload = choice.get("message") or {}
message = LLMMessage(
role=message_payload.get("role", "assistant"),
content=message_payload.get("content", ""),
)
normalized_choices.append(
LLMChoice(index=choice.get("index", idx), message=message)
)
try:
normalized_response = LLMResponse(
provider=request.provider,
model=data.get("model", request.model),
choices=normalized_choices,
raw=data,
)
return normalized_response
except ValidationError as exc:
logger.error(
"new-api response did not match expected schema: %s", data, exc_info=True
)
raise ProviderAPICallError(
"Chat completion response was not in the expected format."
) from exc

View File

@ -22,13 +22,24 @@ from app.models import (
LLMResponse, LLMResponse,
LLMRole, LLMRole,
) )
from app.settings import DEFAULT_IMPORT_MODEL, get_supported_import_models from app.settings import (
DEFAULT_IMPORT_MODEL,
NEW_API_AUTH_TOKEN,
NEW_API_BASE_URL,
get_supported_import_models,
)
from app.utils.llm_usage import extract_usage
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
IMPORT_GATEWAY_BASE_URL = os.getenv( IMPORT_GATEWAY_BASE_URL = os.getenv("IMPORT_GATEWAY_BASE_URL", NEW_API_BASE_URL)
"IMPORT_GATEWAY_BASE_URL", "http://localhost:8000"
)
def build_import_gateway_headers() -> dict[str, str]:
headers = {"Content-Type": "application/json"}
if NEW_API_AUTH_TOKEN:
headers["Authorization"] = f"Bearer {NEW_API_AUTH_TOKEN}"
return headers
def _env_float(name: str, default: float) -> float: def _env_float(name: str, default: float) -> float:
@ -313,16 +324,18 @@ async def dispatch_import_analysis_job(
url = f"{IMPORT_GATEWAY_BASE_URL.rstrip('/')}/v1/chat/completions" url = f"{IMPORT_GATEWAY_BASE_URL.rstrip('/')}/v1/chat/completions"
logger.info( logger.info(
"Dispatching import %s to %s: %s", "Dispatching import %s to %s using provider=%s model=%s",
request.import_record_id, request.import_record_id,
url, url,
json.dumps(payload, ensure_ascii=False), payload.get("provider"),
payload.get("model"),
) )
timeout = httpx.Timeout(IMPORT_CHAT_TIMEOUT_SECONDS) timeout = httpx.Timeout(IMPORT_CHAT_TIMEOUT_SECONDS)
headers = build_import_gateway_headers()
try: try:
response = await client.post(url, json=payload, timeout=timeout) response = await client.post(url, json=payload, timeout=timeout, headers=headers)
response.raise_for_status() response.raise_for_status()
except httpx.HTTPStatusError as exc: except httpx.HTTPStatusError as exc:
body_preview = "" body_preview = ""
@ -347,9 +360,10 @@ async def dispatch_import_analysis_job(
response.status_code, response.status_code,
) )
logger.info( logger.info(
"LLM response for %s: %s", "LLM response received for %s (status %s, choices=%s)",
request.import_record_id, request.import_record_id,
json.dumps(response_data, ensure_ascii=False), response.status_code,
len(response_data.get("choices") or []),
) )
try: try:
@ -375,18 +389,6 @@ async def dispatch_import_analysis_job(
return result return result
# 兼容处理多模型的使用量字段提取
def extract_usage(resp_json: dict) -> dict:
usage = resp_json.get("usage") or resp_json.get("usageMetadata") or {}
return {
"prompt_tokens": usage.get("prompt_tokens") or usage.get("input_tokens") or usage.get("promptTokenCount"),
"completion_tokens": usage.get("completion_tokens") or usage.get("output_tokens") or usage.get("candidatesTokenCount"),
"total_tokens": usage.get("total_tokens") or usage.get("totalTokenCount") or (
(usage.get("prompt_tokens") or usage.get("input_tokens") or 0)
+ (usage.get("completion_tokens") or usage.get("output_tokens") or 0)
)
}
async def notify_import_analysis_callback( async def notify_import_analysis_callback(
callback_url: str, callback_url: str,
payload: Dict[str, Any], payload: Dict[str, Any],
@ -415,6 +417,7 @@ async def process_import_analysis_job(
request: DataImportAnalysisJobRequest, request: DataImportAnalysisJobRequest,
client: httpx.AsyncClient, client: httpx.AsyncClient,
) -> None: ) -> None:
# Run the import analysis and ensure the callback fires regardless of success/failure.
try: try:
payload = await dispatch_import_analysis_job(request, client) payload = await dispatch_import_analysis_job(request, client)
except ProviderAPICallError as exc: except ProviderAPICallError as exc:

View File

@ -0,0 +1,842 @@
from __future__ import annotations
import hashlib
import json
import logging
from datetime import datetime
from typing import Any, Dict, Iterable, List, Optional
from uuid import uuid4
from sqlalchemy import text
from sqlalchemy.engine import Row
from app.db import get_engine
from app.schemas.chat import (
ChatSessionCreate,
ChatSessionUpdate,
ChatTurnCreate,
ChatTurnRetrievalItem,
)
from app.schemas.metrics import (
MetricCreate,
MetricResultItem,
MetricResultsWriteRequest,
MetricRunTrigger,
MetricScheduleCreate,
MetricScheduleUpdate,
MetricUpdate,
)
logger = logging.getLogger(__name__)
# Common helpers
def _json_dump(value: Any) -> Optional[str]:
"""Safe JSON dumper; returns None on failure to keep DB writes simple."""
if value is None:
return None
if isinstance(value, str):
return value
try:
return json.dumps(value, ensure_ascii=False)
except (TypeError, ValueError):
return None
def _parse_json_fields(payload: Dict[str, Any], fields: Iterable[str]) -> Dict[str, Any]:
"""Parse select fields from JSON strings into dict/list for responses."""
for field in fields:
raw = payload.get(field)
if raw is None or isinstance(raw, (dict, list)):
continue
if isinstance(raw, (bytes, bytearray)):
raw = raw.decode("utf-8", errors="ignore")
if isinstance(raw, str):
try:
payload[field] = json.loads(raw)
except ValueError:
pass
return payload
def _row_to_dict(row: Row[Any]) -> Dict[str, Any]:
return dict(row._mapping)
# Chat sessions & turns
def create_chat_session(payload: ChatSessionCreate) -> Dict[str, Any]:
"""Create a chat session row with optional external UUID."""
engine = get_engine()
session_uuid = payload.session_uuid or str(uuid4())
now = datetime.utcnow()
params = {
"user_id": payload.user_id,
"session_uuid": session_uuid,
"end_time": payload.end_time,
"status": payload.status or "OPEN",
"ext_context": _json_dump(payload.ext_context),
}
with engine.begin() as conn:
result = conn.execute(
text(
"""
INSERT INTO chat_session (user_id, session_uuid, end_time, status, ext_context)
VALUES (:user_id, :session_uuid, :end_time, :status, :ext_context)
"""
),
params,
)
session_id = result.lastrowid
row = conn.execute(
text("SELECT * FROM chat_session WHERE id=:id"), {"id": session_id}
).first()
if not row:
raise RuntimeError("Failed to create chat session.")
data = _row_to_dict(row)
_parse_json_fields(data, ["ext_context"])
return data
def update_chat_session(session_id: int, payload: ChatSessionUpdate) -> Dict[str, Any]:
"""Patch selected chat session fields."""
updates = {}
if payload.status is not None:
updates["status"] = payload.status
if payload.end_time is not None:
updates["end_time"] = payload.end_time
if payload.last_turn_id is not None:
updates["last_turn_id"] = payload.last_turn_id
if payload.ext_context is not None:
updates["ext_context"] = _json_dump(payload.ext_context)
if not updates:
current = get_chat_session(session_id)
if not current:
raise KeyError(f"Session {session_id} not found.")
return current
set_clause = ", ".join(f"{key}=:{key}" for key in updates.keys())
params = dict(updates)
params["id"] = session_id
engine = get_engine()
with engine.begin() as conn:
conn.execute(
text(f"UPDATE chat_session SET {set_clause} WHERE id=:id"),
params,
)
row = conn.execute(
text("SELECT * FROM chat_session WHERE id=:id"), {"id": session_id}
).first()
if not row:
raise KeyError(f"Session {session_id} not found.")
data = _row_to_dict(row)
_parse_json_fields(data, ["ext_context"])
return data
def close_chat_session(session_id: int) -> Dict[str, Any]:
"""Mark a chat session as CLOSED with end_time."""
now = datetime.utcnow()
return update_chat_session(
session_id,
ChatSessionUpdate(status="CLOSED", end_time=now),
)
def get_chat_session(session_id: int) -> Optional[Dict[str, Any]]:
engine = get_engine()
with engine.begin() as conn:
row = conn.execute(
text("SELECT * FROM chat_session WHERE id=:id"), {"id": session_id}
).first()
if not row:
return None
data = _row_to_dict(row)
_parse_json_fields(data, ["ext_context"])
return data
def list_chat_sessions(
*,
user_id: Optional[int] = None,
status: Optional[str] = None,
start_from: Optional[datetime] = None,
start_to: Optional[datetime] = None,
limit: int = 50,
offset: int = 0,
) -> List[Dict[str, Any]]:
"""List chat sessions with optional filters and pagination."""
conditions = []
params: Dict[str, Any] = {"limit": limit, "offset": offset}
if user_id is not None:
conditions.append("user_id=:user_id")
params["user_id"] = user_id
if status is not None:
conditions.append("status=:status")
params["status"] = status
if start_from is not None:
conditions.append("created_at>=:start_from")
params["start_from"] = start_from
if start_to is not None:
conditions.append("created_at<=:start_to")
params["start_to"] = start_to
where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
engine = get_engine()
with engine.begin() as conn:
rows = conn.execute(
text(
f"SELECT * FROM chat_session {where_clause} "
"ORDER BY created_at DESC LIMIT :limit OFFSET :offset"
),
params,
).fetchall()
results: List[Dict[str, Any]] = []
for row in rows:
data = _row_to_dict(row)
_parse_json_fields(data, ["ext_context"])
results.append(data)
return results
def _next_turn_no(conn, session_id: int) -> int:
row = conn.execute(
text("SELECT COALESCE(MAX(turn_no), 0) + 1 AS next_no FROM chat_turn WHERE session_id=:sid"),
{"sid": session_id},
).first()
if not row:
return 1
return int(row._mapping["next_no"])
def create_chat_turn(session_id: int, payload: ChatTurnCreate) -> Dict[str, Any]:
"""Insert a chat turn and auto-increment turn number within the session."""
engine = get_engine()
now = datetime.utcnow()
params = {
"session_id": session_id,
"user_id": payload.user_id,
"user_query": payload.user_query,
"intent": payload.intent,
"ast_json": _json_dump(payload.ast_json),
"generated_sql": payload.generated_sql,
"sql_status": payload.sql_status,
"error_msg": payload.error_msg,
"main_metric_ids": _json_dump(payload.main_metric_ids),
"created_metric_ids": _json_dump(payload.created_metric_ids),
"end_time": payload.end_time,
}
with engine.begin() as conn:
turn_no = _next_turn_no(conn, session_id)
params["turn_no"] = turn_no
result = conn.execute(
text(
"""
INSERT INTO chat_turn (
session_id, turn_no, user_id,
user_query, intent, ast_json,
generated_sql, sql_status, error_msg,
main_metric_ids, created_metric_ids,
end_time
)
VALUES (
:session_id, :turn_no, :user_id,
:user_query, :intent, :ast_json,
:generated_sql, :sql_status, :error_msg,
:main_metric_ids, :created_metric_ids,
:end_time
)
"""
),
params,
)
turn_id = result.lastrowid
row = conn.execute(
text("SELECT * FROM chat_turn WHERE id=:id"), {"id": turn_id}
).first()
if not row:
raise RuntimeError("Failed to create chat turn.")
data = _row_to_dict(row)
_parse_json_fields(data, ["ast_json", "main_metric_ids", "created_metric_ids"])
return data
def get_chat_turn(turn_id: int) -> Optional[Dict[str, Any]]:
engine = get_engine()
with engine.begin() as conn:
row = conn.execute(
text("SELECT * FROM chat_turn WHERE id=:id"), {"id": turn_id}
).first()
if not row:
return None
data = _row_to_dict(row)
_parse_json_fields(data, ["ast_json", "main_metric_ids", "created_metric_ids"])
return data
def list_chat_turns(session_id: int) -> List[Dict[str, Any]]:
engine = get_engine()
with engine.begin() as conn:
rows = conn.execute(
text(
"SELECT * FROM chat_turn WHERE session_id=:session_id ORDER BY turn_no ASC"
),
{"session_id": session_id},
).fetchall()
results: List[Dict[str, Any]] = []
for row in rows:
data = _row_to_dict(row)
_parse_json_fields(data, ["ast_json", "main_metric_ids", "created_metric_ids"])
results.append(data)
return results
def create_retrievals(turn_id: int, retrievals: List[ChatTurnRetrievalItem]) -> int:
"""Batch insert retrieval records for a turn."""
if not retrievals:
return 0
engine = get_engine()
params_list = []
for item in retrievals:
params_list.append(
{
"turn_id": turn_id,
"item_type": item.item_type,
"item_id": item.item_id,
"item_extra": _json_dump(item.item_extra),
"similarity_score": item.similarity_score,
"rank_no": item.rank_no,
"used_in_reasoning": 1 if item.used_in_reasoning else 0,
"used_in_sql": 1 if item.used_in_sql else 0,
}
)
with engine.begin() as conn:
conn.execute(
text(
"""
INSERT INTO chat_turn_retrieval (
turn_id, item_type, item_id, item_extra,
similarity_score, rank_no, used_in_reasoning, used_in_sql
)
VALUES (
:turn_id, :item_type, :item_id, :item_extra,
:similarity_score, :rank_no, :used_in_reasoning, :used_in_sql
)
"""
),
params_list,
)
return len(retrievals)
def list_retrievals(turn_id: int) -> List[Dict[str, Any]]:
engine = get_engine()
with engine.begin() as conn:
rows = conn.execute(
text(
"SELECT * FROM chat_turn_retrieval WHERE turn_id=:turn_id ORDER BY created_at ASC, rank_no ASC"
),
{"turn_id": turn_id},
).fetchall()
results: List[Dict[str, Any]] = []
for row in rows:
data = _row_to_dict(row)
_parse_json_fields(data, ["item_extra"])
data["used_in_reasoning"] = bool(data.get("used_in_reasoning"))
data["used_in_sql"] = bool(data.get("used_in_sql"))
results.append(data)
return results
# Metric registry
def _metric_sql_hash(sql_text: str) -> str:
"""Compute a stable hash to detect SQL definition changes."""
return hashlib.md5(sql_text.encode("utf-8")).hexdigest()
def create_metric(payload: MetricCreate) -> Dict[str, Any]:
"""Insert a new metric definition; version starts at 1."""
engine = get_engine()
now = datetime.utcnow()
sql_hash = _metric_sql_hash(payload.base_sql)
params = {
"metric_code": payload.metric_code,
"metric_name": payload.metric_name,
"metric_aliases": _json_dump(payload.metric_aliases),
"biz_domain": payload.biz_domain,
"biz_desc": payload.biz_desc,
"chat_turn_id": payload.chat_turn_id,
"tech_desc": payload.tech_desc,
"formula_expr": payload.formula_expr,
"base_sql": payload.base_sql,
"time_grain": payload.time_grain,
"dim_binding": _json_dump(payload.dim_binding),
"update_strategy": payload.update_strategy,
"schedule_id": payload.schedule_id,
"schedule_type": payload.schedule_type,
"version": 1,
"is_active": 1 if payload.is_active else 0,
"sql_hash": sql_hash,
"created_by": payload.created_by,
"updated_by": payload.updated_by,
"created_at": now,
"updated_at": now,
}
with engine.begin() as conn:
result = conn.execute(
text(
"""
INSERT INTO metric_def (
metric_code, metric_name, metric_aliases, biz_domain, biz_desc,
chat_turn_id, tech_desc, formula_expr, base_sql,
time_grain, dim_binding, update_strategy,
schedule_id, schedule_type, version, is_active,
sql_hash, created_by, updated_by, created_at, updated_at
)
VALUES (
:metric_code, :metric_name, :metric_aliases, :biz_domain, :biz_desc,
:chat_turn_id, :tech_desc, :formula_expr, :base_sql,
:time_grain, :dim_binding, :update_strategy,
:schedule_id, :schedule_type, :version, :is_active,
:sql_hash, :created_by, :updated_by, :created_at, :updated_at
)
"""
),
params,
)
metric_id = result.lastrowid
row = conn.execute(
text("SELECT * FROM metric_def WHERE id=:id"), {"id": metric_id}
).first()
if not row:
raise RuntimeError("Failed to create metric definition.")
data = _row_to_dict(row)
_parse_json_fields(data, ["metric_aliases", "dim_binding"])
data["is_active"] = bool(data.get("is_active"))
return data
def update_metric(metric_id: int, payload: MetricUpdate) -> Dict[str, Any]:
"""Update mutable fields of a metric definition and refresh sql_hash when needed."""
updates: Dict[str, Any] = {}
for field in (
"metric_name",
"biz_domain",
"biz_desc",
"tech_desc",
"formula_expr",
"base_sql",
"time_grain",
"update_strategy",
"schedule_id",
"schedule_type",
"updated_by",
):
value = getattr(payload, field)
if value is not None:
updates[field] = value
if payload.metric_aliases is not None:
updates["metric_aliases"] = _json_dump(payload.metric_aliases)
if payload.dim_binding is not None:
updates["dim_binding"] = _json_dump(payload.dim_binding)
if payload.is_active is not None:
updates["is_active"] = 1 if payload.is_active else 0
if payload.base_sql is not None:
updates["sql_hash"] = _metric_sql_hash(payload.base_sql)
if not updates:
current = get_metric(metric_id)
if not current:
raise KeyError(f"Metric {metric_id} not found.")
return current
updates["updated_at"] = datetime.utcnow()
set_clause = ", ".join(f"{key}=:{key}" for key in updates.keys())
params = dict(updates)
params["id"] = metric_id
engine = get_engine()
with engine.begin() as conn:
conn.execute(
text(f"UPDATE metric_def SET {set_clause} WHERE id=:id"),
params,
)
row = conn.execute(
text("SELECT * FROM metric_def WHERE id=:id"), {"id": metric_id}
).first()
if not row:
raise KeyError(f"Metric {metric_id} not found.")
data = _row_to_dict(row)
_parse_json_fields(data, ["metric_aliases", "dim_binding"])
data["is_active"] = bool(data.get("is_active"))
return data
def get_metric(metric_id: int) -> Optional[Dict[str, Any]]:
engine = get_engine()
with engine.begin() as conn:
row = conn.execute(
text("SELECT * FROM metric_def WHERE id=:id"), {"id": metric_id}
).first()
if not row:
return None
data = _row_to_dict(row)
_parse_json_fields(data, ["metric_aliases", "dim_binding"])
data["is_active"] = bool(data.get("is_active"))
return data
def list_metrics(
*,
biz_domain: Optional[str] = None,
is_active: Optional[bool] = None,
keyword: Optional[str] = None,
limit: int = 100,
offset: int = 0,
) -> List[Dict[str, Any]]:
"""List metric definitions with simple filters and pagination."""
conditions = []
params: Dict[str, Any] = {"limit": limit, "offset": offset}
if biz_domain:
conditions.append("biz_domain=:biz_domain")
params["biz_domain"] = biz_domain
if is_active is not None:
conditions.append("is_active=:is_active")
params["is_active"] = 1 if is_active else 0
if keyword:
conditions.append("(metric_code LIKE :kw OR metric_name LIKE :kw)")
params["kw"] = f"%{keyword}%"
where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
engine = get_engine()
with engine.begin() as conn:
rows = conn.execute(
text(
f"SELECT * FROM metric_def {where_clause} "
"ORDER BY updated_at DESC LIMIT :limit OFFSET :offset"
),
params,
).fetchall()
results: List[Dict[str, Any]] = []
for row in rows:
data = _row_to_dict(row)
_parse_json_fields(data, ["metric_aliases", "dim_binding"])
data["is_active"] = bool(data.get("is_active"))
results.append(data)
return results
# Metric schedules
def create_metric_schedule(payload: MetricScheduleCreate) -> Dict[str, Any]:
"""Create a schedule record for a metric."""
engine = get_engine()
params = {
"metric_id": payload.metric_id,
"cron_expr": payload.cron_expr,
"enabled": 1 if payload.enabled else 0,
"priority": payload.priority,
"backfill_allowed": 1 if payload.backfill_allowed else 0,
"max_runtime_sec": payload.max_runtime_sec,
"retry_times": payload.retry_times,
"owner_team": payload.owner_team,
"owner_user_id": payload.owner_user_id,
}
with engine.begin() as conn:
result = conn.execute(
text(
"""
INSERT INTO metric_schedule (
metric_id, cron_expr, enabled, priority,
backfill_allowed, max_runtime_sec, retry_times,
owner_team, owner_user_id
) VALUES (
:metric_id, :cron_expr, :enabled, :priority,
:backfill_allowed, :max_runtime_sec, :retry_times,
:owner_team, :owner_user_id
)
"""
),
params,
)
schedule_id = result.lastrowid
row = conn.execute(
text("SELECT * FROM metric_schedule WHERE id=:id"), {"id": schedule_id}
).first()
if not row:
raise RuntimeError("Failed to create metric schedule.")
data = _row_to_dict(row)
data["enabled"] = bool(data.get("enabled"))
data["backfill_allowed"] = bool(data.get("backfill_allowed"))
return data
def update_metric_schedule(schedule_id: int, payload: MetricScheduleUpdate) -> Dict[str, Any]:
updates: Dict[str, Any] = {}
for field in (
"cron_expr",
"priority",
"max_runtime_sec",
"retry_times",
"owner_team",
"owner_user_id",
):
value = getattr(payload, field)
if value is not None:
updates[field] = value
if payload.enabled is not None:
updates["enabled"] = 1 if payload.enabled else 0
if payload.backfill_allowed is not None:
updates["backfill_allowed"] = 1 if payload.backfill_allowed else 0
if not updates:
current = list_schedules_for_metric(schedule_id=schedule_id)
if current:
return current[0]
raise KeyError(f"Schedule {schedule_id} not found.")
set_clause = ", ".join(f"{key}=:{key}" for key in updates.keys())
params = dict(updates)
params["id"] = schedule_id
engine = get_engine()
with engine.begin() as conn:
conn.execute(
text(f"UPDATE metric_schedule SET {set_clause} WHERE id=:id"),
params,
)
row = conn.execute(
text("SELECT * FROM metric_schedule WHERE id=:id"), {"id": schedule_id}
).first()
if not row:
raise KeyError(f"Schedule {schedule_id} not found.")
data = _row_to_dict(row)
data["enabled"] = bool(data.get("enabled"))
data["backfill_allowed"] = bool(data.get("backfill_allowed"))
return data
def list_schedules_for_metric(metric_id: Optional[int] = None, schedule_id: Optional[int] = None) -> List[Dict[str, Any]]:
conditions = []
params: Dict[str, Any] = {}
if metric_id is not None:
conditions.append("metric_id=:metric_id")
params["metric_id"] = metric_id
if schedule_id is not None:
conditions.append("id=:id")
params["id"] = schedule_id
where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
engine = get_engine()
with engine.begin() as conn:
rows = conn.execute(
text(f"SELECT * FROM metric_schedule {where_clause} ORDER BY id DESC"),
params,
).fetchall()
results: List[Dict[str, Any]] = []
for row in rows:
data = _row_to_dict(row)
data["enabled"] = bool(data.get("enabled"))
data["backfill_allowed"] = bool(data.get("backfill_allowed"))
results.append(data)
return results
# Metric runs
def trigger_metric_run(payload: MetricRunTrigger) -> Dict[str, Any]:
"""Create a metric_job_run entry; execution is orchestrated elsewhere."""
metric = get_metric(payload.metric_id)
if not metric:
raise KeyError(f"Metric {payload.metric_id} not found.")
metric_version = payload.metric_version or metric.get("version", 1)
base_sql_snapshot = payload.base_sql_snapshot or metric.get("base_sql")
triggered_at = payload.triggered_at or datetime.utcnow()
params = {
"metric_id": payload.metric_id,
"schedule_id": payload.schedule_id,
"source_turn_id": payload.source_turn_id,
"data_time_from": payload.data_time_from,
"data_time_to": payload.data_time_to,
"metric_version": metric_version,
"base_sql_snapshot": base_sql_snapshot,
"status": "RUNNING",
"error_msg": None,
"affected_rows": None,
"runtime_ms": None,
"triggered_by": payload.triggered_by,
"triggered_at": triggered_at,
"started_at": None,
"finished_at": None,
}
engine = get_engine()
with engine.begin() as conn:
result = conn.execute(
text(
"""
INSERT INTO metric_job_run (
metric_id, schedule_id, source_turn_id,
data_time_from, data_time_to, metric_version,
base_sql_snapshot, status, error_msg,
affected_rows, runtime_ms,
triggered_by, triggered_at, started_at, finished_at
) VALUES (
:metric_id, :schedule_id, :source_turn_id,
:data_time_from, :data_time_to, :metric_version,
:base_sql_snapshot, :status, :error_msg,
:affected_rows, :runtime_ms,
:triggered_by, :triggered_at, :started_at, :finished_at
)
"""
),
params,
)
run_id = result.lastrowid
row = conn.execute(
text("SELECT * FROM metric_job_run WHERE id=:id"), {"id": run_id}
).first()
if not row:
raise RuntimeError("Failed to create metric job run.")
return _row_to_dict(row)
def get_metric_run(run_id: int) -> Optional[Dict[str, Any]]:
engine = get_engine()
with engine.begin() as conn:
row = conn.execute(
text("SELECT * FROM metric_job_run WHERE id=:id"), {"id": run_id}
).first()
if not row:
return None
return _row_to_dict(row)
def list_metric_runs(
*,
metric_id: Optional[int] = None,
status: Optional[str] = None,
limit: int = 100,
offset: int = 0,
) -> List[Dict[str, Any]]:
conditions = []
params: Dict[str, Any] = {"limit": limit, "offset": offset}
if metric_id is not None:
conditions.append("metric_id=:metric_id")
params["metric_id"] = metric_id
if status is not None:
conditions.append("status=:status")
params["status"] = status
where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
engine = get_engine()
with engine.begin() as conn:
rows = conn.execute(
text(
f"SELECT * FROM metric_job_run {where_clause} "
"ORDER BY triggered_at DESC LIMIT :limit OFFSET :offset"
),
params,
).fetchall()
return [_row_to_dict(row) for row in rows]
# Metric results
def write_metric_results(payload: MetricResultsWriteRequest) -> int:
"""Bulk insert metric_result rows for a metric/version."""
metric = get_metric(payload.metric_id)
if not metric:
raise KeyError(f"Metric {payload.metric_id} not found.")
default_version = metric.get("version", 1)
now = datetime.utcnow()
rows: List[Dict[str, Any]] = []
for item in payload.results:
rows.append(
{
"metric_id": payload.metric_id,
"metric_version": item.metric_version or default_version,
"stat_time": item.stat_time,
"extra_dims": _json_dump(item.extra_dims),
"metric_value": item.metric_value,
"load_time": item.load_time or now,
"data_version": item.data_version,
}
)
if not rows:
return 0
engine = get_engine()
with engine.begin() as conn:
conn.execute(
text(
"""
INSERT INTO metric_result (
metric_id, metric_version, stat_time,
extra_dims, metric_value, load_time, data_version
) VALUES (
:metric_id, :metric_version, :stat_time,
:extra_dims, :metric_value, :load_time, :data_version
)
"""
),
rows,
)
return len(rows)
def query_metric_results(
*,
metric_id: int,
stat_from: Optional[datetime] = None,
stat_to: Optional[datetime] = None,
limit: int = 200,
offset: int = 0,
) -> List[Dict[str, Any]]:
conditions = ["metric_id=:metric_id"]
params: Dict[str, Any] = {
"metric_id": metric_id,
"limit": limit,
"offset": offset,
}
if stat_from is not None:
conditions.append("stat_time>=:stat_from")
params["stat_from"] = stat_from
if stat_to is not None:
conditions.append("stat_time<=:stat_to")
params["stat_to"] = stat_to
where_clause = "WHERE " + " AND ".join(conditions)
engine = get_engine()
with engine.begin() as conn:
rows = conn.execute(
text(
f"SELECT * FROM metric_result {where_clause} "
"ORDER BY stat_time DESC LIMIT :limit OFFSET :offset"
),
params,
).fetchall()
results: List[Dict[str, Any]] = []
for row in rows:
data = _row_to_dict(row)
_parse_json_fields(data, ["extra_dims"])
results.append(data)
return results
def latest_metric_result(metric_id: int) -> Optional[Dict[str, Any]]:
engine = get_engine()
with engine.begin() as conn:
row = conn.execute(
text(
"""
SELECT * FROM metric_result
WHERE metric_id=:metric_id
ORDER BY stat_time DESC
LIMIT 1
"""
),
{"metric_id": metric_id},
).first()
if not row:
return None
data = _row_to_dict(row)
_parse_json_fields(data, ["extra_dims"])
return data

View File

@ -0,0 +1,83 @@
from __future__ import annotations
import logging
from typing import Any, Sequence
import httpx
from app.exceptions import ProviderAPICallError
from app.schemas.rag import RagDeleteRequest, RagItemPayload, RagRetrieveRequest
from app.settings import RAG_API_AUTH_TOKEN, RAG_API_BASE_URL
logger = logging.getLogger(__name__)
class RagAPIClient:
"""Thin async client wrapper around the RAG endpoints described in doc/rag-api.md."""
def __init__(self, *, base_url: str | None = None, auth_token: str | None = None) -> None:
resolved_base = base_url or RAG_API_BASE_URL
self._base_url = resolved_base.rstrip("/")
self._auth_token = auth_token or RAG_API_AUTH_TOKEN
def _headers(self) -> dict[str, str]:
headers = {"Content-Type": "application/json"}
if self._auth_token:
headers["Authorization"] = f"Bearer {self._auth_token}"
return headers
async def _post(
self,
client: httpx.AsyncClient,
path: str,
payload: Any,
) -> Any:
url = f"{self._base_url}{path}"
try:
response = await client.post(url, json=payload, headers=self._headers())
response.raise_for_status()
except httpx.HTTPStatusError as exc:
status_code = exc.response.status_code if exc.response else None
response_text = exc.response.text if exc.response else ""
logger.error(
"RAG API responded with an error (%s) for %s: %s",
status_code,
url,
response_text,
exc_info=True,
)
raise ProviderAPICallError(
"RAG API call failed.",
status_code=status_code,
response_text=response_text,
) from exc
except httpx.HTTPError as exc:
logger.error("Transport error calling RAG API %s: %s", url, exc, exc_info=True)
raise ProviderAPICallError(f"RAG API call failed: {exc}") from exc
try:
return response.json()
except ValueError:
logger.warning("RAG API returned non-JSON response for %s; returning raw text.", url)
return {"raw": response.text}
async def add(self, client: httpx.AsyncClient, payload: RagItemPayload) -> Any:
body = payload.model_dump(by_alias=True, exclude_none=True)
return await self._post(client, "/rag/add", body)
async def add_batch(self, client: httpx.AsyncClient, items: Sequence[RagItemPayload]) -> Any:
body = [item.model_dump(by_alias=True, exclude_none=True) for item in items]
return await self._post(client, "/rag/addBatch", body)
async def update(self, client: httpx.AsyncClient, payload: RagItemPayload) -> Any:
body = payload.model_dump(by_alias=True, exclude_none=True)
return await self._post(client, "/rag/update", body)
async def delete(self, client: httpx.AsyncClient, payload: RagDeleteRequest) -> Any:
body = payload.model_dump(by_alias=True, exclude_none=True)
return await self._post(client, "/rag/delete", body)
async def retrieve(self, client: httpx.AsyncClient, payload: RagRetrieveRequest) -> Any:
body = payload.model_dump(by_alias=True, exclude_none=True)
return await self._post(client, "/rag/retrieve", body)

View File

@ -26,6 +26,7 @@ from app.services.import_analysis import (
IMPORT_GATEWAY_BASE_URL, IMPORT_GATEWAY_BASE_URL,
resolve_provider_from_model, resolve_provider_from_model,
) )
from app.utils.llm_usage import extract_usage as extract_llm_usage
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -37,7 +38,7 @@ PROMPT_FILENAMES = {
"snippet_generator": "snippet_generator.md", "snippet_generator": "snippet_generator.md",
"snippet_alias": "snippet_alias_generator.md", "snippet_alias": "snippet_alias_generator.md",
} }
DEFAULT_CHAT_TIMEOUT_SECONDS = 90.0 DEFAULT_CHAT_TIMEOUT_SECONDS = 180.0
@dataclass @dataclass
@ -47,6 +48,12 @@ class GEProfilingArtifacts:
docs_path: str docs_path: str
@dataclass
class LLMCallResult:
data: Any
usage: Optional[Dict[str, Any]] = None
class PipelineActionType: class PipelineActionType:
GE_PROFILING = "ge_profiling" GE_PROFILING = "ge_profiling"
GE_RESULT_DESC = "ge_result_desc" GE_RESULT_DESC = "ge_result_desc"
@ -124,11 +131,16 @@ def _extract_json_payload(content: str) -> str:
if not stripped: if not stripped:
raise ValueError("Empty LLM content.") raise ValueError("Empty LLM content.")
for opener, closer in (("{", "}"), ("[", "]")): decoder = json.JSONDecoder()
start = stripped.find(opener) for idx, char in enumerate(stripped):
end = stripped.rfind(closer) if char not in {"{", "["}:
if start != -1 and end != -1 and end > start: continue
candidate = stripped[start : end + 1].strip() try:
_, end = decoder.raw_decode(stripped[idx:])
except json.JSONDecodeError:
continue
candidate = stripped[idx : idx + end].strip()
if candidate:
return candidate return candidate
return stripped return stripped
@ -559,7 +571,9 @@ async def _call_chat_completions(
except ValueError as exc: except ValueError as exc:
raise ProviderAPICallError("Chat completions response was not valid JSON.") from exc raise ProviderAPICallError("Chat completions response was not valid JSON.") from exc
return _parse_completion_payload(response_payload) parsed_payload = _parse_completion_payload(response_payload)
usage_info = extract_llm_usage(response_payload)
return LLMCallResult(data=parsed_payload, usage=usage_info)
def _normalize_for_json(value: Any) -> Any: def _normalize_for_json(value: Any) -> Any:
@ -628,7 +642,7 @@ async def _execute_result_desc(
client=client, client=client,
timeout_seconds=timeout_seconds, timeout_seconds=timeout_seconds,
) )
if not isinstance(llm_output, dict): if not isinstance(llm_output.data, dict):
raise ProviderAPICallError("GE result description payload must be a JSON object.") raise ProviderAPICallError("GE result description payload must be a JSON object.")
return llm_output return llm_output
@ -651,7 +665,7 @@ async def _execute_snippet_generation(
client=client, client=client,
timeout_seconds=timeout_seconds, timeout_seconds=timeout_seconds,
) )
if not isinstance(llm_output, list): if not isinstance(llm_output.data, list):
raise ProviderAPICallError("Snippet generator must return a JSON array.") raise ProviderAPICallError("Snippet generator must return a JSON array.")
return llm_output return llm_output
@ -674,7 +688,7 @@ async def _execute_snippet_alias(
client=client, client=client,
timeout_seconds=timeout_seconds, timeout_seconds=timeout_seconds,
) )
if not isinstance(llm_output, list): if not isinstance(llm_output.data, list):
raise ProviderAPICallError("Snippet alias generator must return a JSON array.") raise ProviderAPICallError("Snippet alias generator must return a JSON array.")
return llm_output return llm_output
@ -711,6 +725,12 @@ async def _run_action_with_callback(
await _post_callback(callback_url, failure_payload, client) await _post_callback(callback_url, failure_payload, client)
raise raise
usage_info: Optional[Dict[str, Any]] = None
result_payload = result
if isinstance(result, LLMCallResult):
usage_info = result.usage
result_payload = result.data
success_payload = dict(callback_base) success_payload = dict(callback_base)
success_payload.update( success_payload.update(
{ {
@ -724,23 +744,26 @@ async def _run_action_with_callback(
logger.info( logger.info(
"Pipeline action %s output: %s", "Pipeline action %s output: %s",
action_type, action_type,
_preview_for_log(result), _preview_for_log(result_payload),
) )
if action_type == PipelineActionType.GE_PROFILING: if action_type == PipelineActionType.GE_PROFILING:
artifacts: GEProfilingArtifacts = result artifacts: GEProfilingArtifacts = result_payload
success_payload["profiling_json"] = artifacts.profiling_result success_payload["ge_profiling_json"] = artifacts.profiling_result
success_payload["profiling_summary"] = artifacts.profiling_summary success_payload["ge_profiling_summary"] = artifacts.profiling_summary
success_payload["ge_report_path"] = artifacts.docs_path success_payload["ge_report_path"] = artifacts.docs_path
elif action_type == PipelineActionType.GE_RESULT_DESC: elif action_type == PipelineActionType.GE_RESULT_DESC:
success_payload["table_desc_json"] = result success_payload["ge_result_desc_json"] = result_payload
elif action_type == PipelineActionType.SNIPPET: elif action_type == PipelineActionType.SNIPPET:
success_payload["snippet_json"] = result success_payload["snippet_json"] = result_payload
elif action_type == PipelineActionType.SNIPPET_ALIAS: elif action_type == PipelineActionType.SNIPPET_ALIAS:
success_payload["snippet_alias_json"] = result success_payload["snippet_alias_json"] = result_payload
if usage_info:
success_payload["llm_usage"] = usage_info
await _post_callback(callback_url, success_payload, client) await _post_callback(callback_url, success_payload, client)
return result return result_payload
async def process_table_profiling_job( async def process_table_profiling_job(
@ -762,6 +785,8 @@ async def process_table_profiling_job(
"table_schema_version_id": request.table_schema_version_id, "table_schema_version_id": request.table_schema_version_id,
"llm_model": request.llm_model, "llm_model": request.llm_model,
"llm_timeout_seconds": timeout_seconds, "llm_timeout_seconds": timeout_seconds,
"workspace_id": request.workspace_id,
"rag_item_type": request.rag_item_type,
} }
logging_request_payload = _profiling_request_for_log(request) logging_request_payload = _profiling_request_for_log(request)

View File

@ -1,19 +1,19 @@
from __future__ import annotations from __future__ import annotations
import hashlib
import json import json
import logging import logging
from typing import Any, Dict, Tuple from datetime import datetime
from typing import Any, Dict, List, Optional, Sequence, Tuple
from sqlalchemy import text from sqlalchemy import text
from sqlalchemy.engine import Engine from sqlalchemy.engine import Engine
from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.exc import SQLAlchemyError
from app.db import get_engine from app.db import get_engine
from app.models import ( from app.models import ActionType, TableSnippetUpsertRequest, TableSnippetUpsertResponse
ActionType, from app.schemas.rag import RagItemPayload
TableSnippetUpsertRequest, from app.services.rag_client import RagAPIClient
TableSnippetUpsertResponse,
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -38,7 +38,15 @@ def _prepare_table_schema(value: Any) -> str:
return json.dumps(value, ensure_ascii=False) return json.dumps(value, ensure_ascii=False)
def _prepare_model_params(params: Dict[str, Any] | None) -> str | None:
if not params:
return None
serialized, _ = _serialize_json(params)
return serialized
def _collect_common_columns(request: TableSnippetUpsertRequest) -> Dict[str, Any]: def _collect_common_columns(request: TableSnippetUpsertRequest) -> Dict[str, Any]:
# Build the base column set shared by all action types; action-specific fields are populated later.
logger.debug( logger.debug(
"Collecting common columns for table_id=%s version_ts=%s action_type=%s", "Collecting common columns for table_id=%s version_ts=%s action_type=%s",
request.table_id, request.table_id,
@ -53,8 +61,34 @@ def _collect_common_columns(request: TableSnippetUpsertRequest) -> Dict[str, Any
"callback_url": str(request.callback_url), "callback_url": str(request.callback_url),
"table_schema_version_id": request.table_schema_version_id, "table_schema_version_id": request.table_schema_version_id,
"table_schema": _prepare_table_schema(request.table_schema), "table_schema": _prepare_table_schema(request.table_schema),
"model": request.model,
"model_provider": request.model_provider,
} }
payload.update(
{
"ge_profiling_json": None,
"ge_profiling_json_size_bytes": None,
"ge_profiling_summary": None,
"ge_profiling_summary_size_bytes": None,
"ge_profiling_total_size_bytes": None,
"ge_profiling_html_report_url": None,
"ge_result_desc_json": None,
"ge_result_desc_json_size_bytes": None,
"snippet_json": None,
"snippet_json_size_bytes": None,
"snippet_alias_json": None,
"snippet_alias_json_size_bytes": None,
}
)
payload["model_params"] = _prepare_model_params(request.model_params)
if request.llm_usage is not None:
llm_usage_json, _ = _serialize_json(request.llm_usage)
if llm_usage_json is not None:
payload["llm_usage"] = llm_usage_json
if request.error_code is not None: if request.error_code is not None:
logger.debug("Adding error_code: %s", request.error_code) logger.debug("Adding error_code: %s", request.error_code)
payload["error_code"] = request.error_code payload["error_code"] = request.error_code
@ -80,35 +114,35 @@ def _apply_action_payload(
) -> None: ) -> None:
logger.debug("Applying action-specific payload for action_type=%s", request.action_type) logger.debug("Applying action-specific payload for action_type=%s", request.action_type)
if request.action_type == ActionType.GE_PROFILING: if request.action_type == ActionType.GE_PROFILING:
full_json, full_size = _serialize_json(request.result_json) full_json, full_size = _serialize_json(request.ge_profiling_json)
summary_json, summary_size = _serialize_json(request.result_summary_json) summary_json, summary_size = _serialize_json(request.ge_profiling_summary)
if full_json is not None: if full_json is not None:
payload["ge_profiling_full"] = full_json payload["ge_profiling_json"] = full_json
payload["ge_profiling_full_size_bytes"] = full_size payload["ge_profiling_json_size_bytes"] = full_size
if summary_json is not None: if summary_json is not None:
payload["ge_profiling_summary"] = summary_json payload["ge_profiling_summary"] = summary_json
payload["ge_profiling_summary_size_bytes"] = summary_size payload["ge_profiling_summary_size_bytes"] = summary_size
if full_size is not None or summary_size is not None: if request.ge_profiling_total_size_bytes is not None:
payload["ge_profiling_total_size_bytes"] = (full_size or 0) + ( payload["ge_profiling_total_size_bytes"] = request.ge_profiling_total_size_bytes
summary_size or 0 elif full_size is not None or summary_size is not None:
) payload["ge_profiling_total_size_bytes"] = (full_size or 0) + (summary_size or 0)
if request.html_report_url: if request.ge_profiling_html_report_url:
payload["ge_profiling_html_report_url"] = request.html_report_url payload["ge_profiling_html_report_url"] = request.ge_profiling_html_report_url
elif request.action_type == ActionType.GE_RESULT_DESC: elif request.action_type == ActionType.GE_RESULT_DESC:
full_json, full_size = _serialize_json(request.result_json) full_json, full_size = _serialize_json(request.ge_result_desc_json)
if full_json is not None: if full_json is not None:
payload["ge_result_desc_full"] = full_json payload["ge_result_desc_json"] = full_json
payload["ge_result_desc_full_size_bytes"] = full_size payload["ge_result_desc_json_size_bytes"] = full_size
elif request.action_type == ActionType.SNIPPET: elif request.action_type == ActionType.SNIPPET:
full_json, full_size = _serialize_json(request.result_json) full_json, full_size = _serialize_json(request.snippet_json)
if full_json is not None: if full_json is not None:
payload["snippet_full"] = full_json payload["snippet_json"] = full_json
payload["snippet_full_size_bytes"] = full_size payload["snippet_json_size_bytes"] = full_size
elif request.action_type == ActionType.SNIPPET_ALIAS: elif request.action_type == ActionType.SNIPPET_ALIAS:
full_json, full_size = _serialize_json(request.result_json) full_json, full_size = _serialize_json(request.snippet_alias_json)
if full_json is not None: if full_json is not None:
payload["snippet_alias_full"] = full_json payload["snippet_alias_json"] = full_json
payload["snippet_alias_full_size_bytes"] = full_size payload["snippet_alias_json_size_bytes"] = full_size
else: else:
logger.error("Unsupported action type encountered: %s", request.action_type) logger.error("Unsupported action type encountered: %s", request.action_type)
raise ValueError(f"Unsupported action type '{request.action_type}'.") raise ValueError(f"Unsupported action type '{request.action_type}'.")
@ -182,3 +216,425 @@ def upsert_action_result(request: TableSnippetUpsertRequest) -> TableSnippetUpse
status=request.status, status=request.status,
updated=updated, updated=updated,
) )
def _decode_json_field(value: Any) -> Any:
"""Decode JSON columns that may be returned as str/bytes/dicts/lists."""
if value is None:
return None
if isinstance(value, (dict, list)):
return value
if isinstance(value, (bytes, bytearray)):
try:
value = value.decode("utf-8")
except Exception: # pragma: no cover - defensive
return None
if isinstance(value, str):
try:
return json.loads(value)
except json.JSONDecodeError:
logger.warning("Failed to decode JSON field: %s", value)
return None
return None
def _coerce_json_array(value: Any) -> List[Any]:
decoded = _decode_json_field(value)
return decoded if isinstance(decoded, list) else []
def _fetch_action_payload(
engine: Engine, table_id: int, version_ts: int, action_type: ActionType
) -> Optional[Dict[str, Any]]:
sql = text(
"""
SELECT id AS action_result_id, snippet_json, snippet_alias_json, updated_at, status
FROM action_results
WHERE table_id = :table_id
AND version_ts = :version_ts
AND action_type = :action_type
AND status IN ('success', 'partial')
ORDER BY CASE status WHEN 'success' THEN 0 ELSE 1 END, updated_at DESC
LIMIT 1
"""
)
with engine.connect() as conn:
row = conn.execute(
sql,
{
"table_id": table_id,
"version_ts": version_ts,
"action_type": action_type.value,
},
).mappings().first()
return dict(row) if row else None
def _load_snippet_sources(
engine: Engine, table_id: int, version_ts: int
) -> Tuple[List[Any], List[Any], Optional[datetime], Optional[int], Optional[int]]:
alias_row = _fetch_action_payload(engine, table_id, version_ts, ActionType.SNIPPET_ALIAS)
snippet_row = _fetch_action_payload(engine, table_id, version_ts, ActionType.SNIPPET)
snippet_json = _coerce_json_array(alias_row.get("snippet_json") if alias_row else None)
alias_json = _coerce_json_array(alias_row.get("snippet_alias_json") if alias_row else None)
updated_at: Optional[datetime] = alias_row.get("updated_at") if alias_row else None
alias_action_id: Optional[int] = alias_row.get("action_result_id") if alias_row else None
snippet_action_id: Optional[int] = snippet_row.get("action_result_id") if snippet_row else None
if not snippet_json and snippet_row:
snippet_json = _coerce_json_array(snippet_row.get("snippet_json"))
if updated_at is None:
updated_at = snippet_row.get("updated_at")
if alias_action_id is None:
alias_action_id = snippet_action_id
if not updated_at and alias_row:
updated_at = alias_row.get("updated_at")
return snippet_json, alias_json, updated_at, alias_action_id, snippet_action_id
def _normalize_aliases(raw_aliases: Any) -> List[Dict[str, Any]]:
aliases: List[Dict[str, Any]] = []
seen: set[str] = set()
if not raw_aliases:
return aliases
if not isinstance(raw_aliases, list):
return aliases
for item in raw_aliases:
if isinstance(item, dict):
text_val = item.get("text")
if not text_val or text_val in seen:
continue
seen.add(text_val)
aliases.append({"text": text_val, "tone": item.get("tone")})
elif isinstance(item, str):
if item in seen:
continue
seen.add(item)
aliases.append({"text": item})
return aliases
def _normalize_str_list(values: Any) -> List[str]:
if not values:
return []
if not isinstance(values, list):
return []
seen: set[str] = set()
normalised: List[str] = []
for val in values:
if not isinstance(val, str):
continue
if val in seen:
continue
seen.add(val)
normalised.append(val)
return normalised
def _merge_alias_lists(primary: List[Dict[str, Any]], secondary: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
merged: List[Dict[str, Any]] = []
seen: set[str] = set()
for source in (primary, secondary):
for item in source:
if not isinstance(item, dict):
continue
text_val = item.get("text")
if not text_val or text_val in seen:
continue
seen.add(text_val)
merged.append({"text": text_val, "tone": item.get("tone")})
return merged
def _merge_str_lists(primary: List[str], secondary: List[str]) -> List[str]:
merged: List[str] = []
seen: set[str] = set()
for source in (primary, secondary):
for item in source:
if item in seen:
continue
seen.add(item)
merged.append(item)
return merged
def _build_alias_map(alias_payload: List[Any]) -> Dict[str, Dict[str, Any]]:
alias_map: Dict[str, Dict[str, Any]] = {}
for item in alias_payload:
if not isinstance(item, dict):
continue
alias_id = item.get("id")
if not alias_id:
continue
existing = alias_map.setdefault(
alias_id,
{"aliases": [], "keywords": [], "intent_tags": []},
)
existing["aliases"] = _merge_alias_lists(
existing["aliases"], _normalize_aliases(item.get("aliases"))
)
existing["keywords"] = _merge_str_lists(
existing["keywords"], _normalize_str_list(item.get("keywords"))
)
existing["intent_tags"] = _merge_str_lists(
existing["intent_tags"], _normalize_str_list(item.get("intent_tags"))
)
return alias_map
def merge_snippet_records_from_db(
table_id: int,
version_ts: int,
*,
engine: Optional[Engine] = None,
) -> List[Dict[str, Any]]:
"""
Load snippet + snippet_alias JSON from action_results after snippet_alias is stored,
then merge into a unified snippet object list ready for downstream RAG.
"""
engine = engine or get_engine()
snippets, aliases, updated_at, alias_action_id, snippet_action_id = _load_snippet_sources(
engine, table_id, version_ts
)
alias_map = _build_alias_map(aliases)
merged: List[Dict[str, Any]] = []
seen_ids: set[str] = set()
for snippet in snippets:
if not isinstance(snippet, dict):
continue
snippet_id = snippet.get("id")
if not snippet_id:
continue
alias_info = alias_map.get(snippet_id)
record = dict(snippet)
record_aliases = _normalize_aliases(record.get("aliases"))
record_keywords = _normalize_str_list(record.get("keywords"))
record_intents = _normalize_str_list(record.get("intent_tags"))
if alias_info:
record_aliases = _merge_alias_lists(record_aliases, alias_info["aliases"])
record_keywords = _merge_str_lists(record_keywords, alias_info["keywords"])
record_intents = _merge_str_lists(record_intents, alias_info["intent_tags"])
record["aliases"] = record_aliases
record["keywords"] = record_keywords
record["intent_tags"] = record_intents
record["table_id"] = table_id
record["version_ts"] = version_ts
record["updated_at_from_action"] = updated_at
record["source"] = "snippet"
record["action_result_id"] = alias_action_id or snippet_action_id
merged.append(record)
seen_ids.add(snippet_id)
for alias_id, alias_info in alias_map.items():
if alias_id in seen_ids:
continue
if alias_action_id is None and snippet_action_id is None:
continue
merged.append(
{
"id": alias_id,
"aliases": alias_info["aliases"],
"keywords": alias_info["keywords"],
"intent_tags": alias_info["intent_tags"],
"table_id": table_id,
"version_ts": version_ts,
"updated_at_from_action": updated_at,
"source": "alias_only",
"action_result_id": alias_action_id or snippet_action_id,
}
)
return merged
def _stable_rag_item_id(table_id: int, version_ts: int, snippet_id: str) -> int:
digest = hashlib.md5(f"{table_id}:{version_ts}:{snippet_id}".encode("utf-8")).hexdigest()
return int(digest[:16], 16) % 9_000_000_000_000_000_000
def _to_serializable(value: Any) -> Any:
if value is None or isinstance(value, (str, int, float, bool)):
return value
if isinstance(value, datetime):
return value.isoformat()
if isinstance(value, dict):
return {k: _to_serializable(v) for k, v in value.items()}
if isinstance(value, list):
return [_to_serializable(v) for v in value]
return str(value)
def _build_rag_text(snippet: Dict[str, Any]) -> str:
# Deterministic text concatenation for embedding input.
parts: List[str] = []
def _add(label: str, value: Any) -> None:
if value is None:
return
if isinstance(value, list):
value = ", ".join([str(v) for v in value if v])
elif isinstance(value, dict):
value = json.dumps(value, ensure_ascii=False)
if value:
parts.append(f"{label}: {value}")
_add("Title", snippet.get("title") or snippet.get("id"))
_add("Description", snippet.get("desc"))
_add("Business", snippet.get("business_caliber"))
_add("Type", snippet.get("type"))
_add("Examples", snippet.get("examples") or [])
_add("Aliases", [a.get("text") for a in snippet.get("aliases") or [] if isinstance(a, dict)])
_add("Keywords", snippet.get("keywords") or [])
_add("IntentTags", snippet.get("intent_tags") or [])
_add("Applicability", snippet.get("applicability"))
_add("DialectSQL", snippet.get("dialect_sql"))
return "\n".join(parts)
def _prepare_rag_payloads(
snippets: List[Dict[str, Any]],
table_id: int,
version_ts: int,
workspace_id: int,
rag_item_type: str = "SNIPPET",
) -> Tuple[List[Dict[str, Any]], List[RagItemPayload]]:
rows: List[Dict[str, Any]] = []
payloads: List[RagItemPayload] = []
now = datetime.utcnow()
for snippet in snippets:
snippet_id = snippet.get("id")
if not snippet_id:
continue
action_result_id = snippet.get("action_result_id")
if action_result_id is None:
logger.warning(
"Skipping snippet without action_result_id for RAG ingestion (table_id=%s version_ts=%s snippet_id=%s)",
table_id,
version_ts,
snippet_id,
)
continue
rag_item_id = _stable_rag_item_id(table_id, version_ts, snippet_id)
rag_text = _build_rag_text(snippet)
serializable_snippet = _to_serializable(snippet)
merged_json = json.dumps(serializable_snippet, ensure_ascii=False)
updated_at_raw = snippet.get("updated_at_from_action") or now
if isinstance(updated_at_raw, str):
try:
updated_at = datetime.fromisoformat(updated_at_raw)
except ValueError:
updated_at = now
else:
updated_at = updated_at_raw if isinstance(updated_at_raw, datetime) else now
created_at = updated_at
row = {
"rag_item_id": rag_item_id,
"workspace_id": workspace_id,
"table_id": table_id,
"version_ts": version_ts,
"created_at": created_at,
"action_result_id": action_result_id,
"snippet_id": snippet_id,
"rag_text": rag_text,
"merged_json": merged_json,
"updated_at": updated_at,
}
rows.append(row)
payloads.append(
RagItemPayload(
id=rag_item_id,
workspaceId=workspace_id,
name=snippet.get("title") or snippet_id,
embeddingData=rag_text,
type=rag_item_type or "SNIPPET",
)
)
return rows, payloads
def _upsert_rag_snippet_rows(engine: Engine, rows: Sequence[Dict[str, Any]]) -> None:
if not rows:
return
delete_sql = text("DELETE FROM rag_snippet WHERE rag_item_id=:rag_item_id")
insert_sql = text(
"""
INSERT INTO rag_snippet (
rag_item_id,
workspace_id,
table_id,
version_ts,
created_at,
action_result_id,
snippet_id,
rag_text,
merged_json,
updated_at
) VALUES (
:rag_item_id,
:workspace_id,
:table_id,
:version_ts,
:created_at,
:action_result_id,
:snippet_id,
:rag_text,
:merged_json,
:updated_at
)
"""
)
with engine.begin() as conn:
for row in rows:
conn.execute(delete_sql, row)
conn.execute(insert_sql, row)
async def ingest_snippet_rag_from_db(
table_id: int,
version_ts: int,
*,
workspace_id: int,
rag_item_type: str = "SNIPPET",
client,
engine: Optional[Engine] = None,
rag_client: Optional[RagAPIClient] = None,
) -> List[int]:
"""
Merge snippet + alias JSON from action_results, persist to rag_snippet, then push to RAG via addBatch.
Returns list of rag_item_id ingested.
"""
engine = engine or get_engine()
snippets = merge_snippet_records_from_db(table_id, version_ts, engine=engine)
if not snippets:
logger.info(
"No snippets available for RAG ingestion (table_id=%s version_ts=%s)",
table_id,
version_ts,
)
return []
rows, payloads = _prepare_rag_payloads(
snippets,
table_id=table_id,
version_ts=version_ts,
workspace_id=workspace_id,
rag_item_type=rag_item_type,
)
_upsert_rag_snippet_rows(engine, rows)
rag_client = rag_client or RagAPIClient()
await rag_client.add_batch(client, payloads)
return [row["rag_item_id"] for row in rows]

View File

@ -20,7 +20,11 @@ PROVIDER_KEY_ENV_MAP: Dict[str, str] = {
} }
DEFAULT_IMPORT_MODEL = os.getenv("DEFAULT_IMPORT_MODEL", "openai:gpt-4.1-mini") DEFAULT_IMPORT_MODEL = os.getenv("DEFAULT_IMPORT_MODEL", "deepseek:deepseek-chat")
NEW_API_BASE_URL = os.getenv("NEW_API_BASE_URL")
NEW_API_AUTH_TOKEN = os.getenv("NEW_API_AUTH_TOKEN")
RAG_API_BASE_URL = os.getenv("RAG_API_BASE_URL", "https://tchatbi.agentcarrier.cn/chatbi/api")
RAG_API_AUTH_TOKEN = os.getenv("RAG_API_AUTH_TOKEN")
@lru_cache(maxsize=1) @lru_cache(maxsize=1)

116
app/utils/llm_usage.py Normal file
View File

@ -0,0 +1,116 @@
from __future__ import annotations
from typing import Any, Dict, Iterable, Optional
PROMPT_TOKEN_KEYS: tuple[str, ...] = ("prompt_tokens", "input_tokens", "promptTokenCount")
COMPLETION_TOKEN_KEYS: tuple[str, ...] = (
"completion_tokens",
"output_tokens",
"candidatesTokenCount",
)
TOTAL_TOKEN_KEYS: tuple[str, ...] = ("total_tokens", "totalTokenCount")
USAGE_CONTAINER_KEYS: tuple[str, ...] = ("usage", "usageMetadata", "usage_metadata")
def _normalize_usage_value(value: Any) -> Any:
if isinstance(value, (int, float)):
return int(value)
if isinstance(value, str):
stripped = value.strip()
if not stripped:
return None
try:
numeric = float(stripped)
except ValueError:
return None
return int(numeric)
if isinstance(value, dict):
normalized: Dict[str, Any] = {}
for key, nested_value in value.items():
normalized_value = _normalize_usage_value(nested_value)
if normalized_value is not None:
normalized[key] = normalized_value
return normalized or None
if isinstance(value, (list, tuple, set)):
normalized_list = [
item for item in (_normalize_usage_value(element) for element in value) if item is not None
]
return normalized_list or None
return None
def _first_numeric(payload: Dict[str, Any], keys: Iterable[str]) -> Optional[int]:
for key in keys:
value = payload.get(key)
if isinstance(value, (int, float)):
return int(value)
return None
def _canonicalize_counts(payload: Dict[str, Any]) -> None:
prompt = _first_numeric(payload, PROMPT_TOKEN_KEYS)
completion = _first_numeric(payload, COMPLETION_TOKEN_KEYS)
total = _first_numeric(payload, TOTAL_TOKEN_KEYS)
if prompt is not None:
payload["prompt_tokens"] = prompt
else:
payload.pop("prompt_tokens", None)
if completion is not None:
payload["completion_tokens"] = completion
else:
payload.pop("completion_tokens", None)
if total is not None:
payload["total_tokens"] = total
elif prompt is not None and completion is not None:
payload["total_tokens"] = prompt + completion
else:
payload.pop("total_tokens", None)
for alias in PROMPT_TOKEN_KEYS[1:]:
payload.pop(alias, None)
for alias in COMPLETION_TOKEN_KEYS[1:]:
payload.pop(alias, None)
for alias in TOTAL_TOKEN_KEYS[1:]:
payload.pop(alias, None)
def _extract_usage_container(candidate: Any) -> Optional[Dict[str, Any]]:
if not isinstance(candidate, dict):
return None
for key in USAGE_CONTAINER_KEYS:
value = candidate.get(key)
if isinstance(value, dict):
return value
return None
def extract_usage(payload: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Unified helper to parse token usage metadata from diverse provider responses."""
if not isinstance(payload, dict):
return None
usage_candidate = _extract_usage_container(payload)
if usage_candidate is None:
raw_section = payload.get("raw")
usage_candidate = _extract_usage_container(raw_section)
if usage_candidate is None:
return None
normalized = _normalize_usage_value(usage_candidate)
if not isinstance(normalized, dict):
return None
_canonicalize_counts(normalized)
return normalized or None
__all__ = ["extract_usage"]

View File

@ -1,41 +0,0 @@
{
"provider": "deepseek",
"model": "deepseek-chat",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": "```json\n{\n \"table_name\": \"national_brand_sales\",\n \"description\": \"全国品牌系统外销售数据\",\n \"columns\": [\n {\n \"original_name\": \"品牌\",\n \"standard_name\": \"brand\",\n \"data_type\": \"string\",\n \"db_type\": \"varchar(50)\",\n \"java_type\": \"string\",\n \"nullable\": true,\n \"distinct_count_sample\": 5,\n \"null_ratio_sample\": 0.4,\n \"is_enum_candidate\": false,\n \"description\": \"品牌名称\",\n \"date_format\": null\n },\n {\n \"original_name\": \"产品价类\",\n \"standard_name\": \"price_category\",\n \"data_type\": \"string\",\n \"db_type\": \"varchar(10)\",\n \"java_type\": \"string\",\n \"nullable\": false,\n \"distinct_count_sample\": 3,\n \"null_ratio_sample\": 0.0,\n \"is_enum_candidate\": true,\n \"description\": \"产品价格分类(一类/二类/三类)\",\n \"date_format\": null\n },\n {\n \"original_name\": \"是否重点品牌"
}
}
],
"raw": {
"id": "67f3cc80-38bc-4bb7-b336-48d4886722c4",
"object": "chat.completion",
"created": 1761752207,
"model": "deepseek-chat",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": "```json\n{\n \"table_name\": \"national_brand_sales\",\n \"description\": \"全国品牌系统外销售数据\",\n \"columns\": [\n {\n \"original_name\": \"品牌\",\n \"standard_name\": \"brand\",\n \"data_type\": \"string\",\n \"db_type\": \"varchar(50)\",\n \"java_type\": \"string\",\n \"nullable\": true,\n \"distinct_count_sample\": 5,\n \"null_ratio_sample\": 0.4,\n \"is_enum_candidate\": false,\n \"description\": \"品牌名称\",\n \"date_format\": null\n },\n {\n \"original_name\": \"产品价类\",\n \"standard_name\": \"price_category\",\n \"data_type\": \"string\",\n \"db_type\": \"varchar(10)\",\n \"java_type\": \"string\",\n \"nullable\": false,\n \"distinct_count_sample\": 3,\n \"null_ratio_sample\": 0.0,\n \"is_enum_candidate\": true,\n \"description\": \"产品价格分类(一类/二类/三类)\",\n \"date_format\": null\n },\n {\n \"original_name\": \"是否重点品牌"
},
"logprobs": null,
"finish_reason": "length"
}
],
"usage": {
"prompt_tokens": 1078,
"completion_tokens": 256,
"total_tokens": 1334,
"prompt_tokens_details": {
"cached_tokens": 1024
},
"prompt_cache_hit_tokens": 1024,
"prompt_cache_miss_tokens": 54
},
"system_fingerprint": "fp_ffc7281d48_prod0820_fp8_kvcache"
}
}

View File

@ -0,0 +1 @@
{"role": "dimension", "time": {"range": null, "column": null, "has_gaps": null, "granularity": "unknown"}, "grain": ["service_point_id"], "table": "data-ge.water_meter_info", "columns": [{"name": "meter_subtype", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 9, "distinct_ratio": 0.03, "pk_candidate_score": 0.03, "metric_candidate_score": 0.0}, {"name": "installation_position", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 4, "distinct_ratio": 0.013333333333333334, "pk_candidate_score": 0.013333333333333334, "metric_candidate_score": 0.0}, {"name": "supply_office", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 11, "distinct_ratio": 0.03666666666666667, "pk_candidate_score": 0.03666666666666667, "metric_candidate_score": 0.0}, {"name": "meter_diameter", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 8, "distinct_ratio": 0.02666666666666667, "pk_candidate_score": 0.02666666666666667, "metric_candidate_score": 0.0}, {"name": "account_id", "dtype": "unknown", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列的统计指标如空值率、唯一性缺失但根据命名规则推断为ID。", "enumish": null, "null_rate": null, "top_values": [], "semantic_type": "id", "distinct_count": null, "distinct_ratio": null, "pk_candidate_score": 0.9, "metric_candidate_score": 0.0}, {"name": "service_point_id", "dtype": "unknown", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列的统计指标如空值率、唯一性缺失但根据命名规则推断为ID。", "enumish": null, "null_rate": null, "top_values": [], "semantic_type": "id", "distinct_count": null, "distinct_ratio": null, "pk_candidate_score": 0.95, "metric_candidate_score": 0.0}, {"name": "station", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 36, "distinct_ratio": 0.12, "pk_candidate_score": 0.12, "metric_candidate_score": 0.0}, {"name": "meter_type", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 5, "distinct_ratio": 0.016666666666666666, "pk_candidate_score": 0.016666666666666666, "metric_candidate_score": 0.0}, {"name": "district", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 13, "distinct_ratio": 0.043333333333333335, "pk_candidate_score": 0.043333333333333335, "metric_candidate_score": 0.0}, {"name": "meter_status", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列只有一个唯一值 '有效'。", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 1, "distinct_ratio": 0.0033333333333333335, "pk_candidate_score": 0.0033333333333333335, "metric_candidate_score": 0.0}], "quality": {"warning_hints": ["列 'meter_status' 只有一个唯一值 '有效',可能为常量列。"], "failed_expectations": []}, "row_count": 300, "fk_candidates": [], "confidence_notes": ["表角色(role)被推断为 'dimension'因为其列几乎完全由ID和类别属性构成且缺少数值指标或时间序列列。", "主键候选(primary_key_candidates) 'service_point_id' 和 'account_id' 是基于命名约定(包含'_id'推断的。其唯一性和非空性未在GE结果中直接度量因此这是一个高置信度的猜测。", "表粒度(grain)可能为 'service_point',与推断的主键 'service_point_id' 相对应。", "未根据列名或数据格式识别出时间列。"], "primary_key_candidates": [["service_point_id"], ["account_id"]]}

View File

@ -0,0 +1,180 @@
[
{
"id": "snpt_count-service-points-by-dimension",
"aliases": [
{
"text": "各个区有多少水表",
"tone": "口语"
},
{
"text": "按维度统计用水点数",
"tone": "中性"
},
{
"text": "各维度用水点数量分布",
"tone": "专业"
}
],
"keywords": [
"用水点数",
"service_point_count",
"数量",
"统计",
"汇总",
"aggregate",
"维度",
"dimension",
"区域",
"district",
"供水所",
"分组统计",
"水表"
],
"intent_tags": [
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_topn-service-points-by-dimension",
"aliases": [
{
"text": "哪个地方水表最多",
"tone": "口语"
},
{
"text": "用水点数Top-N排名",
"tone": "中性"
},
{
"text": "Top-N用水点数维度排行",
"tone": "专业"
}
],
"keywords": [
"Top-N",
"top",
"排名",
"排行",
"ranking",
"最多",
"用水点数",
"service_point_count",
"维度",
"dimension",
"站点",
"station",
"水表"
],
"intent_tags": [
"topn",
"by_dimension"
]
},
{
"id": "snpt_ratio-service-points-by-dimension",
"aliases": [
{
"text": "各种水表各占多少",
"tone": "口语"
},
{
"text": "各维度用水点数占比",
"tone": "中性"
},
{
"text": "用水点维度构成分析",
"tone": "专业"
}
],
"keywords": [
"占比",
"percentage",
"百分比",
"ratio",
"构成",
"分布",
"用水点数",
"水表类型",
"meter_type",
"维度",
"dimension",
"水表"
],
"intent_tags": [
"ratio",
"by_dimension"
]
},
{
"id": "snpt_quality-check-duplicate-spid",
"aliases": [
{
"text": "有没有重复的水表号",
"tone": "口语"
},
{
"text": "检查重复的用水点ID",
"tone": "中性"
},
{
"text": "用水点ID唯一性校验",
"tone": "专业"
}
],
"keywords": [
"数据质量",
"quality",
"检查",
"校验",
"重复",
"duplicate",
"唯一性",
"uniqueness",
"用水点ID",
"service_point_id",
"异常检测",
"主键"
],
"intent_tags": [
"quality",
"by_dimension"
]
},
{
"id": "snpt_sample-filter-service-points-by-dims",
"aliases": [
{
"text": "给我看城区的机械表",
"tone": "口语"
},
{
"text": "按多维度筛选用水点",
"tone": "中性"
},
{
"text": "多维组合条件过滤用水点",
"tone": "专业"
}
],
"keywords": [
"筛选",
"过滤",
"filter",
"查询",
"明细",
"列表",
"sample",
"用水点",
"区域",
"district",
"水表类型",
"meter_type",
"条件查询"
],
"intent_tags": [
"sample",
"filter"
]
}
]

View File

@ -0,0 +1,186 @@
[
{
"id": "snpt_count-service-points-by-dimension",
"desc": "按指定维度(如区域、供水所)分组,统计各分类下的用水点数量。",
"type": "aggregate",
"title": "按维度统计用水点数",
"examples": [
"按区域统计用水点数量",
"各个供水所分别有多少个用水点"
],
"variables": [
{
"name": "dimension_column",
"type": "column",
"default": "district"
}
],
"dialect_sql": {
"mysql": "SELECT\n `${dimension_column}`,\n COUNT(DISTINCT service_point_id) AS service_point_count\nFROM\n `data-ge.water_meter_info`\nGROUP BY\n `${dimension_column}`\nORDER BY\n service_point_count DESC;"
},
"applicability": {
"constraints": {
"notes": [
"适用于对水表档案信息进行分类汇总统计。",
"可将变量 ${dimension_column} 替换为任一维度列,如 district, supply_office, station, meter_type 等。"
],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "用水点数:对 `service_point_id` 进行去重计数,代表一个独立的服务点(通常对应一个水表)。统计粒度为“指定维度”。"
},
{
"id": "snpt_topn-service-points-by-dimension",
"desc": "按指定维度如区域、站点统计用水点数并展示数量最多的前N个分类。",
"type": "topn",
"title": "Top-N 用水点数维度排名",
"examples": [
"哪个区域的用水点最多",
"用水点数排名前5的站点是哪些"
],
"variables": [
{
"name": "dimension_column",
"type": "column",
"default": "station"
},
{
"name": "top_n",
"type": "int",
"default": 10
}
],
"dialect_sql": {
"mysql": "SELECT\n `${dimension_column}`,\n COUNT(DISTINCT service_point_id) AS service_point_count\nFROM\n `data-ge.water_meter_info`\nGROUP BY\n `${dimension_column}`\nORDER BY\n service_point_count DESC\nLIMIT ${top_n};"
},
"applicability": {
"constraints": {
"notes": [
"维度 `station` 基数较高 (36),建议 Top-N 查询时结合业务场景合理设置 N 值。"
],
"fk_join_available": false,
"dim_cardinality_hint": 36
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "用水点数:对 `service_point_id` 进行去重计数。排名依据为各维度分类下的用水点总数。统计粒度为“指定维度”。"
},
{
"id": "snpt_ratio-service-points-by-dimension",
"desc": "计算在指定维度下,各分类的用水点数占总用水点数的百分比,以分析其分布构成。",
"type": "ratio",
"title": "各维度用水点数占比",
"examples": [
"不同水表类型meter_type的分布情况",
"各个区域的用水点占比是多少"
],
"variables": [
{
"name": "dimension_column",
"type": "column",
"default": "meter_type"
}
],
"dialect_sql": {
"mysql": "SELECT\n `${dimension_column}`,\n COUNT(DISTINCT service_point_id) AS service_point_count,\n COUNT(DISTINCT service_point_id) * 100.0 / SUM(COUNT(DISTINCT service_point_id)) OVER () AS percentage\nFROM\n `data-ge.water_meter_info`\nGROUP BY\n `${dimension_column}`\nORDER BY\n service_point_count DESC;"
},
"applicability": {
"constraints": {
"notes": [
"SQL模板使用了窗口函数 SUM() OVER()请确保MySQL版本支持8.0+)。"
],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "用水点数占比:某分类下的用水点数 / 总用水点数。用水点数以 `service_point_id` 去重计数。统计粒度为“指定维度”。"
},
{
"id": "snpt_quality-check-duplicate-spid",
"desc": "查找在用水点信息表中存在重复的 `service_point_id`,用于数据质量校验。",
"type": "quality",
"title": "检查重复的用水点ID",
"examples": [
"检查是否存在重复的水表档案",
"校验用水点ID的唯一性"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT\n service_point_id,\n COUNT(*) AS occurrences\nFROM\n `data-ge.water_meter_info`\nGROUP BY\n service_point_id\nHAVING\n COUNT(*) > 1;"
},
"applicability": {
"constraints": {
"notes": [
"预期返回结果为空。若有返回,则表示数据存在一致性问题,`service_point_id` 未能作为唯一主键。"
],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "重复项:指 `service_point_id` 出现次数大于1的记录。此ID应为表的主键理论上不应重复。"
},
{
"id": "snpt_sample-filter-service-points-by-dims",
"desc": "根据区域、水表类型、供水所等多个维度组合条件,筛选出符合条件的用水点明细。",
"type": "sample",
"title": "多维度筛选用水点列表",
"examples": [
"查询城区的机械表有哪些",
"拉取某个供水所下特定口径水表的列表"
],
"variables": [
{
"name": "district_name",
"type": "string",
"default": "城区"
},
{
"name": "meter_type_name",
"type": "string",
"default": "机械表"
},
{
"name": "limit_num",
"type": "int",
"default": 100
}
],
"dialect_sql": {
"mysql": "SELECT\n service_point_id,\n account_id,\n district,\n supply_office,\n meter_type,\n meter_subtype,\n meter_diameter\nFROM\n `data-ge.water_meter_info`\nWHERE\n district = '${district_name}'\n AND meter_type = '${meter_type_name}'\n -- AND meter_status = '有效' -- 可选:根据画像,该列为常量'有效',可不加\nLIMIT ${limit_num};"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": null,
"required_columns": [
"service_point_id",
"account_id",
"district",
"supply_office",
"meter_type",
"meter_subtype",
"meter_diameter"
]
},
"business_caliber": "返回满足所有筛选条件的用水点明细信息。`meter_status` 列只有一个值 '有效',通常无需作为筛选条件。"
}
]

View File

@ -0,0 +1,230 @@
{
"role": "dimension",
"time": {
"range": null,
"column": null,
"has_gaps": null,
"granularity": "unknown"
},
"grain": [
"service_point_id"
],
"table": "data-ge.water_meter_info",
"columns": [
{
"name": "supply_office",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空11 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 11,
"distinct_ratio": 0.03666666666666667,
"pk_candidate_score": 0.05,
"metric_candidate_score": 0.0
},
{
"name": "station",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空36 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 36,
"distinct_ratio": 0.12,
"pk_candidate_score": 0.1,
"metric_candidate_score": 0.0
},
{
"name": "district",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空13 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 13,
"distinct_ratio": 0.043333333333333335,
"pk_candidate_score": 0.05,
"metric_candidate_score": 0.0
},
{
"name": "meter_diameter",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空8 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 8,
"distinct_ratio": 0.02666666666666667,
"pk_candidate_score": 0.03,
"metric_candidate_score": 0.0
},
{
"name": "meter_status",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空;单一取值(\"有效\"",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 1,
"distinct_ratio": 0.0033333333333333335,
"pk_candidate_score": 0.0,
"metric_candidate_score": 0.0
},
{
"name": "meter_subtype",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空9 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 9,
"distinct_ratio": 0.03,
"pk_candidate_score": 0.03,
"metric_candidate_score": 0.0
},
{
"name": "meter_type",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空5 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 5,
"distinct_ratio": 0.016666666666666666,
"pk_candidate_score": 0.02,
"metric_candidate_score": 0.0
},
{
"name": "installation_position",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空4 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 4,
"distinct_ratio": 0.013333333333333334,
"pk_candidate_score": 0.02,
"metric_candidate_score": 0.0
},
{
"name": "service_point_id",
"dtype": "unknown",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "命名指示标识列;未提供唯一性或非空验证",
"enumish": null,
"null_rate": null,
"top_values": [],
"semantic_type": "id",
"distinct_count": null,
"distinct_ratio": null,
"pk_candidate_score": 0.6,
"metric_candidate_score": 0.05
},
{
"name": "account_id",
"dtype": "unknown",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "命名指示账户标识;未提供唯一性或非空验证",
"enumish": null,
"null_rate": null,
"top_values": [],
"semantic_type": "id",
"distinct_count": null,
"distinct_ratio": null,
"pk_candidate_score": 0.5,
"metric_candidate_score": 0.05
}
],
"quality": {
"warning_hints": [
"以下列未设置非空校验service_point_id, account_id空值情况未知",
"未识别到时间列"
],
"failed_expectations": []
},
"row_count": 300,
"fk_candidates": [],
"confidence_notes": [
"role 判定为 dimension表内列均为枚举/分类或ID未发现数值型度量或时间列34/34 期望均为分类枚举/非空与去重比例。",
"grain 猜测为 service_point_id仅依据命名启发式缺少唯一性与非空度量佐证置信度较低。",
"未识别时间列:列名与期望均未涉及日期/时间,也无最小/最大时间范围可推断。"
],
"primary_key_candidates": []
}

View File

@ -0,0 +1,372 @@
[
{
"id": "snpt_topn_station",
"aliases": [
{
"text": "站点水表排行前N",
"tone": "中性"
},
{
"text": "哪个站点表最多",
"tone": "口语"
},
{
"text": "按站点水表TopN",
"tone": "专业"
}
],
"keywords": [
"TopN",
"排名",
"排行",
"station",
"站点",
"水表数",
"meter count",
"distinct",
"去重",
"聚合",
"排序",
"榜单"
],
"intent_tags": [
"topn",
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_share_district",
"aliases": [
{
"text": "各辖区水表占比",
"tone": "中性"
},
{
"text": "哪个辖区占比高",
"tone": "口语"
},
{
"text": "按辖区水表比例",
"tone": "专业"
}
],
"keywords": [
"占比",
"ratio",
"district",
"辖区",
"水表数",
"meter count",
"distinct",
"去重",
"百分比",
"份额",
"聚合",
"排序",
"分布"
],
"intent_tags": [
"ratio",
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_dist_diameter",
"aliases": [
{
"text": "表径水表数分布",
"tone": "中性"
},
{
"text": "不同口径有多少",
"tone": "口语"
},
{
"text": "按表径去重计数",
"tone": "专业"
}
],
"keywords": [
"分布",
"distribution",
"meter_diameter",
"表径",
"水表数",
"meter count",
"distinct",
"去重",
"聚合",
"类别",
"category",
"条形图",
"饼图",
"排行"
],
"intent_tags": [
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_type_subtype_matrix",
"aliases": [
{
"text": "类型×子类水表数",
"tone": "中性"
},
{
"text": "看各类型各子类",
"tone": "口语"
},
{
"text": "类型子类组合统计",
"tone": "专业"
}
],
"keywords": [
"类型",
"type",
"子类",
"subtype",
"组合",
"matrix",
"交叉分析",
"cross-tab",
"水表数",
"meter count",
"distinct",
"去重",
"分布",
"聚合",
"维度"
],
"intent_tags": [
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_quality_spid_uniq",
"aliases": [
{
"text": "服务点ID唯一性检",
"tone": "专业"
},
{
"text": "服务点ID有重复吗",
"tone": "口语"
},
{
"text": "服务点ID完整性评估",
"tone": "中性"
}
],
"keywords": [
"质量检查",
"quality",
"唯一性",
"uniqueness",
"重复",
"duplicate",
"空值",
"NULL",
"完整性",
"integrity",
"service_point_id",
"数据质量",
"统计",
"去重",
"异常检测"
],
"intent_tags": [
"quality"
]
},
{
"id": "snpt_quality_account_nulls",
"aliases": [
{
"text": "账户ID缺失明细",
"tone": "中性"
},
{
"text": "看看哪些账户为空",
"tone": "口语"
},
{
"text": "account_id空值样本",
"tone": "专业"
}
],
"keywords": [
"质量检查",
"缺失",
"missing",
"空值",
"NULL",
"account_id",
"样本",
"sample",
"抽样",
"sampling",
"明细",
"排查",
"过滤",
"WHERE",
"LIMIT"
],
"intent_tags": [
"quality",
"sample"
]
},
{
"id": "snpt_sample_random_rows",
"aliases": [
{
"text": "随机抽样水表明细",
"tone": "中性"
},
{
"text": "随机取几条看看",
"tone": "口语"
},
{
"text": "RAND()样本抽取",
"tone": "专业"
}
],
"keywords": [
"随机",
"random",
"样本",
"sample",
"抽样",
"sampling",
"明细",
"details",
"质检",
"QA",
"RAND()",
"LIMIT",
"抽取",
"数据验证"
],
"intent_tags": [
"sample"
]
},
{
"id": "snpt_filter_office_type_where",
"aliases": [
{
"text": "按所与类型过滤有效",
"tone": "专业"
},
{
"text": "筛选某所的指定类型",
"tone": "中性"
},
{
"text": "只看这所的这种表",
"tone": "口语"
}
],
"keywords": [
"过滤",
"filter",
"WHERE",
"supply_office",
"营业所",
"meter_type",
"类型",
"meter_status",
"有效",
"条件片段",
"筛选",
"查询拼接",
"字段",
"约束"
],
"intent_tags": [
"filter"
]
},
{
"id": "snpt_office_station_dist",
"aliases": [
{
"text": "所站组合水表数",
"tone": "中性"
},
{
"text": "各站在各所有多少",
"tone": "口语"
},
{
"text": "营业所×站点分布",
"tone": "专业"
}
],
"keywords": [
"supply_office",
"营业所",
"station",
"站点",
"层级",
"hierarchy",
"分布",
"distribution",
"水表数",
"meter count",
"distinct",
"去重",
"聚合",
"交叉分析",
"排行"
],
"intent_tags": [
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_total_meter_baseline",
"aliases": [
{
"text": "水表总量基线",
"tone": "中性"
},
{
"text": "现在有多少水表",
"tone": "口语"
},
{
"text": "全表去重总数",
"tone": "专业"
}
],
"keywords": [
"总量",
"total",
"baseline",
"基线",
"水表总数",
"meter total",
"service_point_id",
"distinct",
"去重",
"分母",
"denominator",
"占比",
"聚合",
"汇总",
"snapshot"
],
"intent_tags": [
"aggregate"
]
}
]

View File

@ -0,0 +1,330 @@
[
{
"id": "snpt_topn_station",
"desc": "按站点统计水表数量并取前N",
"type": "topn",
"title": "站点TopN水表数",
"examples": [
"各站点水表数量排名前10",
"站点水表覆盖情况排行"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 10
}
],
"dialect_sql": {
"mysql": "SELECT station,\n COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY station\nORDER BY meter_cnt DESC\nLIMIT {{top_n}};"
},
"applicability": {
"constraints": {
"notes": [
"TopN建议N<=36",
"以service_point_id去重计数",
"无时间列,无法做趋势"
],
"fk_join_available": false,
"dim_cardinality_hint": 36
},
"time_column": null,
"required_columns": [
"station",
"service_point_id"
]
},
"business_caliber": "水表数=按service_point_id去重计数粒度=站点。仅统计当前表中的有效记录不含时间口径。安全限制用于分析排名避免扩大LIMIT造成全量导出。"
},
{
"id": "snpt_share_district",
"desc": "统计各辖区水表数及其占比",
"type": "ratio",
"title": "辖区水表占比",
"examples": [
"各辖区水表占比",
"哪个辖区水表最多"
],
"variables": [],
"dialect_sql": {
"mysql": "WITH by_district AS (\n SELECT district, COUNT(DISTINCT service_point_id) AS meter_cnt\n FROM `data-ge`.`water_meter_info`\n GROUP BY district\n), tot AS (\n SELECT COUNT(DISTINCT service_point_id) AS total_cnt\n FROM `data-ge`.`water_meter_info`\n)\nSELECT b.district,\n b.meter_cnt,\n ROUND(b.meter_cnt / NULLIF(t.total_cnt, 0) * 100, 2) AS pct\nFROM by_district b\nCROSS JOIN tot t\nORDER BY pct DESC, b.district;"
},
"applicability": {
"constraints": {
"notes": [
"占比分母为全表service_point_id去重总数",
"service_point_id为空将被忽略"
],
"fk_join_available": false,
"dim_cardinality_hint": 13
},
"time_column": null,
"required_columns": [
"district",
"service_point_id"
]
},
"business_caliber": "水表数=按service_point_id去重计数粒度=辖区。占比=辖区水表数/全表水表总数。安全限制:仅基于本表,不代表全市/全网口径;无时间维度。"
},
{
"id": "snpt_dist_diameter",
"desc": "按表径统计水表数量分布",
"type": "aggregate",
"title": "表径分布统计",
"examples": [
"不同口径水表有多少",
"查看表径分布情况"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT meter_diameter,\n COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY meter_diameter\nORDER BY meter_cnt DESC, meter_diameter;"
},
"applicability": {
"constraints": {
"notes": [
"以service_point_id去重计数",
"适合绘制条形图/饼图"
],
"fk_join_available": false,
"dim_cardinality_hint": 8
},
"time_column": null,
"required_columns": [
"meter_diameter",
"service_point_id"
]
},
"business_caliber": "水表数=按service_point_id去重计数粒度=表径。安全限制:仅用于分布分析,不含时间过滤;避免用于明细导出。"
},
{
"id": "snpt_type_subtype_matrix",
"desc": "统计水表类型与子类组合的数量",
"type": "aggregate",
"title": "类型子类分布",
"examples": [
"不同类型与子类的水表数量",
"查看类型与子类的组合分布"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT meter_type,\n meter_subtype,\n COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY meter_type, meter_subtype\nORDER BY meter_cnt DESC, meter_type, meter_subtype;"
},
"applicability": {
"constraints": {
"notes": [
"组合基数<=5×9=45",
"以service_point_id去重计数"
],
"fk_join_available": false,
"dim_cardinality_hint": 45
},
"time_column": null,
"required_columns": [
"meter_type",
"meter_subtype",
"service_point_id"
]
},
"business_caliber": "水表数=按service_point_id去重计数粒度=类型×子类组合。安全限制:仅用于汇总分析,不包含时间或业务状态变化。"
},
{
"id": "snpt_quality_spid_uniq",
"desc": "评估service_point_id的空值与重复情况",
"type": "quality",
"title": "服务点唯一性检",
"examples": [
"检查服务点ID是否唯一",
"统计service_point_id空值与重复情况"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT\n COUNT(*) AS total_rows,\n SUM(service_point_id IS NULL) AS null_cnt,\n COUNT(DISTINCT service_point_id) AS distinct_cnt,\n (COUNT(*) - COUNT(DISTINCT service_point_id)) AS duplicate_rows_est,\n (\n SELECT COUNT(*) FROM (\n SELECT service_point_id\n FROM `data-ge`.`water_meter_info`\n GROUP BY service_point_id\n HAVING COUNT(*) > 1\n ) AS dup\n ) AS dup_key_groups\nFROM `data-ge`.`water_meter_info`;"
},
"applicability": {
"constraints": {
"notes": [
"用于键完整性检查",
"重复行估算=总行数-去重数"
],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "质量检查口径在本表内评估service_point_id的非空与唯一性不代表跨表全局唯一。安全限制仅输出汇总指标不暴露明细重复值。"
},
{
"id": "snpt_quality_account_nulls",
"desc": "抽取account_id为空的记录用于排查",
"type": "quality",
"title": "账户ID缺失明细",
"examples": [
"列出account_id为空的水表",
"抽样查看账户缺失的数据行"
],
"variables": [
{
"name": "limit_n",
"type": "int",
"default": 50
}
],
"dialect_sql": {
"mysql": "SELECT *\nFROM `data-ge`.`water_meter_info`\nWHERE account_id IS NULL\nLIMIT {{limit_n}};"
},
"applicability": {
"constraints": {
"notes": [
"明细仅限小样本抽取",
"建议LIMIT<=100避免全量导出"
],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": null,
"required_columns": [
"account_id"
]
},
"business_caliber": "质量抽样筛出账户ID缺失的水表记录便于核对。安全限制仅用于样本排查不建议在生产中全量导出如需口径统计请改为COUNT聚合。"
},
{
"id": "snpt_sample_random_rows",
"desc": "随机抽取水表信息用于人工核验",
"type": "sample",
"title": "随机抽样明细",
"examples": [
"抽样查看水表信息",
"随机抽取20条做质检"
],
"variables": [
{
"name": "sample_size",
"type": "int",
"default": 20
}
],
"dialect_sql": {
"mysql": "SELECT *\nFROM `data-ge`.`water_meter_info`\nORDER BY RAND()\nLIMIT {{sample_size}};"
},
"applicability": {
"constraints": {
"notes": [
"使用RAND()随机,样本不可复现",
"建议限制样本量"
],
"fk_join_available": false,
"dim_cardinality_hint": 300
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "样本抽取从本表随机返回若干行明细。安全限制避免扩大LIMIT进行全量下载如需可复现样本请改用带种子的随机方法MySQL不原生支持。"
},
{
"id": "snpt_filter_office_type_where",
"desc": "常用WHERE筛选条件片段按营业所与类型且为有效",
"type": "sample",
"title": "机构类型筛选片",
"examples": [
"筛选A营业所的机械表",
"仅查看某营业所的指定类型水表"
],
"variables": [
{
"name": "supply_office",
"type": "string"
},
{
"name": "meter_type",
"type": "string"
}
],
"dialect_sql": {
"mysql": "WHERE supply_office = '{{supply_office}}'\n AND meter_type = '{{meter_type}}'\n AND meter_status = '有效'"
},
"applicability": {
"constraints": {
"notes": [
"这是条件片段,可拼接到其他查询",
"meter_status当前为单一值“有效”"
],
"fk_join_available": false,
"dim_cardinality_hint": 11
},
"time_column": null,
"required_columns": [
"supply_office",
"meter_type",
"meter_status"
]
},
"business_caliber": "过滤口径仅保留指定营业所与指定水表类型、且状态为“有效”的记录。安全限制为片段用途需拼接在SELECT…FROM之后使用。"
},
{
"id": "snpt_office_station_dist",
"desc": "按营业所与站点组合统计水表数",
"type": "aggregate",
"title": "所站层级分布",
"examples": [
"按营业所查看各站点水表数",
"所站两级的水表分布情况"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT supply_office,\n station,\n COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY supply_office, station\nORDER BY supply_office, meter_cnt DESC, station;"
},
"applicability": {
"constraints": {
"notes": [
"组合基数<=11×36=396",
"以service_point_id去重计数",
"如结果过长可再按TopN筛选"
],
"fk_join_available": false,
"dim_cardinality_hint": 396
},
"time_column": null,
"required_columns": [
"supply_office",
"station",
"service_point_id"
]
},
"business_caliber": "水表数=按service_point_id去重计数粒度=营业所×站点。安全限制:结果行数可能较多,建议在可视化端增加筛选或分页。"
},
{
"id": "snpt_total_meter_baseline",
"desc": "获取全表水表去重总量基线",
"type": "aggregate",
"title": "水表总量基线",
"examples": [
"当前有多少只水表",
"作为占比分析的分母基线"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT COUNT(DISTINCT service_point_id) AS meter_total\nFROM `data-ge`.`water_meter_info`;"
},
"applicability": {
"constraints": {
"notes": [
"作为其他占比/分摊分母基线",
"忽略service_point_id为空的记录"
],
"fk_join_available": false,
"dim_cardinality_hint": 300
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "水表总量=按service_point_id去重计数基于当前表的全量记录。安全限制无时间维度无法反映存量随时间变化。"
}
]

View File

@ -0,0 +1,415 @@
{
"role": "dimension",
"time": {
"range": null,
"column": null,
"has_gaps": null,
"granularity": "unknown"
},
"grain": [
"account_id",
"service_point_id"
],
"table": "data-ge.water_meter_info",
"columns": [
{
"name": "supply_office",
"dtype": "string",
"stats": {},
"comment": "供水管理所名称,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "宝山供水管理所"
},
{
"pct": null,
"value": "黄浦供水管理所"
},
{
"pct": null,
"value": "青东供水管理所"
},
{
"pct": null,
"value": "虹口供水管理所"
},
{
"pct": null,
"value": "闸北供水管理所"
},
{
"pct": null,
"value": "松北供水管理所"
},
{
"pct": null,
"value": "杨浦供水管理所"
},
{
"pct": null,
"value": "长宁供水管理所"
},
{
"pct": null,
"value": "闵行供水管理所"
},
{
"pct": null,
"value": "徐汇供水管理所"
},
{
"pct": null,
"value": "普陀供水管理所"
}
],
"semantic_type": "dimension",
"distinct_count": 11,
"distinct_ratio": 0.03666666666666667,
"pk_candidate_score": 0.11,
"metric_candidate_score": 0.0
},
{
"name": "station",
"dtype": "string",
"stats": {},
"comment": "站点名称,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "新闸站"
},
{
"pct": null,
"value": "宝杨站"
},
{
"pct": null,
"value": "江川站"
},
{
"pct": null,
"value": "长江站"
},
{
"pct": null,
"value": "市光站"
},
{
"pct": null,
"value": "徐泾站"
},
{
"pct": null,
"value": "真北站"
},
{
"pct": null,
"value": "半淞园站"
},
{
"pct": null,
"value": "芙蓉江站"
},
{
"pct": null,
"value": "密云站"
}
],
"semantic_type": "dimension",
"distinct_count": 36,
"distinct_ratio": 0.12,
"pk_candidate_score": 0.36,
"metric_candidate_score": 0.0
},
{
"name": "district",
"dtype": "string",
"stats": {},
"comment": "行政区划名称,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "普陀区"
},
{
"pct": null,
"value": "闵行区"
},
{
"pct": null,
"value": "嘉定区"
},
{
"pct": null,
"value": "杨浦区"
},
{
"pct": null,
"value": "徐汇区"
},
{
"pct": null,
"value": "黄浦区"
},
{
"pct": null,
"value": "松江区"
},
{
"pct": null,
"value": "长宁区"
},
{
"pct": null,
"value": "青浦区"
},
{
"pct": null,
"value": "虹口区"
}
],
"semantic_type": "dimension",
"distinct_count": 13,
"distinct_ratio": 0.043333333333333335,
"pk_candidate_score": 0.13,
"metric_candidate_score": 0.0
},
{
"name": "meter_diameter",
"dtype": "string",
"stats": {},
"comment": "水表直径规格,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "20mm"
},
{
"pct": null,
"value": "15mm"
},
{
"pct": null,
"value": "25mm"
},
{
"pct": null,
"value": "40mm"
},
{
"pct": null,
"value": "150mm"
},
{
"pct": null,
"value": "100mm"
},
{
"pct": null,
"value": "80mm"
},
{
"pct": null,
"value": "50mm"
}
],
"semantic_type": "dimension",
"distinct_count": 8,
"distinct_ratio": 0.02666666666666667,
"pk_candidate_score": 0.08,
"metric_candidate_score": 0.0
},
{
"name": "meter_status",
"dtype": "string",
"stats": {},
"comment": "水表状态,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "有效"
}
],
"semantic_type": "dimension",
"distinct_count": 1,
"distinct_ratio": 0.0033333333333333335,
"pk_candidate_score": 0.01,
"metric_candidate_score": 0.0
},
{
"name": "meter_subtype",
"dtype": "string",
"stats": {},
"comment": "水表子类型,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "旋翼半液封式"
},
{
"pct": null,
"value": "超声波式"
},
{
"pct": null,
"value": "旋翼湿式(指针式)"
},
{
"pct": null,
"value": "旋翼湿式(数字指针式)"
},
{
"pct": null,
"value": "电磁式"
},
{
"pct": null,
"value": "无直管段要求超声波式"
},
{
"pct": null,
"value": "无直管段要求电磁式"
},
{
"pct": null,
"value": "垂直螺翼干式"
},
{
"pct": null,
"value": "机械容积式"
}
],
"semantic_type": "dimension",
"distinct_count": 9,
"distinct_ratio": 0.03,
"pk_candidate_score": 0.09,
"metric_candidate_score": 0.0
},
{
"name": "meter_type",
"dtype": "string",
"stats": {},
"comment": "水表类型,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "容积式机械水表"
},
{
"pct": null,
"value": "速度式机械水表"
},
{
"pct": null,
"value": "电磁式远传水表"
},
{
"pct": null,
"value": "速度式机电远传水表"
},
{
"pct": null,
"value": "超声波式远传水表"
}
],
"semantic_type": "dimension",
"distinct_count": 5,
"distinct_ratio": 0.016666666666666666,
"pk_candidate_score": 0.05,
"metric_candidate_score": 0.0
},
{
"name": "installation_position",
"dtype": "string",
"stats": {},
"comment": "安装位置,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "嵌墙表"
},
{
"pct": null,
"value": "管道井表"
},
{
"pct": null,
"value": "地下表"
},
{
"pct": null,
"value": "龙头表"
}
],
"semantic_type": "dimension",
"distinct_count": 4,
"distinct_ratio": 0.013333333333333334,
"pk_candidate_score": 0.04,
"metric_candidate_score": 0.0
},
{
"name": "account_id",
"dtype": "string",
"stats": {},
"comment": "账户ID",
"enumish": false,
"null_rate": null,
"top_values": [],
"semantic_type": "id",
"distinct_count": null,
"distinct_ratio": null,
"pk_candidate_score": 0.95,
"metric_candidate_score": 0.0
},
{
"name": "service_point_id",
"dtype": "string",
"stats": {},
"comment": "服务点ID",
"enumish": false,
"null_rate": null,
"top_values": [],
"semantic_type": "id",
"distinct_count": null,
"distinct_ratio": null,
"pk_candidate_score": 0.95,
"metric_candidate_score": 0.0
}
],
"quality": {
"warning_hints": [],
"failed_expectations": []
},
"row_count": 300,
"fk_candidates": [],
"confidence_notes": [
"role判定为dimension因所有列均为枚举或ID类型无metric列",
"grain依据account_id和service_point_id为唯一标识推测",
"未发现时间列因此time字段为null"
],
"primary_key_candidates": [
[
"account_id"
],
[
"service_point_id"
]
]
}

View File

@ -0,0 +1,286 @@
[
{
"id": "snpt_water_meter_top_supply_office",
"aliases": [
{
"text": "供水所水表排行",
"tone": "中性"
},
{
"text": "哪个供水所水表最多",
"tone": "口语"
},
{
"text": "供水管理所水表TopN统计",
"tone": "专业"
}
],
"keywords": [
"水表",
"供水管理所",
"排行",
"TopN",
"数量",
"统计",
"count",
"排名",
"前N",
"供水所",
"水表数",
"维度聚合",
"by_dimension",
"topn"
],
"intent_tags": [
"topn",
"by_dimension"
]
},
{
"id": "snpt_water_meter_top_station",
"aliases": [
{
"text": "站点水表数量排行",
"tone": "中性"
},
{
"text": "哪个站点水表最多",
"tone": "口语"
},
{
"text": "站点维度水表TopN分析",
"tone": "专业"
}
],
"keywords": [
"水表",
"站点",
"排行",
"TopN",
"数量",
"统计",
"count",
"排名",
"前N",
"站点数",
"维度聚合",
"by_dimension",
"topn"
],
"intent_tags": [
"topn",
"by_dimension"
]
},
{
"id": "snpt_water_meter_top_district",
"aliases": [
{
"text": "区域水表数量排名",
"tone": "中性"
},
{
"text": "哪个区水表最多",
"tone": "口语"
},
{
"text": "行政区水表TopN统计",
"tone": "专业"
}
],
"keywords": [
"水表",
"区域",
"行政区",
"排行",
"TopN",
"数量",
"统计",
"count",
"排名",
"前N",
"区",
"水表数",
"维度聚合",
"by_dimension",
"topn"
],
"intent_tags": [
"topn",
"by_dimension"
]
},
{
"id": "snpt_water_meter_share_by_type",
"aliases": [
{
"text": "水表类型占比",
"tone": "中性"
},
{
"text": "哪种水表用得最多",
"tone": "口语"
},
{
"text": "水表类型分布比例",
"tone": "专业"
}
],
"keywords": [
"水表",
"类型",
"占比",
"比例",
"ratio",
"分布",
"meter_type",
"百分比",
"分类统计",
"水表类型",
"ratio",
"aggregate",
"by_dimension"
],
"intent_tags": [
"ratio",
"by_dimension"
]
},
{
"id": "snpt_water_meter_subtype_distribution",
"aliases": [
{
"text": "水表子类型分布",
"tone": "中性"
},
{
"text": "各种子类型水表情况",
"tone": "口语"
},
{
"text": "水表子类型计数与占比",
"tone": "专业"
}
],
"keywords": [
"水表",
"子类型",
"分布",
"数量",
"占比",
"meter_subtype",
"统计",
"count",
"百分比",
"分类统计",
"aggregate",
"by_dimension"
],
"intent_tags": [
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_water_meter_installation_position_stats",
"aliases": [
{
"text": "安装位置统计",
"tone": "中性"
},
{
"text": "哪种位置装表最多",
"tone": "口语"
},
{
"text": "水表安装位置分布",
"tone": "专业"
}
],
"keywords": [
"水表",
"安装位置",
"统计",
"分布",
"installation_position",
"数量",
"count",
"位置",
"安装点",
"aggregate",
"by_dimension"
],
"intent_tags": [
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_water_meter_grain_check",
"aliases": [
{
"text": "主键粒度校验",
"tone": "中性"
},
{
"text": "数据有没有重复",
"tone": "口语"
},
{
"text": "数据粒度一致性检查",
"tone": "专业"
}
],
"keywords": [
"主键",
"粒度",
"校验",
"质量",
"quality",
"重复",
"唯一性",
"account_id",
"service_point_id",
"数据校验",
"质量检查",
"异常检测"
],
"intent_tags": [
"quality"
]
},
{
"id": "snpt_water_meter_sample_records",
"aliases": [
{
"text": "水表数据抽样",
"tone": "中性"
},
{
"text": "给我看点水表数据",
"tone": "口语"
},
{
"text": "水表记录样本抽取",
"tone": "专业"
}
],
"keywords": [
"水表",
"样本",
"抽样",
"sample",
"随机",
"记录",
"抽查",
"limit",
"数据结构",
"数据示例",
"sample",
"limit_rows"
],
"intent_tags": [
"sample"
]
}
]

View File

@ -0,0 +1,235 @@
[
{
"id": "snpt_water_meter_top_supply_office",
"desc": "统计各供水管理所下辖水表数量并排序",
"type": "topn",
"title": "供水管理所水表数量排行",
"examples": [
"列出水表最多的前10个供水管理所",
"各供水所水表数量排名"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 10
}
],
"dialect_sql": {
"mysql": "SELECT supply_office AS dim_value, COUNT(*) AS metric_value FROM `data-ge.water_meter_info` GROUP BY supply_office ORDER BY metric_value DESC LIMIT {{top_n}}"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": 11
},
"time_column": "nullable",
"required_columns": [
"supply_office"
]
},
"business_caliber": "按供水管理所维度聚合水表总数,粒度=供水管理所"
},
{
"id": "snpt_water_meter_top_station",
"desc": "统计各个站点下辖水表数量并排序",
"type": "topn",
"title": "站点水表数量排行",
"examples": [
"列出水表最多的前10个站点",
"各站点水表数量排名"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 10
}
],
"dialect_sql": {
"mysql": "SELECT station AS dim_value, COUNT(*) AS metric_value FROM `data-ge.water_meter_info` GROUP BY station ORDER BY metric_value DESC LIMIT {{top_n}}"
},
"applicability": {
"constraints": {
"notes": [
"高基数维度建议LIMIT<=50"
],
"fk_join_available": false,
"dim_cardinality_hint": 36
},
"time_column": "nullable",
"required_columns": [
"station"
]
},
"business_caliber": "按站点维度聚合水表总数,粒度=站点"
},
{
"id": "snpt_water_meter_top_district",
"desc": "统计各区水表数量并排序",
"type": "topn",
"title": "区域水表数量排行",
"examples": [
"列出各区水表数量排名",
"哪个区的水表最多?"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 10
}
],
"dialect_sql": {
"mysql": "SELECT district AS dim_value, COUNT(*) AS metric_value FROM `data-ge.water_meter_info` GROUP BY district ORDER BY metric_value DESC LIMIT {{top_n}}"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": 13
},
"time_column": "nullable",
"required_columns": [
"district"
]
},
"business_caliber": "按行政区划维度聚合水表总数,粒度=区"
},
{
"id": "snpt_water_meter_share_by_type",
"desc": "计算各类水表占总水表的比例",
"type": "ratio",
"title": "水表类型占比分布",
"examples": [
"各类水表占比是多少?",
"哪种类型的水表使用最广泛?"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT meter_type AS dim_value, COUNT(*) * 100.0 / (SELECT COUNT(*) FROM `data-ge.water_meter_info`) AS ratio_percent FROM `data-ge.water_meter_info` GROUP BY meter_type ORDER BY ratio_percent DESC"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": 5
},
"time_column": "nullable",
"required_columns": [
"meter_type"
]
},
"business_caliber": "按水表类型分类计算其占比,粒度=水表类型"
},
{
"id": "snpt_water_meter_subtype_distribution",
"desc": "展示不同水表子类型的数量及比例",
"type": "aggregate",
"title": "水表子类型分布情况",
"examples": [
"各种子类型水表的数量和占比",
"哪种子类型水表最多?"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT meter_subtype AS dim_value, COUNT(*) AS count_value, ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM `data-ge.water_meter_info`), 2) AS percentage FROM `data-ge.water_meter_info` GROUP BY meter_subtype ORDER BY count_value DESC"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": 9
},
"time_column": "nullable",
"required_columns": [
"meter_subtype"
]
},
"business_caliber": "按水表子类型进行计数和百分比统计,粒度=水表子类型"
},
{
"id": "snpt_water_meter_installation_position_stats",
"desc": "统计不同安装位置下的水表数量",
"type": "aggregate",
"title": "安装位置分布统计",
"examples": [
"各种安装位置的水表数量",
"哪种安装位置最为常见?"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT installation_position AS dim_value, COUNT(*) AS count_value FROM `data-ge.water_meter_info` GROUP BY installation_position ORDER BY count_value DESC"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": 4
},
"time_column": "nullable",
"required_columns": [
"installation_position"
]
},
"business_caliber": "按安装位置对水表进行分组计数,粒度=安装位置"
},
{
"id": "snpt_water_meter_grain_check",
"desc": "验证 account_id 和 service_point_id 是否构成唯一组合",
"type": "quality",
"title": "主键粒度校验",
"examples": [
"这张表的数据粒度是否正确?",
"是否存在重复的服务点记录?"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT IF(COUNT(*) = COUNT(DISTINCT account_id, service_point_id), 'PASS', 'FAIL') AS grain_check_result FROM `data-ge.water_meter_info`"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": "nullable",
"required_columns": [
"account_id",
"service_point_id"
]
},
"business_caliber": "检验数据是否符合预期的主键粒度account_id + service_point_id"
},
{
"id": "snpt_water_meter_sample_records",
"desc": "随机抽取部分水表信息用于查看结构",
"type": "sample",
"title": "样本抽取",
"examples": [
"给我看几条水表数据的例子",
"抽查一些原始数据看看格式"
],
"variables": [
{
"name": "limit_rows",
"type": "int",
"default": 5
}
],
"dialect_sql": {
"mysql": "SELECT * FROM `data-ge.water_meter_info` ORDER BY RAND() LIMIT {{limit_rows}}"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": "nullable",
"required_columns": []
},
"business_caliber": "从全量数据中随机采样若干条记录供参考"
}
]

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,249 @@
[
{
"id": "snpt_topn_supply_office_by_account",
"aliases": [
{
"text": "哪个供水所用户最多?",
"tone": "口语"
},
{
"text": "按供应办公室统计账户数量",
"tone": "中性"
},
{
"text": "供应办公室账户数TopN排名",
"tone": "专业"
}
],
"keywords": [
"供应办公室",
"账户数",
"TopN",
"排行",
"统计",
"account_id",
"supply_office",
"去重",
"高占比",
"维度分析",
"by_dimension",
"aggregate",
"topn"
],
"intent_tags": [
"topn",
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_topn_station_by_account",
"aliases": [
{
"text": "哪些站点用户最多?",
"tone": "口语"
},
{
"text": "按站点统计账户分布",
"tone": "中性"
},
{
"text": "站点账户数Top20排名",
"tone": "专业"
}
],
"keywords": [
"站点",
"账户数",
"TopN",
"排行",
"统计",
"station",
"account_id",
"去重",
"高负载",
"维度分析",
"by_dimension",
"aggregate",
"topn"
],
"intent_tags": [
"topn",
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_topn_district_by_account",
"aliases": [
{
"text": "哪个区用户最多?",
"tone": "口语"
},
{
"text": "按行政区统计账户数量",
"tone": "中性"
},
{
"text": "行政区账户数全量排名",
"tone": "专业"
}
],
"keywords": [
"行政区",
"账户数",
"TopN",
"排行",
"统计",
"district",
"account_id",
"去重",
"区域对比",
"维度分析",
"by_dimension",
"aggregate",
"topn"
],
"intent_tags": [
"topn",
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_share_of_meter_type",
"aliases": [
{
"text": "各类水表占多少比例?",
"tone": "口语"
},
{
"text": "水表类型占比分析",
"tone": "中性"
},
{
"text": "水表类型占比分布",
"tone": "专业"
}
],
"keywords": [
"水表类型",
"占比",
"比例",
"meter_type",
"account_id",
"去重",
"分布",
"主流类型",
"技术选型",
"ratio",
"aggregate",
"by_dimension"
],
"intent_tags": [
"ratio",
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_sample_account_service_point",
"aliases": [
{
"text": "随机看10条账户信息",
"tone": "口语"
},
{
"text": "抽样账户与服务点明细",
"tone": "中性"
},
{
"text": "账户-服务点随机抽样验证",
"tone": "专业"
}
],
"keywords": [
"抽样",
"随机",
"样本",
"account_id",
"service_point_id",
"数据质量",
"验证",
"唯一性",
"格式检查",
"sample",
"quality"
],
"intent_tags": [
"sample",
"quality"
]
},
{
"id": "snpt_filter_meter_status_valid",
"aliases": [
{
"text": "只取有效的水表记录",
"tone": "口语"
},
{
"text": "筛选有效水表记录",
"tone": "中性"
},
{
"text": "水表状态有效性过滤",
"tone": "专业"
}
],
"keywords": [
"有效",
"过滤",
"筛选",
"meter_status",
"质量检查",
"断言",
"清洗",
"filter",
"quality"
],
"intent_tags": [
"filter",
"quality"
]
},
{
"id": "snpt_filter_meter_diameter_20mm",
"aliases": [
{
"text": "找出所有20mm水表用户",
"tone": "口语"
},
{
"text": "筛选20mm水表记录",
"tone": "中性"
},
{
"text": "20mm口径水表子集提取",
"tone": "专业"
}
],
"keywords": [
"20mm",
"水表直径",
"过滤",
"筛选",
"meter_diameter",
"子集",
"分析",
"住宅用水",
"规格",
"filter",
"by_dimension"
],
"intent_tags": [
"filter",
"by_dimension"
]
}
]

View File

@ -0,0 +1,227 @@
[
{
"id": "snpt_topn_supply_office_by_account",
"desc": "统计各供应办公室对应的账户数量,识别高占比管理所",
"type": "topn",
"title": "按供应办公室统计账户数",
"examples": [
"哪个供水管理所服务的用户最多?",
"列出前5个账户数最多的供应办公室"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 11
}
],
"dialect_sql": {
"mysql": "SELECT supply_office, COUNT(DISTINCT account_id) AS account_count\nFROM water_meter_info\nGROUP BY supply_office\nORDER BY account_count DESC\nLIMIT {{top_n}};"
},
"applicability": {
"constraints": {
"notes": [
"供应办公室仅11个唯一值可安全展示全部建议LIMIT 11避免冗余排序"
],
"fk_join_available": false,
"dim_cardinality_hint": 11
},
"time_column": "nullable",
"required_columns": [
"supply_office",
"account_id"
]
},
"business_caliber": "粒度=供应办公室,指标=去重账户数account_id仅统计水表信息表中有效账户不关联外部表"
},
{
"id": "snpt_topn_station_by_account",
"desc": "统计各站点服务的账户数量,识别高负载站点",
"type": "topn",
"title": "按站点统计账户分布",
"examples": [
"哪些站点服务的用户最多?",
"TOP10用户最多的站点是哪些"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 20
}
],
"dialect_sql": {
"mysql": "SELECT station, COUNT(DISTINCT account_id) AS account_count\nFROM water_meter_info\nGROUP BY station\nORDER BY account_count DESC\nLIMIT {{top_n}};"
},
"applicability": {
"constraints": {
"notes": [
"站点有36个唯一值建议LIMIT<=20以避免结果过长高基数维度可能影响查询性能"
],
"fk_join_available": false,
"dim_cardinality_hint": 36
},
"time_column": "nullable",
"required_columns": [
"station",
"account_id"
]
},
"business_caliber": "粒度=站点station指标=去重账户数account_id基于水表信息表直接聚合不涉及时间维度"
},
{
"id": "snpt_topn_district_by_account",
"desc": "统计各行政区的账户数量,辅助区域资源分配分析",
"type": "topn",
"title": "按行政区统计账户分布",
"examples": [
"哪个区的用水账户最多?",
"列出所有行政区的账户数量排名"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 13
}
],
"dialect_sql": {
"mysql": "SELECT district, COUNT(DISTINCT account_id) AS account_count\nFROM water_meter_info\nGROUP BY district\nORDER BY account_count DESC\nLIMIT {{top_n}};"
},
"applicability": {
"constraints": {
"notes": [
"行政区共13个可完整展示适合用于区域对比分析"
],
"fk_join_available": false,
"dim_cardinality_hint": 13
},
"time_column": "nullable",
"required_columns": [
"district",
"account_id"
]
},
"business_caliber": "粒度=行政区district指标=去重账户数account_id基于水表信息表聚合反映各区域用户规模"
},
{
"id": "snpt_share_of_meter_type",
"desc": "计算各类水表类型在总账户中的占比,识别主流类型",
"type": "ratio",
"title": "水表类型占比分析",
"examples": [
"各类水表在用户中的占比是多少?",
"电磁式远传水表占总用户比例多少?"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT meter_type, \n COUNT(DISTINCT account_id) AS account_count,\n ROUND(COUNT(DISTINCT account_id) * 100.0 / SUM(COUNT(DISTINCT account_id)) OVER (), 2) AS percentage\nFROM water_meter_info\nGROUP BY meter_type\nORDER BY account_count DESC;"
},
"applicability": {
"constraints": {
"notes": [
"水表类型仅5种适合计算占比可直接展示全量分布"
],
"fk_join_available": false,
"dim_cardinality_hint": 5
},
"time_column": "nullable",
"required_columns": [
"meter_type",
"account_id"
]
},
"business_caliber": "粒度=水表类型meter_type指标=去重账户数占比,分母为全表去重账户总数,反映技术选型分布"
},
{
"id": "snpt_sample_account_service_point",
"desc": "随机抽取部分账户与服务点ID的原始记录用于数据质量核查",
"type": "sample",
"title": "抽样账户与服务点明细",
"examples": [
"随机查看10条账户与服务点的详细信息",
"抽样检查水表信息是否符合预期格式"
],
"variables": [
{
"name": "sample_size",
"type": "int",
"default": 10
}
],
"dialect_sql": {
"mysql": "SELECT account_id, service_point_id, supply_office, station, district, meter_diameter, meter_type, meter_subtype, installation_position\nFROM water_meter_info\nORDER BY RAND()\nLIMIT {{sample_size}};"
},
"applicability": {
"constraints": {
"notes": [
"主键组合为account_id+service_point_id适合抽样验证唯一性建议样本量≤100"
],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": "nullable",
"required_columns": [
"account_id",
"service_point_id"
]
},
"business_caliber": "粒度=单条水表记录抽取样本用于验证account_id与service_point_id的组合唯一性及维度字段完整性"
},
{
"id": "snpt_filter_meter_status_valid",
"desc": "过滤出水表状态为'有效'的记录,用于后续分析",
"type": "quality",
"title": "筛选有效水表记录",
"examples": [
"只取状态为有效的水表记录",
"确认所有水表是否均为有效状态"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT *\nFROM water_meter_info\nWHERE meter_status = '有效';"
},
"applicability": {
"constraints": {
"notes": [
"meter_status仅存在'有效'值,此条件恒成立;可用于数据清洗流程的显式过滤"
],
"fk_join_available": false,
"dim_cardinality_hint": 1
},
"time_column": "nullable",
"required_columns": [
"meter_status"
]
},
"business_caliber": "仅保留水表状态为'有效'的记录,因全表均为有效值,此过滤为冗余但可作为数据质量校验的显式断言"
},
{
"id": "snpt_filter_meter_diameter_20mm",
"desc": "筛选水表直径为20mm的记录用于特定口径设备分析",
"type": "quality",
"title": "筛选20mm水表记录",
"examples": [
"找出所有使用20mm水表的用户",
"20mm水表分布在哪些站点"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT *\nFROM water_meter_info\nWHERE meter_diameter = '20mm';"
},
"applicability": {
"constraints": {
"notes": [
"水表直径共8种枚举值20mm为常见规格可作为子集分析的起点"
],
"fk_join_available": false,
"dim_cardinality_hint": 8
},
"time_column": "nullable",
"required_columns": [
"meter_diameter"
]
},
"business_caliber": "粒度=单条水表记录筛选条件为meter_diameter='20mm',用于分析标准住宅用水表的分布特征"
}
]

57
doc/rag-api.md Normal file
View File

@ -0,0 +1,57 @@
#添加RAG
curl --location --request POST 'http://127.0.0.1:8000/rag/add' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer ' \
--data-raw '{
"id": 0,
"workspaceId": 0,
"name": "string",
"embeddingData": "string",
"type": "METRIC"
}'
#批量添加RAG
curl --location --request POST 'http://127.0.0.1:8000/rag/addBatch' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer ' \
--data-raw '[
{
"id": 0,
"workspaceId": 0,
"name": "string",
"embeddingData": "string",
"type": "METRIC"
}
]'
#更新RAG
curl --location --request POST 'http://127.0.0.1:8000/rag/update' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer ' \
--data-raw '{
"id": 0,
"workspaceId": 0,
"name": "string",
"embeddingData": "string",
"type": "METRIC"
}'
#删除RAG
curl --location --request POST 'http://127.0.0.1:8000/rag/delete' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer ' \
--data-raw '{
"id": 0,
"type": "METRIC"
}'
#检索RAG
curl --location --request POST 'http://127.0.0.1:8000/rag/retrieve' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer ' \
--data-raw '{
"query": "string",
"num": 0,
"workspaceId": 0,
"type": "METRIC"
}'

49
doc/会话api.md Normal file
View File

@ -0,0 +1,49 @@
# 创建会话
curl -X POST "/api/v1/chat/sessions" \
-H "Content-Type: application/json" \
-d "{\"user_id\": $CHAT_USER_ID}"
# 获取会话
curl "/api/v1/chat/sessions/{session_id}"
# 按用户列出会话
curl "/api/v1/chat/sessions?user_id=$CHAT_USER_ID"
# 更新会话状态
curl -X POST "/api/v1/chat/sessions/{session_id}/update" \
-H "Content-Type: application/json" \
-d '{"status":"PAUSED"}'
# 关闭会话
curl -X POST "/api/v1/chat/sessions/{session_id}/close"
# 创建对话轮次
curl -X POST "/api/v1/chat/sessions/{session_id}/turns" \
-H "Content-Type: application/json" \
-d '{
"user_id": '"$CHAT_USER_ID"',
"user_query": "展示昨天订单GMV",
"intent": "METRIC_QUERY",
"ast_json": {"select":["gmv"],"where":{"dt":"yesterday"}},
"main_metric_ids": [1234],
"created_metric_ids": []
}'
# 获取单条对话轮次
curl "/api/v1/chat/turns/{turn_id}"
# 列出会话下的轮次
curl "/api/v1/chat/sessions/{session_id}/turns"
# 写入检索结果
curl -X POST "/api/v1/chat/turns/{turn_id}/retrievals" \
-H "Content-Type: application/json" \
-d '{
"retrievals": [
{"item_type":"METRIC","item_id":"metric_foo","used_in_sql":true,"rank_no":1},
{"item_type":"SNIPPET","item_id":"snpt_bar","similarity_score":0.77,"rank_no":2}
]
}'
# 列出轮次的检索结果
curl "/api/v1/chat/turns/{turn_id}/retrievals"

69
doc/指标api.md Normal file
View File

@ -0,0 +1,69 @@
# 新建指标
curl -X POST "/api/v1/metrics" \
-H "Content-Type: application/json" \
-d '{
"metric_code": "metric_1234",
"metric_name": "订单数",
"biz_domain": "order",
"biz_desc": "订单总数",
"base_sql": "select count(*) as order_cnt from orders",
"time_grain": "DAY",
"dim_binding": ["dt"],
"update_strategy": "FULL",
"metric_aliases": ["订单量"],
"created_by": '"$METRIC_USER_ID"'
}'
# 更新指标
curl -X POST "/api/v1/metrics/{metric_id}" \
-H "Content-Type: application/json" \
-d '{"metric_name":"订单数-更新","is_active":false}'
# 获取指标
curl "/api/v1/metrics/{metric_id}"
# 新建调度
curl -X POST "/api/v1/metric-schedules" \
-H "Content-Type: application/json" \
-d '{"metric_id":{metric_id},"cron_expr":"0 2 * * *","priority":5,"enabled":true}'
# 更新调度
curl -X POST "/api/v1/metric-schedules/{schedule_id}" \
-H "Content-Type: application/json" \
-d '{"enabled":false,"retry_times":1}'
# 列出某指标的调度
curl "/api/v1/metrics/{metric_id}/schedules"
# 触发运行
curl -X POST "/api/v1/metric-runs/trigger" \
-H "Content-Type: application/json" \
-d '{
"metric_id": {metric_id},
"triggered_by": "API",
"data_time_from": "2024-05-01T00:00:00Z",
"data_time_to": "2024-05-02T00:00:00Z"
}'
# 列出运行
curl "/api/v1/metric-runs?metric_id={metric_id}"
# 获取单次运行
curl "/api/v1/metric-runs/{run_id}"
# 写入指标结果
curl -X POST "/api/v1/metric-results/{metric_id}" \
-H "Content-Type: application/json" \
-d '{
"metric_id": {metric_id},
"results": [
{"stat_time":"2024-05-01T00:00:00Z","metric_value":123.45,"data_version":"{run_id}"},
{"stat_time":"2024-05-02T00:00:00Z","metric_value":234.56,"data_version":"{run_id}"}
]
}'
# 查询指标结果
curl "/api/v1/metric-results?metric_id={metric_id}"
# 查询最新结果
curl "/api/v1/metric-results/latest?metric_id={metric_id}"

83
doc/指标生成.md Normal file
View File

@ -0,0 +1,83 @@
某个用户的一句问话 → 解析成某轮 chat_turn → 这轮用了哪些指标/知识/会话chat_turn_retrieval
是否产生了新的指标metric_def
是否触发了指标调度运行metric_job_run.turn_id
最终产生了哪些指标结果metric_result.metric_id + stat_time
会话域
schema
会话表 chat_session
会话轮次表 chat_turn
会话轮次检索关联表 chat_turn_retrieval
API
1. 创建会话
POST /api/v1/chat/sessions
2. 更新会话轮次
POST /api/v1/chat/sessions/{session_id}/update
3. 结束会话
POST /api/v1/chat/sessions/{session_id}/close
4. 查询会话
GET /api/v1/chat/sessions/{session_id}
5. 会话列表查询(按用户、时间)
GET /api/v1/chat/sessions
6. 创建问答轮次(用户发起 query
POST /api/v1/chat/sessions/{session_id}/turns
7. 查询某会话的所有轮次
GET /api/v1/chat/sessions/{session_id}/turns
8. 查看单轮问答详情
GET /api/v1/chat/turns/{turn_id}
9. 批量写入某轮的检索结果
POST /api/v1/chat/turns/{turn_id}/retrievals
10. 查询某轮的检索记录
GET /api/v1/chat/turns/{turn_id}/retrievals
11. 更新某轮的检索记录in future
POST /api/v1/chat/turns/{turn_id}/retrievals/update
元数据域
schema
指标定义表 metric_def
API
12. 创建指标(来自问答或传统定义)
POST /api/v1/metrics
13. 更新指标
POST /api/v1/metrics/{id}
14. 获取指标详情
GET /api/v1/metrics
执行调度域暂定airflow
schema
指标调度配置表 metric_schedule
调度运行记录表 metric_job_run
API
1. 创建调度配置
POST /api/v1/metric-schedules
2. 更新调度配置
POST /api/v1/metric-schedules/{id}
3. 查询指标调度配置详情
GET /api/v1/metrics/{metric_id}/schedules
4. 手动触发一次指标运行(例如来自问数)
POST /api/v1/metric-runs/trigger
5. 查询运行记录列表
GET /api/v1/metric-runs
6. 查询单次运行详情
GET /api/metric-runs/{run_id}
数据域
schema
指标结果表纵表metric_result
API
1. 查询指标结果(按时间段 & 维度)
GET /api/metric-results
2. 单点查询(最新值)
GET /api/metric-results/latest
3. 批量写入指标结果
POST /api/v1/metric-results/{metrics_id}

21
file/ecommerce_orders.sql Normal file
View File

@ -0,0 +1,21 @@
CREATE TABLE `ecommerce_orders` (
`order_id` char(36) COLLATE utf8mb4_unicode_ci NOT NULL COMMENT 'UUID from CSV',
`customer_id` int NOT NULL,
`product_id` int NOT NULL,
`category` varchar(64) COLLATE utf8mb4_unicode_ci NOT NULL,
`price` decimal(10,2) NOT NULL,
`quantity` int NOT NULL,
`order_date` datetime(6) NOT NULL,
`shipping_date` datetime(6) NOT NULL,
`delivery_status` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
`payment_method` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
`device_type` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
`channel` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
`shipping_address` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL,
`billing_address` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL,
`customer_segment` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
PRIMARY KEY (`order_id`),
KEY `idx_customer` (`customer_id`),
KEY `idx_product` (`product_id`),
KEY `idx_order_date` (`order_date`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

View File

@ -0,0 +1,40 @@
CREATE TABLE `action_results` (
`id` bigint NOT NULL AUTO_INCREMENT COMMENT '主键',
`table_id` bigint NOT NULL COMMENT '表ID',
`version_ts` bigint NOT NULL COMMENT '版本时间戳(版本号)',
`action_type` enum('ge_profiling','ge_result_desc','snippet','snippet_alias') COLLATE utf8mb4_bin NOT NULL COMMENT '动作类型',
`status` enum('pending','running','success','failed','partial') COLLATE utf8mb4_bin NOT NULL DEFAULT 'pending' COMMENT '执行状态',
`llm_usage` json DEFAULT NULL COMMENT 'LLM token usage统计',
`error_code` varchar(128) COLLATE utf8mb4_bin DEFAULT NULL,
`error_message` text COLLATE utf8mb4_bin,
`started_at` datetime DEFAULT NULL,
`finished_at` datetime DEFAULT NULL,
`duration_ms` int DEFAULT NULL,
`table_schema_version_id` varchar(19) COLLATE utf8mb4_bin NOT NULL,
`table_schema` json NOT NULL,
`ge_profiling_json` json DEFAULT NULL COMMENT 'Profiling完整结果JSON',
`ge_profiling_json_size_bytes` bigint DEFAULT NULL,
`ge_profiling_summary` json DEFAULT NULL COMMENT 'Profiling摘要剔除大value_set等',
`ge_profiling_summary_size_bytes` bigint DEFAULT NULL,
`ge_profiling_total_size_bytes` bigint DEFAULT NULL COMMENT '上两者合计',
`ge_profiling_html_report_url` varchar(1024) COLLATE utf8mb4_bin DEFAULT NULL COMMENT 'GE报告HTML路径/URL',
`ge_result_desc_json` json DEFAULT NULL COMMENT '表描述结果JSON',
`ge_result_desc_json_size_bytes` bigint DEFAULT NULL,
`snippet_json` json DEFAULT NULL COMMENT 'SQL知识片段结果JSON',
`snippet_json_size_bytes` bigint DEFAULT NULL,
`snippet_alias_json` json DEFAULT NULL COMMENT 'SQL片段改写/丰富结果JSON',
`snippet_alias_json_size_bytes` bigint DEFAULT NULL,
`callback_url` varchar(1024) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
`result_checksum` varbinary(32) DEFAULT NULL COMMENT '对当前action有效载荷计算的MD5/xxhash',
`created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
`updated_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
`model` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型名称',
`model_provider` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型渠道',
`model_params` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型参数,如温度',
PRIMARY KEY (`id`),
UNIQUE KEY `uq_table_ver_action` (`table_id`,`version_ts`,`action_type`),
KEY `idx_status` (`status`),
KEY `idx_table` (`table_id`,`updated_at`),
KEY `idx_action_time` (`action_type`,`version_ts`),
KEY `idx_schema_version` (`table_schema_version_id`)
) ENGINE=InnoDB AUTO_INCREMENT=113 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC COMMENT='数据分析知识片段表';

103
file/tableschema/chat.sql Normal file
View File

@ -0,0 +1,103 @@
CREATE TABLE IF NOT EXISTS chat_session (
id BIGINT AUTO_INCREMENT PRIMARY KEY,
user_id BIGINT NOT NULL,
session_uuid CHAR(36) NOT NULL, -- 可用于对外展示的IDUUID
end_time DATETIME NULL,
status VARCHAR(16) NOT NULL DEFAULT 'OPEN', -- OPEN/CLOSED/ABANDONED
last_turn_id BIGINT NULL, -- 指向 chat_turn.id
ext_context JSON NULL, -- 业务上下文
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
UNIQUE KEY uk_session_uuid (session_uuid),
KEY idx_user_time (user_id, created_at),
KEY idx_status_time (status, created_at),
KEY idx_last_turn (last_turn_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE IF NOT EXISTS chat_turn (
id BIGINT AUTO_INCREMENT,
session_id BIGINT NOT NULL, -- 关联 chat_session.id
turn_no INT NOT NULL, -- 会话内轮次序号1,2,3...
user_id BIGINT NOT NULL,
user_query TEXT NOT NULL, -- 原始用户问句
intent VARCHAR(64) NULL, -- METRIC_QUERY/METRIC_EXPLAIN 等
ast_json JSON NULL, -- 解析出来的 AST
generated_sql MEDIUMTEXT NULL, -- 生成的最终SQL
sql_status VARCHAR(32) NULL, -- SUCCESS/FAILED/SKIPPED
error_msg TEXT NULL, -- SQL生成/执行错误信息
main_metric_ids JSON NULL, -- 本轮涉及的指标ID列表
created_metric_ids JSON NULL, -- 本轮新建指标ID列表
end_time DATETIME NULL,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
-- 主键改为联合主键,必须包含 created_at
PRIMARY KEY (id, created_at),
KEY idx_session_turn (session_id, turn_no),
KEY idx_session_time (session_id, created_at),
KEY idx_intent_time (intent, created_at),
KEY idx_user_time (user_id, created_at)
)
ENGINE=InnoDB
DEFAULT CHARSET=utf8mb4
PARTITION BY RANGE COLUMNS(created_at) (
-- 历史数据分区(根据实际需求调整)
PARTITION p202511 VALUES LESS THAN ('2025-12-01'),
PARTITION p202512 VALUES LESS THAN ('2026-01-01'),
-- 2026年按月分区
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
PARTITION p202602 VALUES LESS THAN ('2026-03-01'),
PARTITION p202603 VALUES LESS THAN ('2026-04-01'),
PARTITION p202604 VALUES LESS THAN ('2026-05-01'),
PARTITION p202605 VALUES LESS THAN ('2026-06-01'),
PARTITION p202606 VALUES LESS THAN ('2026-07-01'),
-- ... 可以预建几个月 ...
-- 兜底分区,存放未来的数据,防止插入报错
PARTITION p_future VALUES LESS THAN (MAXVALUE)
);
CREATE TABLE IF NOT EXISTS chat_turn_retrieval (
id BIGINT AUTO_INCREMENT,
turn_id BIGINT NOT NULL, -- 关联 qa_turn.id
item_type VARCHAR(32) NOT NULL, -- METRIC/SNIPPET/CHAT
item_id VARCHAR(128) NOT NULL, -- metric_id/snippet_id/table_name 等
item_extra JSON NULL, -- 附加信息,如字段名等
similarity_score DECIMAL(10,6) NULL, -- 相似度
rank_no INT NULL, -- 检索排名
used_in_reasoning TINYINT(1) NOT NULL DEFAULT 0, -- 是否参与推理
used_in_sql TINYINT(1) NOT NULL DEFAULT 0, -- 是否影响最终SQL
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
-- 主键改为联合主键,必须包含 created_at
PRIMARY KEY (id, created_at),
KEY idx_turn (turn_id),
KEY idx_turn_type (turn_id, item_type),
KEY idx_item (item_type, item_id)
)
ENGINE=InnoDB
DEFAULT CHARSET=utf8mb4
PARTITION BY RANGE COLUMNS(created_at) (
-- 历史数据分区(根据实际需求调整)
PARTITION p202511 VALUES LESS THAN ('2025-12-01'),
PARTITION p202512 VALUES LESS THAN ('2026-01-01'),
-- 2026年按月分区
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
PARTITION p202602 VALUES LESS THAN ('2026-03-01'),
PARTITION p202603 VALUES LESS THAN ('2026-04-01'),
PARTITION p202604 VALUES LESS THAN ('2026-05-01'),
PARTITION p202605 VALUES LESS THAN ('2026-06-01'),
PARTITION p202606 VALUES LESS THAN ('2026-07-01'),
-- ... 可以预建几个月 ...
-- 兜底分区,存放未来的数据,防止插入报错
PARTITION p_future VALUES LESS THAN (MAXVALUE)
);

View File

@ -0,0 +1,155 @@
CREATE TABLE metric_def (
id BIGINT AUTO_INCREMENT PRIMARY KEY,
metric_code VARCHAR(64) NOT NULL, -- 内部编码order_cnt_delivery
metric_name VARCHAR(128) NOT NULL, -- 中文名:外送订单数
metric_aliases JSON NULL, -- 别名列表
biz_domain VARCHAR(64) NOT NULL, -- 通过table tag获取支持人工配置
biz_desc TEXT NULL, -- 业务口径描述
chat_turn_id BIGINT NULL, -- 来自哪轮会话
tech_desc TEXT NULL, -- 技术口径描述
formula_expr TEXT NULL, -- 公式描述:"sum(pay_amount)"
base_sql MEDIUMTEXT NOT NULL, -- 标准计算SQL逻辑SQL/snippet
time_grain VARCHAR(32) NOT NULL, -- DAY/HOUR/WEEK/MONTH
dim_binding JSON NOT NULL, -- 维度绑定,如 ["dt","store_id","channel"]
update_strategy VARCHAR(32) NOT NULL, -- FULL/INCR/REALTIME
schedule_id BIGINT NULL, -- 调度ID
schedule_type INT NULL, -- 调度类型默认调度cron
version INT NOT NULL DEFAULT 1,
is_active TINYINT(1) NOT NULL DEFAULT 1,
sql_hash VARCHAR(64) NULL, -- base_sql hash 用于版本比较
created_by BIGINT NULL,
updated_by BIGINT NULL,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
UNIQUE KEY uk_metric_code (metric_code),
KEY idx_domain_active (biz_domain, is_active),
KEY idx_update_strategy (update_strategy),
KEY idx_name (metric_name)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE metric_schedule (
id BIGINT AUTO_INCREMENT PRIMARY KEY,
metric_id BIGINT NOT NULL, -- 关联 metric_def.id
cron_expr VARCHAR(64) NOT NULL, -- 调度表达式
enabled TINYINT(1) NOT NULL DEFAULT 1, -- 是否启用
priority INT NOT NULL DEFAULT 10, -- 优先级
backfill_allowed TINYINT(1) NOT NULL DEFAULT 1, -- 是否允许补数
max_runtime_sec INT NULL, -- 最大运行时长(秒)
retry_times INT NOT NULL DEFAULT 0, -- 失败重试次数
owner_team VARCHAR(64) NULL,
owner_user_id BIGINT NULL,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
KEY idx_metric_enabled (metric_id, enabled),
KEY idx_owner (owner_team, owner_user_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE metric_job_run (
id BIGINT AUTO_INCREMENT,
metric_id BIGINT NOT NULL, -- metric_def.id
schedule_id BIGINT NULL, -- metric_schedule.id手动触发则可为空
source_turn_id BIGINT NULL, -- 若本次运行由某次问答触发,关联 qa_turn.id
data_time_from DATETIME NULL, -- 指标统计时间窗口起
data_time_to DATETIME NULL, -- 指标统计时间窗口止
metric_version INT NOT NULL, -- 执行时使用的指标版本
base_sql_snapshot MEDIUMTEXT NOT NULL, -- 本次执行使用的SQL快照
status VARCHAR(32) NOT NULL, -- RUNNING/SUCCESS/FAILED/SKIPPED
error_msg TEXT NULL,
affected_rows BIGINT NULL, -- 写入行数
runtime_ms BIGINT NULL, -- 执行耗时
triggered_by VARCHAR(32) NOT NULL, -- SCHEDULER/MANUAL/API/QA_TURN
triggered_at DATETIME NOT NULL,
started_at DATETIME NULL,
finished_at DATETIME NULL,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
-- 主键改为联合主键,必须包含 created_at
PRIMARY KEY (id, created_at),
KEY idx_metric_time (metric_id, data_time_from, data_time_to),
KEY idx_status_time (status, triggered_at),
KEY idx_schedule (schedule_id),
KEY idx_source_turn (source_turn_id)
)
ENGINE=InnoDB
DEFAULT CHARSET=utf8mb4
PARTITION BY RANGE COLUMNS(created_at) (
-- 历史数据分区(根据实际需求调整)
PARTITION p202511 VALUES LESS THAN ('2025-12-01'),
PARTITION p202512 VALUES LESS THAN ('2026-01-01'),
-- 2026年按月分区
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
PARTITION p202602 VALUES LESS THAN ('2026-03-01'),
PARTITION p202603 VALUES LESS THAN ('2026-04-01'),
PARTITION p202604 VALUES LESS THAN ('2026-05-01'),
PARTITION p202605 VALUES LESS THAN ('2026-06-01'),
PARTITION p202606 VALUES LESS THAN ('2026-07-01'),
-- ... 可以预建几个月 ...
-- 兜底分区,存放未来的数据,防止插入报错
PARTITION p_future VALUES LESS THAN (MAXVALUE)
);
CREATE TABLE metric_result (
id BIGINT AUTO_INCREMENT,
metric_id BIGINT NOT NULL, -- metric_def.id
metric_version INT NOT NULL, -- metric_def.version
stat_time DATETIME NOT NULL, -- 按 time_grain 对齐后的时间
extra_dims JSON NULL, -- 其他维度JSON 存
metric_value DECIMAL(32,8) NOT NULL, -- 指标结果值
load_time DATETIME NOT NULL, -- 入库时间
data_version BIGINT NULL, -- 版本或 job_run id
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
-- 主键改为联合主键,必须包含 created_at
PRIMARY KEY (id, created_at),
KEY idx_metric_time (metric_id, stat_time),
KEY idx_load_time (load_time)
)
ENGINE=InnoDB
DEFAULT CHARSET=utf8mb4
PARTITION BY RANGE COLUMNS(created_at) (
-- 历史数据分区(根据实际需求调整)
PARTITION p202511 VALUES LESS THAN ('2025-12-01'),
PARTITION p202512 VALUES LESS THAN ('2026-01-01'),
-- 2026年按月分区
PARTITION p202601 VALUES LESS THAN ('2026-02-01'),
PARTITION p202602 VALUES LESS THAN ('2026-03-01'),
PARTITION p202603 VALUES LESS THAN ('2026-04-01'),
PARTITION p202604 VALUES LESS THAN ('2026-05-01'),
PARTITION p202605 VALUES LESS THAN ('2026-06-01'),
PARTITION p202606 VALUES LESS THAN ('2026-07-01'),
-- ... 可以预建几个月 ...
-- 兜底分区,存放未来的数据,防止插入报错
PARTITION p_future VALUES LESS THAN (MAXVALUE)
);

View File

@ -0,0 +1,24 @@
CREATE TABLE `rag_snippet` (
`rag_item_id` bigint NOT NULL COMMENT 'RAG item id (stable hash of table/version/snippet_id)',
`workspace_id` bigint NOT NULL COMMENT 'RAG workspace scope',
`table_id` bigint NOT NULL COMMENT '来源表ID',
`version_ts` bigint NOT NULL COMMENT '表版本号',
`action_result_id` bigint NOT NULL COMMENT '来源 action_results 主键IDsnippet_alias 或 snippet 行)',
`snippet_id` varchar(255) COLLATE utf8mb4_bin NOT NULL COMMENT '原始 snippet id',
`rag_text` text COLLATE utf8mb4_bin NOT NULL COMMENT '用于向量化的拼接文本',
`merged_json` json NOT NULL COMMENT '合并后的 snippet 对象',
`created_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '写入时间,用于分区',
`updated_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (`rag_item_id`,`created_at`),
KEY `idx_action_result` (`action_result_id`),
KEY `idx_workspace` (`workspace_id`),
KEY `idx_table_version` (`table_id`,`version_ts`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin
PARTITION BY RANGE COLUMNS (`created_at`) (
PARTITION p202401 VALUES LESS THAN ('2024-02-01'),
PARTITION p202402 VALUES LESS THAN ('2024-03-01'),
PARTITION p202403 VALUES LESS THAN ('2024-04-01'),
PARTITION p202404 VALUES LESS THAN ('2024-05-01'),
PARTITION p202405 VALUES LESS THAN ('2024-06-01'),
PARTITION p_future VALUES LESS THAN (MAXVALUE)
) COMMENT='RAG snippet 索引缓存';

View File

@ -0,0 +1,40 @@
CREATE TABLE `action_results` (
`id` bigint NOT NULL AUTO_INCREMENT COMMENT '主键',
`table_id` bigint NOT NULL COMMENT '表ID',
`version_ts` bigint NOT NULL COMMENT '版本时间戳(版本号)',
`action_type` enum('ge_profiling','ge_result_desc','snippet','snippet_alias') COLLATE utf8mb4_bin NOT NULL COMMENT '动作类型',
`status` enum('pending','running','success','failed','partial') COLLATE utf8mb4_bin NOT NULL DEFAULT 'pending' COMMENT '执行状态',
`model` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型名称',
`model_provider` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型渠道',
`model_params` varchar(100) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '模型参数,如温度',
`llm_usage` json DEFAULT NULL COMMENT 'LLM token usage统计',
`error_code` varchar(128) COLLATE utf8mb4_bin DEFAULT NULL,
`error_message` text COLLATE utf8mb4_bin,
`started_at` datetime DEFAULT NULL,
`finished_at` datetime DEFAULT NULL,
`duration_ms` int DEFAULT NULL,
`table_schema_version_id` varchar(19) COLLATE utf8mb4_bin NOT NULL,
`table_schema` json NOT NULL,
`ge_profiling_json` json DEFAULT NULL COMMENT 'Profiling完整结果JSON',
`ge_profiling_json_size_bytes` bigint DEFAULT NULL,
`ge_profiling_summary` json DEFAULT NULL COMMENT 'Profiling摘要剔除大value_set等',
`ge_profiling_summary_size_bytes` bigint DEFAULT NULL,
`ge_profiling_total_size_bytes` bigint DEFAULT NULL COMMENT '上两者合计',
`ge_profiling_html_report_url` varchar(1024) COLLATE utf8mb4_bin DEFAULT NULL COMMENT 'GE报告HTML路径/URL',
`ge_result_desc_json` json DEFAULT NULL COMMENT '表描述结果JSON',
`ge_result_desc_json_size_bytes` bigint DEFAULT NULL,
`snippet_json` json DEFAULT NULL COMMENT 'SQL知识片段结果JSON',
`snippet_json_size_bytes` bigint DEFAULT NULL,
`snippet_alias_json` json DEFAULT NULL COMMENT 'SQL片段改写/丰富结果JSON',
`snippet_alias_json_size_bytes` bigint DEFAULT NULL,
`callback_url` varchar(1024) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL,
`result_checksum` varbinary(32) DEFAULT NULL COMMENT '对当前action有效载荷计算的MD5/xxhash',
`created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
`updated_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
UNIQUE KEY `uq_table_ver_action` (`table_id`,`version_ts`,`action_type`),
KEY `idx_status` (`status`),
KEY `idx_table` (`table_id`,`updated_at`),
KEY `idx_action_time` (`action_type`,`version_ts`),
KEY `idx_schema_version` (`table_schema_version_id`)
) ENGINE=InnoDB AUTO_INCREMENT=53 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin ROW_FORMAT=DYNAMIC COMMENT='数据分析知识片段表';

6
main.py Normal file
View File

@ -0,0 +1,6 @@
def main():
print("Hello from data-ge-new!")
if __name__ == "__main__":
main()

23
project.md Normal file
View File

@ -0,0 +1,23 @@
项目结构与逻辑
app/main.py创建 FastAPI 应用与生命周期,初始化共享 httpx.AsyncClient 和 LLMGateway统一异常处理后暴露四个接口聊天代理、导入分析、表画像流水线、表片段入库。
app/models.py定义所有请求/响应模型与枚举LLM 请求、导入分析作业、表画像作业、片段入库等),并给出字段校验与默认值。
app/services核心业务逻辑
gateway.py 将 /v1/chat/completions 请求转发到 NEW_API_BASE_URL带可选 Bearer Token并归一化返回。
import_analysis.py 组装导入提示词prompt/data_import_analysis.md、解析/截断样本、调用统一聊天接口、抽取 JSON 结果与 token 用量,最后回调业务方。
table_profiling.py 串行执行 4 步流水线Great Expectations profiling → LLM 结果描述prompt/ge_result_desc_prompt.md→ 片段生成prompt/snippet_generator.md→ 片段别名prompt/snippet_alias_generator.md每步都回调状态与结果。
table_snippet.py 将各步骤结果 upsert 到数据库表,自动序列化 JSON/大小信息并构造 INSERT ... ON DUPLICATE KEY UPDATE。
app/providers/*各云厂商直连客户端OpenAI/Anthropic/OpenRouter/Gemini/Qwen/DeepSeek实现统一 chat 接口;当前主流程通过 new-api 转发,但保留直连能力。
prompt/ 存放提示词模板scripts/ 与 test/ 目录提供接口调用示例和回归样本table_snippet.sql 给出 action_results 表结构(用于片段与 profiling 结果持久化)。
功能/需求说明
LLM 网关POST /v1/chat/completions 接收 LLMRequestprovider+model+messages 等),将 payload 透传到 NEW_API_BASE_URL/v1/chat/completions带可选 NEW_API_AUTH_TOKEN 认证;异常时返回 4xx/5xx 并记录原始响应。
导入分析异步POST /v1/import/analyze 接收导入样本rows/headers/raw_csv/table_schema、目标模型 llm_model默认 DEFAULT_IMPORT_MODEL可被 IMPORT_SUPPORTED_MODELS 白名单限制)、温度与回调地址。服务将样本转 CSV、附加 schema拼接系统+用户消息后调用统一聊天接口,解析首个 choice 中的 JSON 作为分析结果,连同 LLM usage 一并以回调形式返回;失败时回调 status=failed 与错误信息。
表画像流水线异步POST /v1/table/profiling 接收表标识、版本号、回调地址及 GE/LLM 配置datasource/batch_request、连接串模板、LLM 模型与超时)。流水线按顺序执行:
Great Expectations profiling可指定 profiler 类型、datasource、runtime SQL 查询/表),生成完整与摘要 JSON 及 Data Docs 路径;
调用聊天接口生成 GE 结果描述 JSON
基于描述生成 SQL 片段数组;
生成片段别名/关键词。
每步成功/失败都会回调payload 包含 action_type、结果 JSON、模型、llm_usage、报错信息等。
片段结果入库POST /v1/table/snippet 接收 TableSnippetUpsertRequest表/版本、action 类型、状态、schema、模型信息、各阶段 JSON 及大小、错误码、时间戳等),组装到 action_results 表进行 UPSERT返回是否更新已有记录。
配置与运行要求:核心环境变量在 app/settings.pyAPI Keys、DEFAULT_IMPORT_MODEL、IMPORT_GATEWAY_BASE_URL/NEW_API_BASE_URL、模型白名单、数据库 URL 等);日志使用 logging.yaml 自动创建 logs/HTTP 客户端超时/代理可通过 HTTP_CLIENT_TIMEOUT、HTTP_CLIENT_TRUST_ENV、HTTP_CLIENT_PROXY 控制。 调试可用 uvicorn app.main:app --reloadDocker 由 Dockerfile/docker-compose.yml 提供。

21
pyproject.toml Normal file
View File

@ -0,0 +1,21 @@
[project]
name = "data-ge-new"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"fastapi>=0.111.0",
"uvicorn[standard]>=0.29.0",
"pydantic>=2.6.0",
"sqlalchemy>=2.0.28",
"pymysql>=1.1.0",
"great-expectations[profilers]==0.18.19",
"pandas>=2.0",
"numpy>=1.24",
"openpyxl>=3.1",
"httpx==0.27.2",
"python-dotenv==1.0.1",
"requests>=2.31.0",
"PyYAML>=6.0.1",
]

View File

@ -1,13 +0,0 @@
fastapi>=0.111.0
uvicorn[standard]>=0.29.0
pydantic>=2.6.0
sqlalchemy>=2.0.28
pymysql>=1.1.0
great_expectations>=0.18.0,<0.19.0
pandas>=2.0
numpy>=1.24
openpyxl>=3.1
httpx==0.27.2
python-dotenv==1.0.1
requests>=2.31.0
PyYAML>=6.0.1

View File

@ -1,54 +0,0 @@
CREATE TABLE IF NOT EXISTS action_results (
id BIGINT NOT NULL AUTO_INCREMENT COMMENT '主键',
table_id BIGINT NOT NULL COMMENT '表ID',
version_ts BIGINT NOT NULL COMMENT '版本时间戳(版本号)',
action_type ENUM('ge_profiling','ge_result_desc','snippet','snippet_alias') NOT NULL COMMENT '动作类型',
status ENUM('pending','running','success','failed','partial') NOT NULL DEFAULT 'pending' COMMENT '执行状态',
error_code VARCHAR(128) NULL,
error_message TEXT NULL,
-- 回调 & 观测
callback_url VARCHAR(1024) NOT NULL,
started_at DATETIME NULL,
finished_at DATETIME NULL,
duration_ms INT NULL,
-- 本次schema信息
table_schema_version_id BIGINT NOT NULL,
table_schema JSON NOT NULL,
-- ===== 动作1GE Profiling =====
ge_profiling_full JSON NULL COMMENT 'Profiling完整结果JSON',
ge_profiling_full_size_bytes BIGINT NULL,
ge_profiling_summary JSON NULL COMMENT 'Profiling摘要剔除大value_set等',
ge_profiling_summary_size_bytes BIGINT NULL,
ge_profiling_total_size_bytes BIGINT NULL COMMENT '上两者合计',
ge_profiling_html_report_url VARCHAR(1024) NULL COMMENT 'GE报告HTML路径/URL',
-- ===== 动作2GE Result Desc =====
ge_result_desc_full JSON NULL COMMENT '表描述结果JSON',
ge_result_desc_full_size_bytes BIGINT NULL,
-- ===== 动作3Snippet 生成 =====
snippet_full JSON NULL COMMENT 'SQL知识片段结果JSON',
snippet_full_size_bytes BIGINT NULL,
-- ===== 动作4Snippet Alias 改写 =====
snippet_alias_full JSON NULL COMMENT 'SQL片段改写/丰富结果JSON',
snippet_alias_full_size_bytes BIGINT NULL,
-- 通用可选指标
result_checksum VARBINARY(32) NULL COMMENT '对当前action有效载荷计算的MD5/xxhash',
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (id),
UNIQUE KEY uq_table_ver_action (table_id, version_ts, action_type),
KEY idx_status (status),
KEY idx_table (table_id, updated_at),
KEY idx_action_time (action_type, version_ts),
KEY idx_schema_version (table_schema_version_id)
) ENGINE=InnoDB
ROW_FORMAT=DYNAMIC
COMMENT='数据分析知识片段表';

142
test/test_chat_api_mysql.py Normal file
View File

@ -0,0 +1,142 @@
from __future__ import annotations
import os
import random
from pathlib import Path
from typing import Generator, List
import sys
import pytest
from fastapi.testclient import TestClient
from sqlalchemy import text
from sqlalchemy.exc import SQLAlchemyError
# Ensure the project root is importable when running directly via python.
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from app import db
from app.main import create_app
TEST_USER_ID = 872341
SCHEMA_PATH = Path("file/tableschema/chat.sql")
DEFAULT_MYSQL_URL = "mysql+pymysql://root:12345678@127.0.0.1:3306/data-ge?charset=utf8mb4"
@pytest.fixture(scope="module")
def client() -> Generator[TestClient, None, None]:
mysql_url = os.getenv("TEST_DATABASE_URL", DEFAULT_MYSQL_URL)
os.environ["DATABASE_URL"] = mysql_url
db.get_engine.cache_clear()
engine = db.get_engine()
try:
# Quick connectivity check
with engine.connect() as conn:
conn.execute(text("SELECT 1"))
except SQLAlchemyError:
pytest.skip(f"Cannot connect to MySQL at {mysql_url}")
#_ensure_chat_schema(engine)
app = create_app()
with TestClient(app) as test_client:
yield test_client
# cleanup test artifacts
with engine.begin() as conn:
# remove retrievals and turns tied to test sessions
conn.execute(
text(
"""
DELETE FROM chat_turn_retrieval
WHERE turn_id IN (
SELECT id FROM chat_turn WHERE session_id IN (SELECT id FROM chat_session WHERE user_id=:uid)
)
"""
),
{"uid": TEST_USER_ID},
)
conn.execute(
text("DELETE FROM chat_turn WHERE session_id IN (SELECT id FROM chat_session WHERE user_id=:uid)"),
{"uid": TEST_USER_ID},
)
conn.execute(text("DELETE FROM chat_session WHERE user_id=:uid"), {"uid": TEST_USER_ID})
db.get_engine.cache_clear()
def test_session_lifecycle_mysql(client: TestClient) -> None:
# Create a session
resp = client.post("/api/v1/chat/sessions", json={"user_id": TEST_USER_ID})
assert resp.status_code == 200, resp.text
session = resp.json()
session_id = session["id"]
assert session["status"] == "OPEN"
# Get session
assert client.get(f"/api/v1/chat/sessions/{session_id}").status_code == 200
# List sessions (filter by user)
resp = client.get(f"/api/v1/chat/sessions", params={"user_id": TEST_USER_ID})
assert resp.status_code == 200
assert any(item["id"] == session_id for item in resp.json())
# Update status
resp = client.post(f"/api/v1/chat/sessions/{session_id}/update", json={"status": "PAUSED"})
assert resp.status_code == 200
assert resp.json()["status"] == "PAUSED"
# Close session
resp = client.post(f"/api/v1/chat/sessions/{session_id}/close")
assert resp.status_code == 200
assert resp.json()["status"] == "CLOSED"
def test_turns_and_retrievals_mysql(client: TestClient) -> None:
session_id = client.post("/api/v1/chat/sessions", json={"user_id": TEST_USER_ID}).json()["id"]
turn_payload = {
"user_id": TEST_USER_ID,
"user_query": "展示昨天订单GMV",
"intent": "METRIC_QUERY",
"ast_json": {"select": ["gmv"], "where": {"dt": "yesterday"}},
"main_metric_ids": [random.randint(1000, 9999)],
"created_metric_ids": [],
}
resp = client.post(f"/api/v1/chat/sessions/{session_id}/turns", json=turn_payload)
assert resp.status_code == 200, resp.text
turn = resp.json()
turn_id = turn["id"]
assert turn["turn_no"] == 1
# Fetch turn
assert client.get(f"/api/v1/chat/turns/{turn_id}").status_code == 200
# List turns under session
resp = client.get(f"/api/v1/chat/sessions/{session_id}/turns")
assert resp.status_code == 200
assert any(t["id"] == turn_id for t in resp.json())
# Insert retrievals
retrievals_payload = {
"retrievals": [
{"item_type": "METRIC", "item_id": "metric_foo", "used_in_sql": True, "rank_no": 1},
{"item_type": "SNIPPET", "item_id": "snpt_bar", "similarity_score": 0.77, "rank_no": 2},
]
}
resp = client.post(f"/api/v1/chat/turns/{turn_id}/retrievals", json=retrievals_payload)
assert resp.status_code == 200
assert resp.json()["inserted"] == 2
# List retrievals
resp = client.get(f"/api/v1/chat/turns/{turn_id}/retrievals")
assert resp.status_code == 200
items = resp.json()
assert len(items) == 2
assert {item["item_type"] for item in items} == {"METRIC", "SNIPPET"}
if __name__ == "__main__":
import pytest as _pytest
raise SystemExit(_pytest.main([__file__]))

View File

@ -0,0 +1,207 @@
from __future__ import annotations
import os
import random
from datetime import datetime, timedelta
from pathlib import Path
from typing import Generator, List
import pytest
from fastapi.testclient import TestClient
from sqlalchemy import text
from sqlalchemy.exc import SQLAlchemyError
# Ensure project root on path for direct execution
ROOT = Path(__file__).resolve().parents[1]
import sys
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from app import db
from app.main import create_app
TEST_USER_ID = 98765
#SCHEMA_PATH = Path("file/tableschema/metrics.sql")
DEFAULT_MYSQL_URL = "mysql+pymysql://root:12345678@127.0.0.1:3306/data-ge?charset=utf8mb4"
# def _run_sql_script(engine, sql_text: str) -> None:
# """Execute semicolon-terminated SQL statements sequentially."""
# statements: List[str] = []
# buffer: List[str] = []
# for line in sql_text.splitlines():
# stripped = line.strip()
# if not stripped or stripped.startswith("--"):
# continue
# buffer.append(line)
# if stripped.endswith(";"):
# statements.append("\n".join(buffer).rstrip(";"))
# buffer = []
# if buffer:
# statements.append("\n".join(buffer))
# with engine.begin() as conn:
# for stmt in statements:
# conn.execute(text(stmt))
# def _ensure_metric_schema(engine) -> None:
# if not SCHEMA_PATH.exists():
# pytest.skip("metrics.sql schema file not found.")
# raw_sql = SCHEMA_PATH.read_text(encoding="utf-8")
# raw_sql = raw_sql.replace("CREATE TABLE metric_def", "CREATE TABLE IF NOT EXISTS metric_def")
# raw_sql = raw_sql.replace("CREATE TABLE metric_schedule", "CREATE TABLE IF NOT EXISTS metric_schedule")
# raw_sql = raw_sql.replace("CREATE TABLE metric_job_run", "CREATE TABLE IF NOT EXISTS metric_job_run")
# raw_sql = raw_sql.replace("CREATE TABLE metric_result", "CREATE TABLE IF NOT EXISTS metric_result")
# _run_sql_script(engine, raw_sql)
@pytest.fixture(scope="module")
def client() -> Generator[TestClient, None, None]:
mysql_url = os.getenv("TEST_DATABASE_URL", DEFAULT_MYSQL_URL)
os.environ["DATABASE_URL"] = mysql_url
db.get_engine.cache_clear()
engine = db.get_engine()
try:
with engine.connect() as conn:
conn.execute(text("SELECT 1"))
except SQLAlchemyError:
pytest.skip(f"Cannot connect to MySQL at {mysql_url}")
#_ensure_metric_schema(engine)
app = create_app()
with TestClient(app) as test_client:
yield test_client
# cleanup test artifacts
with engine.begin() as conn:
conn.execute(text("DELETE FROM metric_result WHERE metric_id IN (SELECT id FROM metric_def WHERE created_by=:uid)"), {"uid": TEST_USER_ID})
conn.execute(text("DELETE FROM metric_job_run WHERE metric_id IN (SELECT id FROM metric_def WHERE created_by=:uid)"), {"uid": TEST_USER_ID})
conn.execute(text("DELETE FROM metric_schedule WHERE metric_id IN (SELECT id FROM metric_def WHERE created_by=:uid)"), {"uid": TEST_USER_ID})
conn.execute(text("DELETE FROM metric_def WHERE created_by=:uid"), {"uid": TEST_USER_ID})
db.get_engine.cache_clear()
def test_metric_crud_and_schedule_mysql(client: TestClient) -> None:
code = f"metric_{random.randint(1000,9999)}"
create_payload = {
"metric_code": code,
"metric_name": "订单数",
"biz_domain": "order",
"biz_desc": "订单总数",
"base_sql": "select count(*) as order_cnt from orders",
"time_grain": "DAY",
"dim_binding": ["dt"],
"update_strategy": "FULL",
"metric_aliases": ["订单量"],
"created_by": TEST_USER_ID,
}
resp = client.post("/api/v1/metrics", json=create_payload)
assert resp.status_code == 200, resp.text
metric = resp.json()
metric_id = metric["id"]
assert metric["metric_code"] == code
# Update metric
resp = client.post(f"/api/v1/metrics/{metric_id}", json={"metric_name": "订单数-更新", "is_active": False})
assert resp.status_code == 200
assert resp.json()["is_active"] is False
# Get metric
resp = client.get(f"/api/v1/metrics/{metric_id}")
assert resp.status_code == 200
assert resp.json()["metric_name"] == "订单数-更新"
# Create schedule
resp = client.post(
"/api/v1/metric-schedules",
json={"metric_id": metric_id, "cron_expr": "0 2 * * *", "priority": 5, "enabled": True},
)
assert resp.status_code == 200, resp.text
schedule = resp.json()
schedule_id = schedule["id"]
# Update schedule
resp = client.post(f"/api/v1/metric-schedules/{schedule_id}", json={"enabled": False, "retry_times": 1})
assert resp.status_code == 200
assert resp.json()["enabled"] is False
# List schedules for metric
resp = client.get(f"/api/v1/metrics/{metric_id}/schedules")
assert resp.status_code == 200
assert any(s["id"] == schedule_id for s in resp.json())
def test_metric_runs_and_results_mysql(client: TestClient) -> None:
code = f"gmv_{random.randint(1000,9999)}"
metric_id = client.post(
"/api/v1/metrics",
json={
"metric_code": code,
"metric_name": "GMV",
"biz_domain": "order",
"base_sql": "select sum(pay_amount) as gmv from orders",
"time_grain": "DAY",
"dim_binding": ["dt"],
"update_strategy": "FULL",
"created_by": TEST_USER_ID,
},
).json()["id"]
# Trigger run
resp = client.post(
"/api/v1/metric-runs/trigger",
json={
"metric_id": metric_id,
"triggered_by": "API",
"data_time_from": (datetime.utcnow() - timedelta(days=1)).isoformat(),
"data_time_to": datetime.utcnow().isoformat(),
},
)
assert resp.status_code == 200, resp.text
run = resp.json()
run_id = run["id"]
assert run["status"] == "RUNNING"
# List runs
resp = client.get("/api/v1/metric-runs", params={"metric_id": metric_id})
assert resp.status_code == 200
assert any(r["id"] == run_id for r in resp.json())
# Get run
resp = client.get(f"/api/v1/metric-runs/{run_id}")
assert resp.status_code == 200
# Write results
now = datetime.utcnow()
resp = client.post(
f"/api/v1/metric-results/{metric_id}",
json={
"metric_id": metric_id,
"results": [
{"stat_time": (now - timedelta(days=1)).isoformat(), "metric_value": 123.45, "data_version": run_id},
{"stat_time": now.isoformat(), "metric_value": 234.56, "data_version": run_id},
],
},
)
assert resp.status_code == 200, resp.text
assert resp.json()["inserted"] == 2
# Query results
resp = client.get("/api/v1/metric-results", params={"metric_id": metric_id})
assert resp.status_code == 200
results = resp.json()
assert len(results) >= 2
# Latest result
resp = client.get("/api/v1/metric-results/latest", params={"metric_id": metric_id})
assert resp.status_code == 200
latest = resp.json()
assert float(latest["metric_value"]) in {123.45, 234.56}
if __name__ == "__main__":
import pytest as _pytest
raise SystemExit(_pytest.main([__file__]))

91
test/test_rag_client.py Normal file
View File

@ -0,0 +1,91 @@
from __future__ import annotations
import json
import httpx
import pytest
from app.exceptions import ProviderAPICallError
from app.schemas.rag import RagDeleteRequest, RagItemPayload, RagRetrieveRequest
from app.services.rag_client import RagAPIClient
@pytest.mark.asyncio
async def test_add_sends_payload_and_headers() -> None:
rag_client = RagAPIClient(base_url="http://rag.test", auth_token="secret-token")
def handler(request: httpx.Request) -> httpx.Response:
assert request.method == "POST"
assert str(request.url) == "http://rag.test/rag/add"
assert request.headers["Authorization"] == "Bearer secret-token"
payload = json.loads(request.content.decode())
assert payload == {
"id": 1,
"workspaceId": 2,
"name": "demo",
"embeddingData": "vector",
"type": "METRIC",
}
return httpx.Response(200, json={"ok": True, "echo": payload})
transport = httpx.MockTransport(handler)
async with httpx.AsyncClient(transport=transport) as client:
result = await rag_client.add(
client,
RagItemPayload(id=1, workspaceId=2, name="demo", embeddingData="vector", type="METRIC"),
)
assert result["ok"] is True
assert result["echo"]["name"] == "demo"
@pytest.mark.asyncio
async def test_add_batch_serializes_list() -> None:
rag_client = RagAPIClient(base_url="http://rag.test", auth_token=None)
def handler(request: httpx.Request) -> httpx.Response:
payload = json.loads(request.content.decode())
assert request.url.path == "/rag/addBatch"
assert isinstance(payload, list) and len(payload) == 2
return httpx.Response(200, json={"received": len(payload)})
items = [
RagItemPayload(id=1, workspaceId=2, name="a", embeddingData="vec-a", type="METRIC"),
RagItemPayload(id=2, workspaceId=2, name="b", embeddingData="vec-b", type="METRIC"),
]
transport = httpx.MockTransport(handler)
async with httpx.AsyncClient(transport=transport) as client:
result = await rag_client.add_batch(client, items)
assert result == {"received": 2}
@pytest.mark.asyncio
async def test_http_error_raises_provider_error() -> None:
rag_client = RagAPIClient(base_url="http://rag.test")
def handler(request: httpx.Request) -> httpx.Response:
return httpx.Response(500, text="boom")
transport = httpx.MockTransport(handler)
async with httpx.AsyncClient(transport=transport) as client:
with pytest.raises(ProviderAPICallError) as excinfo:
await rag_client.delete(client, RagDeleteRequest(id=1, type="METRIC"))
err = excinfo.value
assert err.status_code == 500
assert "boom" in (err.response_text or "")
@pytest.mark.asyncio
async def test_non_json_response_returns_raw_text() -> None:
rag_client = RagAPIClient(base_url="http://rag.test")
def handler(request: httpx.Request) -> httpx.Response:
return httpx.Response(200, text="plain-text-body")
transport = httpx.MockTransport(handler)
async with httpx.AsyncClient(transport=transport) as client:
result = await rag_client.retrieve(
client, RagRetrieveRequest(query="foo", num=1, workspaceId=1, type="METRIC")
)
assert result == {"raw": "plain-text-body"}

View File

@ -0,0 +1,157 @@
from __future__ import annotations
import json
from datetime import datetime
import httpx
import pytest
from sqlalchemy import create_engine, text
from app.services.table_snippet import ingest_snippet_rag_from_db
def _setup_sqlite_engine():
engine = create_engine("sqlite://")
with engine.begin() as conn:
conn.execute(
text(
"""
CREATE TABLE action_results (
id INTEGER PRIMARY KEY AUTOINCREMENT,
table_id INTEGER,
version_ts INTEGER,
action_type TEXT,
status TEXT,
snippet_json TEXT,
snippet_alias_json TEXT,
updated_at TEXT
)
"""
)
)
conn.execute(
text(
"""
CREATE TABLE rag_snippet (
rag_item_id INTEGER PRIMARY KEY,
action_result_id INTEGER NOT NULL,
workspace_id INTEGER,
table_id INTEGER,
version_ts INTEGER,
created_at TEXT,
snippet_id TEXT,
rag_text TEXT,
merged_json TEXT,
updated_at TEXT
)
"""
)
)
return engine
def _insert_action_row(engine, payload: dict) -> None:
with engine.begin() as conn:
conn.execute(
text(
"""
INSERT INTO action_results (table_id, version_ts, action_type, status, snippet_json, snippet_alias_json, updated_at)
VALUES (:table_id, :version_ts, :action_type, :status, :snippet_json, :snippet_alias_json, :updated_at)
"""
),
{
"table_id": payload["table_id"],
"version_ts": payload["version_ts"],
"action_type": payload["action_type"],
"status": payload.get("status", "success"),
"snippet_json": json.dumps(payload.get("snippet_json"), ensure_ascii=False)
if payload.get("snippet_json") is not None
else None,
"snippet_alias_json": json.dumps(payload.get("snippet_alias_json"), ensure_ascii=False)
if payload.get("snippet_alias_json") is not None
else None,
"updated_at": payload.get("updated_at") or datetime.utcnow().isoformat(),
},
)
class _StubRagClient:
def __init__(self) -> None:
self.received = None
async def add_batch(self, _client, items):
self.received = items
return {"count": len(items)}
@pytest.mark.asyncio
async def test_ingest_snippet_rag_from_db_persists_and_calls_rag_client() -> None:
engine = _setup_sqlite_engine()
table_id = 321
version_ts = 20240102000000
snippet_payload = [
{
"id": "snpt_topn",
"title": "TopN",
"aliases": [{"text": "站点水表排行前N", "tone": "中性"}],
"keywords": ["TopN", "站点"],
}
]
alias_payload = [
{
"id": "snpt_topn",
"aliases": [
{"text": "站点水表排行前N", "tone": "中性"},
{"text": "按站点水表TopN", "tone": "专业"},
],
"keywords": ["TopN", "排行"],
"intent_tags": ["topn", "aggregate"],
},
{
"id": "snpt_extra",
"aliases": [{"text": "额外别名"}],
"keywords": ["extra"],
},
]
_insert_action_row(
engine,
{
"table_id": table_id,
"version_ts": version_ts,
"action_type": "snippet_alias",
"snippet_json": snippet_payload,
"snippet_alias_json": alias_payload,
"updated_at": "2024-01-02T00:00:00",
},
)
rag_stub = _StubRagClient()
async with httpx.AsyncClient() as client:
rag_ids = await ingest_snippet_rag_from_db(
table_id=table_id,
version_ts=version_ts,
workspace_id=99,
rag_item_type="SNIPPET",
client=client,
engine=engine,
rag_client=rag_stub,
)
assert rag_stub.received is not None
assert len(rag_stub.received) == 2 # includes alias-only row
assert len(rag_ids) == 2
with engine.connect() as conn:
rows = list(
conn.execute(
text("SELECT snippet_id, action_result_id, rag_text, merged_json FROM rag_snippet ORDER BY snippet_id")
)
)
assert {row[0] for row in rows} == {"snpt_extra", "snpt_topn"}
assert all(row[1] is not None for row in rows)
topn_row = next(row for row in rows if row[0] == "snpt_topn")
assert "TopN" in topn_row[2]
assert "按站点水表TopN" in topn_row[2]
assert "排行" in topn_row[2]

View File

@ -0,0 +1,74 @@
from __future__ import annotations
from app.services.table_profiling import _parse_completion_payload
from app.utils.llm_usage import extract_usage
def test_parse_completion_payload_handles_array_with_trailing_text() -> None:
response_payload = {
"choices": [
{
"message": {
"content": """
结果如下:
[
{"id": "snpt_a"},
{"id": "snpt_b"}
]
附加说明:模型可能会输出额外文本。
""".strip()
}
}
]
}
parsed = _parse_completion_payload(response_payload)
assert isinstance(parsed, list)
assert [item["id"] for item in parsed] == ["snpt_a", "snpt_b"]
def test_extract_usage_info_normalizes_numeric_fields() -> None:
response_payload = {
"raw": {
"usage": {
"prompt_tokens": 12.7,
"completion_tokens": 3,
"total_tokens": 15.7,
"prompt_tokens_details": {"cached_tokens": 8.9, "other": None},
"non_numeric": "ignored",
}
}
}
usage = extract_usage(response_payload)
assert usage == {
"prompt_tokens": 12,
"completion_tokens": 3,
"total_tokens": 15,
"prompt_tokens_details": {"cached_tokens": 8},
}
def test_extract_usage_handles_alias_keys() -> None:
response_payload = {
"raw": {
"usageMetadata": {
"input_tokens": 20,
"output_tokens": 4,
}
}
}
usage = extract_usage(response_payload)
assert usage == {
"prompt_tokens": 20,
"completion_tokens": 4,
"total_tokens": 24,
}
def test_extract_usage_returns_none_when_missing() -> None:
assert extract_usage({"raw": {}}) is None

View File

@ -0,0 +1,213 @@
from __future__ import annotations
import json
import os
import random
from datetime import datetime, timedelta
from typing import List
from pathlib import Path
import sys
import pytest
from sqlalchemy import text
from sqlalchemy.engine import Engine
from sqlalchemy.exc import SQLAlchemyError
# Ensure the project root is importable when running directly via python.
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from app import db
from app.main import create_app
from app.services.table_snippet import merge_snippet_records_from_db
DEFAULT_MYSQL_URL = "mysql+pymysql://root:12345678@127.0.0.1:3306/data-ge?charset=utf8mb4"
@pytest.fixture()
def mysql_engine() -> Engine:
mysql_url = os.getenv("TEST_DATABASE_URL", DEFAULT_MYSQL_URL)
os.environ["DATABASE_URL"] = mysql_url
db.get_engine.cache_clear()
engine = db.get_engine()
try:
with engine.connect() as conn:
conn.execute(text("SELECT 1"))
exists = conn.execute(text("SHOW TABLES LIKE 'action_results'")).scalar()
if not exists:
pytest.skip("action_results table not found in test database.")
except SQLAlchemyError:
pytest.skip(f"Cannot connect to MySQL at {mysql_url}")
return engine
def _insert_action_row(
engine: Engine,
*,
table_id: int,
version_ts: int,
action_type: str,
status: str = "success",
snippet_json: List[dict] | None = None,
snippet_alias_json: List[dict] | None = None,
updated_at: datetime | None = None,
) -> None:
snippet_json_str = json.dumps(snippet_json, ensure_ascii=False) if snippet_json is not None else None
snippet_alias_json_str = (
json.dumps(snippet_alias_json, ensure_ascii=False) if snippet_alias_json is not None else None
)
with engine.begin() as conn:
conn.execute(
text(
"""
INSERT INTO action_results (
table_id, version_ts, action_type, status,
callback_url, table_schema_version_id, table_schema,
snippet_json, snippet_alias_json, updated_at
) VALUES (
:table_id, :version_ts, :action_type, :status,
:callback_url, :table_schema_version_id, :table_schema,
:snippet_json, :snippet_alias_json, :updated_at
)
ON DUPLICATE KEY UPDATE
status=VALUES(status),
snippet_json=VALUES(snippet_json),
snippet_alias_json=VALUES(snippet_alias_json),
updated_at=VALUES(updated_at)
"""
),
{
"table_id": table_id,
"version_ts": version_ts,
"action_type": action_type,
"status": status,
"callback_url": "http://localhost/test-callback",
"table_schema_version_id": "1",
"table_schema": json.dumps({}, ensure_ascii=False),
"snippet_json": snippet_json_str,
"snippet_alias_json": snippet_alias_json_str,
"updated_at": updated_at or datetime.utcnow(),
},
)
def _cleanup(engine: Engine, table_id: int, version_ts: int) -> None:
with engine.begin() as conn:
conn.execute(
text("DELETE FROM action_results WHERE table_id=:table_id AND version_ts=:version_ts"),
{"table_id": table_id, "version_ts": version_ts},
)
def test_merge_prefers_alias_row_and_appends_alias_only_entries(mysql_engine: Engine) -> None:
table_id = 990000000 + random.randint(1, 9999)
version_ts = int(datetime.utcnow().strftime("%Y%m%d%H%M%S"))
alias_updated = datetime(2024, 1, 2, 0, 0, 0)
snippet_payload = [
{
"id": "snpt_topn",
"aliases": [{"text": "站点水表排行前N", "tone": "中性"}],
"keywords": ["TopN", "站点"],
}
]
alias_payload = [
{
"id": "snpt_topn",
"aliases": [
{"text": "站点水表排行前N", "tone": "中性"},
{"text": "按站点水表TopN", "tone": "专业"},
],
"keywords": ["TopN", "排行"],
"intent_tags": ["topn", "aggregate"],
},
{
"id": "snpt_extra",
"aliases": [{"text": "额外别名"}],
"keywords": ["extra"],
},
]
_insert_action_row(
mysql_engine,
table_id=table_id,
version_ts=version_ts,
action_type="snippet_alias",
snippet_json=snippet_payload,
snippet_alias_json=alias_payload,
updated_at=alias_updated,
)
try:
merged = merge_snippet_records_from_db(table_id, version_ts, engine=mysql_engine)
assert len(merged) == 2
topn = next(item for item in merged if item["id"] == "snpt_topn")
assert topn["source"] == "snippet"
assert topn["updated_at_from_action"] == alias_updated
assert {a["text"] for a in topn["aliases"]} == {"站点水表排行前N", "按站点水表TopN"}
assert set(topn["keywords"]) == {"TopN", "站点", "排行"}
assert set(topn["intent_tags"]) == {"topn", "aggregate"}
alias_only = next(item for item in merged if item["source"] == "alias_only")
assert alias_only["id"] == "snpt_extra"
assert alias_only["aliases"][0]["text"] == "额外别名"
finally:
_cleanup(mysql_engine, table_id, version_ts)
def test_merge_falls_back_to_snippet_row_when_alias_row_missing_snippet_json(mysql_engine: Engine) -> None:
table_id = 991000000 + random.randint(1, 9999)
version_ts = int((datetime.utcnow() + timedelta(seconds=1)).strftime("%Y%m%d%H%M%S"))
alias_updated = datetime(2024, 1, 3, 0, 0, 0)
alias_payload = [
{
"id": "snpt_quality",
"aliases": [{"text": "质量检查"}],
"keywords": ["quality"],
}
]
snippet_payload = [
{
"id": "snpt_quality",
"title": "质量检查",
"keywords": ["data-quality"],
"aliases": [{"text": "质量检查"}],
}
]
_insert_action_row(
mysql_engine,
table_id=table_id,
version_ts=version_ts,
action_type="snippet_alias",
snippet_json=None,
snippet_alias_json=alias_payload,
updated_at=alias_updated,
)
_insert_action_row(
mysql_engine,
table_id=table_id,
version_ts=version_ts,
action_type="snippet",
snippet_json=snippet_payload,
snippet_alias_json=None,
updated_at=datetime(2024, 1, 2, 0, 0, 0),
)
try:
merged = merge_snippet_records_from_db(table_id, version_ts, engine=mysql_engine)
assert len(merged) == 1
record = merged[0]
assert record["id"] == "snpt_quality"
assert record["source"] == "snippet"
assert record["updated_at_from_action"] == alias_updated
assert set(record["keywords"]) == {"data-quality", "quality"}
assert {a["text"] for a in record["aliases"]} == {"质量检查"}
finally:
_cleanup(mysql_engine, table_id, version_ts)

13
uv.lock generated Normal file
View File

@ -0,0 +1,13 @@
version = 1
revision = 1
requires-python = ">=3.11"
resolution-markers = [
"python_full_version >= '3.14'",
"python_full_version >= '3.12' and python_full_version < '3.14'",
"python_full_version < '3.12'",
]
[[package]]
name = "data-ge-new"
version = "0.1.0"
source = { virtual = "." }