init,llm gateway & import_analyse

This commit is contained in:
zhaoawd
2025-10-29 00:38:57 +08:00
commit 0af5f19af9
62 changed files with 3169 additions and 0 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
.venv
gx/uncommitted/
.vscode/

3
app/__init__.py Normal file
View File

@ -0,0 +1,3 @@
from .main import create_app
__all__ = ["create_app"]

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

6
app/exceptions.py Normal file
View File

@ -0,0 +1,6 @@
class ProviderConfigurationError(RuntimeError):
"""Raised when a provider is missing required configuration."""
class ProviderAPICallError(RuntimeError):
"""Raised when the upstream provider responds with an error."""

103
app/main.py Normal file
View File

@ -0,0 +1,103 @@
from __future__ import annotations
from contextlib import asynccontextmanager
import httpx
from fastapi import Depends, FastAPI, HTTPException, Request
from app.exceptions import ProviderAPICallError, ProviderConfigurationError
from app.models import (
DataImportAnalysisRequest,
DataImportAnalysisResponse,
LLMRequest,
LLMResponse,
)
from app.services import LLMGateway
from app.services.import_analysis import build_import_messages, resolve_provider_from_model
@asynccontextmanager
async def lifespan(app: FastAPI):
client = httpx.AsyncClient(timeout=httpx.Timeout(30.0))
gateway = LLMGateway()
try:
app.state.http_client = client # type: ignore[attr-defined]
app.state.gateway = gateway # type: ignore[attr-defined]
yield
finally:
await client.aclose()
def create_app() -> FastAPI:
application = FastAPI(
title="Unified LLM Gateway",
version="0.1.0",
lifespan=lifespan,
)
@application.post(
"/v1/chat/completions",
response_model=LLMResponse,
summary="Dispatch chat completion to upstream provider",
)
async def create_chat_completion(
payload: LLMRequest,
gateway: LLMGateway = Depends(get_gateway),
client: httpx.AsyncClient = Depends(get_http_client),
) -> LLMResponse:
try:
return await gateway.chat(payload, client)
except ProviderConfigurationError as exc:
raise HTTPException(status_code=422, detail=str(exc)) from exc
except ProviderAPICallError as exc:
raise HTTPException(status_code=502, detail=str(exc)) from exc
@application.post(
"/v1/import/analyze",
response_model=DataImportAnalysisResponse,
summary="Analyze import sample data via configured LLM",
)
async def analyze_import_data(
payload: DataImportAnalysisRequest,
gateway: LLMGateway = Depends(get_gateway),
client: httpx.AsyncClient = Depends(get_http_client),
) -> DataImportAnalysisResponse:
try:
provider, model_name = resolve_provider_from_model(payload.llm_model)
except ValueError as exc:
raise HTTPException(status_code=422, detail=str(exc)) from exc
messages = build_import_messages(payload)
llm_request = LLMRequest(
provider=provider,
model=model_name,
messages=messages,
temperature=payload.temperature if payload.temperature is not None else 0.2,
max_tokens=payload.max_tokens,
)
try:
llm_response = await gateway.chat(llm_request, client)
except ProviderConfigurationError as exc:
raise HTTPException(status_code=422, detail=str(exc)) from exc
except ProviderAPICallError as exc:
raise HTTPException(status_code=502, detail=str(exc)) from exc
return DataImportAnalysisResponse(
import_record_id=payload.import_record_id,
llm_response=llm_response,
)
return application
async def get_gateway(request: Request) -> LLMGateway:
return request.app.state.gateway # type: ignore[return-value, attr-defined]
async def get_http_client(request: Request) -> httpx.AsyncClient:
return request.app.state.http_client # type: ignore[return-value, attr-defined]
app = create_app()

92
app/models.py Normal file
View File

@ -0,0 +1,92 @@
from __future__ import annotations
from enum import Enum
from typing import Any, List, Optional
from pydantic import BaseModel, Field
class LLMRole(str, Enum):
USER = "user"
ASSISTANT = "assistant"
SYSTEM = "system"
class LLMMessage(BaseModel):
role: LLMRole = Field(..., description="Message author role.")
content: str = Field(..., description="Plain text content of the message.")
class LLMProvider(str, Enum):
OPENAI = "openai"
ANTHROPIC = "anthropic"
OPENROUTER = "openrouter"
GEMINI = "gemini"
QWEN = "qwen"
DEEPSEEK = "deepseek"
class LLMRequest(BaseModel):
provider: LLMProvider = Field(..., description="Target LLM provider identifier.")
model: str = Field(..., description="Model name understood by the provider.")
messages: List[LLMMessage] = Field(..., description="Ordered chat messages.")
temperature: Optional[float] = Field(
0.7, description="Sampling temperature when supported."
)
top_p: Optional[float] = Field(
None, description="Top-p nucleus sampling when supported."
)
max_tokens: Optional[int] = Field(
None, description="Maximum tokens to generate when supported."
)
stream: Optional[bool] = Field(
False, description="Enable provider streaming if both sides support it."
)
extra_params: Optional[dict[str, Any]] = Field(
None, description="Provider-specific parameters to merge into the payload."
)
class LLMChoice(BaseModel):
index: int
message: LLMMessage
class LLMResponse(BaseModel):
provider: LLMProvider
model: str
choices: List[LLMChoice]
raw: Optional[dict[str, Any]] = Field(
None, description="Raw provider response for debugging."
)
class DataImportAnalysisRequest(BaseModel):
import_record_id: str = Field(..., description="Unique identifier for this import run.")
example_data: str = Field(
...,
max_length=30_000,
description="Sample rows from the import payload. Limited to 30k characters.",
)
table_headers: List[str] = Field(
...,
min_length=1,
description="Ordered list of table headers associated with the data.",
)
llm_model: str = Field(
...,
description="Model identifier. Accepts 'provider:model' format or plain model name.",
)
temperature: Optional[float] = Field(
None,
description="Optional override for LLM temperature when generating recognition output.",
)
max_tokens: Optional[int] = Field(
None,
description="Optional override for maximum tokens generated during recognition.",
)
class DataImportAnalysisResponse(BaseModel):
import_record_id: str
llm_response: LLMResponse

17
app/providers/__init__.py Normal file
View File

@ -0,0 +1,17 @@
from .anthropic import AnthropicProvider
from .base import LLMProviderClient
from .deepseek import DeepSeekProvider
from .gemini import GeminiProvider
from .openai import OpenAIProvider
from .openrouter import OpenRouterProvider
from .qwen import QwenProvider
__all__ = [
"LLMProviderClient",
"OpenAIProvider",
"AnthropicProvider",
"OpenRouterProvider",
"GeminiProvider",
"QwenProvider",
"DeepSeekProvider",
]

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,97 @@
from __future__ import annotations
from typing import Any, Dict, List, Tuple
import httpx
from app.exceptions import ProviderAPICallError
from app.models import (
LLMChoice,
LLMMessage,
LLMProvider,
LLMRequest,
LLMResponse,
LLMRole,
)
from app.providers.base import LLMProviderClient
class AnthropicProvider(LLMProviderClient):
name = LLMProvider.ANTHROPIC.value
api_key_env = "ANTHROPIC_API_KEY"
base_url = "https://api.anthropic.com/v1/messages"
anthropic_version = "2023-06-01"
async def chat(
self, request: LLMRequest, client: httpx.AsyncClient
) -> LLMResponse:
self.ensure_stream_supported(request.stream)
system_prompt, chat_messages = self._convert_messages(request.messages)
payload = self.merge_payload(
{
"model": request.model,
"messages": chat_messages,
"max_tokens": request.max_tokens or 1024,
"temperature": request.temperature,
"top_p": request.top_p,
},
request.extra_params,
)
if system_prompt:
payload["system"] = system_prompt
headers = {
"x-api-key": self.api_key,
"anthropic-version": self.anthropic_version,
"content-type": "application/json",
}
try:
response = await client.post(self.base_url, json=payload, headers=headers)
response.raise_for_status()
except httpx.HTTPError as exc:
raise ProviderAPICallError(f"Anthropic request failed: {exc}") from exc
data: Dict[str, Any] = response.json()
message = self._build_message(data)
return LLMResponse(
provider=LLMProvider.ANTHROPIC,
model=data.get("model", request.model),
choices=[LLMChoice(index=0, message=message)],
raw=data,
)
@staticmethod
def _convert_messages(
messages: List[LLMMessage],
) -> Tuple[str | None, List[dict[str, Any]]]:
system_parts: List[str] = []
chat_payload: List[dict[str, Any]] = []
for msg in messages:
if msg.role == LLMRole.SYSTEM:
system_parts.append(msg.content)
continue
role = "user" if msg.role == LLMRole.USER else "assistant"
chat_payload.append(
{"role": role, "content": [{"type": "text", "text": msg.content}]}
)
system_prompt = "\n\n".join(system_parts) if system_parts else None
return system_prompt, chat_payload
@staticmethod
def _build_message(data: Dict[str, Any]) -> LLMMessage:
role = data.get("role", "assistant")
content_blocks = data.get("content", [])
text_parts = [
block.get("text", "")
for block in content_blocks
if isinstance(block, dict) and block.get("type") == "text"
]
content = "\n\n".join(part for part in text_parts if part)
return LLMMessage(role=role, content=content)

44
app/providers/base.py Normal file
View File

@ -0,0 +1,44 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Any
import httpx
from app.exceptions import ProviderConfigurationError
from app.models import LLMRequest, LLMResponse
class LLMProviderClient(ABC):
"""Base class for provider-specific chat completion clients."""
name: str
api_key_env: str | None = None
supports_stream: bool = False
def __init__(self, api_key: str | None):
if self.api_key_env and not api_key:
raise ProviderConfigurationError(
f"Provider '{self.name}' requires environment variable '{self.api_key_env}'."
)
self.api_key = api_key or ""
@abstractmethod
async def chat(
self, request: LLMRequest, client: httpx.AsyncClient
) -> LLMResponse:
"""Execute a chat completion call."""
@staticmethod
def merge_payload(base: dict[str, Any], extra: dict[str, Any] | None) -> dict[str, Any]:
"""Merge provider payload with optional extra params, ignoring None values."""
merged = {k: v for k, v in base.items() if v is not None}
if extra:
merged.update({k: v for k, v in extra.items() if v is not None})
return merged
def ensure_stream_supported(self, stream_requested: bool) -> None:
if stream_requested and not self.supports_stream:
raise ProviderConfigurationError(
f"Provider '{self.name}' does not support streaming mode."
)

66
app/providers/deepseek.py Normal file
View File

@ -0,0 +1,66 @@
from __future__ import annotations
from typing import Any, Dict, List
import httpx
from app.exceptions import ProviderAPICallError
from app.models import LLMChoice, LLMMessage, LLMProvider, LLMRequest, LLMResponse
from app.providers.base import LLMProviderClient
class DeepSeekProvider(LLMProviderClient):
name = LLMProvider.DEEPSEEK.value
api_key_env = "DEEPSEEK_API_KEY"
supports_stream = True
base_url = "https://api.deepseek.com/v1/chat/completions"
async def chat(
self, request: LLMRequest, client: httpx.AsyncClient
) -> LLMResponse:
self.ensure_stream_supported(request.stream)
payload = self.merge_payload(
{
"model": request.model,
"messages": [msg.model_dump() for msg in request.messages],
"temperature": request.temperature,
"top_p": request.top_p,
"max_tokens": request.max_tokens,
"stream": request.stream,
},
request.extra_params,
)
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
try:
response = await client.post(self.base_url, json=payload, headers=headers)
response.raise_for_status()
except httpx.HTTPError as exc:
raise ProviderAPICallError(f"DeepSeek request failed: {exc}") from exc
data: Dict[str, Any] = response.json()
choices = self._build_choices(data.get("choices", []))
return LLMResponse(
provider=LLMProvider.DEEPSEEK,
model=data.get("model", request.model),
choices=choices,
raw=data,
)
@staticmethod
def _build_choices(choices: List[dict[str, Any]]) -> List[LLMChoice]:
built: List[LLMChoice] = []
for choice in choices:
message_data = choice.get("message") or {}
message = LLMMessage(
role=message_data.get("role", "assistant"),
content=message_data.get("content", ""),
)
built.append(LLMChoice(index=choice.get("index", len(built)), message=message))
return built

112
app/providers/gemini.py Normal file
View File

@ -0,0 +1,112 @@
from __future__ import annotations
from typing import Any, Dict, List, Tuple
import httpx
from app.exceptions import ProviderAPICallError
from app.models import (
LLMChoice,
LLMMessage,
LLMProvider,
LLMRequest,
LLMResponse,
LLMRole,
)
from app.providers.base import LLMProviderClient
class GeminiProvider(LLMProviderClient):
name = LLMProvider.GEMINI.value
api_key_env = "GEMINI_API_KEY"
base_url = "https://generativelanguage.googleapis.com/v1beta"
async def chat(
self, request: LLMRequest, client: httpx.AsyncClient
) -> LLMResponse:
self.ensure_stream_supported(request.stream)
system_instruction, contents = self._convert_messages(request.messages)
config = {
"temperature": request.temperature,
"topP": request.top_p,
"maxOutputTokens": request.max_tokens,
}
payload: Dict[str, Any] = self.merge_payload(
{"contents": contents}, request.extra_params
)
generation_config = {k: v for k, v in config.items() if v is not None}
if generation_config:
payload["generationConfig"] = generation_config
if system_instruction:
payload["systemInstruction"] = {
"role": "system",
"parts": [{"text": system_instruction}],
}
endpoint = f"{self.base_url}/models/{request.model}:generateContent?key={self.api_key}"
headers = {"Content-Type": "application/json"}
try:
response = await client.post(endpoint, json=payload, headers=headers)
response.raise_for_status()
except httpx.HTTPError as exc:
raise ProviderAPICallError(f"Gemini request failed: {exc}") from exc
data: Dict[str, Any] = response.json()
choices = self._build_choices(data.get("candidates", []))
return LLMResponse(
provider=LLMProvider.GEMINI,
model=request.model,
choices=choices,
raw=data,
)
@staticmethod
def _convert_messages(
messages: List[LLMMessage],
) -> Tuple[str | None, List[dict[str, Any]]]:
system_parts: List[str] = []
contents: List[dict[str, Any]] = []
for msg in messages:
if msg.role == LLMRole.SYSTEM:
system_parts.append(msg.content)
continue
role = "user" if msg.role == LLMRole.USER else "model"
contents.append({"role": role, "parts": [{"text": msg.content}]})
system_instruction = "\n\n".join(system_parts) if system_parts else None
return system_instruction, contents
@staticmethod
def _build_choices(candidates: List[dict[str, Any]]) -> List[LLMChoice]:
choices: List[LLMChoice] = []
for idx, candidate in enumerate(candidates):
content = candidate.get("content", {})
parts = content.get("parts", [])
text_parts = [
part.get("text", "")
for part in parts
if isinstance(part, dict) and part.get("text")
]
text = "\n\n".join(text_parts)
choices.append(
LLMChoice(
index=candidate.get("index", idx),
message=LLMMessage(role="assistant", content=text),
)
)
if not choices:
choices.append(
LLMChoice(
index=0,
message=LLMMessage(role="assistant", content=""),
)
)
return choices

66
app/providers/openai.py Normal file
View File

@ -0,0 +1,66 @@
from __future__ import annotations
from typing import Any, Dict, List
import httpx
from app.exceptions import ProviderAPICallError
from app.models import LLMChoice, LLMMessage, LLMProvider, LLMRequest, LLMResponse
from app.providers.base import LLMProviderClient
class OpenAIProvider(LLMProviderClient):
name = LLMProvider.OPENAI.value
api_key_env = "OPENAI_API_KEY"
supports_stream = True
base_url = "https://api.openai.com/v1/chat/completions"
async def chat(
self, request: LLMRequest, client: httpx.AsyncClient
) -> LLMResponse:
self.ensure_stream_supported(request.stream)
payload = self.merge_payload(
{
"model": request.model,
"messages": [msg.model_dump() for msg in request.messages],
"temperature": request.temperature,
"top_p": request.top_p,
"max_tokens": request.max_tokens,
"stream": request.stream,
},
request.extra_params,
)
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
try:
response = await client.post(self.base_url, json=payload, headers=headers)
response.raise_for_status()
except httpx.HTTPError as exc:
raise ProviderAPICallError(f"OpenAI request failed: {exc}") from exc
data: Dict[str, Any] = response.json()
choices = self._build_choices(data.get("choices", []))
return LLMResponse(
provider=LLMProvider.OPENAI,
model=data.get("model", request.model),
choices=choices,
raw=data,
)
@staticmethod
def _build_choices(choices: List[dict[str, Any]]) -> List[LLMChoice]:
built: List[LLMChoice] = []
for choice in choices:
message_data = choice.get("message") or {}
message = LLMMessage(
role=message_data.get("role", "assistant"), # fallback to assistant
content=message_data.get("content", ""),
)
built.append(LLMChoice(index=choice.get("index", len(built)), message=message))
return built

View File

@ -0,0 +1,77 @@
from __future__ import annotations
import os
from typing import Any, Dict, List
import httpx
from app.exceptions import ProviderAPICallError
from app.models import LLMChoice, LLMMessage, LLMProvider, LLMRequest, LLMResponse
from app.providers.base import LLMProviderClient
class OpenRouterProvider(LLMProviderClient):
name = LLMProvider.OPENROUTER.value
api_key_env = "OPENROUTER_API_KEY"
supports_stream = True
base_url = "https://openrouter.ai/api/v1/chat/completions"
def __init__(self, api_key: str | None):
super().__init__(api_key)
self.site_url = os.getenv("OPENROUTER_SITE_URL")
self.app_name = os.getenv("OPENROUTER_APP_NAME")
async def chat(
self, request: LLMRequest, client: httpx.AsyncClient
) -> LLMResponse:
self.ensure_stream_supported(request.stream)
payload = self.merge_payload(
{
"model": request.model,
"messages": [msg.model_dump() for msg in request.messages],
"temperature": request.temperature,
"top_p": request.top_p,
"max_tokens": request.max_tokens,
"stream": request.stream,
},
request.extra_params,
)
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
if self.site_url:
headers["HTTP-Referer"] = self.site_url
if self.app_name:
headers["X-Title"] = self.app_name
try:
response = await client.post(self.base_url, json=payload, headers=headers)
response.raise_for_status()
except httpx.HTTPError as exc:
raise ProviderAPICallError(f"OpenRouter request failed: {exc}") from exc
data: Dict[str, Any] = response.json()
choices = self._build_choices(data.get("choices", []))
return LLMResponse(
provider=LLMProvider.OPENROUTER,
model=data.get("model", request.model),
choices=choices,
raw=data,
)
@staticmethod
def _build_choices(choices: List[dict[str, Any]]) -> List[LLMChoice]:
built: List[LLMChoice] = []
for choice in choices:
message_data = choice.get("message") or {}
message = LLMMessage(
role=message_data.get("role", "assistant"),
content=message_data.get("content", ""),
)
built.append(LLMChoice(index=choice.get("index", len(built)), message=message))
return built

87
app/providers/qwen.py Normal file
View File

@ -0,0 +1,87 @@
from __future__ import annotations
from typing import Any, Dict, List
import httpx
from app.exceptions import ProviderAPICallError
from app.models import LLMChoice, LLMMessage, LLMProvider, LLMRequest, LLMResponse
from app.providers.base import LLMProviderClient
class QwenProvider(LLMProviderClient):
name = LLMProvider.QWEN.value
api_key_env = "QWEN_API_KEY"
base_url = (
"https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation"
)
async def chat(
self, request: LLMRequest, client: httpx.AsyncClient
) -> LLMResponse:
self.ensure_stream_supported(request.stream)
parameters = {
"temperature": request.temperature,
"top_p": request.top_p,
}
if request.max_tokens is not None:
parameters["max_output_tokens"] = request.max_tokens
# Strip None values from parameters
parameters = {k: v for k, v in parameters.items() if v is not None}
payload: Dict[str, Any] = {
"model": request.model,
"input": {"messages": [msg.model_dump() for msg in request.messages]},
}
if parameters:
payload["parameters"] = parameters
payload = self.merge_payload(payload, request.extra_params)
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
try:
response = await client.post(self.base_url, json=payload, headers=headers)
response.raise_for_status()
except httpx.HTTPError as exc:
raise ProviderAPICallError(f"Qwen request failed: {exc}") from exc
data: Dict[str, Any] = response.json()
choices = self._build_choices(data.get("output", {}))
return LLMResponse(
provider=LLMProvider.QWEN,
model=request.model,
choices=choices,
raw=data,
)
@staticmethod
def _build_choices(output: Dict[str, Any]) -> List[LLMChoice]:
choices_payload = output.get("choices", [])
if not choices_payload and output.get("text"):
return [
LLMChoice(
index=0,
message=LLMMessage(role="assistant", content=output["text"]),
)
]
choices: List[LLMChoice] = []
for idx, choice in enumerate(choices_payload):
message_data = choice.get("message") or {}
message = LLMMessage(
role=message_data.get("role", "assistant"),
content=message_data.get("content", ""),
)
choices.append(LLMChoice(index=choice.get("index", idx), message=message))
if not choices:
choices.append(
LLMChoice(index=0, message=LLMMessage(role="assistant", content=""))
)
return choices

3
app/services/__init__.py Normal file
View File

@ -0,0 +1,3 @@
from .gateway import LLMGateway
__all__ = ["LLMGateway"]

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

53
app/services/gateway.py Normal file
View File

@ -0,0 +1,53 @@
from __future__ import annotations
import os
from typing import Dict, Type
import httpx
from app.exceptions import ProviderConfigurationError
from app.models import LLMProvider, LLMRequest, LLMResponse
from app.providers import (
AnthropicProvider,
DeepSeekProvider,
GeminiProvider,
LLMProviderClient,
OpenAIProvider,
OpenRouterProvider,
QwenProvider,
)
class LLMGateway:
"""Simple registry that dispatches chat requests to provider clients."""
def __init__(self) -> None:
self._providers: Dict[LLMProvider, LLMProviderClient] = {}
self._factory: Dict[LLMProvider, Type[LLMProviderClient]] = {
LLMProvider.OPENAI: OpenAIProvider,
LLMProvider.ANTHROPIC: AnthropicProvider,
LLMProvider.OPENROUTER: OpenRouterProvider,
LLMProvider.GEMINI: GeminiProvider,
LLMProvider.QWEN: QwenProvider,
LLMProvider.DEEPSEEK: DeepSeekProvider,
}
def get_provider(self, provider: LLMProvider) -> LLMProviderClient:
if provider not in self._factory:
raise ProviderConfigurationError(f"Unsupported provider '{provider.value}'.")
if provider not in self._providers:
self._providers[provider] = self._build_provider(provider)
return self._providers[provider]
def _build_provider(self, provider: LLMProvider) -> LLMProviderClient:
provider_cls = self._factory[provider]
api_key_env = getattr(provider_cls, "api_key_env", None)
api_key = os.getenv(api_key_env) if api_key_env else None
return provider_cls(api_key)
async def chat(
self, request: LLMRequest, client: httpx.AsyncClient
) -> LLMResponse:
provider_client = self.get_provider(request.provider)
return await provider_client.chat(request, client)

View File

@ -0,0 +1,91 @@
from __future__ import annotations
from typing import List, Tuple
from app.models import (
DataImportAnalysisRequest,
LLMMessage,
LLMProvider,
LLMRole,
)
def resolve_provider_from_model(llm_model: str) -> Tuple[LLMProvider, str]:
"""Resolve provider based on the llm_model string.
The llm_model may be provided as 'provider:model' or 'provider/model'.
If no provider prefix is present, try an educated guess from common model name patterns.
"""
normalized = llm_model.strip()
provider_hint: str | None = None
model_name = normalized
for delimiter in (":", "/", "|"):
if delimiter in normalized:
provider_hint, model_name = normalized.split(delimiter, 1)
provider_hint = provider_hint.strip().lower()
model_name = model_name.strip()
break
provider_map = {provider.value: provider for provider in LLMProvider}
if provider_hint:
if provider_hint not in provider_map:
raise ValueError(
f"Unsupported provider '{provider_hint}'. Expected one of: {', '.join(provider_map.keys())}."
)
return provider_map[provider_hint], model_name
return _guess_provider_from_model(model_name), model_name
def _guess_provider_from_model(model_name: str) -> LLMProvider:
lowered = model_name.lower()
if lowered.startswith(("gpt", "o1", "text-", "dall-e", "whisper")):
return LLMProvider.OPENAI
if lowered.startswith(("claude", "anthropic")):
return LLMProvider.ANTHROPIC
if lowered.startswith(("gemini", "models/gemini")):
return LLMProvider.GEMINI
if lowered.startswith("qwen"):
return LLMProvider.QWEN
if lowered.startswith("deepseek"):
return LLMProvider.DEEPSEEK
if lowered.startswith(("openrouter", "router-")):
return LLMProvider.OPENROUTER
supported = ", ".join(provider.value for provider in LLMProvider)
raise ValueError(
f"Unable to infer provider from model '{model_name}'. "
f"Please prefix with 'provider:model'. Supported providers: {supported}."
)
def build_import_messages(
request: DataImportAnalysisRequest,
) -> List[LLMMessage]:
"""Create system and user messages for the import analysis prompt."""
headers_formatted = "\n".join(f"- {header}" for header in request.table_headers)
system_prompt = (
"你是一名数据导入识别助手。请根据给定的表头和示例数据,判断字段含义、"
"典型数据类型以及潜在的数据质量问题。最终请返回一个结构化的JSON。\n"
"JSON结构需包含: field_summaries (数组, 每项含 header、meaning、data_type、quality_notes), "
"detected_issues (字符串数组),以及 overall_suggestion (字符串)。"
)
user_prompt = (
f"导入记录ID: {request.import_record_id}\n\n"
"表头信息:\n"
f"{headers_formatted}\n\n"
"示例数据:\n"
f"{request.example_data}\n\n"
"请仔细分析示例数据与表头之间的对应关系并返回符合上述JSON结构的内容。"
)
return [
LLMMessage(role=LLMRole.SYSTEM, content=system_prompt),
LLMMessage(role=LLMRole.USER, content=user_prompt),
]

View File

@ -0,0 +1,47 @@
系统角色System
你是“数据画像抽取器”。输入是一段 Great Expectations 的 profiling/validation 结果 JSON
可能包含列级期望expect_*)、统计、样例值、类型推断等;也可能带表级/批次元数据。
请将其归一化为一个可被程序消费的“表画像”JSON对不确定项给出置信度与理由。
禁止臆造不存在的列、时间范围或数值。
用户消息User
【输入GE结果JSON】
{{GE_RESULT_JSON}}
【输出要求只输出JSON不要解释文字
{
"table": "<库.表 或 表名>",
"row_count": <int|null>, // 若未知可为 null
"role": "fact|dimension|unknown", // 依据指标/维度占比与唯一性启发式
"grain": ["<列1>", "<列2>", ...], // 事实粒度猜测(如含 dt/店/类目)
"time": { "column": "<name>|null", "granularity": "day|week|month|unknown", "range": ["YYYY-MM-DD","YYYY-MM-DD"]|null, "has_gaps": true|false|null },
"columns": [
{
"name": "<col>",
"dtype": "<ge推断/物理类型>",
"semantic_type": "dimension|metric|time|text|id|unknown",
"null_rate": <0~1|null>,
"distinct_count": <int|null>,
"distinct_ratio": <0~1|null>,
"stats": { "min": <number|string|null>,"max": <number|string|null>,"mean": <number|null>,"std": <number|null>,"skewness": <number|null> },
"enumish": true|false|null, // 低熵/可枚举
"top_values": [{"value":"<v>","pct":<0~1>}, ...],// 取前K个≤10
"pk_candidate_score": <0~1>, // 唯一性+非空综合评分
"metric_candidate_score": <0~1>, // 数值/偏态/业务词命中
"comment": "<列注释或GE描述|可为空>"
}
],
"primary_key_candidates": [["colA","colB"], ...], // 依据 unique/compound unique 期望
"fk_candidates": [{"from":"<col>","to":"<dim_table(col)>","confidence":<0~1>}],
"quality": {
"failed_expectations": [{"name":"<expect_*>","column":"<col|table>","summary":"<一句话>"}],
"warning_hints": ["空值率>0.2的列: ...", "时间列存在缺口: ..."]
},
"confidence_notes": ["<为什么判定role/grain/time列>"]
}
【判定规则(简要)】
- time列类型为日期/时间 OR 命中 dt/date/day 等命名;若有 min/max 可给出 range若间隔缺口≥1天记 has_gaps=true。
- semantic_type数值+右偏/方差大→更偏 metric高唯一/ID命名→id高基数+文本→text低熵+有限取值→dimension。
- rolemetric列占比高且存在time列→倾向 fact几乎全是枚举/ID且少数值→dimension。
- 置信不高时给出 null 或 unknown并写入 confidence_notes。

View File

@ -0,0 +1,127 @@
E-commerce Customer Order Behavior Dataset
A synthetic e-commerce dataset containing 10,000 orders with realistic customer behavior patterns, suitable for e-commerce analytics and machine learning tasks.
Dataset Card for E-commerce Orders
Dataset Summary
This dataset simulates customer order behavior in an e-commerce platform, containing detailed information about orders, customers, products, and delivery patterns. The data is synthetically generated with realistic distributions and patterns.
Supported Tasks
regression: Predict order quantities or prices
classification: Predict delivery status or customer segments
clustering: Identify customer behavior patterns
time-series-forecasting: Analyze order patterns over time
Languages
Not applicable (tabular data)
Dataset Structure
Data Instances
Each instance represents a single e-commerce order with the following fields:
{
'order_id': '5ea92c47-c5b2-4bdd-8a50-d77efd77ec89',
'customer_id': 2350,
'product_id': 995,
'category': 'Electronics',
'price': 403.17,
'quantity': 3,
'order_date': '2024-04-20 14:59:58.897063',
'shipping_date': '2024-04-22 14:59:58.897063',
'delivery_status': 'Delivered',
'payment_method': 'PayPal',
'device_type': 'Mobile',
'channel': 'Paid Search',
'shipping_address': '72166 Cunningham Crescent East Nicholasside Mississippi 85568',
'billing_address': '38199 Edwin Plain Johnborough Maine 81826',
'customer_segment': 'Returning'
}
Data Fields
Field Name Type Description Value Range
order_id string Unique order identifier (UUID4) -
customer_id int Customer identifier 1-3,000
product_id int Product identifier 1-1,000
category string Product category Electronics, Clothing, Home, Books, Beauty, Toys
price float Product price $5.00-$500.00
quantity int Order quantity 1-10
order_date datetime Order placement timestamp Last 12 months
shipping_date datetime Shipping timestamp 1-7 days after order_date
delivery_status string Delivery status Pending, Shipped, Delivered, Returned
payment_method string Payment method used Credit Card, PayPal, Debit Card, Apple Pay, Google Pay
device_type string Ordering device Desktop, Mobile, Tablet
channel string Marketing channel Organic, Paid Search, Email, Social
shipping_address string Delivery address Street, City, State, ZIP
billing_address string Billing address Street, City, State, ZIP
customer_segment string Customer type New, Returning, VIP
Data Splits
This dataset is provided as a single CSV file without splits.
Dataset Creation
Source Data
This is a synthetic dataset generated using Python with pandas, numpy, and Faker libraries. The data generation process ensures:
Realistic customer behavior patterns
Proper data distributions
Valid relationships between fields
Realistic address formatting
Annotations
No manual annotations (synthetic data)
Considerations for Using the Data
Social Impact of Dataset
This dataset is designed for:
Development of e-commerce analytics systems
Testing of order processing systems
Training of machine learning models for e-commerce
Educational purposes in data science
Discussion of Biases
As a synthetic dataset, care has been taken to:
Use realistic distributions for order patterns
Maintain proper relationships between dates
Create realistic customer segments
Avoid demographic biases in address generation
However, users should note that:
The data patterns are simplified compared to real e-commerce data
The customer behavior patterns are based on general assumptions
Geographic distribution might not reflect real-world patterns
Dataset Statistics
Total Records: 10,000
Distribution Statistics:
Delivery Status:
Delivered: 70%
Shipped: 20%
Pending: 5%
Returned: 5%
Customer Segments:
VIP: ~15%
Returning: ~35%
New: ~50%
Loading and Usage
Using Huggingface Datasets:
from datasets import load_dataset
dataset = load_dataset("path/to/e-commerce-orders")
# Example: Load as pandas DataFrame
df = dataset['train'].to_pandas()
# Example: Access specific columns
orders = dataset['train']['order_id']
prices = dataset['train']['price']
Data Quality
The dataset has been validated to ensure:
No missing values
Proper value ranges
Valid categorical values
Proper date relationships
Unique order IDs
Valid address formats

523
demo/snippet.json Normal file
View File

@ -0,0 +1,523 @@
[
{
"id": "snpt_daily_gmv_trend",
"title": "日GMV趋势",
"desc": "按日统计GMV与订单量趋势",
"type": "trend",
"applicability": {
"required_columns": [
"order_date",
"price",
"quantity"
],
"time_column": "order_date",
"constraints": {
"dim_cardinality_hint": null,
"fk_join_available": false,
"notes": [
"GMV=SUM(price*quantity)",
"请避免选择地址等PII字段"
]
}
},
"variables": [
{
"name": "start_date",
"type": "date"
},
{
"name": "end_date",
"type": "date"
}
],
"dialect_sql": {
"mysql": "SELECT DATE(order_date) AS dt, SUM(price*quantity) AS gmv, COUNT(*) AS orders\nFROM {{table}}\nWHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\nGROUP BY dt\nORDER BY dt;"
},
"business_caliber": "GMV口径price×quantity订单量记录条数粒度=日。",
"examples": [
"近30天GMV趋势",
"2025Q1每日GMV与订单数"
]
},
{
"id": "snpt_daily_gmv_ma7",
"title": "7日GMV均线",
"desc": "GMV按日与7日滑动平均",
"type": "trend",
"applicability": {
"required_columns": [
"order_date",
"price",
"quantity"
],
"time_column": "order_date",
"constraints": {
"dim_cardinality_hint": null,
"fk_join_available": false,
"notes": [
"窗口=包含当日的过去7天",
"若日期有缺口,均线可能偏移"
]
}
},
"variables": [
{
"name": "start_date",
"type": "date"
},
{
"name": "end_date",
"type": "date"
}
],
"dialect_sql": {
"mysql": "WITH d AS (\n SELECT DATE(order_date) AS dt, SUM(price*quantity) AS gmv\n FROM {{table}}\n WHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\n GROUP BY dt\n)\nSELECT dt,\n gmv,\n AVG(gmv) OVER (ORDER BY dt ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS gmv_ma7\nFROM d\nORDER BY dt;"
},
"business_caliber": "GMV=price×quantity窗口=7天含当日按自然日排序计算。",
"examples": [
"本季度GMV与7日均线",
"促销期走势平滑对比"
]
},
{
"id": "snpt_yoy_daily_gmv",
"title": "GMV同比",
"desc": "对比去年同日GMV与同比%",
"type": "ratio",
"applicability": {
"required_columns": [
"order_date",
"price",
"quantity"
],
"time_column": "order_date",
"constraints": {
"dim_cardinality_hint": null,
"fk_join_available": false,
"notes": [
"需要查询窗口覆盖到去年的对应日期",
"闰年按日期对齐处理"
]
}
},
"variables": [
{
"name": "start_date",
"type": "date"
},
{
"name": "end_date",
"type": "date"
}
],
"dialect_sql": {
"mysql": "WITH cur AS (\n SELECT DATE(order_date) AS dt, SUM(price*quantity) AS gmv\n FROM {{table}}\n WHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\n GROUP BY dt\n),\nprev AS (\n SELECT DATE(DATE_SUB(order_date, INTERVAL 1 YEAR)) AS dt, SUM(price*quantity) AS gmv_last\n FROM {{table}}\n WHERE DATE(order_date) BETWEEN DATE_SUB({{start_date}}, INTERVAL 1 YEAR) AND DATE_SUB({{end_date}}, INTERVAL 1 YEAR)\n GROUP BY DATE(DATE_SUB(order_date, INTERVAL 1 YEAR))\n)\nSELECT c.dt,\n c.gmv,\n p.gmv_last,\n CASE WHEN p.gmv_last IS NULL OR p.gmv_last=0 THEN NULL ELSE (c.gmv - p.gmv_last)/p.gmv_last END AS yoy\nFROM cur c LEFT JOIN prev p ON c.dt = p.dt\nORDER BY c.dt;"
},
"business_caliber": "同比=当日GMV与去年同日GMV之差/去年同日GMVGMV=price×quantity。",
"examples": [
"最近90天GMV同比曲线",
"节假日同比表现"
]
},
{
"id": "snpt_topn_category_gmv",
"title": "类目GMV排行",
"desc": "按类目统计GMV并取TopN",
"type": "topn",
"applicability": {
"required_columns": [
"order_date",
"category",
"price",
"quantity"
],
"time_column": "order_date",
"constraints": {
"dim_cardinality_hint": 6,
"fk_join_available": false,
"notes": [
"类目枚举较少建议TopN<=6用于展示",
"可追加订单量与件数"
]
}
},
"variables": [
{
"name": "start_date",
"type": "date"
},
{
"name": "end_date",
"type": "date"
},
{
"name": "top_n",
"type": "int",
"default": 10
}
],
"dialect_sql": {
"mysql": "SELECT category,\n SUM(price*quantity) AS gmv,\n COUNT(*) AS orders,\n SUM(quantity) AS qty\nFROM {{table}}\nWHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\nGROUP BY category\nORDER BY gmv DESC\nLIMIT {{top_n}};"
},
"business_caliber": "GMV=price×quantity统计范围=指定日期内;粒度=类目。",
"examples": [
"上月类目Top5",
"本季度类目GMV结构"
]
},
{
"id": "snpt_share_channel",
"title": "渠道GMV占比",
"desc": "统计各渠道GMV及占比",
"type": "ratio",
"applicability": {
"required_columns": [
"order_date",
"channel",
"price",
"quantity"
],
"time_column": "order_date",
"constraints": {
"dim_cardinality_hint": 4,
"fk_join_available": false,
"notes": [
"占比以总GMV为分母占比之和≈100%",
"适合饼图/堆叠柱"
]
}
},
"variables": [
{
"name": "start_date",
"type": "date"
},
{
"name": "end_date",
"type": "date"
}
],
"dialect_sql": {
"mysql": "WITH base AS (\n SELECT channel, SUM(price*quantity) AS gmv\n FROM {{table}}\n WHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\n GROUP BY channel\n), total AS (\n SELECT SUM(gmv) AS tg FROM base\n)\nSELECT b.channel, b.gmv, b.gmv/t.tg AS gmv_share\nFROM base b CROSS JOIN total t\nORDER BY b.gmv DESC;"
},
"business_caliber": "渠道GMV占比=渠道GMV/全部渠道GMV时间范围由参数限定。",
"examples": [
"本月各渠道占比",
"Q1渠道结构对比"
]
},
{
"id": "snpt_topn_product_gmv",
"title": "商品GMV排行",
"desc": "按商品ID统计GMV并取TopN",
"type": "topn",
"applicability": {
"required_columns": [
"order_date",
"product_id",
"price",
"quantity"
],
"time_column": "order_date",
"constraints": {
"dim_cardinality_hint": 1000,
"fk_join_available": true,
"notes": [
"product_id基数较高建议LIMIT<=50",
"可与商品维表联查名称等属性"
]
}
},
"variables": [
{
"name": "start_date",
"type": "date"
},
{
"name": "end_date",
"type": "date"
},
{
"name": "top_n",
"type": "int",
"default": 20
}
],
"dialect_sql": {
"mysql": "SELECT product_id,\n SUM(price*quantity) AS gmv,\n SUM(quantity) AS qty,\n COUNT(*) AS orders\nFROM {{table}}\nWHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\nGROUP BY product_id\nORDER BY gmv DESC\nLIMIT {{top_n}};"
},
"business_caliber": "GMV=price×quantity粒度=商品ID。",
"examples": [
"上周热销商品Top20",
"年度销量Top10商品"
]
},
{
"id": "snpt_join_product_dim",
"title": "商品维表联查",
"desc": "以product_id关联商品维表或使用纯ID",
"type": "join",
"applicability": {
"required_columns": [
"product_id"
],
"time_column": null,
"constraints": {
"dim_cardinality_hint": 1000,
"fk_join_available": true,
"notes": [
"若无维表则保留纯ID版输出",
"谨慎选择PII字段勿输出地址类字段"
]
}
},
"variables": [
{
"name": "dim_product",
"type": "identifier"
},
{
"name": "select_cols",
"type": "string",
"default": "f.product_id, f.price, f.quantity"
}
],
"dialect_sql": {
"mysql": "-- 命名版\nSELECT {{select_cols}}\nFROM {{table}} f\nLEFT JOIN {{dim_product}} d ON f.product_id = d.product_id;\n\n-- 纯ID版\nSELECT product_id, price, quantity FROM {{table}};"
},
"business_caliber": "外键product_id→商品维表主键度量来源于事实表price与quantity。",
"examples": [
"联查商品名称后做TopN",
"仅用ID进行商品分析"
]
},
{
"id": "snpt_join_customer_dim",
"title": "客户维表联查",
"desc": "以customer_id关联客户维表或使用纯ID",
"type": "join",
"applicability": {
"required_columns": [
"customer_id"
],
"time_column": null,
"constraints": {
"dim_cardinality_hint": 2713,
"fk_join_available": true,
"notes": [
"如无维表可直接按customer_id聚合",
"避免输出shipping_address/billing_address等PII"
]
}
},
"variables": [
{
"name": "dim_customer",
"type": "identifier"
},
{
"name": "select_cols",
"type": "string",
"default": "c.customer_name, f.customer_id, SUM(f.price*f.quantity) AS gmv"
}
],
"dialect_sql": {
"mysql": "-- 命名版\nSELECT {{select_cols}}\nFROM {{table}} f\nLEFT JOIN {{dim_customer}} c ON f.customer_id = c.customer_id\nGROUP BY c.customer_name, f.customer_id;\n\n-- 纯ID版\nSELECT customer_id, SUM(price*quantity) AS gmv\nFROM {{table}}\nGROUP BY customer_id;"
},
"business_caliber": "外键customer_id→客户维表主键GMV=price×quantity。",
"examples": [
"客户分群GMV",
"重点客户消费额排行"
]
},
{
"id": "snpt_quality_dup_order",
"title": "主键重复检查",
"desc": "检查order_id唯一性并抽样输出",
"type": "quality",
"applicability": {
"required_columns": [
"order_id"
],
"time_column": null,
"constraints": {
"dim_cardinality_hint": 10000,
"fk_join_available": false,
"notes": [
"画像显示order_id应唯一若结果非空为异常"
]
}
},
"variables": [
{
"name": "limit_sample",
"type": "int",
"default": 50
}
],
"dialect_sql": {
"mysql": "WITH d AS (\n SELECT order_id, COUNT(*) AS cnt\n FROM {{table}}\n GROUP BY order_id\n HAVING COUNT(*)>1\n)\nSELECT * FROM d LIMIT {{limit_sample}};"
},
"business_caliber": "主键口径order_id全表唯一用于数据质量预警与排查。",
"examples": [
"是否存在重复订单?",
"查看重复订单样本"
]
},
{
"id": "snpt_quality_price_outlier",
"title": "价格异常检测",
"desc": "基于当日均值±3σ识别异常价",
"type": "quality",
"applicability": {
"required_columns": [
"order_date",
"price"
],
"time_column": "order_date",
"constraints": {
"dim_cardinality_hint": null,
"fk_join_available": false,
"notes": [
"仅质量预警,不直接代表业务错误",
"当天样本过少时波动较大"
]
}
},
"variables": [
{
"name": "start_date",
"type": "date"
},
{
"name": "end_date",
"type": "date"
},
{
"name": "limit_sample",
"type": "int",
"default": 100
}
],
"dialect_sql": {
"mysql": "WITH stats AS (\n SELECT DATE(order_date) AS dt, AVG(price) AS mu, STDDEV_POP(price) AS sigma\n FROM {{table}}\n WHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\n GROUP BY dt\n)\nSELECT f.*\nFROM {{table}} f\nJOIN stats s ON DATE(f.order_date)=s.dt\nWHERE (f.price > s.mu + 3*s.sigma OR f.price < s.mu - 3*s.sigma)\nLIMIT {{limit_sample}};"
},
"business_caliber": "异常定义价格超出当日均值±3×标准差总体标准差。",
"examples": [
"近30天价格异常样本",
"促销期异常价监控"
]
},
{
"id": "snpt_sample_recent_orders",
"title": "近期明细抽样",
"desc": "抽样查看近期订单核心字段",
"type": "sample",
"applicability": {
"required_columns": [
"order_date",
"order_id",
"customer_id",
"product_id",
"category",
"price",
"quantity",
"channel",
"payment_method",
"delivery_status"
],
"time_column": "order_date",
"constraints": {
"dim_cardinality_hint": null,
"fk_join_available": true,
"notes": [
"为保护隐私不展示shipping_address与billing_address",
"仅用于人工核验"
]
}
},
"variables": [
{
"name": "start_date",
"type": "date"
},
{
"name": "end_date",
"type": "date"
},
{
"name": "limit_rows",
"type": "int",
"default": 100
}
],
"dialect_sql": {
"mysql": "SELECT DATE(order_date) AS dt,\n order_id, customer_id, product_id, category,\n price, quantity, channel, payment_method, delivery_status\nFROM {{table}}\nWHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\nORDER BY dt DESC\nLIMIT {{limit_rows}};"
},
"business_caliber": "明细抽样用于数据核验不输出PII地址信息。",
"examples": [
"抽样查看上周订单",
"核对节假日订单明细"
]
},
{
"id": "snpt_filter_paid_delivered",
"title": "支付已送达筛选",
"desc": "过滤支付方式为信用卡且配送状态为已送达",
"type": "sample",
"applicability": {
"required_columns": [
"payment_method",
"delivery_status"
],
"time_column": null,
"constraints": {
"dim_cardinality_hint": 5,
"fk_join_available": false,
"notes": [
"此片段为WHERE条件模板可拼接到任意查询",
"delivery_status枚举包含Delivered/Pending/Returned/Shipped"
]
}
},
"variables": [],
"dialect_sql": {
"mysql": "WHERE payment_method = 'Credit Card' AND delivery_status = 'Delivered'"
},
"business_caliber": "口径:支付渠道=信用卡;物流状态=已送达Delivered。可与时间或维度条件叠加。",
"examples": [
"筛选信用卡已送达订单",
"在TopN商品中仅看已送达信用卡订单"
]
},
{
"id": "snpt_filter_device_channel",
"title": "设备渠道筛选",
"desc": "按设备类型与渠道过滤分析范围",
"type": "sample",
"applicability": {
"required_columns": [
"device_type",
"channel"
],
"time_column": null,
"constraints": {
"dim_cardinality_hint": 7,
"fk_join_available": false,
"notes": [
"device_type枚举Desktop/Mobile/Tablet",
"channel枚举Email/Organic/Paid Search/Social"
]
}
},
"variables": [],
"dialect_sql": {
"mysql": "WHERE device_type IN ('Mobile','Desktop') AND channel IN ('Paid Search','Social')"
},
"business_caliber": "限制分析在指定设备与渠道可直接作为WHERE子句片段复用。",
"examples": [
"仅看移动端付费渠道GMV",
"桌面+社媒渠道订单明细"
]
}
]

View File

@ -0,0 +1,499 @@
[
{
"id": "snpt_daily_gmv_trend",
"aliases": [
{
"text": "每日GMV走势",
"tone": "中性"
},
{
"text": "日销售额趋势",
"tone": "中性"
},
{
"text": "每天卖了多少",
"tone": "口语"
},
{
"text": "按日GMV曲线",
"tone": "专业"
}
],
"keywords": [
"GMV",
"销售额",
"日趋势",
"每日",
"订单量",
"orders",
"price",
"quantity",
"order_date",
"time series",
"趋势图",
"按日聚合"
],
"intent_tags": [
"trend"
]
},
{
"id": "snpt_daily_gmv_ma7",
"aliases": [
{
"text": "GMV七日均线",
"tone": "专业"
},
{
"text": "7天滑动平均",
"tone": "中性"
},
{
"text": "GMV周均走势",
"tone": "中性"
},
{
"text": "GMV平滑曲线",
"tone": "专业"
}
],
"keywords": [
"GMV",
"移动平均",
"MA7",
"七日均线",
"滑动窗口",
"time series",
"order_date",
"price",
"quantity",
"平滑",
"趋势",
"按日聚合"
],
"intent_tags": [
"trend"
]
},
{
"id": "snpt_yoy_daily_gmv",
"aliases": [
{
"text": "GMV日同比",
"tone": "专业"
},
{
"text": "每日同比增速",
"tone": "中性"
},
{
"text": "跟去年同日比",
"tone": "口语"
},
{
"text": "GMV YoY",
"tone": "专业"
}
],
"keywords": [
"同比",
"YoY",
"GMV",
"去年同日",
"增长率",
"price",
"quantity",
"order_date",
"对比分析",
"比值",
"日粒度",
"ratio"
],
"intent_tags": [
"ratio"
]
},
{
"id": "snpt_topn_category_gmv",
"aliases": [
{
"text": "类目GMV排行",
"tone": "中性"
},
{
"text": "类目TopN销量",
"tone": "中性"
},
{
"text": "哪个分类最卖",
"tone": "口语"
},
{
"text": "按类目GMV排序",
"tone": "专业"
}
],
"keywords": [
"TopN",
"分类",
"类目",
"category",
"GMV",
"price",
"quantity",
"排行",
"榜单",
"按类目聚合",
"订单量",
"销量"
],
"intent_tags": [
"topn",
"by_dimension"
]
},
{
"id": "snpt_share_channel",
"aliases": [
{
"text": "渠道GMV占比",
"tone": "中性"
},
{
"text": "各渠道份额",
"tone": "中性"
},
{
"text": "哪个渠道占多",
"tone": "口语"
},
{
"text": "渠道结构占比",
"tone": "专业"
}
],
"keywords": [
"占比",
"份额",
"share",
"channel",
"GMV",
"price",
"quantity",
"比例",
"结构分析",
"按渠道聚合",
"饼图",
"堆叠"
],
"intent_tags": [
"ratio",
"by_dimension"
]
},
{
"id": "snpt_topn_product_gmv",
"aliases": [
{
"text": "商品GMV排行",
"tone": "中性"
},
{
"text": "热销商品TopN",
"tone": "中性"
},
{
"text": "哪款卖得最好",
"tone": "口语"
},
{
"text": "按商品GMV排序",
"tone": "专业"
}
],
"keywords": [
"TopN",
"product_id",
"商品",
"GMV",
"price",
"quantity",
"热销",
"排行",
"销量",
"订单数",
"高基数",
"榜单"
],
"intent_tags": [
"topn",
"by_dimension"
]
},
{
"id": "snpt_join_product_dim",
"aliases": [
{
"text": "关联商品维度",
"tone": "专业"
},
{
"text": "商品ID联表",
"tone": "中性"
},
{
"text": "把商品名连上",
"tone": "口语"
},
{
"text": "product维表join",
"tone": "专业"
}
],
"keywords": [
"join",
"维表",
"product_id",
"维度扩展",
"明细补充",
"维度属性",
"联表查询",
"外键",
"选择列",
"维度贴标签",
"by id",
"映射"
],
"intent_tags": [
"by_dimension"
]
},
{
"id": "snpt_join_customer_dim",
"aliases": [
{
"text": "关联客户维度",
"tone": "专业"
},
{
"text": "客户ID联表",
"tone": "中性"
},
{
"text": "把客户信息补齐",
"tone": "口语"
},
{
"text": "customer维表join",
"tone": "专业"
}
],
"keywords": [
"join",
"维表",
"customer_id",
"客户属性",
"GMV聚合",
"外键关联",
"联表查询",
"ID映射",
"维度丰富",
"分群分析",
"by id",
"扩展字段"
],
"intent_tags": [
"by_dimension"
]
},
{
"id": "snpt_quality_dup_order",
"aliases": [
{
"text": "订单主键去重检",
"tone": "专业"
},
{
"text": "重复order_id查找",
"tone": "中性"
},
{
"text": "有没重复订单",
"tone": "口语"
},
{
"text": "主键唯一性校验",
"tone": "专业"
}
],
"keywords": [
"数据质量",
"重复",
"去重",
"order_id",
"唯一性",
"主键检查",
"异常数据",
"质量预警",
"count>1",
"样本抽取",
"校验",
"重复检测"
],
"intent_tags": [
"quality"
]
},
{
"id": "snpt_quality_price_outlier",
"aliases": [
{
"text": "价格3σ异常检",
"tone": "专业"
},
{
"text": "当日异常价格",
"tone": "中性"
},
{
"text": "看哪单价格怪",
"tone": "口语"
},
{
"text": "价格离群监控",
"tone": "专业"
}
],
"keywords": [
"异常检测",
"3σ",
"stddev",
"价格",
"price",
"离群点",
"质量规则",
"time series",
"order_date",
"阈值告警",
"数据监控",
"波动"
],
"intent_tags": [
"quality"
]
},
{
"id": "snpt_sample_recent_orders",
"aliases": [
{
"text": "近期明细抽样",
"tone": "中性"
},
{
"text": "抽查最近订单",
"tone": "口语"
},
{
"text": "近期订单样本",
"tone": "中性"
},
{
"text": "核验明细抽样",
"tone": "专业"
}
],
"keywords": [
"抽样",
"sample",
"明细",
"order_date",
"order_id",
"customer_id",
"product_id",
"category",
"channel",
"payment_method",
"delivery_status",
"核验"
],
"intent_tags": [
"by_dimension"
]
},
{
"id": "snpt_filter_paid_delivered",
"aliases": [
{
"text": "信用卡送达筛选",
"tone": "中性"
},
{
"text": "只看信用卡已送达",
"tone": "口语"
},
{
"text": "信用卡且已送达",
"tone": "中性"
},
{
"text": "付款信用卡已送达",
"tone": "专业"
}
],
"keywords": [
"支付方式",
"信用卡",
"Credit Card",
"配送状态",
"Delivered",
"已送达",
"过滤条件",
"where子句",
"订单筛选",
"支付渠道",
"状态筛选",
"条件片段"
],
"intent_tags": [
"by_dimension"
]
},
{
"id": "snpt_filter_device_channel",
"aliases": [
{
"text": "设备渠道筛选",
"tone": "中性"
},
{
"text": "只看移动付费社媒",
"tone": "口语"
},
{
"text": "设备+渠道过滤",
"tone": "专业"
},
{
"text": "端与渠道条件",
"tone": "中性"
}
],
"keywords": [
"device_type",
"channel",
"设备类型",
"渠道",
"过滤条件",
"where子句",
"Mobile",
"Desktop",
"Paid Search",
"Social",
"范围限定",
"条件片段"
],
"intent_tags": [
"by_dimension"
]
}
]

View File

@ -0,0 +1,52 @@
系统角色System
你是“SQL片段别名生成器”。
输入为一个或多个 SQL 片段对象(来自 snippet.json输出为针对每个片段生成的多样化别名口语 / 中性 / 专业)、关键词与意图标签。
要求逐个处理所有片段对象,输出同样数量的 JSON 元素。
用户消息User
【上下文】
SQL片段对象数组{{SNIPPET_ARRAY}} // snippet.json中的一个或多个片段
【任务要求】
请针对输入数组中的 每个 SQL 片段,输出一个 JSON 对象,结构如下:
{
"id": "<与输入片段id一致>",
"aliases": [
{"text": "…", "tone": "口语|中性|专业"},
{"text": "…", "tone": "专业"}
],
"keywords": [
"GMV","销售额","TopN","category","类目","趋势","同比","客户","订单","质量","异常检测","join","过滤","sample"
],
"intent_tags": ["aggregate","trend","topn","ratio","quality","join","sample","filter","by_dimension"]
}
生成逻辑规范
1.逐条输出输入数组中每个片段对应一个输出对象id 保持一致)。
2.aliases生成
至少 3 个别名,分别覆盖语气类型:口语 / 中性 / 专业。
≤20字语义需等价不得添加不存在的字段或业务口径。
示例:
GMV趋势分析中性
每天卖多少钱(口语)
按日GMV曲线专业
3.keywords生成
8~15个关键词需涵盖片段核心维度、指标、分析类型和语义近义词。
中英文混合(如 "GMV"/"销售额"、"同比"/"YoY"、"类目"/"category" 等)。
包含用于匹配的分析意图关键词(如 “趋势”、“排行”、“占比”、“质量检查”、“过滤” 等)。
4.intent_tags生成
从以下集合中选取与片段type及用途一致
["aggregate","trend","topn","ratio","quality","join","sample","filter","by_dimension"]
若为条件片段WHERE句型补充 "filter";若含维度分组逻辑,补充 "by_dimension"。
5.语言与内容要求
保持正式书面风格,不添加解释说明。
只输出JSON数组不包含文字描述或额外文本。

View File

@ -0,0 +1,46 @@
系统角色System
你是“SQL片段生成器”。只能基于给定“表画像”生成可复用的分析片段。
为每个片段产出标题、用途描述、片段类型、变量、适用条件、SQL模板mysql方言并注明业务口径与安全限制。
不要发明画像里没有的列。时间/维度/指标须与画像匹配。
用户消息User
【表画像JSON】
{{TABLE_PROFILE_JSON}}
【输出要求只输出JSON数组
[
{
"id": "snpt_<slug>",
"title": "中文标题≤16字",
"desc": "一句话用途",
"type": "aggregate|trend|topn|ratio|quality|join|sample",
"applicability": {
"required_columns": ["<col>", ...],
"time_column": "<dt|nullable>",
"constraints": {
"dim_cardinality_hint": <int|null>, // 用于TopN限制与性能提示
"fk_join_available": true|false,
"notes": ["高基数维度建议LIMIT<=50", "..."]
}
},
"variables": [
{"name":"start_date","type":"date"},
{"name":"end_date","type":"date"},
{"name":"top_n","type":"int","default":10}
],
"dialect_sql": {
"mysql": ""
},
"business_caliber": "清晰口径说明,如 UV以device_id去重粒度=日-类目",
"examples": ["示例问法1","示例问法2"]
}
]
【片段选择建议】
- 若存在 time 列:生成 trend_by_day / yoy_qoq / moving_avg。
- 若存在 enumish 维度distinct 5~200生成 topn_by_dimension / share_of_total。
- 若 metric 列:生成 sum/avg/max、分位数/异常检测3σ/箱线)。
- 有主键/唯一:生成 去重/明细抽样/质量检查。
- 有 fk_candidates同时生成“join维表命名版”和“纯ID版”。
- 高枚举维度:在 constraints.notes 中强调 LIMIT 建议与可能的性能风险。
- 除了完整的sql片段还有sql里部分内容的sql片段比如 where payment_method = 'Credit Card' and delivery_status = 'Deliverd' 的含义是支付方式为信用卡且配送状态是已送达

277
demo/table-desc.json Normal file
View File

@ -0,0 +1,277 @@
{
"table": "ecommerce_orders",
"row_count": 10000,
"role": "fact",
"grain": ["order_id"],
"time": {
"column": "order_date",
"granularity": "day",
"range": ["2024-04-20", "2025-04-19"],
"has_gaps": false
},
"columns": [
{
"name": "order_id",
"dtype": "string",
"semantic_type": "id",
"null_rate": 0.0,
"distinct_count": 10000,
"distinct_ratio": 1.0,
"stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
"enumish": false,
"top_values": [],
"pk_candidate_score": 1.0,
"metric_candidate_score": 0.0,
"comment": ""
},
{
"name": "customer_id",
"dtype": "integer",
"semantic_type": "dimension",
"null_rate": 0.0,
"distinct_count": 2713,
"distinct_ratio": 0.2713,
"stats": {"min": 1, "max": 2999, "mean": 995.29, "std": null, "skewness": null},
"enumish": false,
"top_values": [],
"pk_candidate_score": 0.3,
"metric_candidate_score": 0.1,
"comment": ""
},
{
"name": "product_id",
"dtype": "integer",
"semantic_type": "dimension",
"null_rate": 0.0,
"distinct_count": 1000,
"distinct_ratio": 0.0999,
"stats": {"min": 1, "max": 1000, "mean": 504.87, "std": null, "skewness": null},
"enumish": true,
"top_values": [],
"pk_candidate_score": 0.1,
"metric_candidate_score": 0.1,
"comment": ""
},
{
"name": "category",
"dtype": "string",
"semantic_type": "dimension",
"null_rate": 0.0,
"distinct_count": 6,
"distinct_ratio": 0.0006,
"stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
"enumish": true,
"top_values": [
{"value": "Beauty", "pct": null},
{"value": "Books", "pct": null},
{"value": "Clothing", "pct": null},
{"value": "Electronics", "pct": null},
{"value": "Home", "pct": null},
{"value": "Toys", "pct": null}
],
"pk_candidate_score": 0.0,
"metric_candidate_score": 0.0,
"comment": ""
},
{
"name": "price",
"dtype": "float",
"semantic_type": "metric",
"null_rate": 0.0,
"distinct_count": 9013,
"distinct_ratio": 0.9013,
"stats": {"min": 5.06, "max": 499.93, "mean": 252.55, "std": null, "skewness": null},
"enumish": false,
"top_values": [],
"pk_candidate_score": 0.0,
"metric_candidate_score": 0.9,
"comment": ""
},
{
"name": "quantity",
"dtype": "integer",
"semantic_type": "metric",
"null_rate": 0.0,
"distinct_count": 9,
"distinct_ratio": 0.0009,
"stats": {"min": 1, "max": 9, "mean": 2.12, "std": null, "skewness": null},
"enumish": true,
"top_values": [
{"value": 1, "pct": null},
{"value": 2, "pct": null},
{"value": 3, "pct": null},
{"value": 4, "pct": null},
{"value": 5, "pct": null}
],
"pk_candidate_score": 0.0,
"metric_candidate_score": 0.7,
"comment": ""
},
{
"name": "order_date",
"dtype": "string",
"semantic_type": "time",
"null_rate": 0.0,
"distinct_count": 365,
"distinct_ratio": 0.0365,
"stats": {"min": "2024-04-20", "max": "2025-04-19", "mean": null, "std": null, "skewness": null},
"enumish": false,
"top_values": [],
"pk_candidate_score": 0.0,
"metric_candidate_score": 0.0,
"comment": ""
},
{
"name": "shipping_date",
"dtype": "string",
"semantic_type": "time",
"null_rate": 0.0,
"distinct_count": 371,
"distinct_ratio": 0.0371,
"stats": {"min": "2024-04-21", "max": "2025-04-26", "mean": null, "std": null, "skewness": null},
"enumish": false,
"top_values": [],
"pk_candidate_score": 0.0,
"metric_candidate_score": 0.0,
"comment": ""
},
{
"name": "delivery_status",
"dtype": "string",
"semantic_type": "dimension",
"null_rate": 0.0,
"distinct_count": 4,
"distinct_ratio": 0.0004,
"stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
"enumish": true,
"top_values": [
{"value": "Delivered", "pct": null},
{"value": "Pending", "pct": null},
{"value": "Returned", "pct": null},
{"value": "Shipped", "pct": null}
],
"pk_candidate_score": 0.0,
"metric_candidate_score": 0.0,
"comment": ""
},
{
"name": "payment_method",
"dtype": "string",
"semantic_type": "dimension",
"null_rate": 0.0,
"distinct_count": 5,
"distinct_ratio": 0.0005,
"stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
"enumish": true,
"top_values": [
{"value": "Apple Pay", "pct": null},
{"value": "Credit Card", "pct": null},
{"value": "Debit Card", "pct": null},
{"value": "Google Pay", "pct": null},
{"value": "PayPal", "pct": null}
],
"pk_candidate_score": 0.0,
"metric_candidate_score": 0.0,
"comment": ""
},
{
"name": "device_type",
"dtype": "string",
"semantic_type": "dimension",
"null_rate": 0.0,
"distinct_count": 3,
"distinct_ratio": 0.0003,
"stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
"enumish": true,
"top_values": [
{"value": "Desktop", "pct": null},
{"value": "Mobile", "pct": null},
{"value": "Tablet", "pct": null}
],
"pk_candidate_score": 0.0,
"metric_candidate_score": 0.0,
"comment": ""
},
{
"name": "channel",
"dtype": "string",
"semantic_type": "dimension",
"null_rate": 0.0,
"distinct_count": 4,
"distinct_ratio": 0.0004,
"stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
"enumish": true,
"top_values": [
{"value": "Email", "pct": null},
{"value": "Organic", "pct": null},
{"value": "Paid Search", "pct": null},
{"value": "Social", "pct": null}
],
"pk_candidate_score": 0.0,
"metric_candidate_score": 0.0,
"comment": ""
},
{
"name": "shipping_address",
"dtype": "string",
"semantic_type": "text",
"null_rate": 0.0,
"distinct_count": 10000,
"distinct_ratio": 1.0,
"stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
"enumish": false,
"top_values": [],
"pk_candidate_score": 0.9,
"metric_candidate_score": 0.0,
"comment": ""
},
{
"name": "billing_address",
"dtype": "string",
"semantic_type": "text",
"null_rate": 0.0,
"distinct_count": 10000,
"distinct_ratio": 1.0,
"stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
"enumish": false,
"top_values": [],
"pk_candidate_score": 0.9,
"metric_candidate_score": 0.0,
"comment": ""
},
{
"name": "customer_segment",
"dtype": "string",
"semantic_type": "dimension",
"null_rate": 0.0,
"distinct_count": 3,
"distinct_ratio": 0.0003,
"stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
"enumish": true,
"top_values": [
{"value": "New", "pct": null},
{"value": "Returning", "pct": null},
{"value": "VIP", "pct": null}
],
"pk_candidate_score": 0.0,
"metric_candidate_score": 0.0,
"comment": ""
}
],
"primary_key_candidates": [["order_id"]],
"fk_candidates": [
{"from": "customer_id", "to": "dim_customer(customer_id)", "confidence": 0.9},
{"from": "product_id", "to": "dim_product(product_id)", "confidence": 0.9}
],
"quality": {
"failed_expectations": [],
"warning_hints": []
},
"confidence_notes": [
"表含时间列(order_date, shipping_date)且含度量列(price, quantity)推断为fact表。",
"order_id唯一性=1.0,确认主键。",
"order_date日期范围连续无缺口粒度为日级。",
"高基数数值字段(price, quantity)符合指标特征。",
"低熵字段(category, delivery_status, payment_method等)为枚举维度。"
]
}

102
demo/user-query.json Normal file
View File

@ -0,0 +1,102 @@
[
{
"question": "近一年每个月的销售额和订单量变化趋势如何?",
"intent": "trend_analysis",
"related_fields": ["order_date", "price", "quantity"]
},
{
"question": "哪个产品类目的GMV最高",
"intent": "topn_category",
"related_fields": ["category", "price", "quantity"]
},
{
"question": "不同支付方式的订单数量和平均客单价是多少?",
"intent": "aggregate_comparison",
"related_fields": ["payment_method", "price", "quantity"]
},
{
"question": "各营销渠道如Paid Search、Social的GMV占比是多少",
"intent": "ratio_analysis",
"related_fields": ["channel", "price", "quantity"]
},
{
"question": "移动端和桌面端的订单表现差异大吗?",
"intent": "device_comparison",
"related_fields": ["device_type", "price", "quantity"]
},
{
"question": "已送达订单的平均配送时长是多少天?",
"intent": "shipping_time_analysis",
"related_fields": ["order_date", "shipping_date", "delivery_status"]
},
{
"question": "退货Returned订单主要集中在哪些产品类目",
"intent": "return_analysis",
"related_fields": ["delivery_status", "category"]
},
{
"question": "不同客户类型新客、回头客、VIP的平均订单金额是多少",
"intent": "segment_analysis",
"related_fields": ["customer_segment", "price", "quantity"]
},
{
"question": "每个客户的平均下单频率是多少?",
"intent": "customer_behavior",
"related_fields": ["customer_id", "order_date"]
},
{
"question": "近期是否存在价格异常或超高订单?",
"intent": "quality_outlier",
"related_fields": ["price", "order_date"]
},
{
"question": "哪个支付方式的退货率最高?",
"intent": "return_ratio_by_payment",
"related_fields": ["payment_method", "delivery_status"]
},
{
"question": "哪些商品在VIP客户中最受欢迎",
"intent": "vip_product_preference",
"related_fields": ["customer_segment", "product_id", "price", "quantity"]
},
{
"question": "下单后平均几天发货?",
"intent": "shipping_speed",
"related_fields": ["order_date", "shipping_date"]
},
{
"question": "从哪些渠道来的新用户最多?",
"intent": "user_acquisition_channel",
"related_fields": ["channel", "customer_segment"]
},
{
"question": "订单数量在周末和工作日有什么差异?",
"intent": "weekday_pattern",
"related_fields": ["order_date"]
},
{
"question": "每个设备类型的平均订单金额是多少?",
"intent": "device_gmv_comparison",
"related_fields": ["device_type", "price", "quantity"]
},
{
"question": "本月退货率与上月相比是否上升?",
"intent": "return_trend",
"related_fields": ["delivery_status", "order_date"]
},
{
"question": "哪些客户下单金额最高?",
"intent": "top_customers",
"related_fields": ["customer_id", "price", "quantity"]
},
{
"question": "不同类目的平均客单价GMV/订单量)是多少?",
"intent": "category_avg_order_value",
"related_fields": ["category", "price", "quantity"]
},
{
"question": "不同渠道的订单平均转化周期(下单到发货)是多少?",
"intent": "conversion_cycle",
"related_fields": ["channel", "order_date", "shipping_date"]
}
]

BIN
file/全国品牌.xlsx Normal file

Binary file not shown.

332
ge_v1.py Normal file
View File

@ -0,0 +1,332 @@
"""Great Expectations profiling helper for Excel sources.
This script loads a user-provided Excel file into pandas, profiles it with
Great Expectations, writes a lightweight analysis summary to JSON, and exposes
the path to GE Data Docs for manual inspection.
"""
from __future__ import annotations
import argparse
import json
import os
import shutil
from pathlib import Path
from typing import Any, Dict
import numpy as np
import pandas as pd
import great_expectations as gx
from great_expectations.core.batch import RuntimeBatchRequest
from great_expectations.data_context import FileDataContext
from great_expectations.exceptions import (
DataContextError,
InvalidDataContextConfigError,
)
try:
from great_expectations.profile.user_configurable_profiler import (
UserConfigurableProfiler,
)
except ImportError:
try:
from great_expectations.profiler.user_configurable_profiler import (
UserConfigurableProfiler,
)
except ImportError as err:
raise ImportError(
"UserConfigurableProfiler is not available; please install a compatible "
"version of great_expectations (>=0.15,<0.19) or add the profiling extra."
) from err
RESULTS_DIR = Path("results")
DEFAULT_EXCEL_PATH = Path("file") / "全国品牌.xlsx"
DEFAULT_BATCH_ID = "initial_profile"
def parse_cli_args() -> argparse.Namespace:
"""Parse command line options for Excel ingestion."""
parser = argparse.ArgumentParser(description="Profile an Excel file with GE")
parser.add_argument(
"--excel-path",
type=Path,
default=DEFAULT_EXCEL_PATH,
help="Path to the Excel file to analyse (default: ./file/全国品牌.xlsx)",
)
parser.add_argument(
"--sheet-name",
default=0,
help="Excel sheet name or index to load (default: 0)",
)
parser.add_argument(
"--header-row",
type=int,
default=0,
help="Row index (0-based) to use as the header (default: 0)",
)
parser.add_argument(
"--clean-results",
action="store_true",
help="Remove the previous results directory before running",
)
parser.add_argument(
"--ge-root",
type=Path,
default=Path("gx_project"),
help="Directory to host the Great Expectations project (default: ./gx_project)",
)
return parser.parse_args()
def reset_results_dir(clean_results: bool) -> None:
"""Remove prior results folder when requested and ensure directory exists."""
if clean_results and RESULTS_DIR.exists():
shutil.rmtree(RESULTS_DIR)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
def load_excel_as_dataframe(excel_path: Path, sheet_name: Any, header_row: int) -> pd.DataFrame:
"""Load Excel data into a DataFrame and provide basic logging."""
if not excel_path.exists():
raise FileNotFoundError(f"Excel file not found: {excel_path}")
df = pd.read_excel(excel_path, sheet_name=sheet_name, header=header_row)
print(f"Loaded Excel data: {excel_path} ({len(df)} rows, {len(df.columns)} columns)")
return df
def get_datasource_config(datasource_name: str, data_connector_name: str) -> Dict[str, Any]:
"""Assemble a minimal Pandas datasource configuration."""
return {
"name": datasource_name,
"class_name": "Datasource",
"execution_engine": {"class_name": "PandasExecutionEngine"},
"data_connectors": {
data_connector_name: {
"class_name": "RuntimeDataConnector",
"runtime_keys": ["batch_id"],
}
},
}
def clean_value(value: Any) -> Any:
"""Convert numpy/pandas scalar types into JSON serialisable values."""
if isinstance(value, (np.generic,)):
return value.item()
if isinstance(value, pd.Timestamp):
return value.isoformat()
if pd.isna(value):
return None
return value
def build_column_profile(series: pd.Series) -> Dict[str, Any]:
"""Generate a compact per-column profile for JSON output."""
stats = series.describe()
profiled_stats = {key: clean_value(val) for key, val in stats.items()}
return {
"name": str(series.name),
"dtype": str(series.dtype),
"non_null_count": int(series.count()),
"null_count": int(series.isna().sum()),
"unique_count": int(series.nunique(dropna=True)),
"stats": profiled_stats,
}
def build_analysis_summary(df: pd.DataFrame, sample_size: int = 5) -> Dict[str, Any]:
"""Collate basic statistics to accompany GE outputs."""
summary = {
"shape": {"rows": int(df.shape[0]), "columns": int(df.shape[1])},
"columns": [build_column_profile(df[col]) for col in df.columns],
"sample_rows": [
{key: clean_value(value) for key, value in row.items()} for row in df.head(sample_size).to_dict(orient="records")
],
}
return summary
def serialize_batch_request(batch_request: Any) -> Dict[str, Any]:
"""Convert differing batch request types into plain dictionaries."""
if hasattr(batch_request, "to_json_dict"):
return batch_request.to_json_dict()
if hasattr(batch_request, "dict"):
return batch_request.dict()
if hasattr(batch_request, "model_dump"):
return batch_request.model_dump()
return {"repr": repr(batch_request)}
def ensure_data_context(ge_root: Path) -> gx.DataContext:
"""Create or repair a file-backed GE data context as needed."""
ge_root = ge_root.resolve()
config_path = ge_root / "gx" / "great_expectations.yml"
if not config_path.exists():
FileDataContext.create(project_root_dir=str(ge_root))
try:
return gx.get_context(project_root_dir=str(ge_root))
except InvalidDataContextConfigError:
print("Existing Great Expectations config invalid; recreating project root.")
shutil.rmtree(ge_root, ignore_errors=True)
FileDataContext.create(project_root_dir=str(ge_root))
return gx.get_context(project_root_dir=str(ge_root))
def run_ge_profiling(
context: gx.DataContext,
df: pd.DataFrame,
datasource_name: str,
data_connector_name: str,
data_asset_name: str,
expectation_suite_name: str,
) -> Dict[str, Any]:
"""Register datasource, build expectations, and capture validation results."""
if hasattr(context, "sources"):
datasource = context.sources.add_or_update_pandas(name=datasource_name)
try:
datasource.delete_asset(data_asset_name)
except (gx.exceptions.DataConnectorError, ValueError, KeyError, LookupError, AttributeError):
pass
asset = datasource.add_dataframe_asset(name=data_asset_name)
batch_request = asset.build_batch_request(dataframe=df)
print(f"Datasource registered (fluent): {datasource_name}")
else:
datasource_config = get_datasource_config(datasource_name, data_connector_name)
try:
context.add_datasource(**datasource_config)
print(f"Datasource registered: {datasource_name}")
except gx.exceptions.GreatExpectationsError as err:
print(f"Datasource registration issue: {err}")
batch_request = RuntimeBatchRequest(
datasource_name=datasource_name,
data_connector_name=data_connector_name,
data_asset_name=data_asset_name,
runtime_parameters={"batch_data": df},
batch_identifiers={"batch_id": DEFAULT_BATCH_ID},
)
try:
context.delete_expectation_suite(expectation_suite_name=expectation_suite_name)
except DataContextError:
pass
if hasattr(context, "create_expectation_suite"):
context.create_expectation_suite(
expectation_suite_name=expectation_suite_name, overwrite_existing=True
)
else:
context.add_expectation_suite(expectation_suite_name=expectation_suite_name)
validator = context.get_validator(
batch_request=batch_request, expectation_suite_name=expectation_suite_name
)
profiler = UserConfigurableProfiler(profile_dataset=validator)
expectation_suite = profiler.build_suite()
context.add_or_update_expectation_suite(expectation_suite=expectation_suite)
validation_result = validator.validate(result_format="SUMMARY")
context.build_data_docs()
data_docs_path = (
Path(context.root_directory)
/ "uncommitted"
/ "data_docs"
/ "local_site"
/ "index.html"
)
print(
f"Expectation suite saved: {expectation_suite_name} ({len(expectation_suite.expectations)} expectations)"
)
return {
"batch_request": serialize_batch_request(batch_request),
"expectation_suite_name": expectation_suite_name,
"expectations_count": len(expectation_suite.expectations),
"validation_result": validation_result.to_json_dict(),
"data_docs_path": os.path.abspath(data_docs_path),
}
def assemble_payload(
excel_path: Path,
sheet_name: Any,
dataframe_summary: Dict[str, Any],
ge_summary: Dict[str, Any],
) -> Dict[str, Any]:
"""Combine pandas and GE artefacts into a single JSON payload."""
return {
"source": {
"excel_path": str(excel_path.resolve()),
"sheet_name": sheet_name,
},
"analysis": dataframe_summary,
"great_expectations": ge_summary,
}
def save_json_payload(payload: Dict[str, Any], output_path: Path) -> None:
"""Persist the combined analysis payload to disk."""
output_path.parent.mkdir(parents=True, exist_ok=True)
with output_path.open("w", encoding="utf-8") as f:
json.dump(payload, f, ensure_ascii=False, indent=2)
print(f"JSON analysis saved to: {output_path}")
def main() -> None:
args = parse_cli_args()
reset_results_dir(clean_results=args.clean_results)
context = ensure_data_context(args.ge_root)
print(f"Great Expectations Data Context initialized at {context.root_directory}.")
df = load_excel_as_dataframe(args.excel_path, args.sheet_name, args.header_row)
dataframe_summary = build_analysis_summary(df)
file_stem = args.excel_path.stem
datasource_name = f"{file_stem}_datasource"
data_connector_name = "runtime_data_connector"
data_asset_name = f"{file_stem}_asset"
expectation_suite_name = f"{file_stem}_suite"
ge_summary = run_ge_profiling(
context,
df,
datasource_name,
data_connector_name,
data_asset_name,
expectation_suite_name,
)
payload = assemble_payload(
excel_path=args.excel_path,
sheet_name=args.sheet_name,
dataframe_summary=dataframe_summary,
ge_summary=ge_summary,
)
output_path = RESULTS_DIR / f"{file_stem}_analysis.json"
save_json_payload(payload, output_path)
print(
f"Data Docs generated. Open in browser: file://{ge_summary['data_docs_path']}"
)
if __name__ == "__main__":
main()

104
ge_v2.py Normal file
View File

@ -0,0 +1,104 @@
import great_expectations as gx
from datasets import load_dataset
import pandas as pd
import os
import webbrowser
from great_expectations.profile.user_configurable_profiler import (
UserConfigurableProfiler,
)
# --- 1. 加载Hugging Face数据集并转换为Pandas DataFrame ---
print("🚚 1. 从Hugging Face加载 'millat/e-commerce-orders' 数据集...")
# 加载数据集,只取训练集部分
hf_dataset = load_dataset("millat/e-commerce-orders", split="train")
# 转换为Pandas DataFrame这是GX最常使用的数据格式
df = hf_dataset.to_pandas()
print(f"✅ 数据集加载成功,包含 {len(df)} 行数据。")
print("\n📝 数据集前5行预览:")
print(df.head())
# --- 2. 初始化Great Expectations (GX) 项目 ---
# 这将在当前目录下创建一个名为 "ge_project" 的文件夹来存放所有GX的配置和结果
print("\n🏗️ 2. 初始化Great Expectations项目...")
context = gx.get_context()
print("✅ GX项目上下文Context创建成功。")
# --- 3. 添加数据源并将DataFrame连接到GX ---
# 我们将Pandas DataFrame添加为一个数据源这样GX就知道如何访问它
print("\n🔗 3. 将DataFrame添加为GX的数据源...")
datasource_name = "my_ecommerce_datasource"
# Fluent API: add_or_update 确保多次运行脚本也不会重复出错
datasource = context.sources.add_or_update_pandas(name=datasource_name)
data_asset_name = "orders_table"
data_asset = datasource.add_dataframe_asset(name=data_asset_name, dataframe=df)
print("✅ 数据源和数据资产Data Asset配置完成。")
# --- 4. 使用自动剖析器生成期望套件 ---
print("\n🔍 4. 使用自动剖析器 (Profiler) 扫描数据并生成期望...")
# 创建一个请求告诉GX我们要处理哪个数据资产
batch_request = data_asset.build_batch_request()
# 定义期望套件的名称
expectation_suite_name = "ecommerce_profiling_suite"
# 创建或获取期望套件
try:
suite = context.get_expectation_suite(expectation_suite_name=expectation_suite_name)
print(f" - 已找到现有的期望套件 '{expectation_suite_name}'")
except gx.exceptions.DataContextError:
suite = context.add_expectation_suite(expectation_suite_name=expectation_suite_name)
print(f" - 已创建新的期望套件 '{expectation_suite_name}'")
# 构建一个 Validator供剖析器消费
validator = context.get_validator(
batch_request=batch_request,
expectation_suite_name=expectation_suite_name,
)
# 这是核心步骤:使用 UserConfigurableProfiler 自动分析数据并创建期望
profiler = UserConfigurableProfiler(profile_dataset=validator)
suite = profiler.build_suite()
# 保存由剖析器生成的期望套件
context.save_expectation_suite(expectation_suite=suite, expectation_suite_name=expectation_suite_name)
print("✅ 自动剖析完成,期望已生成并保存。")
# --- 5. 创建并运行检查点Checkpoint以验证数据 ---
print("\n🛡️ 5. 创建并运行检查点 (Checkpoint) 以验证数据...")
checkpoint_name = "ecommerce_profiling_checkpoint"
try:
# 检查检查点是否已存在
checkpoint = context.get_checkpoint(name=checkpoint_name)
print(f" - 已加载现有的检查点 '{checkpoint_name}'")
except gx.exceptions.CheckpointNotFoundError:
# 如果不存在,则创建一个新的
checkpoint_config = {
"name": checkpoint_name,
"validations": [
{
"batch_request": batch_request,
"expectation_suite_name": expectation_suite_name,
}
],
}
context.add_or_update_checkpoint(**checkpoint_config)
checkpoint = context.get_checkpoint(name=checkpoint_name)
print(f" - 已创建新的检查点 '{checkpoint_name}'")
# 运行检查点,它会将数据与我们刚刚生成的期望套件进行对比
checkpoint_result = checkpoint.run()
print("✅ 检查点运行完毕,数据验证完成。")
# --- 6. 构建并打开数据文档Data Docs查看结果 ---
print("\n📊 6. 构建并打开数据文档 (Data Docs) 查看剖析报告...")
# 这会生成一个HTML报告
context.build_data_docs()
# 获取Data Docs的路径并自动在浏览器中打开
docs_path = os.path.join(context.root_directory, "uncommitted", "data_docs", "local_site", "index.html")
print(f"\n🎉 剖析报告已生成!请在浏览器中查看:\nfile://{os.path.abspath(docs_path)}")
webbrowser.open(f"file://{os.path.abspath(docs_path)}")

11
requirements.txt Normal file
View File

@ -0,0 +1,11 @@
fastapi>=0.111.0
uvicorn[standard]>=0.29.0
pydantic>=2.6.0
sqlalchemy>=2.0.28
pymysql>=1.1.0
great_expectations>=0.18.0,<0.19.0
pandas>=2.0
numpy>=1.24
openpyxl>=3.1
httpx==0.27.2
python-dotenv==1.0.1

Binary file not shown.

110
scripts/deepseek_request.py Normal file
View File

@ -0,0 +1,110 @@
from __future__ import annotations
import argparse
import asyncio
import json
from typing import Any, Dict
import httpx
DEFAULT_URL = "http://127.0.0.1:8000/v1/chat/completions"
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Send a DeepSeek chat completion request to the local LLM gateway."
)
parser.add_argument(
"--url",
default=DEFAULT_URL,
help=f"Gateway endpoint URL (default: {DEFAULT_URL})",
)
parser.add_argument(
"--model",
default="deepseek-chat",
help="DeepSeek model to use (default: deepseek-chat).",
)
parser.add_argument(
"--system",
default="You are a helpful assistant.",
help="Optional system prompt.",
)
parser.add_argument(
"--prompt",
default="写一段简短的中文欢迎词。",
help="User message content to send.",
)
parser.add_argument(
"--temperature",
type=float,
default=0.7,
help="Sampling temperature.",
)
parser.add_argument(
"--max-tokens",
type=int,
default=512,
help="Maximum tokens for the response.",
)
parser.add_argument(
"--stream",
action="store_true",
help="Enable streaming mode (DeepSeek supports it).",
)
parser.add_argument(
"--extra",
help="Optional JSON string with extra provider parameters.",
)
return parser.parse_args()
async def send_request(url: str, payload: Dict[str, Any]) -> Dict[str, Any]:
async with httpx.AsyncClient(timeout=httpx.Timeout(60.0)) as client:
response = await client.post(url, json=payload)
response.raise_for_status()
return response.json()
def build_payload(args: argparse.Namespace) -> Dict[str, Any]:
extra_params = None
if args.extra:
try:
extra_params = json.loads(args.extra)
except json.JSONDecodeError as exc:
raise SystemExit(f"Invalid JSON passed to --extra: {exc}") from exc
payload: Dict[str, Any] = {
"provider": "deepseek",
"model": args.model,
"messages": [
{"role": "system", "content": args.system},
{"role": "user", "content": args.prompt},
],
"temperature": args.temperature,
"max_tokens": args.max_tokens,
"stream": args.stream,
}
if extra_params:
payload["extra_params"] = extra_params
return payload
async def main() -> None:
args = parse_args()
payload = build_payload(args)
try:
result = await send_request(args.url, payload)
except httpx.HTTPStatusError as exc:
detail = exc.response.text
raise SystemExit(f"Gateway returned {exc.response.status_code}: {detail}") from exc
except httpx.HTTPError as exc:
raise SystemExit(f"HTTP error calling gateway: {exc}") from exc
print(json.dumps(result, ensure_ascii=False, indent=2))
if __name__ == "__main__":
asyncio.run(main())

19
todo.md Normal file
View File

@ -0,0 +1,19 @@
##基础选型:
Python + Great Expectations
##设计理念:
1. 服务化 (Service-Oriented): 将 GE 的能力封装成一个独立的微服务,通过 RESTful API 对外提供数据分析质量的定义、校验、报告。
2. 配置驱动 (Configuration-Driven): 所有的期望(Expectations)、数据源连接、校验点(Checkpoints)都是可配置的,期望有默认版本和自定义版本。
3. 聚焦验证 (Validation):通过GE发现验证问题解决问题留给后续的数据清洗和修改
4. 异步(Asynchronous): 应对多任务同时分析,不阻塞流程
5. 增量分析(Incremental- Analysis ):数据会多次分析和修改才能使用
##架构设计:
使用 FastAPI 构建 RESTful API 服务,具备高性能和自带 OpenAPI (Swagger UI) 文档,异步支持度高。
1. 分析流程管理
解析 API 请求,管理分析任务的生命周期,支持异步,接受请求-调度服务-存储报告-触发通知。
2. GE封装
以编程配置方式管理数据源Datasources、期望套件Expectation Suites和校验点Checkpoints执行数据分析Profiling和数据校验Validation
3. 期望仓库和结果存储
使用 Git 仓库mysql期望是json来存储期望仓库对数据质量规则进行版本控制、审计和协作将每次的校验结果结构化后存入数据库便于后续进行数据质量趋势分析、历史追溯和仪表盘展示。
4. 提供llm api gateway服务
llm api 调用,多供应商统一网关OpenAI / Anthropic / Openrouter / Gemini / Qwen / DeepSeek 一键切换)