init,llm gateway & import_analyse

2025-10-29 00:38:57 +08:00
commit 0af5f19af9
62 changed files with 3169 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 .venv
 gx/uncommitted/
 .vscode/
--- a/app/init.py
+++ b/app/init.py
@ -0,0 +1,3 @@
 from .main import create_app
 __all__ = ["create_app"]
--- a/app/pycache/init.cpython-311.pyc
+++ b/app/pycache/init.cpython-311.pyc
--- a/app/pycache/init.cpython-312.pyc
+++ b/app/pycache/init.cpython-312.pyc
--- a/app/pycache/exceptions.cpython-311.pyc
+++ b/app/pycache/exceptions.cpython-311.pyc
--- a/app/pycache/exceptions.cpython-312.pyc
+++ b/app/pycache/exceptions.cpython-312.pyc
--- a/app/pycache/main.cpython-311.pyc
+++ b/app/pycache/main.cpython-311.pyc
--- a/app/pycache/main.cpython-312.pyc
+++ b/app/pycache/main.cpython-312.pyc
--- a/app/pycache/models.cpython-311.pyc
+++ b/app/pycache/models.cpython-311.pyc
--- a/app/pycache/models.cpython-312.pyc
+++ b/app/pycache/models.cpython-312.pyc
--- a/app/exceptions.py
+++ b/app/exceptions.py
@ -0,0 +1,6 @@
 class ProviderConfigurationError(RuntimeError):
    """Raised when a provider is missing required configuration."""
 class ProviderAPICallError(RuntimeError):
    """Raised when the upstream provider responds with an error."""
--- a/app/main.py
+++ b/app/main.py
@ -0,0 +1,103 @@
 from __future__ import annotations
 from contextlib import asynccontextmanager
 import httpx
 from fastapi import Depends, FastAPI, HTTPException, Request
 from app.exceptions import ProviderAPICallError, ProviderConfigurationError
 from app.models import (
    DataImportAnalysisRequest,
    DataImportAnalysisResponse,
    LLMRequest,
    LLMResponse,
 )
 from app.services import LLMGateway
 from app.services.import_analysis import build_import_messages, resolve_provider_from_model
@asynccontextmanager
 async def lifespan(app: FastAPI):
    client = httpx.AsyncClient(timeout=httpx.Timeout(30.0))
    gateway = LLMGateway()
    try:
        app.state.http_client = client  # type: ignore[attr-defined]
        app.state.gateway = gateway  # type: ignore[attr-defined]
        yield
    finally:
        await client.aclose()
 def create_app() -> FastAPI:
    application = FastAPI(
        title="Unified LLM Gateway",
        version="0.1.0",
        lifespan=lifespan,
    )
    @application.post(
        "/v1/chat/completions",
        response_model=LLMResponse,
        summary="Dispatch chat completion to upstream provider",
    )
    async def create_chat_completion(
        payload: LLMRequest,
        gateway: LLMGateway = Depends(get_gateway),
        client: httpx.AsyncClient = Depends(get_http_client),
    ) -> LLMResponse:
        try:
            return await gateway.chat(payload, client)
        except ProviderConfigurationError as exc:
            raise HTTPException(status_code=422, detail=str(exc)) from exc
        except ProviderAPICallError as exc:
            raise HTTPException(status_code=502, detail=str(exc)) from exc
    @application.post(
        "/v1/import/analyze",
        response_model=DataImportAnalysisResponse,
        summary="Analyze import sample data via configured LLM",
    )
    async def analyze_import_data(
        payload: DataImportAnalysisRequest,
        gateway: LLMGateway = Depends(get_gateway),
        client: httpx.AsyncClient = Depends(get_http_client),
    ) -> DataImportAnalysisResponse:
        try:
            provider, model_name = resolve_provider_from_model(payload.llm_model)
        except ValueError as exc:
            raise HTTPException(status_code=422, detail=str(exc)) from exc
        messages = build_import_messages(payload)
        llm_request = LLMRequest(
            provider=provider,
            model=model_name,
            messages=messages,
            temperature=payload.temperature if payload.temperature is not None else 0.2,
            max_tokens=payload.max_tokens,
        )
        try:
            llm_response = await gateway.chat(llm_request, client)
        except ProviderConfigurationError as exc:
            raise HTTPException(status_code=422, detail=str(exc)) from exc
        except ProviderAPICallError as exc:
            raise HTTPException(status_code=502, detail=str(exc)) from exc
        return DataImportAnalysisResponse(
            import_record_id=payload.import_record_id,
            llm_response=llm_response,
        )
    return application
 async def get_gateway(request: Request) -> LLMGateway:
    return request.app.state.gateway  # type: ignore[return-value, attr-defined]
 async def get_http_client(request: Request) -> httpx.AsyncClient:
    return request.app.state.http_client  # type: ignore[return-value, attr-defined]
 app = create_app()
--- a/app/models.py
+++ b/app/models.py
@ -0,0 +1,92 @@
 from __future__ import annotations
 from enum import Enum
 from typing import Any, List, Optional
 from pydantic import BaseModel, Field
 class LLMRole(str, Enum):
    USER = "user"
    ASSISTANT = "assistant"
    SYSTEM = "system"
 class LLMMessage(BaseModel):
    role: LLMRole = Field(..., description="Message author role.")
    content: str = Field(..., description="Plain text content of the message.")
 class LLMProvider(str, Enum):
    OPENAI = "openai"
    ANTHROPIC = "anthropic"
    OPENROUTER = "openrouter"
    GEMINI = "gemini"
    QWEN = "qwen"
    DEEPSEEK = "deepseek"
 class LLMRequest(BaseModel):
    provider: LLMProvider = Field(..., description="Target LLM provider identifier.")
    model: str = Field(..., description="Model name understood by the provider.")
    messages: List[LLMMessage] = Field(..., description="Ordered chat messages.")
    temperature: Optional[float] = Field(
        0.7, description="Sampling temperature when supported."
    )
    top_p: Optional[float] = Field(
        None, description="Top-p nucleus sampling when supported."
    )
    max_tokens: Optional[int] = Field(
        None, description="Maximum tokens to generate when supported."
    )
    stream: Optional[bool] = Field(
        False, description="Enable provider streaming if both sides support it."
    )
    extra_params: Optional[dict[str, Any]] = Field(
        None, description="Provider-specific parameters to merge into the payload."
    )
 class LLMChoice(BaseModel):
    index: int
    message: LLMMessage
 class LLMResponse(BaseModel):
    provider: LLMProvider
    model: str
    choices: List[LLMChoice]
    raw: Optional[dict[str, Any]] = Field(
        None, description="Raw provider response for debugging."
    )
 class DataImportAnalysisRequest(BaseModel):
    import_record_id: str = Field(..., description="Unique identifier for this import run.")
    example_data: str = Field(
        ...,
        max_length=30_000,
        description="Sample rows from the import payload. Limited to 30k characters.",
    )
    table_headers: List[str] = Field(
        ...,
        min_length=1,
        description="Ordered list of table headers associated with the data.",
    )
    llm_model: str = Field(
        ...,
        description="Model identifier. Accepts 'provider:model' format or plain model name.",
    )
    temperature: Optional[float] = Field(
        None,
        description="Optional override for LLM temperature when generating recognition output.",
    )
    max_tokens: Optional[int] = Field(
        None,
        description="Optional override for maximum tokens generated during recognition.",
    )
 class DataImportAnalysisResponse(BaseModel):
    import_record_id: str
    llm_response: LLMResponse
--- a/app/providers/init.py
+++ b/app/providers/init.py
@ -0,0 +1,17 @@
 from .anthropic import AnthropicProvider
 from .base import LLMProviderClient
 from .deepseek import DeepSeekProvider
 from .gemini import GeminiProvider
 from .openai import OpenAIProvider
 from .openrouter import OpenRouterProvider
 from .qwen import QwenProvider
 __all__ = [
    "LLMProviderClient",
    "OpenAIProvider",
    "AnthropicProvider",
    "OpenRouterProvider",
    "GeminiProvider",
    "QwenProvider",
    "DeepSeekProvider",
 ]
--- a/app/providers/pycache/init.cpython-311.pyc
+++ b/app/providers/pycache/init.cpython-311.pyc
--- a/app/providers/pycache/init.cpython-312.pyc
+++ b/app/providers/pycache/init.cpython-312.pyc
--- a/app/providers/pycache/anthropic.cpython-311.pyc
+++ b/app/providers/pycache/anthropic.cpython-311.pyc
--- a/app/providers/pycache/anthropic.cpython-312.pyc
+++ b/app/providers/pycache/anthropic.cpython-312.pyc
--- a/app/providers/pycache/base.cpython-311.pyc
+++ b/app/providers/pycache/base.cpython-311.pyc
--- a/app/providers/pycache/base.cpython-312.pyc
+++ b/app/providers/pycache/base.cpython-312.pyc
--- a/app/providers/pycache/deepseek.cpython-311.pyc
+++ b/app/providers/pycache/deepseek.cpython-311.pyc
--- a/app/providers/pycache/deepseek.cpython-312.pyc
+++ b/app/providers/pycache/deepseek.cpython-312.pyc
--- a/app/providers/pycache/gemini.cpython-311.pyc
+++ b/app/providers/pycache/gemini.cpython-311.pyc
--- a/app/providers/pycache/gemini.cpython-312.pyc
+++ b/app/providers/pycache/gemini.cpython-312.pyc
--- a/app/providers/pycache/openai.cpython-311.pyc
+++ b/app/providers/pycache/openai.cpython-311.pyc
--- a/app/providers/pycache/openai.cpython-312.pyc
+++ b/app/providers/pycache/openai.cpython-312.pyc
--- a/app/providers/pycache/openrouter.cpython-311.pyc
+++ b/app/providers/pycache/openrouter.cpython-311.pyc
--- a/app/providers/pycache/openrouter.cpython-312.pyc
+++ b/app/providers/pycache/openrouter.cpython-312.pyc
--- a/app/providers/pycache/qwen.cpython-311.pyc
+++ b/app/providers/pycache/qwen.cpython-311.pyc
--- a/app/providers/pycache/qwen.cpython-312.pyc
+++ b/app/providers/pycache/qwen.cpython-312.pyc
--- a/app/providers/anthropic.py
+++ b/app/providers/anthropic.py
@ -0,0 +1,97 @@
 from __future__ import annotations
 from typing import Any, Dict, List, Tuple
 import httpx
 from app.exceptions import ProviderAPICallError
 from app.models import (
    LLMChoice,
    LLMMessage,
    LLMProvider,
    LLMRequest,
    LLMResponse,
    LLMRole,
 )
 from app.providers.base import LLMProviderClient
 class AnthropicProvider(LLMProviderClient):
    name = LLMProvider.ANTHROPIC.value
    api_key_env = "ANTHROPIC_API_KEY"
    base_url = "https://api.anthropic.com/v1/messages"
    anthropic_version = "2023-06-01"
    async def chat(
        self, request: LLMRequest, client: httpx.AsyncClient
    ) -> LLMResponse:
        self.ensure_stream_supported(request.stream)
        system_prompt, chat_messages = self._convert_messages(request.messages)
        payload = self.merge_payload(
            {
                "model": request.model,
                "messages": chat_messages,
                "max_tokens": request.max_tokens or 1024,
                "temperature": request.temperature,
                "top_p": request.top_p,
            },
            request.extra_params,
        )
        if system_prompt:
            payload["system"] = system_prompt
        headers = {
            "x-api-key": self.api_key,
            "anthropic-version": self.anthropic_version,
            "content-type": "application/json",
        }
        try:
            response = await client.post(self.base_url, json=payload, headers=headers)
            response.raise_for_status()
        except httpx.HTTPError as exc:
            raise ProviderAPICallError(f"Anthropic request failed: {exc}") from exc
        data: Dict[str, Any] = response.json()
        message = self._build_message(data)
        return LLMResponse(
            provider=LLMProvider.ANTHROPIC,
            model=data.get("model", request.model),
            choices=[LLMChoice(index=0, message=message)],
            raw=data,
        )
    @staticmethod
    def _convert_messages(
        messages: List[LLMMessage],
    ) -> Tuple[str | None, List[dict[str, Any]]]:
        system_parts: List[str] = []
        chat_payload: List[dict[str, Any]] = []
        for msg in messages:
            if msg.role == LLMRole.SYSTEM:
                system_parts.append(msg.content)
                continue
            role = "user" if msg.role == LLMRole.USER else "assistant"
            chat_payload.append(
                {"role": role, "content": [{"type": "text", "text": msg.content}]}
            )
        system_prompt = "\n\n".join(system_parts) if system_parts else None
        return system_prompt, chat_payload
    @staticmethod
    def _build_message(data: Dict[str, Any]) -> LLMMessage:
        role = data.get("role", "assistant")
        content_blocks = data.get("content", [])
        text_parts = [
            block.get("text", "")
            for block in content_blocks
            if isinstance(block, dict) and block.get("type") == "text"
        ]
        content = "\n\n".join(part for part in text_parts if part)
        return LLMMessage(role=role, content=content)
--- a/app/providers/base.py
+++ b/app/providers/base.py
@ -0,0 +1,44 @@
 from __future__ import annotations
 from abc import ABC, abstractmethod
 from typing import Any
 import httpx
 from app.exceptions import ProviderConfigurationError
 from app.models import LLMRequest, LLMResponse
 class LLMProviderClient(ABC):
    """Base class for provider-specific chat completion clients."""
    name: str
    api_key_env: str | None = None
    supports_stream: bool = False
    def __init__(self, api_key: str | None):
        if self.api_key_env and not api_key:
            raise ProviderConfigurationError(
                f"Provider '{self.name}' requires environment variable '{self.api_key_env}'."
            )
        self.api_key = api_key or ""
    @abstractmethod
    async def chat(
        self, request: LLMRequest, client: httpx.AsyncClient
    ) -> LLMResponse:
        """Execute a chat completion call."""
    @staticmethod
    def merge_payload(base: dict[str, Any], extra: dict[str, Any] | None) -> dict[str, Any]:
        """Merge provider payload with optional extra params, ignoring None values."""
        merged = {k: v for k, v in base.items() if v is not None}
        if extra:
            merged.update({k: v for k, v in extra.items() if v is not None})
        return merged
    def ensure_stream_supported(self, stream_requested: bool) -> None:
        if stream_requested and not self.supports_stream:
            raise ProviderConfigurationError(
                f"Provider '{self.name}' does not support streaming mode."
            )
--- a/app/providers/deepseek.py
+++ b/app/providers/deepseek.py
@ -0,0 +1,66 @@
 from __future__ import annotations
 from typing import Any, Dict, List
 import httpx
 from app.exceptions import ProviderAPICallError
 from app.models import LLMChoice, LLMMessage, LLMProvider, LLMRequest, LLMResponse
 from app.providers.base import LLMProviderClient
 class DeepSeekProvider(LLMProviderClient):
    name = LLMProvider.DEEPSEEK.value
    api_key_env = "DEEPSEEK_API_KEY"
    supports_stream = True
    base_url = "https://api.deepseek.com/v1/chat/completions"
    async def chat(
        self, request: LLMRequest, client: httpx.AsyncClient
    ) -> LLMResponse:
        self.ensure_stream_supported(request.stream)
        payload = self.merge_payload(
            {
                "model": request.model,
                "messages": [msg.model_dump() for msg in request.messages],
                "temperature": request.temperature,
                "top_p": request.top_p,
                "max_tokens": request.max_tokens,
                "stream": request.stream,
            },
            request.extra_params,
        )
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
        }
        try:
            response = await client.post(self.base_url, json=payload, headers=headers)
            response.raise_for_status()
        except httpx.HTTPError as exc:
            raise ProviderAPICallError(f"DeepSeek request failed: {exc}") from exc
        data: Dict[str, Any] = response.json()
        choices = self._build_choices(data.get("choices", []))
        return LLMResponse(
            provider=LLMProvider.DEEPSEEK,
            model=data.get("model", request.model),
            choices=choices,
            raw=data,
        )
    @staticmethod
    def _build_choices(choices: List[dict[str, Any]]) -> List[LLMChoice]:
        built: List[LLMChoice] = []
        for choice in choices:
            message_data = choice.get("message") or {}
            message = LLMMessage(
                role=message_data.get("role", "assistant"),
                content=message_data.get("content", ""),
            )
            built.append(LLMChoice(index=choice.get("index", len(built)), message=message))
        return built
--- a/app/providers/gemini.py
+++ b/app/providers/gemini.py
@ -0,0 +1,112 @@
 from __future__ import annotations
 from typing import Any, Dict, List, Tuple
 import httpx
 from app.exceptions import ProviderAPICallError
 from app.models import (
    LLMChoice,
    LLMMessage,
    LLMProvider,
    LLMRequest,
    LLMResponse,
    LLMRole,
 )
 from app.providers.base import LLMProviderClient
 class GeminiProvider(LLMProviderClient):
    name = LLMProvider.GEMINI.value
    api_key_env = "GEMINI_API_KEY"
    base_url = "https://generativelanguage.googleapis.com/v1beta"
    async def chat(
        self, request: LLMRequest, client: httpx.AsyncClient
    ) -> LLMResponse:
        self.ensure_stream_supported(request.stream)
        system_instruction, contents = self._convert_messages(request.messages)
        config = {
            "temperature": request.temperature,
            "topP": request.top_p,
            "maxOutputTokens": request.max_tokens,
        }
        payload: Dict[str, Any] = self.merge_payload(
            {"contents": contents}, request.extra_params
        )
        generation_config = {k: v for k, v in config.items() if v is not None}
        if generation_config:
            payload["generationConfig"] = generation_config
        if system_instruction:
            payload["systemInstruction"] = {
                "role": "system",
                "parts": [{"text": system_instruction}],
            }
        endpoint = f"{self.base_url}/models/{request.model}:generateContent?key={self.api_key}"
        headers = {"Content-Type": "application/json"}
        try:
            response = await client.post(endpoint, json=payload, headers=headers)
            response.raise_for_status()
        except httpx.HTTPError as exc:
            raise ProviderAPICallError(f"Gemini request failed: {exc}") from exc
        data: Dict[str, Any] = response.json()
        choices = self._build_choices(data.get("candidates", []))
        return LLMResponse(
            provider=LLMProvider.GEMINI,
            model=request.model,
            choices=choices,
            raw=data,
        )
    @staticmethod
    def _convert_messages(
        messages: List[LLMMessage],
    ) -> Tuple[str | None, List[dict[str, Any]]]:
        system_parts: List[str] = []
        contents: List[dict[str, Any]] = []
        for msg in messages:
            if msg.role == LLMRole.SYSTEM:
                system_parts.append(msg.content)
                continue
            role = "user" if msg.role == LLMRole.USER else "model"
            contents.append({"role": role, "parts": [{"text": msg.content}]})
        system_instruction = "\n\n".join(system_parts) if system_parts else None
        return system_instruction, contents
    @staticmethod
    def _build_choices(candidates: List[dict[str, Any]]) -> List[LLMChoice]:
        choices: List[LLMChoice] = []
        for idx, candidate in enumerate(candidates):
            content = candidate.get("content", {})
            parts = content.get("parts", [])
            text_parts = [
                part.get("text", "")
                for part in parts
                if isinstance(part, dict) and part.get("text")
            ]
            text = "\n\n".join(text_parts)
            choices.append(
                LLMChoice(
                    index=candidate.get("index", idx),
                    message=LLMMessage(role="assistant", content=text),
                )
            )
        if not choices:
            choices.append(
                LLMChoice(
                    index=0,
                    message=LLMMessage(role="assistant", content=""),
                )
            )
        return choices
--- a/app/providers/openai.py
+++ b/app/providers/openai.py
@ -0,0 +1,66 @@
 from __future__ import annotations
 from typing import Any, Dict, List
 import httpx
 from app.exceptions import ProviderAPICallError
 from app.models import LLMChoice, LLMMessage, LLMProvider, LLMRequest, LLMResponse
 from app.providers.base import LLMProviderClient
 class OpenAIProvider(LLMProviderClient):
    name = LLMProvider.OPENAI.value
    api_key_env = "OPENAI_API_KEY"
    supports_stream = True
    base_url = "https://api.openai.com/v1/chat/completions"
    async def chat(
        self, request: LLMRequest, client: httpx.AsyncClient
    ) -> LLMResponse:
        self.ensure_stream_supported(request.stream)
        payload = self.merge_payload(
            {
                "model": request.model,
                "messages": [msg.model_dump() for msg in request.messages],
                "temperature": request.temperature,
                "top_p": request.top_p,
                "max_tokens": request.max_tokens,
                "stream": request.stream,
            },
            request.extra_params,
        )
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
        }
        try:
            response = await client.post(self.base_url, json=payload, headers=headers)
            response.raise_for_status()
        except httpx.HTTPError as exc:
            raise ProviderAPICallError(f"OpenAI request failed: {exc}") from exc
        data: Dict[str, Any] = response.json()
        choices = self._build_choices(data.get("choices", []))
        return LLMResponse(
            provider=LLMProvider.OPENAI,
            model=data.get("model", request.model),
            choices=choices,
            raw=data,
        )
    @staticmethod
    def _build_choices(choices: List[dict[str, Any]]) -> List[LLMChoice]:
        built: List[LLMChoice] = []
        for choice in choices:
            message_data = choice.get("message") or {}
            message = LLMMessage(
                role=message_data.get("role", "assistant"),  # fallback to assistant
                content=message_data.get("content", ""),
            )
            built.append(LLMChoice(index=choice.get("index", len(built)), message=message))
        return built
--- a/app/providers/openrouter.py
+++ b/app/providers/openrouter.py
@ -0,0 +1,77 @@
 from __future__ import annotations
 import os
 from typing import Any, Dict, List
 import httpx
 from app.exceptions import ProviderAPICallError
 from app.models import LLMChoice, LLMMessage, LLMProvider, LLMRequest, LLMResponse
 from app.providers.base import LLMProviderClient
 class OpenRouterProvider(LLMProviderClient):
    name = LLMProvider.OPENROUTER.value
    api_key_env = "OPENROUTER_API_KEY"
    supports_stream = True
    base_url = "https://openrouter.ai/api/v1/chat/completions"
    def __init__(self, api_key: str | None):
        super().__init__(api_key)
        self.site_url = os.getenv("OPENROUTER_SITE_URL")
        self.app_name = os.getenv("OPENROUTER_APP_NAME")
    async def chat(
        self, request: LLMRequest, client: httpx.AsyncClient
    ) -> LLMResponse:
        self.ensure_stream_supported(request.stream)
        payload = self.merge_payload(
            {
                "model": request.model,
                "messages": [msg.model_dump() for msg in request.messages],
                "temperature": request.temperature,
                "top_p": request.top_p,
                "max_tokens": request.max_tokens,
                "stream": request.stream,
            },
            request.extra_params,
        )
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
        }
        if self.site_url:
            headers["HTTP-Referer"] = self.site_url
        if self.app_name:
            headers["X-Title"] = self.app_name
        try:
            response = await client.post(self.base_url, json=payload, headers=headers)
            response.raise_for_status()
        except httpx.HTTPError as exc:
            raise ProviderAPICallError(f"OpenRouter request failed: {exc}") from exc
        data: Dict[str, Any] = response.json()
        choices = self._build_choices(data.get("choices", []))
        return LLMResponse(
            provider=LLMProvider.OPENROUTER,
            model=data.get("model", request.model),
            choices=choices,
            raw=data,
        )
    @staticmethod
    def _build_choices(choices: List[dict[str, Any]]) -> List[LLMChoice]:
        built: List[LLMChoice] = []
        for choice in choices:
            message_data = choice.get("message") or {}
            message = LLMMessage(
                role=message_data.get("role", "assistant"),
                content=message_data.get("content", ""),
            )
            built.append(LLMChoice(index=choice.get("index", len(built)), message=message))
        return built
--- a/app/providers/qwen.py
+++ b/app/providers/qwen.py
@ -0,0 +1,87 @@
 from __future__ import annotations
 from typing import Any, Dict, List
 import httpx
 from app.exceptions import ProviderAPICallError
 from app.models import LLMChoice, LLMMessage, LLMProvider, LLMRequest, LLMResponse
 from app.providers.base import LLMProviderClient
 class QwenProvider(LLMProviderClient):
    name = LLMProvider.QWEN.value
    api_key_env = "QWEN_API_KEY"
    base_url = (
        "https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation"
    )
    async def chat(
        self, request: LLMRequest, client: httpx.AsyncClient
    ) -> LLMResponse:
        self.ensure_stream_supported(request.stream)
        parameters = {
            "temperature": request.temperature,
            "top_p": request.top_p,
        }
        if request.max_tokens is not None:
            parameters["max_output_tokens"] = request.max_tokens
        # Strip None values from parameters
        parameters = {k: v for k, v in parameters.items() if v is not None}
        payload: Dict[str, Any] = {
            "model": request.model,
            "input": {"messages": [msg.model_dump() for msg in request.messages]},
        }
        if parameters:
            payload["parameters"] = parameters
        payload = self.merge_payload(payload, request.extra_params)
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
        }
        try:
            response = await client.post(self.base_url, json=payload, headers=headers)
            response.raise_for_status()
        except httpx.HTTPError as exc:
            raise ProviderAPICallError(f"Qwen request failed: {exc}") from exc
        data: Dict[str, Any] = response.json()
        choices = self._build_choices(data.get("output", {}))
        return LLMResponse(
            provider=LLMProvider.QWEN,
            model=request.model,
            choices=choices,
            raw=data,
        )
    @staticmethod
    def _build_choices(output: Dict[str, Any]) -> List[LLMChoice]:
        choices_payload = output.get("choices", [])
        if not choices_payload and output.get("text"):
            return [
                LLMChoice(
                    index=0,
                    message=LLMMessage(role="assistant", content=output["text"]),
                )
            ]
        choices: List[LLMChoice] = []
        for idx, choice in enumerate(choices_payload):
            message_data = choice.get("message") or {}
            message = LLMMessage(
                role=message_data.get("role", "assistant"),
                content=message_data.get("content", ""),
            )
            choices.append(LLMChoice(index=choice.get("index", idx), message=message))
        if not choices:
            choices.append(
                LLMChoice(index=0, message=LLMMessage(role="assistant", content=""))
            )
        return choices
--- a/app/services/init.py
+++ b/app/services/init.py
@ -0,0 +1,3 @@
 from .gateway import LLMGateway
 __all__ = ["LLMGateway"]
--- a/app/services/pycache/init.cpython-311.pyc
+++ b/app/services/pycache/init.cpython-311.pyc
--- a/app/services/pycache/init.cpython-312.pyc
+++ b/app/services/pycache/init.cpython-312.pyc
--- a/app/services/pycache/gateway.cpython-311.pyc
+++ b/app/services/pycache/gateway.cpython-311.pyc
--- a/app/services/pycache/gateway.cpython-312.pyc
+++ b/app/services/pycache/gateway.cpython-312.pyc
--- a/app/services/pycache/import_analysis.cpython-311.pyc
+++ b/app/services/pycache/import_analysis.cpython-311.pyc
--- a/app/services/pycache/import_analysis.cpython-312.pyc
+++ b/app/services/pycache/import_analysis.cpython-312.pyc
--- a/app/services/gateway.py
+++ b/app/services/gateway.py
@ -0,0 +1,53 @@
 from __future__ import annotations
 import os
 from typing import Dict, Type
 import httpx
 from app.exceptions import ProviderConfigurationError
 from app.models import LLMProvider, LLMRequest, LLMResponse
 from app.providers import (
    AnthropicProvider,
    DeepSeekProvider,
    GeminiProvider,
    LLMProviderClient,
    OpenAIProvider,
    OpenRouterProvider,
    QwenProvider,
 )
 class LLMGateway:
    """Simple registry that dispatches chat requests to provider clients."""
    def __init__(self) -> None:
        self._providers: Dict[LLMProvider, LLMProviderClient] = {}
        self._factory: Dict[LLMProvider, Type[LLMProviderClient]] = {
            LLMProvider.OPENAI: OpenAIProvider,
            LLMProvider.ANTHROPIC: AnthropicProvider,
            LLMProvider.OPENROUTER: OpenRouterProvider,
            LLMProvider.GEMINI: GeminiProvider,
            LLMProvider.QWEN: QwenProvider,
            LLMProvider.DEEPSEEK: DeepSeekProvider,
        }
    def get_provider(self, provider: LLMProvider) -> LLMProviderClient:
        if provider not in self._factory:
            raise ProviderConfigurationError(f"Unsupported provider '{provider.value}'.")
        if provider not in self._providers:
            self._providers[provider] = self._build_provider(provider)
        return self._providers[provider]
    def _build_provider(self, provider: LLMProvider) -> LLMProviderClient:
        provider_cls = self._factory[provider]
        api_key_env = getattr(provider_cls, "api_key_env", None)
        api_key = os.getenv(api_key_env) if api_key_env else None
        return provider_cls(api_key)
    async def chat(
        self, request: LLMRequest, client: httpx.AsyncClient
    ) -> LLMResponse:
        provider_client = self.get_provider(request.provider)
        return await provider_client.chat(request, client)
--- a/app/services/import_analysis.py
+++ b/app/services/import_analysis.py
@ -0,0 +1,91 @@
 from __future__ import annotations
 from typing import List, Tuple
 from app.models import (
    DataImportAnalysisRequest,
    LLMMessage,
    LLMProvider,
    LLMRole,
 )
 def resolve_provider_from_model(llm_model: str) -> Tuple[LLMProvider, str]:
    """Resolve provider based on the llm_model string.
    The llm_model may be provided as 'provider:model' or 'provider/model'.
    If no provider prefix is present, try an educated guess from common model name patterns.
    """
    normalized = llm_model.strip()
    provider_hint: str | None = None
    model_name = normalized
    for delimiter in (":", "/", "|"):
        if delimiter in normalized:
            provider_hint, model_name = normalized.split(delimiter, 1)
            provider_hint = provider_hint.strip().lower()
            model_name = model_name.strip()
            break
    provider_map = {provider.value: provider for provider in LLMProvider}
    if provider_hint:
        if provider_hint not in provider_map:
            raise ValueError(
                f"Unsupported provider '{provider_hint}'. Expected one of: {', '.join(provider_map.keys())}."
            )
        return provider_map[provider_hint], model_name
    return _guess_provider_from_model(model_name), model_name
 def _guess_provider_from_model(model_name: str) -> LLMProvider:
    lowered = model_name.lower()
    if lowered.startswith(("gpt", "o1", "text-", "dall-e", "whisper")):
        return LLMProvider.OPENAI
    if lowered.startswith(("claude", "anthropic")):
        return LLMProvider.ANTHROPIC
    if lowered.startswith(("gemini", "models/gemini")):
        return LLMProvider.GEMINI
    if lowered.startswith("qwen"):
        return LLMProvider.QWEN
    if lowered.startswith("deepseek"):
        return LLMProvider.DEEPSEEK
    if lowered.startswith(("openrouter", "router-")):
        return LLMProvider.OPENROUTER
    supported = ", ".join(provider.value for provider in LLMProvider)
    raise ValueError(
        f"Unable to infer provider from model '{model_name}'. "
        f"Please prefix with 'provider:model'. Supported providers: {supported}."
    )
 def build_import_messages(
    request: DataImportAnalysisRequest,
 ) -> List[LLMMessage]:
    """Create system and user messages for the import analysis prompt."""
    headers_formatted = "\n".join(f"- {header}" for header in request.table_headers)
    system_prompt = (
        "你是一名数据导入识别助手。请根据给定的表头和示例数据，判断字段含义、"
        "典型数据类型以及潜在的数据质量问题。最终请返回一个结构化的JSON。\n"
        "JSON结构需包含: field_summaries (数组, 每项含 header、meaning、data_type、quality_notes), "
        "detected_issues (字符串数组)，以及 overall_suggestion (字符串)。"
    )
    user_prompt = (
        f"导入记录ID: {request.import_record_id}\n\n"
        "表头信息:\n"
        f"{headers_formatted}\n\n"
        "示例数据:\n"
        f"{request.example_data}\n\n"
        "请仔细分析示例数据与表头之间的对应关系，并返回符合上述JSON结构的内容。"
    )
    return [
        LLMMessage(role=LLMRole.SYSTEM, content=system_prompt),
        LLMMessage(role=LLMRole.USER, content=user_prompt),
    ]
--- a/demo/GE_result_desc_prompt.txt
+++ b/demo/GE_result_desc_prompt.txt
@ -0,0 +1,47 @@
 系统角色（System）
 你是“数据画像抽取器”。输入是一段 Great Expectations 的 profiling/validation 结果 JSON，
 可能包含：列级期望（expect_*）、统计、样例值、类型推断等；也可能带表级/批次元数据。
 请将其归一化为一个可被程序消费的“表画像”JSON，对不确定项给出置信度与理由。
 禁止臆造不存在的列、时间范围或数值。
 用户消息（User）
 【输入：GE结果JSON】
 {{GE_RESULT_JSON}}
 【输出要求（只输出JSON，不要解释文字）】
 {
  "table": "<库.表 或 表名>",
  "row_count": <int|null>,                             // 若未知可为 null
  "role": "fact|dimension|unknown",                    // 依据指标/维度占比与唯一性启发式
  "grain": ["<列1>", "<列2>", ...],                    // 事实粒度猜测（如含 dt/店/类目）
  "time": { "column": "<name>|null", "granularity": "day|week|month|unknown", "range": ["YYYY-MM-DD","YYYY-MM-DD"]|null, "has_gaps": true|false|null },
  "columns": [
    {
      "name": "<col>",
      "dtype": "<ge推断/物理类型>",
      "semantic_type": "dimension|metric|time|text|id|unknown",
      "null_rate": <0~1|null>,
      "distinct_count": <int|null>,
      "distinct_ratio": <0~1|null>,
      "stats": { "min": <number|string|null>,"max": <number|string|null>,"mean": <number|null>,"std": <number|null>,"skewness": <number|null> },
      "enumish": true|false|null,                      // 低熵/可枚举
      "top_values": [{"value":"<v>","pct":<0~1>}, ...],// 取前K个（≤10）
      "pk_candidate_score": <0~1>,                     // 唯一性+非空综合评分
      "metric_candidate_score": <0~1>,                 // 数值/偏态/业务词命中
      "comment": "<列注释或GE描述|可为空>"
    }
  ],
  "primary_key_candidates": [["colA","colB"], ...],    // 依据 unique/compound unique 期望
  "fk_candidates": [{"from":"<col>","to":"<dim_table(col)>","confidence":<0~1>}],
  "quality": {
    "failed_expectations": [{"name":"<expect_*>","column":"<col|table>","summary":"<一句话>"}],
    "warning_hints": ["空值率>0.2的列: ...", "时间列存在缺口: ..."]
  },
  "confidence_notes": ["<为什么判定role/grain/time列>"]
 }
 【判定规则（简要）】
 - time列：类型为日期/时间 OR 命中 dt/date/day 等命名；若有 min/max 可给出 range；若间隔缺口≥1天记 has_gaps=true。
 - semantic_type：数值+右偏/方差大→更偏 metric；高唯一/ID命名→id；高基数+文本→text；低熵+有限取值→dimension。
 - role：metric列占比高且存在time列→倾向 fact；几乎全是枚举/ID且少数值→dimension。
 - 置信不高时给出 null 或 unknown，并写入 confidence_notes。
--- a/demo/e-commerce-orders_desc.md
+++ b/demo/e-commerce-orders_desc.md
@ -0,0 +1,127 @@
 E-commerce Customer Order Behavior Dataset
 A synthetic e-commerce dataset containing 10,000 orders with realistic customer behavior patterns, suitable for e-commerce analytics and machine learning tasks.
 Dataset Card for E-commerce Orders
 Dataset Summary
 This dataset simulates customer order behavior in an e-commerce platform, containing detailed information about orders, customers, products, and delivery patterns. The data is synthetically generated with realistic distributions and patterns.
 Supported Tasks
 regression: Predict order quantities or prices
 classification: Predict delivery status or customer segments
 clustering: Identify customer behavior patterns
 time-series-forecasting: Analyze order patterns over time
 Languages
 Not applicable (tabular data)
 Dataset Structure
 Data Instances
 Each instance represents a single e-commerce order with the following fields:
 {
    'order_id': '5ea92c47-c5b2-4bdd-8a50-d77efd77ec89',
    'customer_id': 2350,
    'product_id': 995,
    'category': 'Electronics',
    'price': 403.17,
    'quantity': 3,
    'order_date': '2024-04-20 14:59:58.897063',
    'shipping_date': '2024-04-22 14:59:58.897063',
    'delivery_status': 'Delivered',
    'payment_method': 'PayPal',
    'device_type': 'Mobile',
    'channel': 'Paid Search',
    'shipping_address': '72166 Cunningham Crescent East Nicholasside Mississippi 85568',
    'billing_address': '38199 Edwin Plain Johnborough Maine 81826',
    'customer_segment': 'Returning'
 }
 Data Fields
 Field Name	Type	Description	Value Range
 order_id	string	Unique order identifier (UUID4)	-
 customer_id	int	Customer identifier	1-3,000
 product_id	int	Product identifier	1-1,000
 category	string	Product category	Electronics, Clothing, Home, Books, Beauty, Toys
 price	float	Product price	$5.00-$500.00
 quantity	int	Order quantity	1-10
 order_date	datetime	Order placement timestamp	Last 12 months
 shipping_date	datetime	Shipping timestamp	1-7 days after order_date
 delivery_status	string	Delivery status	Pending, Shipped, Delivered, Returned
 payment_method	string	Payment method used	Credit Card, PayPal, Debit Card, Apple Pay, Google Pay
 device_type	string	Ordering device	Desktop, Mobile, Tablet
 channel	string	Marketing channel	Organic, Paid Search, Email, Social
 shipping_address	string	Delivery address	Street, City, State, ZIP
 billing_address	string	Billing address	Street, City, State, ZIP
 customer_segment	string	Customer type	New, Returning, VIP
 Data Splits
 This dataset is provided as a single CSV file without splits.
 Dataset Creation
 Source Data
 This is a synthetic dataset generated using Python with pandas, numpy, and Faker libraries. The data generation process ensures:
 Realistic customer behavior patterns
 Proper data distributions
 Valid relationships between fields
 Realistic address formatting
 Annotations
 No manual annotations (synthetic data)
 Considerations for Using the Data
 Social Impact of Dataset
 This dataset is designed for:
 Development of e-commerce analytics systems
 Testing of order processing systems
 Training of machine learning models for e-commerce
 Educational purposes in data science
 Discussion of Biases
 As a synthetic dataset, care has been taken to:
 Use realistic distributions for order patterns
 Maintain proper relationships between dates
 Create realistic customer segments
 Avoid demographic biases in address generation
 However, users should note that:
 The data patterns are simplified compared to real e-commerce data
 The customer behavior patterns are based on general assumptions
 Geographic distribution might not reflect real-world patterns
 Dataset Statistics
 Total Records: 10,000
 Distribution Statistics:
 Delivery Status:
 Delivered: 70%
 Shipped: 20%
 Pending: 5%
 Returned: 5%
 Customer Segments:
 VIP: ~15%
 Returning: ~35%
 New: ~50%
 Loading and Usage
 Using Huggingface Datasets:
 from datasets import load_dataset
 dataset = load_dataset("path/to/e-commerce-orders")
 # Example: Load as pandas DataFrame
 df = dataset['train'].to_pandas()
 # Example: Access specific columns
 orders = dataset['train']['order_id']
 prices = dataset['train']['price']
 Data Quality
 The dataset has been validated to ensure:
 No missing values
 Proper value ranges
 Valid categorical values
 Proper date relationships
 Unique order IDs
 Valid address formats
--- a/demo/snippet.json
+++ b/demo/snippet.json
@ -0,0 +1,523 @@
 [
    {
        "id": "snpt_daily_gmv_trend",
        "title": "日GMV趋势",
        "desc": "按日统计GMV与订单量趋势",
        "type": "trend",
        "applicability": {
            "required_columns": [
                "order_date",
                "price",
                "quantity"
            ],
            "time_column": "order_date",
            "constraints": {
                "dim_cardinality_hint": null,
                "fk_join_available": false,
                "notes": [
                    "GMV=SUM(price*quantity)",
                    "请避免选择地址等PII字段"
                ]
            }
        },
        "variables": [
            {
                "name": "start_date",
                "type": "date"
            },
            {
                "name": "end_date",
                "type": "date"
            }
        ],
        "dialect_sql": {
            "mysql": "SELECT DATE(order_date) AS dt, SUM(price*quantity) AS gmv, COUNT(*) AS orders\nFROM {{table}}\nWHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\nGROUP BY dt\nORDER BY dt;"
        },
        "business_caliber": "GMV口径：price×quantity；订单量：记录条数；粒度=日。",
        "examples": [
            "近30天GMV趋势",
            "2025Q1每日GMV与订单数"
        ]
    },
    {
        "id": "snpt_daily_gmv_ma7",
        "title": "7日GMV均线",
        "desc": "GMV按日与7日滑动平均",
        "type": "trend",
        "applicability": {
            "required_columns": [
                "order_date",
                "price",
                "quantity"
            ],
            "time_column": "order_date",
            "constraints": {
                "dim_cardinality_hint": null,
                "fk_join_available": false,
                "notes": [
                    "窗口=包含当日的过去7天",
                    "若日期有缺口，均线可能偏移"
                ]
            }
        },
        "variables": [
            {
                "name": "start_date",
                "type": "date"
            },
            {
                "name": "end_date",
                "type": "date"
            }
        ],
        "dialect_sql": {
            "mysql": "WITH d AS (\n  SELECT DATE(order_date) AS dt, SUM(price*quantity) AS gmv\n  FROM {{table}}\n  WHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\n  GROUP BY dt\n)\nSELECT dt,\n       gmv,\n       AVG(gmv) OVER (ORDER BY dt ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS gmv_ma7\nFROM d\nORDER BY dt;"
        },
        "business_caliber": "GMV=price×quantity；窗口=7天（含当日），按自然日排序计算。",
        "examples": [
            "本季度GMV与7日均线",
            "促销期走势平滑对比"
        ]
    },
    {
        "id": "snpt_yoy_daily_gmv",
        "title": "GMV同比（日）",
        "desc": "对比去年同日GMV与同比%",
        "type": "ratio",
        "applicability": {
            "required_columns": [
                "order_date",
                "price",
                "quantity"
            ],
            "time_column": "order_date",
            "constraints": {
                "dim_cardinality_hint": null,
                "fk_join_available": false,
                "notes": [
                    "需要查询窗口覆盖到去年的对应日期",
                    "闰年按日期对齐处理"
                ]
            }
        },
        "variables": [
            {
                "name": "start_date",
                "type": "date"
            },
            {
                "name": "end_date",
                "type": "date"
            }
        ],
        "dialect_sql": {
            "mysql": "WITH cur AS (\n  SELECT DATE(order_date) AS dt, SUM(price*quantity) AS gmv\n  FROM {{table}}\n  WHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\n  GROUP BY dt\n),\nprev AS (\n  SELECT DATE(DATE_SUB(order_date, INTERVAL 1 YEAR)) AS dt, SUM(price*quantity) AS gmv_last\n  FROM {{table}}\n  WHERE DATE(order_date) BETWEEN DATE_SUB({{start_date}}, INTERVAL 1 YEAR) AND DATE_SUB({{end_date}}, INTERVAL 1 YEAR)\n  GROUP BY DATE(DATE_SUB(order_date, INTERVAL 1 YEAR))\n)\nSELECT c.dt,\n       c.gmv,\n       p.gmv_last,\n       CASE WHEN p.gmv_last IS NULL OR p.gmv_last=0 THEN NULL ELSE (c.gmv - p.gmv_last)/p.gmv_last END AS yoy\nFROM cur c LEFT JOIN prev p ON c.dt = p.dt\nORDER BY c.dt;"
        },
        "business_caliber": "同比=当日GMV与去年同日GMV之差/去年同日GMV；GMV=price×quantity。",
        "examples": [
            "最近90天GMV同比曲线",
            "节假日同比表现"
        ]
    },
    {
        "id": "snpt_topn_category_gmv",
        "title": "类目GMV排行",
        "desc": "按类目统计GMV并取TopN",
        "type": "topn",
        "applicability": {
            "required_columns": [
                "order_date",
                "category",
                "price",
                "quantity"
            ],
            "time_column": "order_date",
            "constraints": {
                "dim_cardinality_hint": 6,
                "fk_join_available": false,
                "notes": [
                    "类目枚举较少，建议TopN<=6用于展示",
                    "可追加订单量与件数"
                ]
            }
        },
        "variables": [
            {
                "name": "start_date",
                "type": "date"
            },
            {
                "name": "end_date",
                "type": "date"
            },
            {
                "name": "top_n",
                "type": "int",
                "default": 10
            }
        ],
        "dialect_sql": {
            "mysql": "SELECT category,\n       SUM(price*quantity) AS gmv,\n       COUNT(*) AS orders,\n       SUM(quantity) AS qty\nFROM {{table}}\nWHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\nGROUP BY category\nORDER BY gmv DESC\nLIMIT {{top_n}};"
        },
        "business_caliber": "GMV=price×quantity；统计范围=指定日期内；粒度=类目。",
        "examples": [
            "上月类目Top5",
            "本季度类目GMV结构"
        ]
    },
    {
        "id": "snpt_share_channel",
        "title": "渠道GMV占比",
        "desc": "统计各渠道GMV及占比",
        "type": "ratio",
        "applicability": {
            "required_columns": [
                "order_date",
                "channel",
                "price",
                "quantity"
            ],
            "time_column": "order_date",
            "constraints": {
                "dim_cardinality_hint": 4,
                "fk_join_available": false,
                "notes": [
                    "占比以总GMV为分母；占比之和≈100%",
                    "适合饼图/堆叠柱"
                ]
            }
        },
        "variables": [
            {
                "name": "start_date",
                "type": "date"
            },
            {
                "name": "end_date",
                "type": "date"
            }
        ],
        "dialect_sql": {
            "mysql": "WITH base AS (\n  SELECT channel, SUM(price*quantity) AS gmv\n  FROM {{table}}\n  WHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\n  GROUP BY channel\n), total AS (\n  SELECT SUM(gmv) AS tg FROM base\n)\nSELECT b.channel, b.gmv, b.gmv/t.tg AS gmv_share\nFROM base b CROSS JOIN total t\nORDER BY b.gmv DESC;"
        },
        "business_caliber": "渠道GMV占比=渠道GMV/全部渠道GMV；时间范围由参数限定。",
        "examples": [
            "本月各渠道占比",
            "Q1渠道结构对比"
        ]
    },
    {
        "id": "snpt_topn_product_gmv",
        "title": "商品GMV排行",
        "desc": "按商品ID统计GMV并取TopN",
        "type": "topn",
        "applicability": {
            "required_columns": [
                "order_date",
                "product_id",
                "price",
                "quantity"
            ],
            "time_column": "order_date",
            "constraints": {
                "dim_cardinality_hint": 1000,
                "fk_join_available": true,
                "notes": [
                    "product_id基数较高，建议LIMIT<=50",
                    "可与商品维表联查名称等属性"
                ]
            }
        },
        "variables": [
            {
                "name": "start_date",
                "type": "date"
            },
            {
                "name": "end_date",
                "type": "date"
            },
            {
                "name": "top_n",
                "type": "int",
                "default": 20
            }
        ],
        "dialect_sql": {
            "mysql": "SELECT product_id,\n       SUM(price*quantity) AS gmv,\n       SUM(quantity) AS qty,\n       COUNT(*) AS orders\nFROM {{table}}\nWHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\nGROUP BY product_id\nORDER BY gmv DESC\nLIMIT {{top_n}};"
        },
        "business_caliber": "GMV=price×quantity；粒度=商品ID。",
        "examples": [
            "上周热销商品Top20",
            "年度销量Top10商品"
        ]
    },
    {
        "id": "snpt_join_product_dim",
        "title": "商品维表联查",
        "desc": "以product_id关联商品维表或使用纯ID",
        "type": "join",
        "applicability": {
            "required_columns": [
                "product_id"
            ],
            "time_column": null,
            "constraints": {
                "dim_cardinality_hint": 1000,
                "fk_join_available": true,
                "notes": [
                    "若无维表则保留纯ID版输出",
                    "谨慎选择PII字段，勿输出地址类字段"
                ]
            }
        },
        "variables": [
            {
                "name": "dim_product",
                "type": "identifier"
            },
            {
                "name": "select_cols",
                "type": "string",
                "default": "f.product_id, f.price, f.quantity"
            }
        ],
        "dialect_sql": {
            "mysql": "-- 命名版\nSELECT {{select_cols}}\nFROM {{table}} f\nLEFT JOIN {{dim_product}} d ON f.product_id = d.product_id;\n\n-- 纯ID版\nSELECT product_id, price, quantity FROM {{table}};"
        },
        "business_caliber": "外键：product_id→商品维表主键；度量来源于事实表price与quantity。",
        "examples": [
            "联查商品名称后做TopN",
            "仅用ID进行商品分析"
        ]
    },
    {
        "id": "snpt_join_customer_dim",
        "title": "客户维表联查",
        "desc": "以customer_id关联客户维表或使用纯ID",
        "type": "join",
        "applicability": {
            "required_columns": [
                "customer_id"
            ],
            "time_column": null,
            "constraints": {
                "dim_cardinality_hint": 2713,
                "fk_join_available": true,
                "notes": [
                    "如无维表，可直接按customer_id聚合",
                    "避免输出shipping_address/billing_address等PII"
                ]
            }
        },
        "variables": [
            {
                "name": "dim_customer",
                "type": "identifier"
            },
            {
                "name": "select_cols",
                "type": "string",
                "default": "c.customer_name, f.customer_id, SUM(f.price*f.quantity) AS gmv"
            }
        ],
        "dialect_sql": {
            "mysql": "-- 命名版\nSELECT {{select_cols}}\nFROM {{table}} f\nLEFT JOIN {{dim_customer}} c ON f.customer_id = c.customer_id\nGROUP BY c.customer_name, f.customer_id;\n\n-- 纯ID版\nSELECT customer_id, SUM(price*quantity) AS gmv\nFROM {{table}}\nGROUP BY customer_id;"
        },
        "business_caliber": "外键：customer_id→客户维表主键；GMV=price×quantity。",
        "examples": [
            "客户分群GMV",
            "重点客户消费额排行"
        ]
    },
    {
        "id": "snpt_quality_dup_order",
        "title": "主键重复检查",
        "desc": "检查order_id唯一性并抽样输出",
        "type": "quality",
        "applicability": {
            "required_columns": [
                "order_id"
            ],
            "time_column": null,
            "constraints": {
                "dim_cardinality_hint": 10000,
                "fk_join_available": false,
                "notes": [
                    "画像显示order_id应唯一；若结果非空为异常"
                ]
            }
        },
        "variables": [
            {
                "name": "limit_sample",
                "type": "int",
                "default": 50
            }
        ],
        "dialect_sql": {
            "mysql": "WITH d AS (\n  SELECT order_id, COUNT(*) AS cnt\n  FROM {{table}}\n  GROUP BY order_id\n  HAVING COUNT(*)>1\n)\nSELECT * FROM d LIMIT {{limit_sample}};"
        },
        "business_caliber": "主键口径：order_id全表唯一；用于数据质量预警与排查。",
        "examples": [
            "是否存在重复订单？",
            "查看重复订单样本"
        ]
    },
    {
        "id": "snpt_quality_price_outlier",
        "title": "价格异常检测",
        "desc": "基于当日均值±3σ识别异常价",
        "type": "quality",
        "applicability": {
            "required_columns": [
                "order_date",
                "price"
            ],
            "time_column": "order_date",
            "constraints": {
                "dim_cardinality_hint": null,
                "fk_join_available": false,
                "notes": [
                    "仅质量预警，不直接代表业务错误",
                    "当天样本过少时波动较大"
                ]
            }
        },
        "variables": [
            {
                "name": "start_date",
                "type": "date"
            },
            {
                "name": "end_date",
                "type": "date"
            },
            {
                "name": "limit_sample",
                "type": "int",
                "default": 100
            }
        ],
        "dialect_sql": {
            "mysql": "WITH stats AS (\n  SELECT DATE(order_date) AS dt, AVG(price) AS mu, STDDEV_POP(price) AS sigma\n  FROM {{table}}\n  WHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\n  GROUP BY dt\n)\nSELECT f.*\nFROM {{table}} f\nJOIN stats s ON DATE(f.order_date)=s.dt\nWHERE (f.price > s.mu + 3*s.sigma OR f.price < s.mu - 3*s.sigma)\nLIMIT {{limit_sample}};"
        },
        "business_caliber": "异常定义：价格超出当日均值±3×标准差（总体标准差）。",
        "examples": [
            "近30天价格异常样本",
            "促销期异常价监控"
        ]
    },
    {
        "id": "snpt_sample_recent_orders",
        "title": "近期明细抽样",
        "desc": "抽样查看近期订单核心字段",
        "type": "sample",
        "applicability": {
            "required_columns": [
                "order_date",
                "order_id",
                "customer_id",
                "product_id",
                "category",
                "price",
                "quantity",
                "channel",
                "payment_method",
                "delivery_status"
            ],
            "time_column": "order_date",
            "constraints": {
                "dim_cardinality_hint": null,
                "fk_join_available": true,
                "notes": [
                    "为保护隐私，不展示shipping_address与billing_address",
                    "仅用于人工核验"
                ]
            }
        },
        "variables": [
            {
                "name": "start_date",
                "type": "date"
            },
            {
                "name": "end_date",
                "type": "date"
            },
            {
                "name": "limit_rows",
                "type": "int",
                "default": 100
            }
        ],
        "dialect_sql": {
            "mysql": "SELECT DATE(order_date) AS dt,\n       order_id, customer_id, product_id, category,\n       price, quantity, channel, payment_method, delivery_status\nFROM {{table}}\nWHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\nORDER BY dt DESC\nLIMIT {{limit_rows}};"
        },
        "business_caliber": "明细抽样用于数据核验；不输出PII地址信息。",
        "examples": [
            "抽样查看上周订单",
            "核对节假日订单明细"
        ]
    },
    {
        "id": "snpt_filter_paid_delivered",
        "title": "支付已送达筛选",
        "desc": "过滤支付方式为信用卡且配送状态为已送达",
        "type": "sample",
        "applicability": {
            "required_columns": [
                "payment_method",
                "delivery_status"
            ],
            "time_column": null,
            "constraints": {
                "dim_cardinality_hint": 5,
                "fk_join_available": false,
                "notes": [
                    "此片段为WHERE条件模板，可拼接到任意查询",
                    "delivery_status枚举包含Delivered/Pending/Returned/Shipped"
                ]
            }
        },
        "variables": [],
        "dialect_sql": {
            "mysql": "WHERE payment_method = 'Credit Card' AND delivery_status = 'Delivered'"
        },
        "business_caliber": "口径：支付渠道=信用卡；物流状态=已送达（Delivered）。可与时间或维度条件叠加。",
        "examples": [
            "筛选信用卡已送达订单",
            "在TopN商品中仅看已送达信用卡订单"
        ]
    },
    {
        "id": "snpt_filter_device_channel",
        "title": "设备渠道筛选",
        "desc": "按设备类型与渠道过滤分析范围",
        "type": "sample",
        "applicability": {
            "required_columns": [
                "device_type",
                "channel"
            ],
            "time_column": null,
            "constraints": {
                "dim_cardinality_hint": 7,
                "fk_join_available": false,
                "notes": [
                    "device_type枚举：Desktop/Mobile/Tablet",
                    "channel枚举：Email/Organic/Paid Search/Social"
                ]
            }
        },
        "variables": [],
        "dialect_sql": {
            "mysql": "WHERE device_type IN ('Mobile','Desktop') AND channel IN ('Paid Search','Social')"
        },
        "business_caliber": "限制分析在指定设备与渠道；可直接作为WHERE子句片段复用。",
        "examples": [
            "仅看移动端付费渠道GMV",
            "桌面+社媒渠道订单明细"
        ]
    }
 ]
--- a/demo/snippet_alias_generator.json
+++ b/demo/snippet_alias_generator.json
@ -0,0 +1,499 @@
 [
    {
        "id": "snpt_daily_gmv_trend",
        "aliases": [
            {
                "text": "每日GMV走势",
                "tone": "中性"
            },
            {
                "text": "日销售额趋势",
                "tone": "中性"
            },
            {
                "text": "每天卖了多少",
                "tone": "口语"
            },
            {
                "text": "按日GMV曲线",
                "tone": "专业"
            }
        ],
        "keywords": [
            "GMV",
            "销售额",
            "日趋势",
            "每日",
            "订单量",
            "orders",
            "price",
            "quantity",
            "order_date",
            "time series",
            "趋势图",
            "按日聚合"
        ],
        "intent_tags": [
            "trend"
        ]
    },
    {
        "id": "snpt_daily_gmv_ma7",
        "aliases": [
            {
                "text": "GMV七日均线",
                "tone": "专业"
            },
            {
                "text": "7天滑动平均",
                "tone": "中性"
            },
            {
                "text": "GMV周均走势",
                "tone": "中性"
            },
            {
                "text": "GMV平滑曲线",
                "tone": "专业"
            }
        ],
        "keywords": [
            "GMV",
            "移动平均",
            "MA7",
            "七日均线",
            "滑动窗口",
            "time series",
            "order_date",
            "price",
            "quantity",
            "平滑",
            "趋势",
            "按日聚合"
        ],
        "intent_tags": [
            "trend"
        ]
    },
    {
        "id": "snpt_yoy_daily_gmv",
        "aliases": [
            {
                "text": "GMV日同比",
                "tone": "专业"
            },
            {
                "text": "每日同比增速",
                "tone": "中性"
            },
            {
                "text": "跟去年同日比",
                "tone": "口语"
            },
            {
                "text": "GMV YoY（日）",
                "tone": "专业"
            }
        ],
        "keywords": [
            "同比",
            "YoY",
            "GMV",
            "去年同日",
            "增长率",
            "price",
            "quantity",
            "order_date",
            "对比分析",
            "比值",
            "日粒度",
            "ratio"
        ],
        "intent_tags": [
            "ratio"
        ]
    },
    {
        "id": "snpt_topn_category_gmv",
        "aliases": [
            {
                "text": "类目GMV排行",
                "tone": "中性"
            },
            {
                "text": "类目TopN销量",
                "tone": "中性"
            },
            {
                "text": "哪个分类最卖",
                "tone": "口语"
            },
            {
                "text": "按类目GMV排序",
                "tone": "专业"
            }
        ],
        "keywords": [
            "TopN",
            "分类",
            "类目",
            "category",
            "GMV",
            "price",
            "quantity",
            "排行",
            "榜单",
            "按类目聚合",
            "订单量",
            "销量"
        ],
        "intent_tags": [
            "topn",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_share_channel",
        "aliases": [
            {
                "text": "渠道GMV占比",
                "tone": "中性"
            },
            {
                "text": "各渠道份额",
                "tone": "中性"
            },
            {
                "text": "哪个渠道占多",
                "tone": "口语"
            },
            {
                "text": "渠道结构占比",
                "tone": "专业"
            }
        ],
        "keywords": [
            "占比",
            "份额",
            "share",
            "channel",
            "GMV",
            "price",
            "quantity",
            "比例",
            "结构分析",
            "按渠道聚合",
            "饼图",
            "堆叠"
        ],
        "intent_tags": [
            "ratio",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_topn_product_gmv",
        "aliases": [
            {
                "text": "商品GMV排行",
                "tone": "中性"
            },
            {
                "text": "热销商品TopN",
                "tone": "中性"
            },
            {
                "text": "哪款卖得最好",
                "tone": "口语"
            },
            {
                "text": "按商品GMV排序",
                "tone": "专业"
            }
        ],
        "keywords": [
            "TopN",
            "product_id",
            "商品",
            "GMV",
            "price",
            "quantity",
            "热销",
            "排行",
            "销量",
            "订单数",
            "高基数",
            "榜单"
        ],
        "intent_tags": [
            "topn",
            "by_dimension"
        ]
    },
    {
        "id": "snpt_join_product_dim",
        "aliases": [
            {
                "text": "关联商品维度",
                "tone": "专业"
            },
            {
                "text": "商品ID联表",
                "tone": "中性"
            },
            {
                "text": "把商品名连上",
                "tone": "口语"
            },
            {
                "text": "product维表join",
                "tone": "专业"
            }
        ],
        "keywords": [
            "join",
            "维表",
            "product_id",
            "维度扩展",
            "明细补充",
            "维度属性",
            "联表查询",
            "外键",
            "选择列",
            "维度贴标签",
            "by id",
            "映射"
        ],
        "intent_tags": [
            "by_dimension"
        ]
    },
    {
        "id": "snpt_join_customer_dim",
        "aliases": [
            {
                "text": "关联客户维度",
                "tone": "专业"
            },
            {
                "text": "客户ID联表",
                "tone": "中性"
            },
            {
                "text": "把客户信息补齐",
                "tone": "口语"
            },
            {
                "text": "customer维表join",
                "tone": "专业"
            }
        ],
        "keywords": [
            "join",
            "维表",
            "customer_id",
            "客户属性",
            "GMV聚合",
            "外键关联",
            "联表查询",
            "ID映射",
            "维度丰富",
            "分群分析",
            "by id",
            "扩展字段"
        ],
        "intent_tags": [
            "by_dimension"
        ]
    },
    {
        "id": "snpt_quality_dup_order",
        "aliases": [
            {
                "text": "订单主键去重检",
                "tone": "专业"
            },
            {
                "text": "重复order_id查找",
                "tone": "中性"
            },
            {
                "text": "有没重复订单",
                "tone": "口语"
            },
            {
                "text": "主键唯一性校验",
                "tone": "专业"
            }
        ],
        "keywords": [
            "数据质量",
            "重复",
            "去重",
            "order_id",
            "唯一性",
            "主键检查",
            "异常数据",
            "质量预警",
            "count>1",
            "样本抽取",
            "校验",
            "重复检测"
        ],
        "intent_tags": [
            "quality"
        ]
    },
    {
        "id": "snpt_quality_price_outlier",
        "aliases": [
            {
                "text": "价格3σ异常检",
                "tone": "专业"
            },
            {
                "text": "当日异常价格",
                "tone": "中性"
            },
            {
                "text": "看哪单价格怪",
                "tone": "口语"
            },
            {
                "text": "价格离群监控",
                "tone": "专业"
            }
        ],
        "keywords": [
            "异常检测",
            "3σ",
            "stddev",
            "价格",
            "price",
            "离群点",
            "质量规则",
            "time series",
            "order_date",
            "阈值告警",
            "数据监控",
            "波动"
        ],
        "intent_tags": [
            "quality"
        ]
    },
    {
        "id": "snpt_sample_recent_orders",
        "aliases": [
            {
                "text": "近期明细抽样",
                "tone": "中性"
            },
            {
                "text": "抽查最近订单",
                "tone": "口语"
            },
            {
                "text": "近期订单样本",
                "tone": "中性"
            },
            {
                "text": "核验明细抽样",
                "tone": "专业"
            }
        ],
        "keywords": [
            "抽样",
            "sample",
            "明细",
            "order_date",
            "order_id",
            "customer_id",
            "product_id",
            "category",
            "channel",
            "payment_method",
            "delivery_status",
            "核验"
        ],
        "intent_tags": [
            "by_dimension"
        ]
    },
    {
        "id": "snpt_filter_paid_delivered",
        "aliases": [
            {
                "text": "信用卡送达筛选",
                "tone": "中性"
            },
            {
                "text": "只看信用卡已送达",
                "tone": "口语"
            },
            {
                "text": "信用卡且已送达",
                "tone": "中性"
            },
            {
                "text": "付款信用卡已送达",
                "tone": "专业"
            }
        ],
        "keywords": [
            "支付方式",
            "信用卡",
            "Credit Card",
            "配送状态",
            "Delivered",
            "已送达",
            "过滤条件",
            "where子句",
            "订单筛选",
            "支付渠道",
            "状态筛选",
            "条件片段"
        ],
        "intent_tags": [
            "by_dimension"
        ]
    },
    {
        "id": "snpt_filter_device_channel",
        "aliases": [
            {
                "text": "设备渠道筛选",
                "tone": "中性"
            },
            {
                "text": "只看移动付费社媒",
                "tone": "口语"
            },
            {
                "text": "设备+渠道过滤",
                "tone": "专业"
            },
            {
                "text": "端与渠道条件",
                "tone": "中性"
            }
        ],
        "keywords": [
            "device_type",
            "channel",
            "设备类型",
            "渠道",
            "过滤条件",
            "where子句",
            "Mobile",
            "Desktop",
            "Paid Search",
            "Social",
            "范围限定",
            "条件片段"
        ],
        "intent_tags": [
            "by_dimension"
        ]
    }
 ]
--- a/demo/snippet_alias_generator.txt
+++ b/demo/snippet_alias_generator.txt
@ -0,0 +1,52 @@
 系统角色（System）
 你是“SQL片段别名生成器”。
 输入为一个或多个 SQL 片段对象（来自 snippet.json），输出为针对每个片段生成的多样化别名（口语 / 中性 / 专业）、关键词与意图标签。
 要求逐个处理所有片段对象，输出同样数量的 JSON 元素。
 用户消息（User）
 【上下文】
 SQL片段对象数组：{{SNIPPET_ARRAY}} // snippet.json中的一个或多个片段
 【任务要求】
 请针对输入数组中的 每个 SQL 片段，输出一个 JSON 对象，结构如下：
 {
  "id": "<与输入片段id一致>",
  "aliases": [
    {"text": "…", "tone": "口语|中性|专业"},
    {"text": "…", "tone": "专业"}
  ],
  "keywords": [
    "GMV","销售额","TopN","category","类目","趋势","同比","客户","订单","质量","异常检测","join","过滤","sample"
  ],
  "intent_tags": ["aggregate","trend","topn","ratio","quality","join","sample","filter","by_dimension"]
 }
 生成逻辑规范
 1.逐条输出：输入数组中每个片段对应一个输出对象（id 保持一致）。
 2.aliases生成
 至少 3 个别名，分别覆盖语气类型：口语 / 中性 / 专业。
 ≤20字，语义需等价，不得添加不存在的字段或业务口径。
 示例：
  GMV趋势分析（中性）
  每天卖多少钱（口语）
  按日GMV曲线（专业）
 3.keywords生成
 8~15个关键词，需涵盖片段核心维度、指标、分析类型和语义近义词。
 中英文混合（如 "GMV"/"销售额"、"同比"/"YoY"、"类目"/"category" 等）。
 包含用于匹配的分析意图关键词（如 “趋势”、“排行”、“占比”、“质量检查”、“过滤” 等）。
 4.intent_tags生成
 从以下集合中选取，与片段type及用途一致：
 ["aggregate","trend","topn","ratio","quality","join","sample","filter","by_dimension"]
 若为条件片段（WHERE句型），补充 "filter"；若含维度分组逻辑，补充 "by_dimension"。
 5.语言与内容要求
 保持正式书面风格，不添加解释说明。
 只输出JSON数组，不包含文字描述或额外文本。
--- a/demo/snippet_generator.txt
+++ b/demo/snippet_generator.txt
@ -0,0 +1,46 @@
 系统角色（System）
 你是“SQL片段生成器”。只能基于给定“表画像”生成可复用的分析片段。
 为每个片段产出：标题、用途描述、片段类型、变量、适用条件、SQL模板（mysql方言），并注明业务口径与安全限制。
 不要发明画像里没有的列。时间/维度/指标须与画像匹配。
 用户消息（User）
 【表画像JSON】
 {{TABLE_PROFILE_JSON}}
 【输出要求（只输出JSON数组）】
 [
  {
    "id": "snpt_<slug>",
    "title": "中文标题（≤16字）",
    "desc": "一句话用途",
    "type": "aggregate|trend|topn|ratio|quality|join|sample",
    "applicability": {
      "required_columns": ["<col>", ...],
      "time_column": "<dt|nullable>",
      "constraints": {
        "dim_cardinality_hint": <int|null>,            // 用于TopN限制与性能提示
        "fk_join_available": true|false,
        "notes": ["高基数维度建议LIMIT<=50", "..."]
      }
    },
    "variables": [
      {"name":"start_date","type":"date"},
      {"name":"end_date","type":"date"},
      {"name":"top_n","type":"int","default":10}
    ],
    "dialect_sql": {
      "mysql": ""
    },
    "business_caliber": "清晰口径说明，如 UV以device_id去重；粒度=日-类目",
    "examples": ["示例问法1","示例问法2"]
  }
 ]
 【片段选择建议】
 - 若存在 time 列：生成 trend_by_day / yoy_qoq / moving_avg。
 - 若存在 enumish 维度（distinct 5~200）：生成 topn_by_dimension / share_of_total。
 - 若 metric 列：生成 sum/avg/max、分位数/异常检测（3σ/箱线）。
 - 有主键/唯一：生成 去重/明细抽样/质量检查。
 - 有 fk_candidates：同时生成“join维表命名版”和“纯ID版”。
 - 高枚举维度：在 constraints.notes 中强调 LIMIT 建议与可能的性能风险。
 - 除了完整的sql片段，还有sql里部分内容的sql片段，比如 where payment_method = 'Credit Card' and delivery_status = 'Deliverd' 的含义是支付方式为信用卡且配送状态是已送达
--- a/demo/table-desc.json
+++ b/demo/table-desc.json
@ -0,0 +1,277 @@
 {
  "table": "ecommerce_orders",
  "row_count": 10000,
  "role": "fact",
  "grain": ["order_id"],
  "time": {
    "column": "order_date",
    "granularity": "day",
    "range": ["2024-04-20", "2025-04-19"],
    "has_gaps": false
  },
  "columns": [
    {
      "name": "order_id",
      "dtype": "string",
      "semantic_type": "id",
      "null_rate": 0.0,
      "distinct_count": 10000,
      "distinct_ratio": 1.0,
      "stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
      "enumish": false,
      "top_values": [],
      "pk_candidate_score": 1.0,
      "metric_candidate_score": 0.0,
      "comment": ""
    },
    {
      "name": "customer_id",
      "dtype": "integer",
      "semantic_type": "dimension",
      "null_rate": 0.0,
      "distinct_count": 2713,
      "distinct_ratio": 0.2713,
      "stats": {"min": 1, "max": 2999, "mean": 995.29, "std": null, "skewness": null},
      "enumish": false,
      "top_values": [],
      "pk_candidate_score": 0.3,
      "metric_candidate_score": 0.1,
      "comment": ""
    },
    {
      "name": "product_id",
      "dtype": "integer",
      "semantic_type": "dimension",
      "null_rate": 0.0,
      "distinct_count": 1000,
      "distinct_ratio": 0.0999,
      "stats": {"min": 1, "max": 1000, "mean": 504.87, "std": null, "skewness": null},
      "enumish": true,
      "top_values": [],
      "pk_candidate_score": 0.1,
      "metric_candidate_score": 0.1,
      "comment": ""
    },
    {
      "name": "category",
      "dtype": "string",
      "semantic_type": "dimension",
      "null_rate": 0.0,
      "distinct_count": 6,
      "distinct_ratio": 0.0006,
      "stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
      "enumish": true,
      "top_values": [
        {"value": "Beauty", "pct": null},
        {"value": "Books", "pct": null},
        {"value": "Clothing", "pct": null},
        {"value": "Electronics", "pct": null},
        {"value": "Home", "pct": null},
        {"value": "Toys", "pct": null}
      ],
      "pk_candidate_score": 0.0,
      "metric_candidate_score": 0.0,
      "comment": ""
    },
    {
      "name": "price",
      "dtype": "float",
      "semantic_type": "metric",
      "null_rate": 0.0,
      "distinct_count": 9013,
      "distinct_ratio": 0.9013,
      "stats": {"min": 5.06, "max": 499.93, "mean": 252.55, "std": null, "skewness": null},
      "enumish": false,
      "top_values": [],
      "pk_candidate_score": 0.0,
      "metric_candidate_score": 0.9,
      "comment": ""
    },
    {
      "name": "quantity",
      "dtype": "integer",
      "semantic_type": "metric",
      "null_rate": 0.0,
      "distinct_count": 9,
      "distinct_ratio": 0.0009,
      "stats": {"min": 1, "max": 9, "mean": 2.12, "std": null, "skewness": null},
      "enumish": true,
      "top_values": [
        {"value": 1, "pct": null},
        {"value": 2, "pct": null},
        {"value": 3, "pct": null},
        {"value": 4, "pct": null},
        {"value": 5, "pct": null}
      ],
      "pk_candidate_score": 0.0,
      "metric_candidate_score": 0.7,
      "comment": ""
    },
    {
      "name": "order_date",
      "dtype": "string",
      "semantic_type": "time",
      "null_rate": 0.0,
      "distinct_count": 365,
      "distinct_ratio": 0.0365,
      "stats": {"min": "2024-04-20", "max": "2025-04-19", "mean": null, "std": null, "skewness": null},
      "enumish": false,
      "top_values": [],
      "pk_candidate_score": 0.0,
      "metric_candidate_score": 0.0,
      "comment": ""
    },
    {
      "name": "shipping_date",
      "dtype": "string",
      "semantic_type": "time",
      "null_rate": 0.0,
      "distinct_count": 371,
      "distinct_ratio": 0.0371,
      "stats": {"min": "2024-04-21", "max": "2025-04-26", "mean": null, "std": null, "skewness": null},
      "enumish": false,
      "top_values": [],
      "pk_candidate_score": 0.0,
      "metric_candidate_score": 0.0,
      "comment": ""
    },
    {
      "name": "delivery_status",
      "dtype": "string",
      "semantic_type": "dimension",
      "null_rate": 0.0,
      "distinct_count": 4,
      "distinct_ratio": 0.0004,
      "stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
      "enumish": true,
      "top_values": [
        {"value": "Delivered", "pct": null},
        {"value": "Pending", "pct": null},
        {"value": "Returned", "pct": null},
        {"value": "Shipped", "pct": null}
      ],
      "pk_candidate_score": 0.0,
      "metric_candidate_score": 0.0,
      "comment": ""
    },
    {
      "name": "payment_method",
      "dtype": "string",
      "semantic_type": "dimension",
      "null_rate": 0.0,
      "distinct_count": 5,
      "distinct_ratio": 0.0005,
      "stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
      "enumish": true,
      "top_values": [
        {"value": "Apple Pay", "pct": null},
        {"value": "Credit Card", "pct": null},
        {"value": "Debit Card", "pct": null},
        {"value": "Google Pay", "pct": null},
        {"value": "PayPal", "pct": null}
      ],
      "pk_candidate_score": 0.0,
      "metric_candidate_score": 0.0,
      "comment": ""
    },
    {
      "name": "device_type",
      "dtype": "string",
      "semantic_type": "dimension",
      "null_rate": 0.0,
      "distinct_count": 3,
      "distinct_ratio": 0.0003,
      "stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
      "enumish": true,
      "top_values": [
        {"value": "Desktop", "pct": null},
        {"value": "Mobile", "pct": null},
        {"value": "Tablet", "pct": null}
      ],
      "pk_candidate_score": 0.0,
      "metric_candidate_score": 0.0,
      "comment": ""
    },
    {
      "name": "channel",
      "dtype": "string",
      "semantic_type": "dimension",
      "null_rate": 0.0,
      "distinct_count": 4,
      "distinct_ratio": 0.0004,
      "stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
      "enumish": true,
      "top_values": [
        {"value": "Email", "pct": null},
        {"value": "Organic", "pct": null},
        {"value": "Paid Search", "pct": null},
        {"value": "Social", "pct": null}
      ],
      "pk_candidate_score": 0.0,
      "metric_candidate_score": 0.0,
      "comment": ""
    },
    {
      "name": "shipping_address",
      "dtype": "string",
      "semantic_type": "text",
      "null_rate": 0.0,
      "distinct_count": 10000,
      "distinct_ratio": 1.0,
      "stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
      "enumish": false,
      "top_values": [],
      "pk_candidate_score": 0.9,
      "metric_candidate_score": 0.0,
      "comment": ""
    },
    {
      "name": "billing_address",
      "dtype": "string",
      "semantic_type": "text",
      "null_rate": 0.0,
      "distinct_count": 10000,
      "distinct_ratio": 1.0,
      "stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
      "enumish": false,
      "top_values": [],
      "pk_candidate_score": 0.9,
      "metric_candidate_score": 0.0,
      "comment": ""
    },
    {
      "name": "customer_segment",
      "dtype": "string",
      "semantic_type": "dimension",
      "null_rate": 0.0,
      "distinct_count": 3,
      "distinct_ratio": 0.0003,
      "stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
      "enumish": true,
      "top_values": [
        {"value": "New", "pct": null},
        {"value": "Returning", "pct": null},
        {"value": "VIP", "pct": null}
      ],
      "pk_candidate_score": 0.0,
      "metric_candidate_score": 0.0,
      "comment": ""
    }
  ],
  "primary_key_candidates": [["order_id"]],
  "fk_candidates": [
    {"from": "customer_id", "to": "dim_customer(customer_id)", "confidence": 0.9},
    {"from": "product_id", "to": "dim_product(product_id)", "confidence": 0.9}
  ],
  "quality": {
    "failed_expectations": [],
    "warning_hints": []
  },
  "confidence_notes": [
    "表含时间列(order_date, shipping_date)且含度量列(price, quantity)，推断为fact表。",
    "order_id唯一性=1.0，确认主键。",
    "order_date日期范围连续无缺口，粒度为日级。",
    "高基数数值字段(price, quantity)符合指标特征。",
    "低熵字段(category, delivery_status, payment_method等)为枚举维度。"
  ]
 }
--- a/demo/user-query.json
+++ b/demo/user-query.json
@ -0,0 +1,102 @@
 [
  {
    "question": "近一年每个月的销售额和订单量变化趋势如何？",
    "intent": "trend_analysis",
    "related_fields": ["order_date", "price", "quantity"]
  },
  {
    "question": "哪个产品类目的GMV最高？",
    "intent": "topn_category",
    "related_fields": ["category", "price", "quantity"]
  },
  {
    "question": "不同支付方式的订单数量和平均客单价是多少？",
    "intent": "aggregate_comparison",
    "related_fields": ["payment_method", "price", "quantity"]
  },
  {
    "question": "各营销渠道（如Paid Search、Social）的GMV占比是多少？",
    "intent": "ratio_analysis",
    "related_fields": ["channel", "price", "quantity"]
  },
  {
    "question": "移动端和桌面端的订单表现差异大吗？",
    "intent": "device_comparison",
    "related_fields": ["device_type", "price", "quantity"]
  },
  {
    "question": "已送达订单的平均配送时长是多少天？",
    "intent": "shipping_time_analysis",
    "related_fields": ["order_date", "shipping_date", "delivery_status"]
  },
  {
    "question": "退货（Returned）订单主要集中在哪些产品类目？",
    "intent": "return_analysis",
    "related_fields": ["delivery_status", "category"]
  },
  {
    "question": "不同客户类型（新客、回头客、VIP）的平均订单金额是多少？",
    "intent": "segment_analysis",
    "related_fields": ["customer_segment", "price", "quantity"]
  },
  {
    "question": "每个客户的平均下单频率是多少？",
    "intent": "customer_behavior",
    "related_fields": ["customer_id", "order_date"]
  },
  {
    "question": "近期是否存在价格异常或超高订单？",
    "intent": "quality_outlier",
    "related_fields": ["price", "order_date"]
  },
  {
    "question": "哪个支付方式的退货率最高？",
    "intent": "return_ratio_by_payment",
    "related_fields": ["payment_method", "delivery_status"]
  },
  {
    "question": "哪些商品在VIP客户中最受欢迎？",
    "intent": "vip_product_preference",
    "related_fields": ["customer_segment", "product_id", "price", "quantity"]
  },
  {
    "question": "下单后平均几天发货？",
    "intent": "shipping_speed",
    "related_fields": ["order_date", "shipping_date"]
  },
  {
    "question": "从哪些渠道来的新用户最多？",
    "intent": "user_acquisition_channel",
    "related_fields": ["channel", "customer_segment"]
  },
  {
    "question": "订单数量在周末和工作日有什么差异？",
    "intent": "weekday_pattern",
    "related_fields": ["order_date"]
  },
  {
    "question": "每个设备类型的平均订单金额是多少？",
    "intent": "device_gmv_comparison",
    "related_fields": ["device_type", "price", "quantity"]
  },
  {
    "question": "本月退货率与上月相比是否上升？",
    "intent": "return_trend",
    "related_fields": ["delivery_status", "order_date"]
  },
  {
    "question": "哪些客户下单金额最高？",
    "intent": "top_customers",
    "related_fields": ["customer_id", "price", "quantity"]
  },
  {
    "question": "不同类目的平均客单价（GMV/订单量）是多少？",
    "intent": "category_avg_order_value",
    "related_fields": ["category", "price", "quantity"]
  },
  {
    "question": "不同渠道的订单平均转化周期（下单到发货）是多少？",
    "intent": "conversion_cycle",
    "related_fields": ["channel", "order_date", "shipping_date"]
  }
 ]
--- a/file/全国品牌.xlsx
+++ b/file/全国品牌.xlsx
--- a/file/长江电力CYPC概览历史工作票-向家坝-机械类-2024.01.01-2025.06.30.xlsx
+++ b/file/长江电力CYPC概览历史工作票-向家坝-机械类-2024.01.01-2025.06.30.xlsx
--- a/ge_v1.py
+++ b/ge_v1.py
@ -0,0 +1,332 @@
 """Great Expectations profiling helper for Excel sources.
 This script loads a user-provided Excel file into pandas, profiles it with
 Great Expectations, writes a lightweight analysis summary to JSON, and exposes
 the path to GE Data Docs for manual inspection.
 """
 from __future__ import annotations
 import argparse
 import json
 import os
 import shutil
 from pathlib import Path
 from typing import Any, Dict
 import numpy as np
 import pandas as pd
 import great_expectations as gx
 from great_expectations.core.batch import RuntimeBatchRequest
 from great_expectations.data_context import FileDataContext
 from great_expectations.exceptions import (
    DataContextError,
    InvalidDataContextConfigError,
 )
 try:
    from great_expectations.profile.user_configurable_profiler import (
        UserConfigurableProfiler,
    )
 except ImportError:
    try:
        from great_expectations.profiler.user_configurable_profiler import (
            UserConfigurableProfiler,
        )
    except ImportError as err:
        raise ImportError(
            "UserConfigurableProfiler is not available; please install a compatible "
            "version of great_expectations (>=0.15,<0.19) or add the profiling extra."
        ) from err
 RESULTS_DIR = Path("results")
 DEFAULT_EXCEL_PATH = Path("file") / "全国品牌.xlsx"
 DEFAULT_BATCH_ID = "initial_profile"
 def parse_cli_args() -> argparse.Namespace:
    """Parse command line options for Excel ingestion."""
    parser = argparse.ArgumentParser(description="Profile an Excel file with GE")
    parser.add_argument(
        "--excel-path",
        type=Path,
        default=DEFAULT_EXCEL_PATH,
        help="Path to the Excel file to analyse (default: ./file/全国品牌.xlsx)",
    )
    parser.add_argument(
        "--sheet-name",
        default=0,
        help="Excel sheet name or index to load (default: 0)",
    )
    parser.add_argument(
        "--header-row",
        type=int,
        default=0,
        help="Row index (0-based) to use as the header (default: 0)",
    )
    parser.add_argument(
        "--clean-results",
        action="store_true",
        help="Remove the previous results directory before running",
    )
    parser.add_argument(
        "--ge-root",
        type=Path,
        default=Path("gx_project"),
        help="Directory to host the Great Expectations project (default: ./gx_project)",
    )
    return parser.parse_args()
 def reset_results_dir(clean_results: bool) -> None:
    """Remove prior results folder when requested and ensure directory exists."""
    if clean_results and RESULTS_DIR.exists():
        shutil.rmtree(RESULTS_DIR)
    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
 def load_excel_as_dataframe(excel_path: Path, sheet_name: Any, header_row: int) -> pd.DataFrame:
    """Load Excel data into a DataFrame and provide basic logging."""
    if not excel_path.exists():
        raise FileNotFoundError(f"Excel file not found: {excel_path}")
    df = pd.read_excel(excel_path, sheet_name=sheet_name, header=header_row)
    print(f"Loaded Excel data: {excel_path} ({len(df)} rows, {len(df.columns)} columns)")
    return df
 def get_datasource_config(datasource_name: str, data_connector_name: str) -> Dict[str, Any]:
    """Assemble a minimal Pandas datasource configuration."""
    return {
        "name": datasource_name,
        "class_name": "Datasource",
        "execution_engine": {"class_name": "PandasExecutionEngine"},
        "data_connectors": {
            data_connector_name: {
                "class_name": "RuntimeDataConnector",
                "runtime_keys": ["batch_id"],
            }
        },
    }
 def clean_value(value: Any) -> Any:
    """Convert numpy/pandas scalar types into JSON serialisable values."""
    if isinstance(value, (np.generic,)):
        return value.item()
    if isinstance(value, pd.Timestamp):
        return value.isoformat()
    if pd.isna(value):
        return None
    return value
 def build_column_profile(series: pd.Series) -> Dict[str, Any]:
    """Generate a compact per-column profile for JSON output."""
    stats = series.describe()
    profiled_stats = {key: clean_value(val) for key, val in stats.items()}
    return {
        "name": str(series.name),
        "dtype": str(series.dtype),
        "non_null_count": int(series.count()),
        "null_count": int(series.isna().sum()),
        "unique_count": int(series.nunique(dropna=True)),
        "stats": profiled_stats,
    }
 def build_analysis_summary(df: pd.DataFrame, sample_size: int = 5) -> Dict[str, Any]:
    """Collate basic statistics to accompany GE outputs."""
    summary = {
        "shape": {"rows": int(df.shape[0]), "columns": int(df.shape[1])},
        "columns": [build_column_profile(df[col]) for col in df.columns],
        "sample_rows": [
            {key: clean_value(value) for key, value in row.items()} for row in df.head(sample_size).to_dict(orient="records")
        ],
    }
    return summary
 def serialize_batch_request(batch_request: Any) -> Dict[str, Any]:
    """Convert differing batch request types into plain dictionaries."""
    if hasattr(batch_request, "to_json_dict"):
        return batch_request.to_json_dict()
    if hasattr(batch_request, "dict"):
        return batch_request.dict()
    if hasattr(batch_request, "model_dump"):
        return batch_request.model_dump()
    return {"repr": repr(batch_request)}
 def ensure_data_context(ge_root: Path) -> gx.DataContext:
    """Create or repair a file-backed GE data context as needed."""
    ge_root = ge_root.resolve()
    config_path = ge_root / "gx" / "great_expectations.yml"
    if not config_path.exists():
        FileDataContext.create(project_root_dir=str(ge_root))
    try:
        return gx.get_context(project_root_dir=str(ge_root))
    except InvalidDataContextConfigError:
        print("Existing Great Expectations config invalid; recreating project root.")
        shutil.rmtree(ge_root, ignore_errors=True)
        FileDataContext.create(project_root_dir=str(ge_root))
        return gx.get_context(project_root_dir=str(ge_root))
 def run_ge_profiling(
    context: gx.DataContext,
    df: pd.DataFrame,
    datasource_name: str,
    data_connector_name: str,
    data_asset_name: str,
    expectation_suite_name: str,
 ) -> Dict[str, Any]:
    """Register datasource, build expectations, and capture validation results."""
    if hasattr(context, "sources"):
        datasource = context.sources.add_or_update_pandas(name=datasource_name)
        try:
            datasource.delete_asset(data_asset_name)
        except (gx.exceptions.DataConnectorError, ValueError, KeyError, LookupError, AttributeError):
            pass
        asset = datasource.add_dataframe_asset(name=data_asset_name)
        batch_request = asset.build_batch_request(dataframe=df)
        print(f"Datasource registered (fluent): {datasource_name}")
    else:
        datasource_config = get_datasource_config(datasource_name, data_connector_name)
        try:
            context.add_datasource(**datasource_config)
            print(f"Datasource registered: {datasource_name}")
        except gx.exceptions.GreatExpectationsError as err:
            print(f"Datasource registration issue: {err}")
        batch_request = RuntimeBatchRequest(
            datasource_name=datasource_name,
            data_connector_name=data_connector_name,
            data_asset_name=data_asset_name,
            runtime_parameters={"batch_data": df},
            batch_identifiers={"batch_id": DEFAULT_BATCH_ID},
        )
    try:
        context.delete_expectation_suite(expectation_suite_name=expectation_suite_name)
    except DataContextError:
        pass
    if hasattr(context, "create_expectation_suite"):
        context.create_expectation_suite(
            expectation_suite_name=expectation_suite_name, overwrite_existing=True
        )
    else:
        context.add_expectation_suite(expectation_suite_name=expectation_suite_name)
    validator = context.get_validator(
        batch_request=batch_request, expectation_suite_name=expectation_suite_name
    )
    profiler = UserConfigurableProfiler(profile_dataset=validator)
    expectation_suite = profiler.build_suite()
    context.add_or_update_expectation_suite(expectation_suite=expectation_suite)
    validation_result = validator.validate(result_format="SUMMARY")
    context.build_data_docs()
    data_docs_path = (
        Path(context.root_directory)
        / "uncommitted"
        / "data_docs"
        / "local_site"
        / "index.html"
    )
    print(
        f"Expectation suite saved: {expectation_suite_name} ({len(expectation_suite.expectations)} expectations)"
    )
    return {
        "batch_request": serialize_batch_request(batch_request),
        "expectation_suite_name": expectation_suite_name,
        "expectations_count": len(expectation_suite.expectations),
        "validation_result": validation_result.to_json_dict(),
        "data_docs_path": os.path.abspath(data_docs_path),
    }
 def assemble_payload(
    excel_path: Path,
    sheet_name: Any,
    dataframe_summary: Dict[str, Any],
    ge_summary: Dict[str, Any],
 ) -> Dict[str, Any]:
    """Combine pandas and GE artefacts into a single JSON payload."""
    return {
        "source": {
            "excel_path": str(excel_path.resolve()),
            "sheet_name": sheet_name,
        },
        "analysis": dataframe_summary,
        "great_expectations": ge_summary,
    }
 def save_json_payload(payload: Dict[str, Any], output_path: Path) -> None:
    """Persist the combined analysis payload to disk."""
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with output_path.open("w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)
    print(f"JSON analysis saved to: {output_path}")
 def main() -> None:
    args = parse_cli_args()
    reset_results_dir(clean_results=args.clean_results)
    context = ensure_data_context(args.ge_root)
    print(f"Great Expectations Data Context initialized at {context.root_directory}.")
    df = load_excel_as_dataframe(args.excel_path, args.sheet_name, args.header_row)
    dataframe_summary = build_analysis_summary(df)
    file_stem = args.excel_path.stem
    datasource_name = f"{file_stem}_datasource"
    data_connector_name = "runtime_data_connector"
    data_asset_name = f"{file_stem}_asset"
    expectation_suite_name = f"{file_stem}_suite"
    ge_summary = run_ge_profiling(
        context,
        df,
        datasource_name,
        data_connector_name,
        data_asset_name,
        expectation_suite_name,
    )
    payload = assemble_payload(
        excel_path=args.excel_path,
        sheet_name=args.sheet_name,
        dataframe_summary=dataframe_summary,
        ge_summary=ge_summary,
    )
    output_path = RESULTS_DIR / f"{file_stem}_analysis.json"
    save_json_payload(payload, output_path)
    print(
        f"Data Docs generated. Open in browser: file://{ge_summary['data_docs_path']}"
    )
 if __name__ == "__main__":
    main()
--- a/ge_v2.py
+++ b/ge_v2.py
@ -0,0 +1,104 @@
 import great_expectations as gx
 from datasets import load_dataset
 import pandas as pd
 import os
 import webbrowser
 from great_expectations.profile.user_configurable_profiler import (
    UserConfigurableProfiler,
 )
 # --- 1. 加载Hugging Face数据集并转换为Pandas DataFrame ---
 print("🚚 1. 从Hugging Face加载 'millat/e-commerce-orders' 数据集...")
 # 加载数据集，只取训练集部分
 hf_dataset = load_dataset("millat/e-commerce-orders", split="train")
 # 转换为Pandas DataFrame，这是GX最常使用的数据格式
 df = hf_dataset.to_pandas()
 print(f"✅ 数据集加载成功，包含 {len(df)} 行数据。")
 print("\n📝 数据集前5行预览:")
 print(df.head())
 # --- 2. 初始化Great Expectations (GX) 项目 ---
 # 这将在当前目录下创建一个名为 "ge_project" 的文件夹来存放所有GX的配置和结果
 print("\n🏗️ 2. 初始化Great Expectations项目...")
 context = gx.get_context()
 print("✅ GX项目上下文（Context）创建成功。")
 # --- 3. 添加数据源并将DataFrame连接到GX ---
 # 我们将Pandas DataFrame添加为一个数据源，这样GX就知道如何访问它
 print("\n🔗 3. 将DataFrame添加为GX的数据源...")
 datasource_name = "my_ecommerce_datasource"
 # Fluent API: add_or_update 确保多次运行脚本也不会重复出错
 datasource = context.sources.add_or_update_pandas(name=datasource_name)
 data_asset_name = "orders_table"
 data_asset = datasource.add_dataframe_asset(name=data_asset_name, dataframe=df)
 print("✅ 数据源和数据资产（Data Asset）配置完成。")
 # --- 4. 使用自动剖析器生成期望套件 ---
 print("\n🔍 4. 使用自动剖析器 (Profiler) 扫描数据并生成期望...")
 # 创建一个请求，告诉GX我们要处理哪个数据资产
 batch_request = data_asset.build_batch_request()
 # 定义期望套件的名称
 expectation_suite_name = "ecommerce_profiling_suite"
 # 创建或获取期望套件
 try:
    suite = context.get_expectation_suite(expectation_suite_name=expectation_suite_name)
    print(f"  - 已找到现有的期望套件 '{expectation_suite_name}'。")
 except gx.exceptions.DataContextError:
    suite = context.add_expectation_suite(expectation_suite_name=expectation_suite_name)
    print(f"  - 已创建新的期望套件 '{expectation_suite_name}'。")
 # 构建一个 Validator，供剖析器消费
 validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name,
 )
 # 这是核心步骤：使用 UserConfigurableProfiler 自动分析数据并创建期望
 profiler = UserConfigurableProfiler(profile_dataset=validator)
 suite = profiler.build_suite()
 # 保存由剖析器生成的期望套件
 context.save_expectation_suite(expectation_suite=suite, expectation_suite_name=expectation_suite_name)
 print("✅ 自动剖析完成，期望已生成并保存。")
 # --- 5. 创建并运行检查点（Checkpoint）以验证数据 ---
 print("\n🛡️ 5. 创建并运行检查点 (Checkpoint) 以验证数据...")
 checkpoint_name = "ecommerce_profiling_checkpoint"
 try:
    # 检查检查点是否已存在
    checkpoint = context.get_checkpoint(name=checkpoint_name)
    print(f"  - 已加载现有的检查点 '{checkpoint_name}'。")
 except gx.exceptions.CheckpointNotFoundError:
    # 如果不存在，则创建一个新的
    checkpoint_config = {
        "name": checkpoint_name,
        "validations": [
            {
                "batch_request": batch_request,
                "expectation_suite_name": expectation_suite_name,
            }
        ],
    }
    context.add_or_update_checkpoint(**checkpoint_config)
    checkpoint = context.get_checkpoint(name=checkpoint_name)
    print(f"  - 已创建新的检查点 '{checkpoint_name}'。")
 # 运行检查点，它会将数据与我们刚刚生成的期望套件进行对比
 checkpoint_result = checkpoint.run()
 print("✅ 检查点运行完毕，数据验证完成。")
 # --- 6. 构建并打开数据文档（Data Docs）查看结果 ---
 print("\n📊 6. 构建并打开数据文档 (Data Docs) 查看剖析报告...")
 # 这会生成一个HTML报告
 context.build_data_docs()
 # 获取Data Docs的路径并自动在浏览器中打开
 docs_path = os.path.join(context.root_directory, "uncommitted", "data_docs", "local_site", "index.html")
 print(f"\n🎉 剖析报告已生成！请在浏览器中查看：\nfile://{os.path.abspath(docs_path)}")
 webbrowser.open(f"file://{os.path.abspath(docs_path)}")
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,11 @@
 fastapi>=0.111.0
 uvicorn[standard]>=0.29.0
 pydantic>=2.6.0
 sqlalchemy>=2.0.28
 pymysql>=1.1.0
 great_expectations>=0.18.0,<0.19.0
 pandas>=2.0
 numpy>=1.24
 openpyxl>=3.1
 httpx==0.27.2
 python-dotenv==1.0.1
--- a/scripts/pycache/deepseek_request.cpython-312.pyc
+++ b/scripts/pycache/deepseek_request.cpython-312.pyc
--- a/scripts/deepseek_request.py
+++ b/scripts/deepseek_request.py
@ -0,0 +1,110 @@
 from __future__ import annotations
 import argparse
 import asyncio
 import json
 from typing import Any, Dict
 import httpx
 DEFAULT_URL = "http://127.0.0.1:8000/v1/chat/completions"
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Send a DeepSeek chat completion request to the local LLM gateway."
    )
    parser.add_argument(
        "--url",
        default=DEFAULT_URL,
        help=f"Gateway endpoint URL (default: {DEFAULT_URL})",
    )
    parser.add_argument(
        "--model",
        default="deepseek-chat",
        help="DeepSeek model to use (default: deepseek-chat).",
    )
    parser.add_argument(
        "--system",
        default="You are a helpful assistant.",
        help="Optional system prompt.",
    )
    parser.add_argument(
        "--prompt",
        default="写一段简短的中文欢迎词。",
        help="User message content to send.",
    )
    parser.add_argument(
        "--temperature",
        type=float,
        default=0.7,
        help="Sampling temperature.",
    )
    parser.add_argument(
        "--max-tokens",
        type=int,
        default=512,
        help="Maximum tokens for the response.",
    )
    parser.add_argument(
        "--stream",
        action="store_true",
        help="Enable streaming mode (DeepSeek supports it).",
    )
    parser.add_argument(
        "--extra",
        help="Optional JSON string with extra provider parameters.",
    )
    return parser.parse_args()
 async def send_request(url: str, payload: Dict[str, Any]) -> Dict[str, Any]:
    async with httpx.AsyncClient(timeout=httpx.Timeout(60.0)) as client:
        response = await client.post(url, json=payload)
        response.raise_for_status()
        return response.json()
 def build_payload(args: argparse.Namespace) -> Dict[str, Any]:
    extra_params = None
    if args.extra:
        try:
            extra_params = json.loads(args.extra)
        except json.JSONDecodeError as exc:
            raise SystemExit(f"Invalid JSON passed to --extra: {exc}") from exc
    payload: Dict[str, Any] = {
        "provider": "deepseek",
        "model": args.model,
        "messages": [
            {"role": "system", "content": args.system},
            {"role": "user", "content": args.prompt},
        ],
        "temperature": args.temperature,
        "max_tokens": args.max_tokens,
        "stream": args.stream,
    }
    if extra_params:
        payload["extra_params"] = extra_params
    return payload
 async def main() -> None:
    args = parse_args()
    payload = build_payload(args)
    try:
        result = await send_request(args.url, payload)
    except httpx.HTTPStatusError as exc:
        detail = exc.response.text
        raise SystemExit(f"Gateway returned {exc.response.status_code}: {detail}") from exc
    except httpx.HTTPError as exc:
        raise SystemExit(f"HTTP error calling gateway: {exc}") from exc
    print(json.dumps(result, ensure_ascii=False, indent=2))
 if __name__ == "__main__":
    asyncio.run(main())
--- a/todo.md
+++ b/todo.md
@ -0,0 +1,19 @@
 ##基础选型：
 Python + Great Expectations
 ##设计理念：
 1. 服务化 (Service-Oriented): 将 GE 的能力封装成一个独立的微服务，通过 RESTful API 对外提供数据分析质量的定义、校验、报告。
 2. 配置驱动 (Configuration-Driven): 所有的期望(Expectations)、数据源连接、校验点(Checkpoints)都是可配置的，期望有默认版本和自定义版本。
 3. 聚焦验证 (Validation):通过GE发现验证问题，解决问题留给后续的数据清洗和修改
 4. 异步(Asynchronous): 应对多任务同时分析，不阻塞流程
 5. 增量分析(Incremental- Analysis ):数据会多次分析和修改才能使用
 ##架构设计：
 使用 FastAPI 构建 RESTful API 服务，具备高性能和自带 OpenAPI (Swagger UI) 文档，异步支持度高。
 1. 分析流程管理
 解析 API 请求，管理分析任务的生命周期，支持异步，接受请求-调度服务-存储报告-触发通知。
 2. GE封装
 以编程配置方式管理数据源（Datasources）、期望套件（Expectation Suites）和校验点（Checkpoints），执行数据分析（Profiling）和数据校验（Validation）。
 3. 期望仓库和结果存储
 使用 Git 仓库（mysql？，期望是json）来存储期望仓库，对数据质量规则进行版本控制、审计和协作，将每次的校验结果结构化后存入数据库，便于后续进行数据质量趋势分析、历史追溯和仪表盘展示。
 4. 提供llm api gateway服务
 llm api 调用,多供应商统一网关（OpenAI / Anthropic / Openrouter / Gemini / Qwen / DeepSeek 一键切换）
		`@ -0,0 +1,3 @@`
							`from .main import create_app`

							`__all__ = ["create_app"]`
		`@ -0,0 +1,3 @@`
							`from .gateway import LLMGateway`

							`__all__ = ["LLMGateway"]`