table profiling功能开发

This commit is contained in:
zhaoawd
2025-11-03 00:18:26 +08:00
parent 557efc4bf1
commit c2a08e4134
6 changed files with 1280 additions and 16 deletions

View File

@ -1,5 +1,6 @@
from __future__ import annotations
from datetime import datetime
from enum import Enum
from typing import Any, Dict, List, Optional, Union
@ -135,3 +136,89 @@ class DataImportAnalysisJobRequest(BaseModel):
class DataImportAnalysisJobAck(BaseModel):
import_record_id: str = Field(..., description="Echo of the import record identifier")
status: str = Field("accepted", description="Processing status acknowledgement.")
class TableProfilingJobRequest(BaseModel):
table_id: str = Field(..., description="Unique identifier for the table to profile.")
version_ts: str = Field(
...,
pattern=r"^\d{14}$",
description="Version timestamp expressed as fourteen digit string (yyyyMMddHHmmss).",
)
callback_url: HttpUrl = Field(
...,
description="Callback endpoint invoked after each pipeline action completes.",
)
table_schema: Optional[Any] = Field(
None,
description="Schema structure snapshot for the current table version.",
)
table_schema_version_id: Optional[str] = Field(
None,
description="Identifier for the schema snapshot provided in table_schema.",
)
table_link_info: Optional[Dict[str, Any]] = Field(
None,
description=(
"Information describing how to locate the source table for profiling. "
"For example: {'type': 'sql', 'connection_string': 'mysql+pymysql://user:pass@host/db', "
"'table': 'schema.table_name'}."
),
)
table_access_info: Optional[Dict[str, Any]] = Field(
None,
description=(
"Credentials or supplemental parameters required to access the table described in table_link_info. "
"These values can be merged into the connection string using Python format placeholders."
),
)
ge_batch_request: Optional[Dict[str, Any]] = Field(
None,
description="Optional Great Expectations batch request payload used for profiling.",
)
ge_expectation_suite_name: Optional[str] = Field(
None,
description="Expectation suite name used during profiling. Created automatically when absent.",
)
ge_data_context_root: Optional[str] = Field(
None,
description="Custom root directory for the Great Expectations data context. Defaults to project ./gx.",
)
ge_datasource_name: Optional[str] = Field(
None,
description="Datasource name registered inside the GE context when batch_request is not supplied.",
)
ge_data_asset_name: Optional[str] = Field(
None,
description="Data asset reference used when inferring batch request from datasource configuration.",
)
ge_profiler_type: str = Field(
"user_configurable",
description="Profiler implementation identifier. Currently supports 'user_configurable' or 'data_assistant'.",
)
llm_model: Optional[str] = Field(
None,
description="Default LLM model spec applied to prompt-based actions when overrides are omitted.",
)
result_desc_model: Optional[str] = Field(
None,
description="LLM model override used for GE result description (action 2).",
)
snippet_model: Optional[str] = Field(
None,
description="LLM model override used for snippet generation (action 3).",
)
snippet_alias_model: Optional[str] = Field(
None,
description="LLM model override used for snippet alias enrichment (action 4).",
)
extra_options: Optional[Dict[str, Any]] = Field(
None,
description="Miscellaneous execution flags applied across pipeline steps.",
)
class TableProfilingJobAck(BaseModel):
table_id: str = Field(..., description="Echo of the table identifier.")
version_ts: str = Field(..., description="Echo of the profiling version timestamp (yyyyMMddHHmmss).")
status: str = Field("accepted", description="Processing acknowledgement status.")