init,llm gateway & import_analyse
This commit is contained in:
332
ge_v1.py
Normal file
332
ge_v1.py
Normal file
@ -0,0 +1,332 @@
|
||||
"""Great Expectations profiling helper for Excel sources.
|
||||
|
||||
This script loads a user-provided Excel file into pandas, profiles it with
|
||||
Great Expectations, writes a lightweight analysis summary to JSON, and exposes
|
||||
the path to GE Data Docs for manual inspection.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import great_expectations as gx
|
||||
from great_expectations.core.batch import RuntimeBatchRequest
|
||||
from great_expectations.data_context import FileDataContext
|
||||
from great_expectations.exceptions import (
|
||||
DataContextError,
|
||||
InvalidDataContextConfigError,
|
||||
)
|
||||
|
||||
try:
|
||||
from great_expectations.profile.user_configurable_profiler import (
|
||||
UserConfigurableProfiler,
|
||||
)
|
||||
except ImportError:
|
||||
try:
|
||||
from great_expectations.profiler.user_configurable_profiler import (
|
||||
UserConfigurableProfiler,
|
||||
)
|
||||
except ImportError as err:
|
||||
raise ImportError(
|
||||
"UserConfigurableProfiler is not available; please install a compatible "
|
||||
"version of great_expectations (>=0.15,<0.19) or add the profiling extra."
|
||||
) from err
|
||||
|
||||
|
||||
RESULTS_DIR = Path("results")
|
||||
DEFAULT_EXCEL_PATH = Path("file") / "全国品牌.xlsx"
|
||||
DEFAULT_BATCH_ID = "initial_profile"
|
||||
|
||||
|
||||
def parse_cli_args() -> argparse.Namespace:
|
||||
"""Parse command line options for Excel ingestion."""
|
||||
|
||||
parser = argparse.ArgumentParser(description="Profile an Excel file with GE")
|
||||
parser.add_argument(
|
||||
"--excel-path",
|
||||
type=Path,
|
||||
default=DEFAULT_EXCEL_PATH,
|
||||
help="Path to the Excel file to analyse (default: ./file/全国品牌.xlsx)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sheet-name",
|
||||
default=0,
|
||||
help="Excel sheet name or index to load (default: 0)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--header-row",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Row index (0-based) to use as the header (default: 0)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--clean-results",
|
||||
action="store_true",
|
||||
help="Remove the previous results directory before running",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ge-root",
|
||||
type=Path,
|
||||
default=Path("gx_project"),
|
||||
help="Directory to host the Great Expectations project (default: ./gx_project)",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def reset_results_dir(clean_results: bool) -> None:
|
||||
"""Remove prior results folder when requested and ensure directory exists."""
|
||||
|
||||
if clean_results and RESULTS_DIR.exists():
|
||||
shutil.rmtree(RESULTS_DIR)
|
||||
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def load_excel_as_dataframe(excel_path: Path, sheet_name: Any, header_row: int) -> pd.DataFrame:
|
||||
"""Load Excel data into a DataFrame and provide basic logging."""
|
||||
|
||||
if not excel_path.exists():
|
||||
raise FileNotFoundError(f"Excel file not found: {excel_path}")
|
||||
|
||||
df = pd.read_excel(excel_path, sheet_name=sheet_name, header=header_row)
|
||||
print(f"Loaded Excel data: {excel_path} ({len(df)} rows, {len(df.columns)} columns)")
|
||||
return df
|
||||
|
||||
|
||||
def get_datasource_config(datasource_name: str, data_connector_name: str) -> Dict[str, Any]:
|
||||
"""Assemble a minimal Pandas datasource configuration."""
|
||||
|
||||
return {
|
||||
"name": datasource_name,
|
||||
"class_name": "Datasource",
|
||||
"execution_engine": {"class_name": "PandasExecutionEngine"},
|
||||
"data_connectors": {
|
||||
data_connector_name: {
|
||||
"class_name": "RuntimeDataConnector",
|
||||
"runtime_keys": ["batch_id"],
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def clean_value(value: Any) -> Any:
|
||||
"""Convert numpy/pandas scalar types into JSON serialisable values."""
|
||||
|
||||
if isinstance(value, (np.generic,)):
|
||||
return value.item()
|
||||
if isinstance(value, pd.Timestamp):
|
||||
return value.isoformat()
|
||||
if pd.isna(value):
|
||||
return None
|
||||
return value
|
||||
|
||||
|
||||
def build_column_profile(series: pd.Series) -> Dict[str, Any]:
|
||||
"""Generate a compact per-column profile for JSON output."""
|
||||
|
||||
stats = series.describe()
|
||||
profiled_stats = {key: clean_value(val) for key, val in stats.items()}
|
||||
|
||||
return {
|
||||
"name": str(series.name),
|
||||
"dtype": str(series.dtype),
|
||||
"non_null_count": int(series.count()),
|
||||
"null_count": int(series.isna().sum()),
|
||||
"unique_count": int(series.nunique(dropna=True)),
|
||||
"stats": profiled_stats,
|
||||
}
|
||||
|
||||
|
||||
def build_analysis_summary(df: pd.DataFrame, sample_size: int = 5) -> Dict[str, Any]:
|
||||
"""Collate basic statistics to accompany GE outputs."""
|
||||
|
||||
summary = {
|
||||
"shape": {"rows": int(df.shape[0]), "columns": int(df.shape[1])},
|
||||
"columns": [build_column_profile(df[col]) for col in df.columns],
|
||||
"sample_rows": [
|
||||
{key: clean_value(value) for key, value in row.items()} for row in df.head(sample_size).to_dict(orient="records")
|
||||
],
|
||||
}
|
||||
return summary
|
||||
|
||||
|
||||
def serialize_batch_request(batch_request: Any) -> Dict[str, Any]:
|
||||
"""Convert differing batch request types into plain dictionaries."""
|
||||
|
||||
if hasattr(batch_request, "to_json_dict"):
|
||||
return batch_request.to_json_dict()
|
||||
if hasattr(batch_request, "dict"):
|
||||
return batch_request.dict()
|
||||
if hasattr(batch_request, "model_dump"):
|
||||
return batch_request.model_dump()
|
||||
return {"repr": repr(batch_request)}
|
||||
|
||||
|
||||
def ensure_data_context(ge_root: Path) -> gx.DataContext:
|
||||
"""Create or repair a file-backed GE data context as needed."""
|
||||
|
||||
ge_root = ge_root.resolve()
|
||||
config_path = ge_root / "gx" / "great_expectations.yml"
|
||||
if not config_path.exists():
|
||||
FileDataContext.create(project_root_dir=str(ge_root))
|
||||
|
||||
try:
|
||||
return gx.get_context(project_root_dir=str(ge_root))
|
||||
except InvalidDataContextConfigError:
|
||||
print("Existing Great Expectations config invalid; recreating project root.")
|
||||
shutil.rmtree(ge_root, ignore_errors=True)
|
||||
FileDataContext.create(project_root_dir=str(ge_root))
|
||||
return gx.get_context(project_root_dir=str(ge_root))
|
||||
|
||||
|
||||
def run_ge_profiling(
|
||||
context: gx.DataContext,
|
||||
df: pd.DataFrame,
|
||||
datasource_name: str,
|
||||
data_connector_name: str,
|
||||
data_asset_name: str,
|
||||
expectation_suite_name: str,
|
||||
) -> Dict[str, Any]:
|
||||
"""Register datasource, build expectations, and capture validation results."""
|
||||
|
||||
if hasattr(context, "sources"):
|
||||
datasource = context.sources.add_or_update_pandas(name=datasource_name)
|
||||
try:
|
||||
datasource.delete_asset(data_asset_name)
|
||||
except (gx.exceptions.DataConnectorError, ValueError, KeyError, LookupError, AttributeError):
|
||||
pass
|
||||
asset = datasource.add_dataframe_asset(name=data_asset_name)
|
||||
batch_request = asset.build_batch_request(dataframe=df)
|
||||
print(f"Datasource registered (fluent): {datasource_name}")
|
||||
else:
|
||||
datasource_config = get_datasource_config(datasource_name, data_connector_name)
|
||||
try:
|
||||
context.add_datasource(**datasource_config)
|
||||
print(f"Datasource registered: {datasource_name}")
|
||||
except gx.exceptions.GreatExpectationsError as err:
|
||||
print(f"Datasource registration issue: {err}")
|
||||
batch_request = RuntimeBatchRequest(
|
||||
datasource_name=datasource_name,
|
||||
data_connector_name=data_connector_name,
|
||||
data_asset_name=data_asset_name,
|
||||
runtime_parameters={"batch_data": df},
|
||||
batch_identifiers={"batch_id": DEFAULT_BATCH_ID},
|
||||
)
|
||||
|
||||
try:
|
||||
context.delete_expectation_suite(expectation_suite_name=expectation_suite_name)
|
||||
except DataContextError:
|
||||
pass
|
||||
|
||||
if hasattr(context, "create_expectation_suite"):
|
||||
context.create_expectation_suite(
|
||||
expectation_suite_name=expectation_suite_name, overwrite_existing=True
|
||||
)
|
||||
else:
|
||||
context.add_expectation_suite(expectation_suite_name=expectation_suite_name)
|
||||
|
||||
validator = context.get_validator(
|
||||
batch_request=batch_request, expectation_suite_name=expectation_suite_name
|
||||
)
|
||||
profiler = UserConfigurableProfiler(profile_dataset=validator)
|
||||
expectation_suite = profiler.build_suite()
|
||||
context.add_or_update_expectation_suite(expectation_suite=expectation_suite)
|
||||
validation_result = validator.validate(result_format="SUMMARY")
|
||||
|
||||
context.build_data_docs()
|
||||
data_docs_path = (
|
||||
Path(context.root_directory)
|
||||
/ "uncommitted"
|
||||
/ "data_docs"
|
||||
/ "local_site"
|
||||
/ "index.html"
|
||||
)
|
||||
|
||||
print(
|
||||
f"Expectation suite saved: {expectation_suite_name} ({len(expectation_suite.expectations)} expectations)"
|
||||
)
|
||||
|
||||
return {
|
||||
"batch_request": serialize_batch_request(batch_request),
|
||||
"expectation_suite_name": expectation_suite_name,
|
||||
"expectations_count": len(expectation_suite.expectations),
|
||||
"validation_result": validation_result.to_json_dict(),
|
||||
"data_docs_path": os.path.abspath(data_docs_path),
|
||||
}
|
||||
|
||||
|
||||
def assemble_payload(
|
||||
excel_path: Path,
|
||||
sheet_name: Any,
|
||||
dataframe_summary: Dict[str, Any],
|
||||
ge_summary: Dict[str, Any],
|
||||
) -> Dict[str, Any]:
|
||||
"""Combine pandas and GE artefacts into a single JSON payload."""
|
||||
|
||||
return {
|
||||
"source": {
|
||||
"excel_path": str(excel_path.resolve()),
|
||||
"sheet_name": sheet_name,
|
||||
},
|
||||
"analysis": dataframe_summary,
|
||||
"great_expectations": ge_summary,
|
||||
}
|
||||
|
||||
|
||||
def save_json_payload(payload: Dict[str, Any], output_path: Path) -> None:
|
||||
"""Persist the combined analysis payload to disk."""
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with output_path.open("w", encoding="utf-8") as f:
|
||||
json.dump(payload, f, ensure_ascii=False, indent=2)
|
||||
print(f"JSON analysis saved to: {output_path}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_cli_args()
|
||||
reset_results_dir(clean_results=args.clean_results)
|
||||
|
||||
context = ensure_data_context(args.ge_root)
|
||||
print(f"Great Expectations Data Context initialized at {context.root_directory}.")
|
||||
|
||||
df = load_excel_as_dataframe(args.excel_path, args.sheet_name, args.header_row)
|
||||
dataframe_summary = build_analysis_summary(df)
|
||||
|
||||
file_stem = args.excel_path.stem
|
||||
datasource_name = f"{file_stem}_datasource"
|
||||
data_connector_name = "runtime_data_connector"
|
||||
data_asset_name = f"{file_stem}_asset"
|
||||
expectation_suite_name = f"{file_stem}_suite"
|
||||
|
||||
ge_summary = run_ge_profiling(
|
||||
context,
|
||||
df,
|
||||
datasource_name,
|
||||
data_connector_name,
|
||||
data_asset_name,
|
||||
expectation_suite_name,
|
||||
)
|
||||
|
||||
payload = assemble_payload(
|
||||
excel_path=args.excel_path,
|
||||
sheet_name=args.sheet_name,
|
||||
dataframe_summary=dataframe_summary,
|
||||
ge_summary=ge_summary,
|
||||
)
|
||||
|
||||
output_path = RESULTS_DIR / f"{file_stem}_analysis.json"
|
||||
save_json_payload(payload, output_path)
|
||||
|
||||
print(
|
||||
f"Data Docs generated. Open in browser: file://{ge_summary['data_docs_path']}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user