init,llm gateway & import_analyse
This commit is contained in:
47
demo/GE_result_desc_prompt.txt
Normal file
47
demo/GE_result_desc_prompt.txt
Normal file
@ -0,0 +1,47 @@
|
||||
系统角色(System)
|
||||
你是“数据画像抽取器”。输入是一段 Great Expectations 的 profiling/validation 结果 JSON,
|
||||
可能包含:列级期望(expect_*)、统计、样例值、类型推断等;也可能带表级/批次元数据。
|
||||
请将其归一化为一个可被程序消费的“表画像”JSON,对不确定项给出置信度与理由。
|
||||
禁止臆造不存在的列、时间范围或数值。
|
||||
|
||||
用户消息(User)
|
||||
【输入:GE结果JSON】
|
||||
{{GE_RESULT_JSON}}
|
||||
|
||||
【输出要求(只输出JSON,不要解释文字)】
|
||||
{
|
||||
"table": "<库.表 或 表名>",
|
||||
"row_count": <int|null>, // 若未知可为 null
|
||||
"role": "fact|dimension|unknown", // 依据指标/维度占比与唯一性启发式
|
||||
"grain": ["<列1>", "<列2>", ...], // 事实粒度猜测(如含 dt/店/类目)
|
||||
"time": { "column": "<name>|null", "granularity": "day|week|month|unknown", "range": ["YYYY-MM-DD","YYYY-MM-DD"]|null, "has_gaps": true|false|null },
|
||||
"columns": [
|
||||
{
|
||||
"name": "<col>",
|
||||
"dtype": "<ge推断/物理类型>",
|
||||
"semantic_type": "dimension|metric|time|text|id|unknown",
|
||||
"null_rate": <0~1|null>,
|
||||
"distinct_count": <int|null>,
|
||||
"distinct_ratio": <0~1|null>,
|
||||
"stats": { "min": <number|string|null>,"max": <number|string|null>,"mean": <number|null>,"std": <number|null>,"skewness": <number|null> },
|
||||
"enumish": true|false|null, // 低熵/可枚举
|
||||
"top_values": [{"value":"<v>","pct":<0~1>}, ...],// 取前K个(≤10)
|
||||
"pk_candidate_score": <0~1>, // 唯一性+非空综合评分
|
||||
"metric_candidate_score": <0~1>, // 数值/偏态/业务词命中
|
||||
"comment": "<列注释或GE描述|可为空>"
|
||||
}
|
||||
],
|
||||
"primary_key_candidates": [["colA","colB"], ...], // 依据 unique/compound unique 期望
|
||||
"fk_candidates": [{"from":"<col>","to":"<dim_table(col)>","confidence":<0~1>}],
|
||||
"quality": {
|
||||
"failed_expectations": [{"name":"<expect_*>","column":"<col|table>","summary":"<一句话>"}],
|
||||
"warning_hints": ["空值率>0.2的列: ...", "时间列存在缺口: ..."]
|
||||
},
|
||||
"confidence_notes": ["<为什么判定role/grain/time列>"]
|
||||
}
|
||||
|
||||
【判定规则(简要)】
|
||||
- time列:类型为日期/时间 OR 命中 dt/date/day 等命名;若有 min/max 可给出 range;若间隔缺口≥1天记 has_gaps=true。
|
||||
- semantic_type:数值+右偏/方差大→更偏 metric;高唯一/ID命名→id;高基数+文本→text;低熵+有限取值→dimension。
|
||||
- role:metric列占比高且存在time列→倾向 fact;几乎全是枚举/ID且少数值→dimension。
|
||||
- 置信不高时给出 null 或 unknown,并写入 confidence_notes。
|
||||
127
demo/e-commerce-orders_desc.md
Normal file
127
demo/e-commerce-orders_desc.md
Normal file
@ -0,0 +1,127 @@
|
||||
E-commerce Customer Order Behavior Dataset
|
||||
A synthetic e-commerce dataset containing 10,000 orders with realistic customer behavior patterns, suitable for e-commerce analytics and machine learning tasks.
|
||||
|
||||
Dataset Card for E-commerce Orders
|
||||
Dataset Summary
|
||||
This dataset simulates customer order behavior in an e-commerce platform, containing detailed information about orders, customers, products, and delivery patterns. The data is synthetically generated with realistic distributions and patterns.
|
||||
|
||||
Supported Tasks
|
||||
regression: Predict order quantities or prices
|
||||
classification: Predict delivery status or customer segments
|
||||
clustering: Identify customer behavior patterns
|
||||
time-series-forecasting: Analyze order patterns over time
|
||||
Languages
|
||||
Not applicable (tabular data)
|
||||
|
||||
Dataset Structure
|
||||
Data Instances
|
||||
Each instance represents a single e-commerce order with the following fields:
|
||||
|
||||
{
|
||||
'order_id': '5ea92c47-c5b2-4bdd-8a50-d77efd77ec89',
|
||||
'customer_id': 2350,
|
||||
'product_id': 995,
|
||||
'category': 'Electronics',
|
||||
'price': 403.17,
|
||||
'quantity': 3,
|
||||
'order_date': '2024-04-20 14:59:58.897063',
|
||||
'shipping_date': '2024-04-22 14:59:58.897063',
|
||||
'delivery_status': 'Delivered',
|
||||
'payment_method': 'PayPal',
|
||||
'device_type': 'Mobile',
|
||||
'channel': 'Paid Search',
|
||||
'shipping_address': '72166 Cunningham Crescent East Nicholasside Mississippi 85568',
|
||||
'billing_address': '38199 Edwin Plain Johnborough Maine 81826',
|
||||
'customer_segment': 'Returning'
|
||||
}
|
||||
|
||||
Data Fields
|
||||
Field Name Type Description Value Range
|
||||
order_id string Unique order identifier (UUID4) -
|
||||
customer_id int Customer identifier 1-3,000
|
||||
product_id int Product identifier 1-1,000
|
||||
category string Product category Electronics, Clothing, Home, Books, Beauty, Toys
|
||||
price float Product price $5.00-$500.00
|
||||
quantity int Order quantity 1-10
|
||||
order_date datetime Order placement timestamp Last 12 months
|
||||
shipping_date datetime Shipping timestamp 1-7 days after order_date
|
||||
delivery_status string Delivery status Pending, Shipped, Delivered, Returned
|
||||
payment_method string Payment method used Credit Card, PayPal, Debit Card, Apple Pay, Google Pay
|
||||
device_type string Ordering device Desktop, Mobile, Tablet
|
||||
channel string Marketing channel Organic, Paid Search, Email, Social
|
||||
shipping_address string Delivery address Street, City, State, ZIP
|
||||
billing_address string Billing address Street, City, State, ZIP
|
||||
customer_segment string Customer type New, Returning, VIP
|
||||
Data Splits
|
||||
This dataset is provided as a single CSV file without splits.
|
||||
|
||||
Dataset Creation
|
||||
Source Data
|
||||
This is a synthetic dataset generated using Python with pandas, numpy, and Faker libraries. The data generation process ensures:
|
||||
|
||||
Realistic customer behavior patterns
|
||||
Proper data distributions
|
||||
Valid relationships between fields
|
||||
Realistic address formatting
|
||||
Annotations
|
||||
No manual annotations (synthetic data)
|
||||
|
||||
Considerations for Using the Data
|
||||
Social Impact of Dataset
|
||||
This dataset is designed for:
|
||||
|
||||
Development of e-commerce analytics systems
|
||||
Testing of order processing systems
|
||||
Training of machine learning models for e-commerce
|
||||
Educational purposes in data science
|
||||
Discussion of Biases
|
||||
As a synthetic dataset, care has been taken to:
|
||||
|
||||
Use realistic distributions for order patterns
|
||||
Maintain proper relationships between dates
|
||||
Create realistic customer segments
|
||||
Avoid demographic biases in address generation
|
||||
However, users should note that:
|
||||
|
||||
The data patterns are simplified compared to real e-commerce data
|
||||
The customer behavior patterns are based on general assumptions
|
||||
Geographic distribution might not reflect real-world patterns
|
||||
Dataset Statistics
|
||||
Total Records: 10,000
|
||||
|
||||
Distribution Statistics:
|
||||
|
||||
Delivery Status:
|
||||
|
||||
Delivered: 70%
|
||||
Shipped: 20%
|
||||
Pending: 5%
|
||||
Returned: 5%
|
||||
Customer Segments:
|
||||
|
||||
VIP: ~15%
|
||||
Returning: ~35%
|
||||
New: ~50%
|
||||
Loading and Usage
|
||||
Using Huggingface Datasets:
|
||||
|
||||
from datasets import load_dataset
|
||||
|
||||
dataset = load_dataset("path/to/e-commerce-orders")
|
||||
|
||||
# Example: Load as pandas DataFrame
|
||||
df = dataset['train'].to_pandas()
|
||||
|
||||
# Example: Access specific columns
|
||||
orders = dataset['train']['order_id']
|
||||
prices = dataset['train']['price']
|
||||
|
||||
Data Quality
|
||||
The dataset has been validated to ensure:
|
||||
|
||||
No missing values
|
||||
Proper value ranges
|
||||
Valid categorical values
|
||||
Proper date relationships
|
||||
Unique order IDs
|
||||
Valid address formats
|
||||
523
demo/snippet.json
Normal file
523
demo/snippet.json
Normal file
@ -0,0 +1,523 @@
|
||||
[
|
||||
{
|
||||
"id": "snpt_daily_gmv_trend",
|
||||
"title": "日GMV趋势",
|
||||
"desc": "按日统计GMV与订单量趋势",
|
||||
"type": "trend",
|
||||
"applicability": {
|
||||
"required_columns": [
|
||||
"order_date",
|
||||
"price",
|
||||
"quantity"
|
||||
],
|
||||
"time_column": "order_date",
|
||||
"constraints": {
|
||||
"dim_cardinality_hint": null,
|
||||
"fk_join_available": false,
|
||||
"notes": [
|
||||
"GMV=SUM(price*quantity)",
|
||||
"请避免选择地址等PII字段"
|
||||
]
|
||||
}
|
||||
},
|
||||
"variables": [
|
||||
{
|
||||
"name": "start_date",
|
||||
"type": "date"
|
||||
},
|
||||
{
|
||||
"name": "end_date",
|
||||
"type": "date"
|
||||
}
|
||||
],
|
||||
"dialect_sql": {
|
||||
"mysql": "SELECT DATE(order_date) AS dt, SUM(price*quantity) AS gmv, COUNT(*) AS orders\nFROM {{table}}\nWHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\nGROUP BY dt\nORDER BY dt;"
|
||||
},
|
||||
"business_caliber": "GMV口径:price×quantity;订单量:记录条数;粒度=日。",
|
||||
"examples": [
|
||||
"近30天GMV趋势",
|
||||
"2025Q1每日GMV与订单数"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_daily_gmv_ma7",
|
||||
"title": "7日GMV均线",
|
||||
"desc": "GMV按日与7日滑动平均",
|
||||
"type": "trend",
|
||||
"applicability": {
|
||||
"required_columns": [
|
||||
"order_date",
|
||||
"price",
|
||||
"quantity"
|
||||
],
|
||||
"time_column": "order_date",
|
||||
"constraints": {
|
||||
"dim_cardinality_hint": null,
|
||||
"fk_join_available": false,
|
||||
"notes": [
|
||||
"窗口=包含当日的过去7天",
|
||||
"若日期有缺口,均线可能偏移"
|
||||
]
|
||||
}
|
||||
},
|
||||
"variables": [
|
||||
{
|
||||
"name": "start_date",
|
||||
"type": "date"
|
||||
},
|
||||
{
|
||||
"name": "end_date",
|
||||
"type": "date"
|
||||
}
|
||||
],
|
||||
"dialect_sql": {
|
||||
"mysql": "WITH d AS (\n SELECT DATE(order_date) AS dt, SUM(price*quantity) AS gmv\n FROM {{table}}\n WHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\n GROUP BY dt\n)\nSELECT dt,\n gmv,\n AVG(gmv) OVER (ORDER BY dt ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS gmv_ma7\nFROM d\nORDER BY dt;"
|
||||
},
|
||||
"business_caliber": "GMV=price×quantity;窗口=7天(含当日),按自然日排序计算。",
|
||||
"examples": [
|
||||
"本季度GMV与7日均线",
|
||||
"促销期走势平滑对比"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_yoy_daily_gmv",
|
||||
"title": "GMV同比(日)",
|
||||
"desc": "对比去年同日GMV与同比%",
|
||||
"type": "ratio",
|
||||
"applicability": {
|
||||
"required_columns": [
|
||||
"order_date",
|
||||
"price",
|
||||
"quantity"
|
||||
],
|
||||
"time_column": "order_date",
|
||||
"constraints": {
|
||||
"dim_cardinality_hint": null,
|
||||
"fk_join_available": false,
|
||||
"notes": [
|
||||
"需要查询窗口覆盖到去年的对应日期",
|
||||
"闰年按日期对齐处理"
|
||||
]
|
||||
}
|
||||
},
|
||||
"variables": [
|
||||
{
|
||||
"name": "start_date",
|
||||
"type": "date"
|
||||
},
|
||||
{
|
||||
"name": "end_date",
|
||||
"type": "date"
|
||||
}
|
||||
],
|
||||
"dialect_sql": {
|
||||
"mysql": "WITH cur AS (\n SELECT DATE(order_date) AS dt, SUM(price*quantity) AS gmv\n FROM {{table}}\n WHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\n GROUP BY dt\n),\nprev AS (\n SELECT DATE(DATE_SUB(order_date, INTERVAL 1 YEAR)) AS dt, SUM(price*quantity) AS gmv_last\n FROM {{table}}\n WHERE DATE(order_date) BETWEEN DATE_SUB({{start_date}}, INTERVAL 1 YEAR) AND DATE_SUB({{end_date}}, INTERVAL 1 YEAR)\n GROUP BY DATE(DATE_SUB(order_date, INTERVAL 1 YEAR))\n)\nSELECT c.dt,\n c.gmv,\n p.gmv_last,\n CASE WHEN p.gmv_last IS NULL OR p.gmv_last=0 THEN NULL ELSE (c.gmv - p.gmv_last)/p.gmv_last END AS yoy\nFROM cur c LEFT JOIN prev p ON c.dt = p.dt\nORDER BY c.dt;"
|
||||
},
|
||||
"business_caliber": "同比=当日GMV与去年同日GMV之差/去年同日GMV;GMV=price×quantity。",
|
||||
"examples": [
|
||||
"最近90天GMV同比曲线",
|
||||
"节假日同比表现"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_topn_category_gmv",
|
||||
"title": "类目GMV排行",
|
||||
"desc": "按类目统计GMV并取TopN",
|
||||
"type": "topn",
|
||||
"applicability": {
|
||||
"required_columns": [
|
||||
"order_date",
|
||||
"category",
|
||||
"price",
|
||||
"quantity"
|
||||
],
|
||||
"time_column": "order_date",
|
||||
"constraints": {
|
||||
"dim_cardinality_hint": 6,
|
||||
"fk_join_available": false,
|
||||
"notes": [
|
||||
"类目枚举较少,建议TopN<=6用于展示",
|
||||
"可追加订单量与件数"
|
||||
]
|
||||
}
|
||||
},
|
||||
"variables": [
|
||||
{
|
||||
"name": "start_date",
|
||||
"type": "date"
|
||||
},
|
||||
{
|
||||
"name": "end_date",
|
||||
"type": "date"
|
||||
},
|
||||
{
|
||||
"name": "top_n",
|
||||
"type": "int",
|
||||
"default": 10
|
||||
}
|
||||
],
|
||||
"dialect_sql": {
|
||||
"mysql": "SELECT category,\n SUM(price*quantity) AS gmv,\n COUNT(*) AS orders,\n SUM(quantity) AS qty\nFROM {{table}}\nWHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\nGROUP BY category\nORDER BY gmv DESC\nLIMIT {{top_n}};"
|
||||
},
|
||||
"business_caliber": "GMV=price×quantity;统计范围=指定日期内;粒度=类目。",
|
||||
"examples": [
|
||||
"上月类目Top5",
|
||||
"本季度类目GMV结构"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_share_channel",
|
||||
"title": "渠道GMV占比",
|
||||
"desc": "统计各渠道GMV及占比",
|
||||
"type": "ratio",
|
||||
"applicability": {
|
||||
"required_columns": [
|
||||
"order_date",
|
||||
"channel",
|
||||
"price",
|
||||
"quantity"
|
||||
],
|
||||
"time_column": "order_date",
|
||||
"constraints": {
|
||||
"dim_cardinality_hint": 4,
|
||||
"fk_join_available": false,
|
||||
"notes": [
|
||||
"占比以总GMV为分母;占比之和≈100%",
|
||||
"适合饼图/堆叠柱"
|
||||
]
|
||||
}
|
||||
},
|
||||
"variables": [
|
||||
{
|
||||
"name": "start_date",
|
||||
"type": "date"
|
||||
},
|
||||
{
|
||||
"name": "end_date",
|
||||
"type": "date"
|
||||
}
|
||||
],
|
||||
"dialect_sql": {
|
||||
"mysql": "WITH base AS (\n SELECT channel, SUM(price*quantity) AS gmv\n FROM {{table}}\n WHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\n GROUP BY channel\n), total AS (\n SELECT SUM(gmv) AS tg FROM base\n)\nSELECT b.channel, b.gmv, b.gmv/t.tg AS gmv_share\nFROM base b CROSS JOIN total t\nORDER BY b.gmv DESC;"
|
||||
},
|
||||
"business_caliber": "渠道GMV占比=渠道GMV/全部渠道GMV;时间范围由参数限定。",
|
||||
"examples": [
|
||||
"本月各渠道占比",
|
||||
"Q1渠道结构对比"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_topn_product_gmv",
|
||||
"title": "商品GMV排行",
|
||||
"desc": "按商品ID统计GMV并取TopN",
|
||||
"type": "topn",
|
||||
"applicability": {
|
||||
"required_columns": [
|
||||
"order_date",
|
||||
"product_id",
|
||||
"price",
|
||||
"quantity"
|
||||
],
|
||||
"time_column": "order_date",
|
||||
"constraints": {
|
||||
"dim_cardinality_hint": 1000,
|
||||
"fk_join_available": true,
|
||||
"notes": [
|
||||
"product_id基数较高,建议LIMIT<=50",
|
||||
"可与商品维表联查名称等属性"
|
||||
]
|
||||
}
|
||||
},
|
||||
"variables": [
|
||||
{
|
||||
"name": "start_date",
|
||||
"type": "date"
|
||||
},
|
||||
{
|
||||
"name": "end_date",
|
||||
"type": "date"
|
||||
},
|
||||
{
|
||||
"name": "top_n",
|
||||
"type": "int",
|
||||
"default": 20
|
||||
}
|
||||
],
|
||||
"dialect_sql": {
|
||||
"mysql": "SELECT product_id,\n SUM(price*quantity) AS gmv,\n SUM(quantity) AS qty,\n COUNT(*) AS orders\nFROM {{table}}\nWHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\nGROUP BY product_id\nORDER BY gmv DESC\nLIMIT {{top_n}};"
|
||||
},
|
||||
"business_caliber": "GMV=price×quantity;粒度=商品ID。",
|
||||
"examples": [
|
||||
"上周热销商品Top20",
|
||||
"年度销量Top10商品"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_join_product_dim",
|
||||
"title": "商品维表联查",
|
||||
"desc": "以product_id关联商品维表或使用纯ID",
|
||||
"type": "join",
|
||||
"applicability": {
|
||||
"required_columns": [
|
||||
"product_id"
|
||||
],
|
||||
"time_column": null,
|
||||
"constraints": {
|
||||
"dim_cardinality_hint": 1000,
|
||||
"fk_join_available": true,
|
||||
"notes": [
|
||||
"若无维表则保留纯ID版输出",
|
||||
"谨慎选择PII字段,勿输出地址类字段"
|
||||
]
|
||||
}
|
||||
},
|
||||
"variables": [
|
||||
{
|
||||
"name": "dim_product",
|
||||
"type": "identifier"
|
||||
},
|
||||
{
|
||||
"name": "select_cols",
|
||||
"type": "string",
|
||||
"default": "f.product_id, f.price, f.quantity"
|
||||
}
|
||||
],
|
||||
"dialect_sql": {
|
||||
"mysql": "-- 命名版\nSELECT {{select_cols}}\nFROM {{table}} f\nLEFT JOIN {{dim_product}} d ON f.product_id = d.product_id;\n\n-- 纯ID版\nSELECT product_id, price, quantity FROM {{table}};"
|
||||
},
|
||||
"business_caliber": "外键:product_id→商品维表主键;度量来源于事实表price与quantity。",
|
||||
"examples": [
|
||||
"联查商品名称后做TopN",
|
||||
"仅用ID进行商品分析"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_join_customer_dim",
|
||||
"title": "客户维表联查",
|
||||
"desc": "以customer_id关联客户维表或使用纯ID",
|
||||
"type": "join",
|
||||
"applicability": {
|
||||
"required_columns": [
|
||||
"customer_id"
|
||||
],
|
||||
"time_column": null,
|
||||
"constraints": {
|
||||
"dim_cardinality_hint": 2713,
|
||||
"fk_join_available": true,
|
||||
"notes": [
|
||||
"如无维表,可直接按customer_id聚合",
|
||||
"避免输出shipping_address/billing_address等PII"
|
||||
]
|
||||
}
|
||||
},
|
||||
"variables": [
|
||||
{
|
||||
"name": "dim_customer",
|
||||
"type": "identifier"
|
||||
},
|
||||
{
|
||||
"name": "select_cols",
|
||||
"type": "string",
|
||||
"default": "c.customer_name, f.customer_id, SUM(f.price*f.quantity) AS gmv"
|
||||
}
|
||||
],
|
||||
"dialect_sql": {
|
||||
"mysql": "-- 命名版\nSELECT {{select_cols}}\nFROM {{table}} f\nLEFT JOIN {{dim_customer}} c ON f.customer_id = c.customer_id\nGROUP BY c.customer_name, f.customer_id;\n\n-- 纯ID版\nSELECT customer_id, SUM(price*quantity) AS gmv\nFROM {{table}}\nGROUP BY customer_id;"
|
||||
},
|
||||
"business_caliber": "外键:customer_id→客户维表主键;GMV=price×quantity。",
|
||||
"examples": [
|
||||
"客户分群GMV",
|
||||
"重点客户消费额排行"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_quality_dup_order",
|
||||
"title": "主键重复检查",
|
||||
"desc": "检查order_id唯一性并抽样输出",
|
||||
"type": "quality",
|
||||
"applicability": {
|
||||
"required_columns": [
|
||||
"order_id"
|
||||
],
|
||||
"time_column": null,
|
||||
"constraints": {
|
||||
"dim_cardinality_hint": 10000,
|
||||
"fk_join_available": false,
|
||||
"notes": [
|
||||
"画像显示order_id应唯一;若结果非空为异常"
|
||||
]
|
||||
}
|
||||
},
|
||||
"variables": [
|
||||
{
|
||||
"name": "limit_sample",
|
||||
"type": "int",
|
||||
"default": 50
|
||||
}
|
||||
],
|
||||
"dialect_sql": {
|
||||
"mysql": "WITH d AS (\n SELECT order_id, COUNT(*) AS cnt\n FROM {{table}}\n GROUP BY order_id\n HAVING COUNT(*)>1\n)\nSELECT * FROM d LIMIT {{limit_sample}};"
|
||||
},
|
||||
"business_caliber": "主键口径:order_id全表唯一;用于数据质量预警与排查。",
|
||||
"examples": [
|
||||
"是否存在重复订单?",
|
||||
"查看重复订单样本"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_quality_price_outlier",
|
||||
"title": "价格异常检测",
|
||||
"desc": "基于当日均值±3σ识别异常价",
|
||||
"type": "quality",
|
||||
"applicability": {
|
||||
"required_columns": [
|
||||
"order_date",
|
||||
"price"
|
||||
],
|
||||
"time_column": "order_date",
|
||||
"constraints": {
|
||||
"dim_cardinality_hint": null,
|
||||
"fk_join_available": false,
|
||||
"notes": [
|
||||
"仅质量预警,不直接代表业务错误",
|
||||
"当天样本过少时波动较大"
|
||||
]
|
||||
}
|
||||
},
|
||||
"variables": [
|
||||
{
|
||||
"name": "start_date",
|
||||
"type": "date"
|
||||
},
|
||||
{
|
||||
"name": "end_date",
|
||||
"type": "date"
|
||||
},
|
||||
{
|
||||
"name": "limit_sample",
|
||||
"type": "int",
|
||||
"default": 100
|
||||
}
|
||||
],
|
||||
"dialect_sql": {
|
||||
"mysql": "WITH stats AS (\n SELECT DATE(order_date) AS dt, AVG(price) AS mu, STDDEV_POP(price) AS sigma\n FROM {{table}}\n WHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\n GROUP BY dt\n)\nSELECT f.*\nFROM {{table}} f\nJOIN stats s ON DATE(f.order_date)=s.dt\nWHERE (f.price > s.mu + 3*s.sigma OR f.price < s.mu - 3*s.sigma)\nLIMIT {{limit_sample}};"
|
||||
},
|
||||
"business_caliber": "异常定义:价格超出当日均值±3×标准差(总体标准差)。",
|
||||
"examples": [
|
||||
"近30天价格异常样本",
|
||||
"促销期异常价监控"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_sample_recent_orders",
|
||||
"title": "近期明细抽样",
|
||||
"desc": "抽样查看近期订单核心字段",
|
||||
"type": "sample",
|
||||
"applicability": {
|
||||
"required_columns": [
|
||||
"order_date",
|
||||
"order_id",
|
||||
"customer_id",
|
||||
"product_id",
|
||||
"category",
|
||||
"price",
|
||||
"quantity",
|
||||
"channel",
|
||||
"payment_method",
|
||||
"delivery_status"
|
||||
],
|
||||
"time_column": "order_date",
|
||||
"constraints": {
|
||||
"dim_cardinality_hint": null,
|
||||
"fk_join_available": true,
|
||||
"notes": [
|
||||
"为保护隐私,不展示shipping_address与billing_address",
|
||||
"仅用于人工核验"
|
||||
]
|
||||
}
|
||||
},
|
||||
"variables": [
|
||||
{
|
||||
"name": "start_date",
|
||||
"type": "date"
|
||||
},
|
||||
{
|
||||
"name": "end_date",
|
||||
"type": "date"
|
||||
},
|
||||
{
|
||||
"name": "limit_rows",
|
||||
"type": "int",
|
||||
"default": 100
|
||||
}
|
||||
],
|
||||
"dialect_sql": {
|
||||
"mysql": "SELECT DATE(order_date) AS dt,\n order_id, customer_id, product_id, category,\n price, quantity, channel, payment_method, delivery_status\nFROM {{table}}\nWHERE DATE(order_date) BETWEEN {{start_date}} AND {{end_date}}\nORDER BY dt DESC\nLIMIT {{limit_rows}};"
|
||||
},
|
||||
"business_caliber": "明细抽样用于数据核验;不输出PII地址信息。",
|
||||
"examples": [
|
||||
"抽样查看上周订单",
|
||||
"核对节假日订单明细"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_filter_paid_delivered",
|
||||
"title": "支付已送达筛选",
|
||||
"desc": "过滤支付方式为信用卡且配送状态为已送达",
|
||||
"type": "sample",
|
||||
"applicability": {
|
||||
"required_columns": [
|
||||
"payment_method",
|
||||
"delivery_status"
|
||||
],
|
||||
"time_column": null,
|
||||
"constraints": {
|
||||
"dim_cardinality_hint": 5,
|
||||
"fk_join_available": false,
|
||||
"notes": [
|
||||
"此片段为WHERE条件模板,可拼接到任意查询",
|
||||
"delivery_status枚举包含Delivered/Pending/Returned/Shipped"
|
||||
]
|
||||
}
|
||||
},
|
||||
"variables": [],
|
||||
"dialect_sql": {
|
||||
"mysql": "WHERE payment_method = 'Credit Card' AND delivery_status = 'Delivered'"
|
||||
},
|
||||
"business_caliber": "口径:支付渠道=信用卡;物流状态=已送达(Delivered)。可与时间或维度条件叠加。",
|
||||
"examples": [
|
||||
"筛选信用卡已送达订单",
|
||||
"在TopN商品中仅看已送达信用卡订单"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_filter_device_channel",
|
||||
"title": "设备渠道筛选",
|
||||
"desc": "按设备类型与渠道过滤分析范围",
|
||||
"type": "sample",
|
||||
"applicability": {
|
||||
"required_columns": [
|
||||
"device_type",
|
||||
"channel"
|
||||
],
|
||||
"time_column": null,
|
||||
"constraints": {
|
||||
"dim_cardinality_hint": 7,
|
||||
"fk_join_available": false,
|
||||
"notes": [
|
||||
"device_type枚举:Desktop/Mobile/Tablet",
|
||||
"channel枚举:Email/Organic/Paid Search/Social"
|
||||
]
|
||||
}
|
||||
},
|
||||
"variables": [],
|
||||
"dialect_sql": {
|
||||
"mysql": "WHERE device_type IN ('Mobile','Desktop') AND channel IN ('Paid Search','Social')"
|
||||
},
|
||||
"business_caliber": "限制分析在指定设备与渠道;可直接作为WHERE子句片段复用。",
|
||||
"examples": [
|
||||
"仅看移动端付费渠道GMV",
|
||||
"桌面+社媒渠道订单明细"
|
||||
]
|
||||
}
|
||||
]
|
||||
499
demo/snippet_alias_generator.json
Normal file
499
demo/snippet_alias_generator.json
Normal file
@ -0,0 +1,499 @@
|
||||
[
|
||||
{
|
||||
"id": "snpt_daily_gmv_trend",
|
||||
"aliases": [
|
||||
{
|
||||
"text": "每日GMV走势",
|
||||
"tone": "中性"
|
||||
},
|
||||
{
|
||||
"text": "日销售额趋势",
|
||||
"tone": "中性"
|
||||
},
|
||||
{
|
||||
"text": "每天卖了多少",
|
||||
"tone": "口语"
|
||||
},
|
||||
{
|
||||
"text": "按日GMV曲线",
|
||||
"tone": "专业"
|
||||
}
|
||||
],
|
||||
"keywords": [
|
||||
"GMV",
|
||||
"销售额",
|
||||
"日趋势",
|
||||
"每日",
|
||||
"订单量",
|
||||
"orders",
|
||||
"price",
|
||||
"quantity",
|
||||
"order_date",
|
||||
"time series",
|
||||
"趋势图",
|
||||
"按日聚合"
|
||||
],
|
||||
"intent_tags": [
|
||||
"trend"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_daily_gmv_ma7",
|
||||
"aliases": [
|
||||
{
|
||||
"text": "GMV七日均线",
|
||||
"tone": "专业"
|
||||
},
|
||||
{
|
||||
"text": "7天滑动平均",
|
||||
"tone": "中性"
|
||||
},
|
||||
{
|
||||
"text": "GMV周均走势",
|
||||
"tone": "中性"
|
||||
},
|
||||
{
|
||||
"text": "GMV平滑曲线",
|
||||
"tone": "专业"
|
||||
}
|
||||
],
|
||||
"keywords": [
|
||||
"GMV",
|
||||
"移动平均",
|
||||
"MA7",
|
||||
"七日均线",
|
||||
"滑动窗口",
|
||||
"time series",
|
||||
"order_date",
|
||||
"price",
|
||||
"quantity",
|
||||
"平滑",
|
||||
"趋势",
|
||||
"按日聚合"
|
||||
],
|
||||
"intent_tags": [
|
||||
"trend"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_yoy_daily_gmv",
|
||||
"aliases": [
|
||||
{
|
||||
"text": "GMV日同比",
|
||||
"tone": "专业"
|
||||
},
|
||||
{
|
||||
"text": "每日同比增速",
|
||||
"tone": "中性"
|
||||
},
|
||||
{
|
||||
"text": "跟去年同日比",
|
||||
"tone": "口语"
|
||||
},
|
||||
{
|
||||
"text": "GMV YoY(日)",
|
||||
"tone": "专业"
|
||||
}
|
||||
],
|
||||
"keywords": [
|
||||
"同比",
|
||||
"YoY",
|
||||
"GMV",
|
||||
"去年同日",
|
||||
"增长率",
|
||||
"price",
|
||||
"quantity",
|
||||
"order_date",
|
||||
"对比分析",
|
||||
"比值",
|
||||
"日粒度",
|
||||
"ratio"
|
||||
],
|
||||
"intent_tags": [
|
||||
"ratio"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_topn_category_gmv",
|
||||
"aliases": [
|
||||
{
|
||||
"text": "类目GMV排行",
|
||||
"tone": "中性"
|
||||
},
|
||||
{
|
||||
"text": "类目TopN销量",
|
||||
"tone": "中性"
|
||||
},
|
||||
{
|
||||
"text": "哪个分类最卖",
|
||||
"tone": "口语"
|
||||
},
|
||||
{
|
||||
"text": "按类目GMV排序",
|
||||
"tone": "专业"
|
||||
}
|
||||
],
|
||||
"keywords": [
|
||||
"TopN",
|
||||
"分类",
|
||||
"类目",
|
||||
"category",
|
||||
"GMV",
|
||||
"price",
|
||||
"quantity",
|
||||
"排行",
|
||||
"榜单",
|
||||
"按类目聚合",
|
||||
"订单量",
|
||||
"销量"
|
||||
],
|
||||
"intent_tags": [
|
||||
"topn",
|
||||
"by_dimension"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_share_channel",
|
||||
"aliases": [
|
||||
{
|
||||
"text": "渠道GMV占比",
|
||||
"tone": "中性"
|
||||
},
|
||||
{
|
||||
"text": "各渠道份额",
|
||||
"tone": "中性"
|
||||
},
|
||||
{
|
||||
"text": "哪个渠道占多",
|
||||
"tone": "口语"
|
||||
},
|
||||
{
|
||||
"text": "渠道结构占比",
|
||||
"tone": "专业"
|
||||
}
|
||||
],
|
||||
"keywords": [
|
||||
"占比",
|
||||
"份额",
|
||||
"share",
|
||||
"channel",
|
||||
"GMV",
|
||||
"price",
|
||||
"quantity",
|
||||
"比例",
|
||||
"结构分析",
|
||||
"按渠道聚合",
|
||||
"饼图",
|
||||
"堆叠"
|
||||
],
|
||||
"intent_tags": [
|
||||
"ratio",
|
||||
"by_dimension"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_topn_product_gmv",
|
||||
"aliases": [
|
||||
{
|
||||
"text": "商品GMV排行",
|
||||
"tone": "中性"
|
||||
},
|
||||
{
|
||||
"text": "热销商品TopN",
|
||||
"tone": "中性"
|
||||
},
|
||||
{
|
||||
"text": "哪款卖得最好",
|
||||
"tone": "口语"
|
||||
},
|
||||
{
|
||||
"text": "按商品GMV排序",
|
||||
"tone": "专业"
|
||||
}
|
||||
],
|
||||
"keywords": [
|
||||
"TopN",
|
||||
"product_id",
|
||||
"商品",
|
||||
"GMV",
|
||||
"price",
|
||||
"quantity",
|
||||
"热销",
|
||||
"排行",
|
||||
"销量",
|
||||
"订单数",
|
||||
"高基数",
|
||||
"榜单"
|
||||
],
|
||||
"intent_tags": [
|
||||
"topn",
|
||||
"by_dimension"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_join_product_dim",
|
||||
"aliases": [
|
||||
{
|
||||
"text": "关联商品维度",
|
||||
"tone": "专业"
|
||||
},
|
||||
{
|
||||
"text": "商品ID联表",
|
||||
"tone": "中性"
|
||||
},
|
||||
{
|
||||
"text": "把商品名连上",
|
||||
"tone": "口语"
|
||||
},
|
||||
{
|
||||
"text": "product维表join",
|
||||
"tone": "专业"
|
||||
}
|
||||
],
|
||||
"keywords": [
|
||||
"join",
|
||||
"维表",
|
||||
"product_id",
|
||||
"维度扩展",
|
||||
"明细补充",
|
||||
"维度属性",
|
||||
"联表查询",
|
||||
"外键",
|
||||
"选择列",
|
||||
"维度贴标签",
|
||||
"by id",
|
||||
"映射"
|
||||
],
|
||||
"intent_tags": [
|
||||
"by_dimension"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_join_customer_dim",
|
||||
"aliases": [
|
||||
{
|
||||
"text": "关联客户维度",
|
||||
"tone": "专业"
|
||||
},
|
||||
{
|
||||
"text": "客户ID联表",
|
||||
"tone": "中性"
|
||||
},
|
||||
{
|
||||
"text": "把客户信息补齐",
|
||||
"tone": "口语"
|
||||
},
|
||||
{
|
||||
"text": "customer维表join",
|
||||
"tone": "专业"
|
||||
}
|
||||
],
|
||||
"keywords": [
|
||||
"join",
|
||||
"维表",
|
||||
"customer_id",
|
||||
"客户属性",
|
||||
"GMV聚合",
|
||||
"外键关联",
|
||||
"联表查询",
|
||||
"ID映射",
|
||||
"维度丰富",
|
||||
"分群分析",
|
||||
"by id",
|
||||
"扩展字段"
|
||||
],
|
||||
"intent_tags": [
|
||||
"by_dimension"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_quality_dup_order",
|
||||
"aliases": [
|
||||
{
|
||||
"text": "订单主键去重检",
|
||||
"tone": "专业"
|
||||
},
|
||||
{
|
||||
"text": "重复order_id查找",
|
||||
"tone": "中性"
|
||||
},
|
||||
{
|
||||
"text": "有没重复订单",
|
||||
"tone": "口语"
|
||||
},
|
||||
{
|
||||
"text": "主键唯一性校验",
|
||||
"tone": "专业"
|
||||
}
|
||||
],
|
||||
"keywords": [
|
||||
"数据质量",
|
||||
"重复",
|
||||
"去重",
|
||||
"order_id",
|
||||
"唯一性",
|
||||
"主键检查",
|
||||
"异常数据",
|
||||
"质量预警",
|
||||
"count>1",
|
||||
"样本抽取",
|
||||
"校验",
|
||||
"重复检测"
|
||||
],
|
||||
"intent_tags": [
|
||||
"quality"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_quality_price_outlier",
|
||||
"aliases": [
|
||||
{
|
||||
"text": "价格3σ异常检",
|
||||
"tone": "专业"
|
||||
},
|
||||
{
|
||||
"text": "当日异常价格",
|
||||
"tone": "中性"
|
||||
},
|
||||
{
|
||||
"text": "看哪单价格怪",
|
||||
"tone": "口语"
|
||||
},
|
||||
{
|
||||
"text": "价格离群监控",
|
||||
"tone": "专业"
|
||||
}
|
||||
],
|
||||
"keywords": [
|
||||
"异常检测",
|
||||
"3σ",
|
||||
"stddev",
|
||||
"价格",
|
||||
"price",
|
||||
"离群点",
|
||||
"质量规则",
|
||||
"time series",
|
||||
"order_date",
|
||||
"阈值告警",
|
||||
"数据监控",
|
||||
"波动"
|
||||
],
|
||||
"intent_tags": [
|
||||
"quality"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_sample_recent_orders",
|
||||
"aliases": [
|
||||
{
|
||||
"text": "近期明细抽样",
|
||||
"tone": "中性"
|
||||
},
|
||||
{
|
||||
"text": "抽查最近订单",
|
||||
"tone": "口语"
|
||||
},
|
||||
{
|
||||
"text": "近期订单样本",
|
||||
"tone": "中性"
|
||||
},
|
||||
{
|
||||
"text": "核验明细抽样",
|
||||
"tone": "专业"
|
||||
}
|
||||
],
|
||||
"keywords": [
|
||||
"抽样",
|
||||
"sample",
|
||||
"明细",
|
||||
"order_date",
|
||||
"order_id",
|
||||
"customer_id",
|
||||
"product_id",
|
||||
"category",
|
||||
"channel",
|
||||
"payment_method",
|
||||
"delivery_status",
|
||||
"核验"
|
||||
],
|
||||
"intent_tags": [
|
||||
"by_dimension"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_filter_paid_delivered",
|
||||
"aliases": [
|
||||
{
|
||||
"text": "信用卡送达筛选",
|
||||
"tone": "中性"
|
||||
},
|
||||
{
|
||||
"text": "只看信用卡已送达",
|
||||
"tone": "口语"
|
||||
},
|
||||
{
|
||||
"text": "信用卡且已送达",
|
||||
"tone": "中性"
|
||||
},
|
||||
{
|
||||
"text": "付款信用卡已送达",
|
||||
"tone": "专业"
|
||||
}
|
||||
],
|
||||
"keywords": [
|
||||
"支付方式",
|
||||
"信用卡",
|
||||
"Credit Card",
|
||||
"配送状态",
|
||||
"Delivered",
|
||||
"已送达",
|
||||
"过滤条件",
|
||||
"where子句",
|
||||
"订单筛选",
|
||||
"支付渠道",
|
||||
"状态筛选",
|
||||
"条件片段"
|
||||
],
|
||||
"intent_tags": [
|
||||
"by_dimension"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "snpt_filter_device_channel",
|
||||
"aliases": [
|
||||
{
|
||||
"text": "设备渠道筛选",
|
||||
"tone": "中性"
|
||||
},
|
||||
{
|
||||
"text": "只看移动付费社媒",
|
||||
"tone": "口语"
|
||||
},
|
||||
{
|
||||
"text": "设备+渠道过滤",
|
||||
"tone": "专业"
|
||||
},
|
||||
{
|
||||
"text": "端与渠道条件",
|
||||
"tone": "中性"
|
||||
}
|
||||
],
|
||||
"keywords": [
|
||||
"device_type",
|
||||
"channel",
|
||||
"设备类型",
|
||||
"渠道",
|
||||
"过滤条件",
|
||||
"where子句",
|
||||
"Mobile",
|
||||
"Desktop",
|
||||
"Paid Search",
|
||||
"Social",
|
||||
"范围限定",
|
||||
"条件片段"
|
||||
],
|
||||
"intent_tags": [
|
||||
"by_dimension"
|
||||
]
|
||||
}
|
||||
]
|
||||
52
demo/snippet_alias_generator.txt
Normal file
52
demo/snippet_alias_generator.txt
Normal file
@ -0,0 +1,52 @@
|
||||
系统角色(System)
|
||||
你是“SQL片段别名生成器”。
|
||||
输入为一个或多个 SQL 片段对象(来自 snippet.json),输出为针对每个片段生成的多样化别名(口语 / 中性 / 专业)、关键词与意图标签。
|
||||
要求逐个处理所有片段对象,输出同样数量的 JSON 元素。
|
||||
|
||||
用户消息(User)
|
||||
【上下文】
|
||||
|
||||
SQL片段对象数组:{{SNIPPET_ARRAY}} // snippet.json中的一个或多个片段
|
||||
|
||||
【任务要求】
|
||||
请针对输入数组中的 每个 SQL 片段,输出一个 JSON 对象,结构如下:
|
||||
|
||||
{
|
||||
"id": "<与输入片段id一致>",
|
||||
"aliases": [
|
||||
{"text": "…", "tone": "口语|中性|专业"},
|
||||
{"text": "…", "tone": "专业"}
|
||||
],
|
||||
"keywords": [
|
||||
"GMV","销售额","TopN","category","类目","趋势","同比","客户","订单","质量","异常检测","join","过滤","sample"
|
||||
],
|
||||
"intent_tags": ["aggregate","trend","topn","ratio","quality","join","sample","filter","by_dimension"]
|
||||
}
|
||||
|
||||
生成逻辑规范
|
||||
1.逐条输出:输入数组中每个片段对应一个输出对象(id 保持一致)。
|
||||
|
||||
2.aliases生成
|
||||
至少 3 个别名,分别覆盖语气类型:口语 / 中性 / 专业。
|
||||
≤20字,语义需等价,不得添加不存在的字段或业务口径。
|
||||
示例:
|
||||
GMV趋势分析(中性)
|
||||
每天卖多少钱(口语)
|
||||
按日GMV曲线(专业)
|
||||
3.keywords生成
|
||||
8~15个关键词,需涵盖片段核心维度、指标、分析类型和语义近义词。
|
||||
中英文混合(如 "GMV"/"销售额"、"同比"/"YoY"、"类目"/"category" 等)。
|
||||
包含用于匹配的分析意图关键词(如 “趋势”、“排行”、“占比”、“质量检查”、“过滤” 等)。
|
||||
|
||||
4.intent_tags生成
|
||||
|
||||
从以下集合中选取,与片段type及用途一致:
|
||||
["aggregate","trend","topn","ratio","quality","join","sample","filter","by_dimension"]
|
||||
|
||||
若为条件片段(WHERE句型),补充 "filter";若含维度分组逻辑,补充 "by_dimension"。
|
||||
|
||||
5.语言与内容要求
|
||||
|
||||
保持正式书面风格,不添加解释说明。
|
||||
|
||||
只输出JSON数组,不包含文字描述或额外文本。
|
||||
46
demo/snippet_generator.txt
Normal file
46
demo/snippet_generator.txt
Normal file
@ -0,0 +1,46 @@
|
||||
系统角色(System)
|
||||
你是“SQL片段生成器”。只能基于给定“表画像”生成可复用的分析片段。
|
||||
为每个片段产出:标题、用途描述、片段类型、变量、适用条件、SQL模板(mysql方言),并注明业务口径与安全限制。
|
||||
不要发明画像里没有的列。时间/维度/指标须与画像匹配。
|
||||
|
||||
用户消息(User)
|
||||
【表画像JSON】
|
||||
{{TABLE_PROFILE_JSON}}
|
||||
|
||||
【输出要求(只输出JSON数组)】
|
||||
[
|
||||
{
|
||||
"id": "snpt_<slug>",
|
||||
"title": "中文标题(≤16字)",
|
||||
"desc": "一句话用途",
|
||||
"type": "aggregate|trend|topn|ratio|quality|join|sample",
|
||||
"applicability": {
|
||||
"required_columns": ["<col>", ...],
|
||||
"time_column": "<dt|nullable>",
|
||||
"constraints": {
|
||||
"dim_cardinality_hint": <int|null>, // 用于TopN限制与性能提示
|
||||
"fk_join_available": true|false,
|
||||
"notes": ["高基数维度建议LIMIT<=50", "..."]
|
||||
}
|
||||
},
|
||||
"variables": [
|
||||
{"name":"start_date","type":"date"},
|
||||
{"name":"end_date","type":"date"},
|
||||
{"name":"top_n","type":"int","default":10}
|
||||
],
|
||||
"dialect_sql": {
|
||||
"mysql": ""
|
||||
},
|
||||
"business_caliber": "清晰口径说明,如 UV以device_id去重;粒度=日-类目",
|
||||
"examples": ["示例问法1","示例问法2"]
|
||||
}
|
||||
]
|
||||
|
||||
【片段选择建议】
|
||||
- 若存在 time 列:生成 trend_by_day / yoy_qoq / moving_avg。
|
||||
- 若存在 enumish 维度(distinct 5~200):生成 topn_by_dimension / share_of_total。
|
||||
- 若 metric 列:生成 sum/avg/max、分位数/异常检测(3σ/箱线)。
|
||||
- 有主键/唯一:生成 去重/明细抽样/质量检查。
|
||||
- 有 fk_candidates:同时生成“join维表命名版”和“纯ID版”。
|
||||
- 高枚举维度:在 constraints.notes 中强调 LIMIT 建议与可能的性能风险。
|
||||
- 除了完整的sql片段,还有sql里部分内容的sql片段,比如 where payment_method = 'Credit Card' and delivery_status = 'Deliverd' 的含义是支付方式为信用卡且配送状态是已送达
|
||||
277
demo/table-desc.json
Normal file
277
demo/table-desc.json
Normal file
@ -0,0 +1,277 @@
|
||||
{
|
||||
"table": "ecommerce_orders",
|
||||
"row_count": 10000,
|
||||
"role": "fact",
|
||||
"grain": ["order_id"],
|
||||
"time": {
|
||||
"column": "order_date",
|
||||
"granularity": "day",
|
||||
"range": ["2024-04-20", "2025-04-19"],
|
||||
"has_gaps": false
|
||||
},
|
||||
"columns": [
|
||||
{
|
||||
"name": "order_id",
|
||||
"dtype": "string",
|
||||
"semantic_type": "id",
|
||||
"null_rate": 0.0,
|
||||
"distinct_count": 10000,
|
||||
"distinct_ratio": 1.0,
|
||||
"stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
|
||||
"enumish": false,
|
||||
"top_values": [],
|
||||
"pk_candidate_score": 1.0,
|
||||
"metric_candidate_score": 0.0,
|
||||
"comment": ""
|
||||
},
|
||||
{
|
||||
"name": "customer_id",
|
||||
"dtype": "integer",
|
||||
"semantic_type": "dimension",
|
||||
"null_rate": 0.0,
|
||||
"distinct_count": 2713,
|
||||
"distinct_ratio": 0.2713,
|
||||
"stats": {"min": 1, "max": 2999, "mean": 995.29, "std": null, "skewness": null},
|
||||
"enumish": false,
|
||||
"top_values": [],
|
||||
"pk_candidate_score": 0.3,
|
||||
"metric_candidate_score": 0.1,
|
||||
"comment": ""
|
||||
},
|
||||
{
|
||||
"name": "product_id",
|
||||
"dtype": "integer",
|
||||
"semantic_type": "dimension",
|
||||
"null_rate": 0.0,
|
||||
"distinct_count": 1000,
|
||||
"distinct_ratio": 0.0999,
|
||||
"stats": {"min": 1, "max": 1000, "mean": 504.87, "std": null, "skewness": null},
|
||||
"enumish": true,
|
||||
"top_values": [],
|
||||
"pk_candidate_score": 0.1,
|
||||
"metric_candidate_score": 0.1,
|
||||
"comment": ""
|
||||
},
|
||||
{
|
||||
"name": "category",
|
||||
"dtype": "string",
|
||||
"semantic_type": "dimension",
|
||||
"null_rate": 0.0,
|
||||
"distinct_count": 6,
|
||||
"distinct_ratio": 0.0006,
|
||||
"stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
|
||||
"enumish": true,
|
||||
"top_values": [
|
||||
{"value": "Beauty", "pct": null},
|
||||
{"value": "Books", "pct": null},
|
||||
{"value": "Clothing", "pct": null},
|
||||
{"value": "Electronics", "pct": null},
|
||||
{"value": "Home", "pct": null},
|
||||
{"value": "Toys", "pct": null}
|
||||
],
|
||||
"pk_candidate_score": 0.0,
|
||||
"metric_candidate_score": 0.0,
|
||||
"comment": ""
|
||||
},
|
||||
{
|
||||
"name": "price",
|
||||
"dtype": "float",
|
||||
"semantic_type": "metric",
|
||||
"null_rate": 0.0,
|
||||
"distinct_count": 9013,
|
||||
"distinct_ratio": 0.9013,
|
||||
"stats": {"min": 5.06, "max": 499.93, "mean": 252.55, "std": null, "skewness": null},
|
||||
"enumish": false,
|
||||
"top_values": [],
|
||||
"pk_candidate_score": 0.0,
|
||||
"metric_candidate_score": 0.9,
|
||||
"comment": ""
|
||||
},
|
||||
{
|
||||
"name": "quantity",
|
||||
"dtype": "integer",
|
||||
"semantic_type": "metric",
|
||||
"null_rate": 0.0,
|
||||
"distinct_count": 9,
|
||||
"distinct_ratio": 0.0009,
|
||||
"stats": {"min": 1, "max": 9, "mean": 2.12, "std": null, "skewness": null},
|
||||
"enumish": true,
|
||||
"top_values": [
|
||||
{"value": 1, "pct": null},
|
||||
{"value": 2, "pct": null},
|
||||
{"value": 3, "pct": null},
|
||||
{"value": 4, "pct": null},
|
||||
{"value": 5, "pct": null}
|
||||
],
|
||||
"pk_candidate_score": 0.0,
|
||||
"metric_candidate_score": 0.7,
|
||||
"comment": ""
|
||||
},
|
||||
{
|
||||
"name": "order_date",
|
||||
"dtype": "string",
|
||||
"semantic_type": "time",
|
||||
"null_rate": 0.0,
|
||||
"distinct_count": 365,
|
||||
"distinct_ratio": 0.0365,
|
||||
"stats": {"min": "2024-04-20", "max": "2025-04-19", "mean": null, "std": null, "skewness": null},
|
||||
"enumish": false,
|
||||
"top_values": [],
|
||||
"pk_candidate_score": 0.0,
|
||||
"metric_candidate_score": 0.0,
|
||||
"comment": ""
|
||||
},
|
||||
{
|
||||
"name": "shipping_date",
|
||||
"dtype": "string",
|
||||
"semantic_type": "time",
|
||||
"null_rate": 0.0,
|
||||
"distinct_count": 371,
|
||||
"distinct_ratio": 0.0371,
|
||||
"stats": {"min": "2024-04-21", "max": "2025-04-26", "mean": null, "std": null, "skewness": null},
|
||||
"enumish": false,
|
||||
"top_values": [],
|
||||
"pk_candidate_score": 0.0,
|
||||
"metric_candidate_score": 0.0,
|
||||
"comment": ""
|
||||
},
|
||||
{
|
||||
"name": "delivery_status",
|
||||
"dtype": "string",
|
||||
"semantic_type": "dimension",
|
||||
"null_rate": 0.0,
|
||||
"distinct_count": 4,
|
||||
"distinct_ratio": 0.0004,
|
||||
"stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
|
||||
"enumish": true,
|
||||
"top_values": [
|
||||
{"value": "Delivered", "pct": null},
|
||||
{"value": "Pending", "pct": null},
|
||||
{"value": "Returned", "pct": null},
|
||||
{"value": "Shipped", "pct": null}
|
||||
],
|
||||
"pk_candidate_score": 0.0,
|
||||
"metric_candidate_score": 0.0,
|
||||
"comment": ""
|
||||
},
|
||||
{
|
||||
"name": "payment_method",
|
||||
"dtype": "string",
|
||||
"semantic_type": "dimension",
|
||||
"null_rate": 0.0,
|
||||
"distinct_count": 5,
|
||||
"distinct_ratio": 0.0005,
|
||||
"stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
|
||||
"enumish": true,
|
||||
"top_values": [
|
||||
{"value": "Apple Pay", "pct": null},
|
||||
{"value": "Credit Card", "pct": null},
|
||||
{"value": "Debit Card", "pct": null},
|
||||
{"value": "Google Pay", "pct": null},
|
||||
{"value": "PayPal", "pct": null}
|
||||
],
|
||||
"pk_candidate_score": 0.0,
|
||||
"metric_candidate_score": 0.0,
|
||||
"comment": ""
|
||||
},
|
||||
{
|
||||
"name": "device_type",
|
||||
"dtype": "string",
|
||||
"semantic_type": "dimension",
|
||||
"null_rate": 0.0,
|
||||
"distinct_count": 3,
|
||||
"distinct_ratio": 0.0003,
|
||||
"stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
|
||||
"enumish": true,
|
||||
"top_values": [
|
||||
{"value": "Desktop", "pct": null},
|
||||
{"value": "Mobile", "pct": null},
|
||||
{"value": "Tablet", "pct": null}
|
||||
],
|
||||
"pk_candidate_score": 0.0,
|
||||
"metric_candidate_score": 0.0,
|
||||
"comment": ""
|
||||
},
|
||||
{
|
||||
"name": "channel",
|
||||
"dtype": "string",
|
||||
"semantic_type": "dimension",
|
||||
"null_rate": 0.0,
|
||||
"distinct_count": 4,
|
||||
"distinct_ratio": 0.0004,
|
||||
"stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
|
||||
"enumish": true,
|
||||
"top_values": [
|
||||
{"value": "Email", "pct": null},
|
||||
{"value": "Organic", "pct": null},
|
||||
{"value": "Paid Search", "pct": null},
|
||||
{"value": "Social", "pct": null}
|
||||
],
|
||||
"pk_candidate_score": 0.0,
|
||||
"metric_candidate_score": 0.0,
|
||||
"comment": ""
|
||||
},
|
||||
{
|
||||
"name": "shipping_address",
|
||||
"dtype": "string",
|
||||
"semantic_type": "text",
|
||||
"null_rate": 0.0,
|
||||
"distinct_count": 10000,
|
||||
"distinct_ratio": 1.0,
|
||||
"stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
|
||||
"enumish": false,
|
||||
"top_values": [],
|
||||
"pk_candidate_score": 0.9,
|
||||
"metric_candidate_score": 0.0,
|
||||
"comment": ""
|
||||
},
|
||||
{
|
||||
"name": "billing_address",
|
||||
"dtype": "string",
|
||||
"semantic_type": "text",
|
||||
"null_rate": 0.0,
|
||||
"distinct_count": 10000,
|
||||
"distinct_ratio": 1.0,
|
||||
"stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
|
||||
"enumish": false,
|
||||
"top_values": [],
|
||||
"pk_candidate_score": 0.9,
|
||||
"metric_candidate_score": 0.0,
|
||||
"comment": ""
|
||||
},
|
||||
{
|
||||
"name": "customer_segment",
|
||||
"dtype": "string",
|
||||
"semantic_type": "dimension",
|
||||
"null_rate": 0.0,
|
||||
"distinct_count": 3,
|
||||
"distinct_ratio": 0.0003,
|
||||
"stats": {"min": null, "max": null, "mean": null, "std": null, "skewness": null},
|
||||
"enumish": true,
|
||||
"top_values": [
|
||||
{"value": "New", "pct": null},
|
||||
{"value": "Returning", "pct": null},
|
||||
{"value": "VIP", "pct": null}
|
||||
],
|
||||
"pk_candidate_score": 0.0,
|
||||
"metric_candidate_score": 0.0,
|
||||
"comment": ""
|
||||
}
|
||||
],
|
||||
"primary_key_candidates": [["order_id"]],
|
||||
"fk_candidates": [
|
||||
{"from": "customer_id", "to": "dim_customer(customer_id)", "confidence": 0.9},
|
||||
{"from": "product_id", "to": "dim_product(product_id)", "confidence": 0.9}
|
||||
],
|
||||
"quality": {
|
||||
"failed_expectations": [],
|
||||
"warning_hints": []
|
||||
},
|
||||
"confidence_notes": [
|
||||
"表含时间列(order_date, shipping_date)且含度量列(price, quantity),推断为fact表。",
|
||||
"order_id唯一性=1.0,确认主键。",
|
||||
"order_date日期范围连续无缺口,粒度为日级。",
|
||||
"高基数数值字段(price, quantity)符合指标特征。",
|
||||
"低熵字段(category, delivery_status, payment_method等)为枚举维度。"
|
||||
]
|
||||
}
|
||||
102
demo/user-query.json
Normal file
102
demo/user-query.json
Normal file
@ -0,0 +1,102 @@
|
||||
[
|
||||
{
|
||||
"question": "近一年每个月的销售额和订单量变化趋势如何?",
|
||||
"intent": "trend_analysis",
|
||||
"related_fields": ["order_date", "price", "quantity"]
|
||||
},
|
||||
{
|
||||
"question": "哪个产品类目的GMV最高?",
|
||||
"intent": "topn_category",
|
||||
"related_fields": ["category", "price", "quantity"]
|
||||
},
|
||||
{
|
||||
"question": "不同支付方式的订单数量和平均客单价是多少?",
|
||||
"intent": "aggregate_comparison",
|
||||
"related_fields": ["payment_method", "price", "quantity"]
|
||||
},
|
||||
{
|
||||
"question": "各营销渠道(如Paid Search、Social)的GMV占比是多少?",
|
||||
"intent": "ratio_analysis",
|
||||
"related_fields": ["channel", "price", "quantity"]
|
||||
},
|
||||
{
|
||||
"question": "移动端和桌面端的订单表现差异大吗?",
|
||||
"intent": "device_comparison",
|
||||
"related_fields": ["device_type", "price", "quantity"]
|
||||
},
|
||||
{
|
||||
"question": "已送达订单的平均配送时长是多少天?",
|
||||
"intent": "shipping_time_analysis",
|
||||
"related_fields": ["order_date", "shipping_date", "delivery_status"]
|
||||
},
|
||||
{
|
||||
"question": "退货(Returned)订单主要集中在哪些产品类目?",
|
||||
"intent": "return_analysis",
|
||||
"related_fields": ["delivery_status", "category"]
|
||||
},
|
||||
{
|
||||
"question": "不同客户类型(新客、回头客、VIP)的平均订单金额是多少?",
|
||||
"intent": "segment_analysis",
|
||||
"related_fields": ["customer_segment", "price", "quantity"]
|
||||
},
|
||||
{
|
||||
"question": "每个客户的平均下单频率是多少?",
|
||||
"intent": "customer_behavior",
|
||||
"related_fields": ["customer_id", "order_date"]
|
||||
},
|
||||
{
|
||||
"question": "近期是否存在价格异常或超高订单?",
|
||||
"intent": "quality_outlier",
|
||||
"related_fields": ["price", "order_date"]
|
||||
},
|
||||
{
|
||||
"question": "哪个支付方式的退货率最高?",
|
||||
"intent": "return_ratio_by_payment",
|
||||
"related_fields": ["payment_method", "delivery_status"]
|
||||
},
|
||||
{
|
||||
"question": "哪些商品在VIP客户中最受欢迎?",
|
||||
"intent": "vip_product_preference",
|
||||
"related_fields": ["customer_segment", "product_id", "price", "quantity"]
|
||||
},
|
||||
{
|
||||
"question": "下单后平均几天发货?",
|
||||
"intent": "shipping_speed",
|
||||
"related_fields": ["order_date", "shipping_date"]
|
||||
},
|
||||
{
|
||||
"question": "从哪些渠道来的新用户最多?",
|
||||
"intent": "user_acquisition_channel",
|
||||
"related_fields": ["channel", "customer_segment"]
|
||||
},
|
||||
{
|
||||
"question": "订单数量在周末和工作日有什么差异?",
|
||||
"intent": "weekday_pattern",
|
||||
"related_fields": ["order_date"]
|
||||
},
|
||||
{
|
||||
"question": "每个设备类型的平均订单金额是多少?",
|
||||
"intent": "device_gmv_comparison",
|
||||
"related_fields": ["device_type", "price", "quantity"]
|
||||
},
|
||||
{
|
||||
"question": "本月退货率与上月相比是否上升?",
|
||||
"intent": "return_trend",
|
||||
"related_fields": ["delivery_status", "order_date"]
|
||||
},
|
||||
{
|
||||
"question": "哪些客户下单金额最高?",
|
||||
"intent": "top_customers",
|
||||
"related_fields": ["customer_id", "price", "quantity"]
|
||||
},
|
||||
{
|
||||
"question": "不同类目的平均客单价(GMV/订单量)是多少?",
|
||||
"intent": "category_avg_order_value",
|
||||
"related_fields": ["category", "price", "quantity"]
|
||||
},
|
||||
{
|
||||
"question": "不同渠道的订单平均转化周期(下单到发货)是多少?",
|
||||
"intent": "conversion_cycle",
|
||||
"related_fields": ["channel", "order_date", "shipping_date"]
|
||||
}
|
||||
]
|
||||
Reference in New Issue
Block a user