demo数据

This commit is contained in:
zhaoawd
2025-11-14 00:58:00 +08:00
parent 7eb3c059a1
commit a72ca3593e
13 changed files with 2733 additions and 0 deletions

View File

@ -0,0 +1 @@
{"role": "dimension", "time": {"range": null, "column": null, "has_gaps": null, "granularity": "unknown"}, "grain": ["service_point_id"], "table": "data-ge.water_meter_info", "columns": [{"name": "meter_subtype", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 9, "distinct_ratio": 0.03, "pk_candidate_score": 0.03, "metric_candidate_score": 0.0}, {"name": "installation_position", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 4, "distinct_ratio": 0.013333333333333334, "pk_candidate_score": 0.013333333333333334, "metric_candidate_score": 0.0}, {"name": "supply_office", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 11, "distinct_ratio": 0.03666666666666667, "pk_candidate_score": 0.03666666666666667, "metric_candidate_score": 0.0}, {"name": "meter_diameter", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 8, "distinct_ratio": 0.02666666666666667, "pk_candidate_score": 0.02666666666666667, "metric_candidate_score": 0.0}, {"name": "account_id", "dtype": "unknown", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列的统计指标如空值率、唯一性缺失但根据命名规则推断为ID。", "enumish": null, "null_rate": null, "top_values": [], "semantic_type": "id", "distinct_count": null, "distinct_ratio": null, "pk_candidate_score": 0.9, "metric_candidate_score": 0.0}, {"name": "service_point_id", "dtype": "unknown", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列的统计指标如空值率、唯一性缺失但根据命名规则推断为ID。", "enumish": null, "null_rate": null, "top_values": [], "semantic_type": "id", "distinct_count": null, "distinct_ratio": null, "pk_candidate_score": 0.95, "metric_candidate_score": 0.0}, {"name": "station", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 36, "distinct_ratio": 0.12, "pk_candidate_score": 0.12, "metric_candidate_score": 0.0}, {"name": "meter_type", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 5, "distinct_ratio": 0.016666666666666666, "pk_candidate_score": 0.016666666666666666, "metric_candidate_score": 0.0}, {"name": "district", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 13, "distinct_ratio": 0.043333333333333335, "pk_candidate_score": 0.043333333333333335, "metric_candidate_score": 0.0}, {"name": "meter_status", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列只有一个唯一值 '有效'。", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 1, "distinct_ratio": 0.0033333333333333335, "pk_candidate_score": 0.0033333333333333335, "metric_candidate_score": 0.0}], "quality": {"warning_hints": ["列 'meter_status' 只有一个唯一值 '有效',可能为常量列。"], "failed_expectations": []}, "row_count": 300, "fk_candidates": [], "confidence_notes": ["表角色(role)被推断为 'dimension'因为其列几乎完全由ID和类别属性构成且缺少数值指标或时间序列列。", "主键候选(primary_key_candidates) 'service_point_id' 和 'account_id' 是基于命名约定(包含'_id'推断的。其唯一性和非空性未在GE结果中直接度量因此这是一个高置信度的猜测。", "表粒度(grain)可能为 'service_point',与推断的主键 'service_point_id' 相对应。", "未根据列名或数据格式识别出时间列。"], "primary_key_candidates": [["service_point_id"], ["account_id"]]}

View File

@ -0,0 +1,180 @@
[
{
"id": "snpt_count-service-points-by-dimension",
"aliases": [
{
"text": "各个区有多少水表",
"tone": "口语"
},
{
"text": "按维度统计用水点数",
"tone": "中性"
},
{
"text": "各维度用水点数量分布",
"tone": "专业"
}
],
"keywords": [
"用水点数",
"service_point_count",
"数量",
"统计",
"汇总",
"aggregate",
"维度",
"dimension",
"区域",
"district",
"供水所",
"分组统计",
"水表"
],
"intent_tags": [
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_topn-service-points-by-dimension",
"aliases": [
{
"text": "哪个地方水表最多",
"tone": "口语"
},
{
"text": "用水点数Top-N排名",
"tone": "中性"
},
{
"text": "Top-N用水点数维度排行",
"tone": "专业"
}
],
"keywords": [
"Top-N",
"top",
"排名",
"排行",
"ranking",
"最多",
"用水点数",
"service_point_count",
"维度",
"dimension",
"站点",
"station",
"水表"
],
"intent_tags": [
"topn",
"by_dimension"
]
},
{
"id": "snpt_ratio-service-points-by-dimension",
"aliases": [
{
"text": "各种水表各占多少",
"tone": "口语"
},
{
"text": "各维度用水点数占比",
"tone": "中性"
},
{
"text": "用水点维度构成分析",
"tone": "专业"
}
],
"keywords": [
"占比",
"percentage",
"百分比",
"ratio",
"构成",
"分布",
"用水点数",
"水表类型",
"meter_type",
"维度",
"dimension",
"水表"
],
"intent_tags": [
"ratio",
"by_dimension"
]
},
{
"id": "snpt_quality-check-duplicate-spid",
"aliases": [
{
"text": "有没有重复的水表号",
"tone": "口语"
},
{
"text": "检查重复的用水点ID",
"tone": "中性"
},
{
"text": "用水点ID唯一性校验",
"tone": "专业"
}
],
"keywords": [
"数据质量",
"quality",
"检查",
"校验",
"重复",
"duplicate",
"唯一性",
"uniqueness",
"用水点ID",
"service_point_id",
"异常检测",
"主键"
],
"intent_tags": [
"quality",
"by_dimension"
]
},
{
"id": "snpt_sample-filter-service-points-by-dims",
"aliases": [
{
"text": "给我看城区的机械表",
"tone": "口语"
},
{
"text": "按多维度筛选用水点",
"tone": "中性"
},
{
"text": "多维组合条件过滤用水点",
"tone": "专业"
}
],
"keywords": [
"筛选",
"过滤",
"filter",
"查询",
"明细",
"列表",
"sample",
"用水点",
"区域",
"district",
"水表类型",
"meter_type",
"条件查询"
],
"intent_tags": [
"sample",
"filter"
]
}
]

View File

@ -0,0 +1,186 @@
[
{
"id": "snpt_count-service-points-by-dimension",
"desc": "按指定维度(如区域、供水所)分组,统计各分类下的用水点数量。",
"type": "aggregate",
"title": "按维度统计用水点数",
"examples": [
"按区域统计用水点数量",
"各个供水所分别有多少个用水点"
],
"variables": [
{
"name": "dimension_column",
"type": "column",
"default": "district"
}
],
"dialect_sql": {
"mysql": "SELECT\n `${dimension_column}`,\n COUNT(DISTINCT service_point_id) AS service_point_count\nFROM\n `data-ge.water_meter_info`\nGROUP BY\n `${dimension_column}`\nORDER BY\n service_point_count DESC;"
},
"applicability": {
"constraints": {
"notes": [
"适用于对水表档案信息进行分类汇总统计。",
"可将变量 ${dimension_column} 替换为任一维度列,如 district, supply_office, station, meter_type 等。"
],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "用水点数:对 `service_point_id` 进行去重计数,代表一个独立的服务点(通常对应一个水表)。统计粒度为“指定维度”。"
},
{
"id": "snpt_topn-service-points-by-dimension",
"desc": "按指定维度如区域、站点统计用水点数并展示数量最多的前N个分类。",
"type": "topn",
"title": "Top-N 用水点数维度排名",
"examples": [
"哪个区域的用水点最多",
"用水点数排名前5的站点是哪些"
],
"variables": [
{
"name": "dimension_column",
"type": "column",
"default": "station"
},
{
"name": "top_n",
"type": "int",
"default": 10
}
],
"dialect_sql": {
"mysql": "SELECT\n `${dimension_column}`,\n COUNT(DISTINCT service_point_id) AS service_point_count\nFROM\n `data-ge.water_meter_info`\nGROUP BY\n `${dimension_column}`\nORDER BY\n service_point_count DESC\nLIMIT ${top_n};"
},
"applicability": {
"constraints": {
"notes": [
"维度 `station` 基数较高 (36),建议 Top-N 查询时结合业务场景合理设置 N 值。"
],
"fk_join_available": false,
"dim_cardinality_hint": 36
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "用水点数:对 `service_point_id` 进行去重计数。排名依据为各维度分类下的用水点总数。统计粒度为“指定维度”。"
},
{
"id": "snpt_ratio-service-points-by-dimension",
"desc": "计算在指定维度下,各分类的用水点数占总用水点数的百分比,以分析其分布构成。",
"type": "ratio",
"title": "各维度用水点数占比",
"examples": [
"不同水表类型meter_type的分布情况",
"各个区域的用水点占比是多少"
],
"variables": [
{
"name": "dimension_column",
"type": "column",
"default": "meter_type"
}
],
"dialect_sql": {
"mysql": "SELECT\n `${dimension_column}`,\n COUNT(DISTINCT service_point_id) AS service_point_count,\n COUNT(DISTINCT service_point_id) * 100.0 / SUM(COUNT(DISTINCT service_point_id)) OVER () AS percentage\nFROM\n `data-ge.water_meter_info`\nGROUP BY\n `${dimension_column}`\nORDER BY\n service_point_count DESC;"
},
"applicability": {
"constraints": {
"notes": [
"SQL模板使用了窗口函数 SUM() OVER()请确保MySQL版本支持8.0+)。"
],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "用水点数占比:某分类下的用水点数 / 总用水点数。用水点数以 `service_point_id` 去重计数。统计粒度为“指定维度”。"
},
{
"id": "snpt_quality-check-duplicate-spid",
"desc": "查找在用水点信息表中存在重复的 `service_point_id`,用于数据质量校验。",
"type": "quality",
"title": "检查重复的用水点ID",
"examples": [
"检查是否存在重复的水表档案",
"校验用水点ID的唯一性"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT\n service_point_id,\n COUNT(*) AS occurrences\nFROM\n `data-ge.water_meter_info`\nGROUP BY\n service_point_id\nHAVING\n COUNT(*) > 1;"
},
"applicability": {
"constraints": {
"notes": [
"预期返回结果为空。若有返回,则表示数据存在一致性问题,`service_point_id` 未能作为唯一主键。"
],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "重复项:指 `service_point_id` 出现次数大于1的记录。此ID应为表的主键理论上不应重复。"
},
{
"id": "snpt_sample-filter-service-points-by-dims",
"desc": "根据区域、水表类型、供水所等多个维度组合条件,筛选出符合条件的用水点明细。",
"type": "sample",
"title": "多维度筛选用水点列表",
"examples": [
"查询城区的机械表有哪些",
"拉取某个供水所下特定口径水表的列表"
],
"variables": [
{
"name": "district_name",
"type": "string",
"default": "城区"
},
{
"name": "meter_type_name",
"type": "string",
"default": "机械表"
},
{
"name": "limit_num",
"type": "int",
"default": 100
}
],
"dialect_sql": {
"mysql": "SELECT\n service_point_id,\n account_id,\n district,\n supply_office,\n meter_type,\n meter_subtype,\n meter_diameter\nFROM\n `data-ge.water_meter_info`\nWHERE\n district = '${district_name}'\n AND meter_type = '${meter_type_name}'\n -- AND meter_status = '有效' -- 可选:根据画像,该列为常量'有效',可不加\nLIMIT ${limit_num};"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": null,
"required_columns": [
"service_point_id",
"account_id",
"district",
"supply_office",
"meter_type",
"meter_subtype",
"meter_diameter"
]
},
"business_caliber": "返回满足所有筛选条件的用水点明细信息。`meter_status` 列只有一个值 '有效',通常无需作为筛选条件。"
}
]

View File

@ -0,0 +1,230 @@
{
"role": "dimension",
"time": {
"range": null,
"column": null,
"has_gaps": null,
"granularity": "unknown"
},
"grain": [
"service_point_id"
],
"table": "data-ge.water_meter_info",
"columns": [
{
"name": "supply_office",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空11 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 11,
"distinct_ratio": 0.03666666666666667,
"pk_candidate_score": 0.05,
"metric_candidate_score": 0.0
},
{
"name": "station",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空36 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 36,
"distinct_ratio": 0.12,
"pk_candidate_score": 0.1,
"metric_candidate_score": 0.0
},
{
"name": "district",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空13 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 13,
"distinct_ratio": 0.043333333333333335,
"pk_candidate_score": 0.05,
"metric_candidate_score": 0.0
},
{
"name": "meter_diameter",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空8 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 8,
"distinct_ratio": 0.02666666666666667,
"pk_candidate_score": 0.03,
"metric_candidate_score": 0.0
},
{
"name": "meter_status",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空;单一取值(\"有效\"",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 1,
"distinct_ratio": 0.0033333333333333335,
"pk_candidate_score": 0.0,
"metric_candidate_score": 0.0
},
{
"name": "meter_subtype",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空9 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 9,
"distinct_ratio": 0.03,
"pk_candidate_score": 0.03,
"metric_candidate_score": 0.0
},
{
"name": "meter_type",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空5 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 5,
"distinct_ratio": 0.016666666666666666,
"pk_candidate_score": 0.02,
"metric_candidate_score": 0.0
},
{
"name": "installation_position",
"dtype": "string",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "非空4 个枚举值GE 约束)",
"enumish": true,
"null_rate": 0.0,
"top_values": [],
"semantic_type": "dimension",
"distinct_count": 4,
"distinct_ratio": 0.013333333333333334,
"pk_candidate_score": 0.02,
"metric_candidate_score": 0.0
},
{
"name": "service_point_id",
"dtype": "unknown",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "命名指示标识列;未提供唯一性或非空验证",
"enumish": null,
"null_rate": null,
"top_values": [],
"semantic_type": "id",
"distinct_count": null,
"distinct_ratio": null,
"pk_candidate_score": 0.6,
"metric_candidate_score": 0.05
},
{
"name": "account_id",
"dtype": "unknown",
"stats": {
"max": null,
"min": null,
"std": null,
"mean": null,
"skewness": null
},
"comment": "命名指示账户标识;未提供唯一性或非空验证",
"enumish": null,
"null_rate": null,
"top_values": [],
"semantic_type": "id",
"distinct_count": null,
"distinct_ratio": null,
"pk_candidate_score": 0.5,
"metric_candidate_score": 0.05
}
],
"quality": {
"warning_hints": [
"以下列未设置非空校验service_point_id, account_id空值情况未知",
"未识别到时间列"
],
"failed_expectations": []
},
"row_count": 300,
"fk_candidates": [],
"confidence_notes": [
"role 判定为 dimension表内列均为枚举/分类或ID未发现数值型度量或时间列34/34 期望均为分类枚举/非空与去重比例。",
"grain 猜测为 service_point_id仅依据命名启发式缺少唯一性与非空度量佐证置信度较低。",
"未识别时间列:列名与期望均未涉及日期/时间,也无最小/最大时间范围可推断。"
],
"primary_key_candidates": []
}

View File

@ -0,0 +1,372 @@
[
{
"id": "snpt_topn_station",
"aliases": [
{
"text": "站点水表排行前N",
"tone": "中性"
},
{
"text": "哪个站点表最多",
"tone": "口语"
},
{
"text": "按站点水表TopN",
"tone": "专业"
}
],
"keywords": [
"TopN",
"排名",
"排行",
"station",
"站点",
"水表数",
"meter count",
"distinct",
"去重",
"聚合",
"排序",
"榜单"
],
"intent_tags": [
"topn",
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_share_district",
"aliases": [
{
"text": "各辖区水表占比",
"tone": "中性"
},
{
"text": "哪个辖区占比高",
"tone": "口语"
},
{
"text": "按辖区水表比例",
"tone": "专业"
}
],
"keywords": [
"占比",
"ratio",
"district",
"辖区",
"水表数",
"meter count",
"distinct",
"去重",
"百分比",
"份额",
"聚合",
"排序",
"分布"
],
"intent_tags": [
"ratio",
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_dist_diameter",
"aliases": [
{
"text": "表径水表数分布",
"tone": "中性"
},
{
"text": "不同口径有多少",
"tone": "口语"
},
{
"text": "按表径去重计数",
"tone": "专业"
}
],
"keywords": [
"分布",
"distribution",
"meter_diameter",
"表径",
"水表数",
"meter count",
"distinct",
"去重",
"聚合",
"类别",
"category",
"条形图",
"饼图",
"排行"
],
"intent_tags": [
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_type_subtype_matrix",
"aliases": [
{
"text": "类型×子类水表数",
"tone": "中性"
},
{
"text": "看各类型各子类",
"tone": "口语"
},
{
"text": "类型子类组合统计",
"tone": "专业"
}
],
"keywords": [
"类型",
"type",
"子类",
"subtype",
"组合",
"matrix",
"交叉分析",
"cross-tab",
"水表数",
"meter count",
"distinct",
"去重",
"分布",
"聚合",
"维度"
],
"intent_tags": [
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_quality_spid_uniq",
"aliases": [
{
"text": "服务点ID唯一性检",
"tone": "专业"
},
{
"text": "服务点ID有重复吗",
"tone": "口语"
},
{
"text": "服务点ID完整性评估",
"tone": "中性"
}
],
"keywords": [
"质量检查",
"quality",
"唯一性",
"uniqueness",
"重复",
"duplicate",
"空值",
"NULL",
"完整性",
"integrity",
"service_point_id",
"数据质量",
"统计",
"去重",
"异常检测"
],
"intent_tags": [
"quality"
]
},
{
"id": "snpt_quality_account_nulls",
"aliases": [
{
"text": "账户ID缺失明细",
"tone": "中性"
},
{
"text": "看看哪些账户为空",
"tone": "口语"
},
{
"text": "account_id空值样本",
"tone": "专业"
}
],
"keywords": [
"质量检查",
"缺失",
"missing",
"空值",
"NULL",
"account_id",
"样本",
"sample",
"抽样",
"sampling",
"明细",
"排查",
"过滤",
"WHERE",
"LIMIT"
],
"intent_tags": [
"quality",
"sample"
]
},
{
"id": "snpt_sample_random_rows",
"aliases": [
{
"text": "随机抽样水表明细",
"tone": "中性"
},
{
"text": "随机取几条看看",
"tone": "口语"
},
{
"text": "RAND()样本抽取",
"tone": "专业"
}
],
"keywords": [
"随机",
"random",
"样本",
"sample",
"抽样",
"sampling",
"明细",
"details",
"质检",
"QA",
"RAND()",
"LIMIT",
"抽取",
"数据验证"
],
"intent_tags": [
"sample"
]
},
{
"id": "snpt_filter_office_type_where",
"aliases": [
{
"text": "按所与类型过滤有效",
"tone": "专业"
},
{
"text": "筛选某所的指定类型",
"tone": "中性"
},
{
"text": "只看这所的这种表",
"tone": "口语"
}
],
"keywords": [
"过滤",
"filter",
"WHERE",
"supply_office",
"营业所",
"meter_type",
"类型",
"meter_status",
"有效",
"条件片段",
"筛选",
"查询拼接",
"字段",
"约束"
],
"intent_tags": [
"filter"
]
},
{
"id": "snpt_office_station_dist",
"aliases": [
{
"text": "所站组合水表数",
"tone": "中性"
},
{
"text": "各站在各所有多少",
"tone": "口语"
},
{
"text": "营业所×站点分布",
"tone": "专业"
}
],
"keywords": [
"supply_office",
"营业所",
"station",
"站点",
"层级",
"hierarchy",
"分布",
"distribution",
"水表数",
"meter count",
"distinct",
"去重",
"聚合",
"交叉分析",
"排行"
],
"intent_tags": [
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_total_meter_baseline",
"aliases": [
{
"text": "水表总量基线",
"tone": "中性"
},
{
"text": "现在有多少水表",
"tone": "口语"
},
{
"text": "全表去重总数",
"tone": "专业"
}
],
"keywords": [
"总量",
"total",
"baseline",
"基线",
"水表总数",
"meter total",
"service_point_id",
"distinct",
"去重",
"分母",
"denominator",
"占比",
"聚合",
"汇总",
"snapshot"
],
"intent_tags": [
"aggregate"
]
}
]

View File

@ -0,0 +1,330 @@
[
{
"id": "snpt_topn_station",
"desc": "按站点统计水表数量并取前N",
"type": "topn",
"title": "站点TopN水表数",
"examples": [
"各站点水表数量排名前10",
"站点水表覆盖情况排行"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 10
}
],
"dialect_sql": {
"mysql": "SELECT station,\n COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY station\nORDER BY meter_cnt DESC\nLIMIT {{top_n}};"
},
"applicability": {
"constraints": {
"notes": [
"TopN建议N<=36",
"以service_point_id去重计数",
"无时间列,无法做趋势"
],
"fk_join_available": false,
"dim_cardinality_hint": 36
},
"time_column": null,
"required_columns": [
"station",
"service_point_id"
]
},
"business_caliber": "水表数=按service_point_id去重计数粒度=站点。仅统计当前表中的有效记录不含时间口径。安全限制用于分析排名避免扩大LIMIT造成全量导出。"
},
{
"id": "snpt_share_district",
"desc": "统计各辖区水表数及其占比",
"type": "ratio",
"title": "辖区水表占比",
"examples": [
"各辖区水表占比",
"哪个辖区水表最多"
],
"variables": [],
"dialect_sql": {
"mysql": "WITH by_district AS (\n SELECT district, COUNT(DISTINCT service_point_id) AS meter_cnt\n FROM `data-ge`.`water_meter_info`\n GROUP BY district\n), tot AS (\n SELECT COUNT(DISTINCT service_point_id) AS total_cnt\n FROM `data-ge`.`water_meter_info`\n)\nSELECT b.district,\n b.meter_cnt,\n ROUND(b.meter_cnt / NULLIF(t.total_cnt, 0) * 100, 2) AS pct\nFROM by_district b\nCROSS JOIN tot t\nORDER BY pct DESC, b.district;"
},
"applicability": {
"constraints": {
"notes": [
"占比分母为全表service_point_id去重总数",
"service_point_id为空将被忽略"
],
"fk_join_available": false,
"dim_cardinality_hint": 13
},
"time_column": null,
"required_columns": [
"district",
"service_point_id"
]
},
"business_caliber": "水表数=按service_point_id去重计数粒度=辖区。占比=辖区水表数/全表水表总数。安全限制:仅基于本表,不代表全市/全网口径;无时间维度。"
},
{
"id": "snpt_dist_diameter",
"desc": "按表径统计水表数量分布",
"type": "aggregate",
"title": "表径分布统计",
"examples": [
"不同口径水表有多少",
"查看表径分布情况"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT meter_diameter,\n COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY meter_diameter\nORDER BY meter_cnt DESC, meter_diameter;"
},
"applicability": {
"constraints": {
"notes": [
"以service_point_id去重计数",
"适合绘制条形图/饼图"
],
"fk_join_available": false,
"dim_cardinality_hint": 8
},
"time_column": null,
"required_columns": [
"meter_diameter",
"service_point_id"
]
},
"business_caliber": "水表数=按service_point_id去重计数粒度=表径。安全限制:仅用于分布分析,不含时间过滤;避免用于明细导出。"
},
{
"id": "snpt_type_subtype_matrix",
"desc": "统计水表类型与子类组合的数量",
"type": "aggregate",
"title": "类型子类分布",
"examples": [
"不同类型与子类的水表数量",
"查看类型与子类的组合分布"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT meter_type,\n meter_subtype,\n COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY meter_type, meter_subtype\nORDER BY meter_cnt DESC, meter_type, meter_subtype;"
},
"applicability": {
"constraints": {
"notes": [
"组合基数<=5×9=45",
"以service_point_id去重计数"
],
"fk_join_available": false,
"dim_cardinality_hint": 45
},
"time_column": null,
"required_columns": [
"meter_type",
"meter_subtype",
"service_point_id"
]
},
"business_caliber": "水表数=按service_point_id去重计数粒度=类型×子类组合。安全限制:仅用于汇总分析,不包含时间或业务状态变化。"
},
{
"id": "snpt_quality_spid_uniq",
"desc": "评估service_point_id的空值与重复情况",
"type": "quality",
"title": "服务点唯一性检",
"examples": [
"检查服务点ID是否唯一",
"统计service_point_id空值与重复情况"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT\n COUNT(*) AS total_rows,\n SUM(service_point_id IS NULL) AS null_cnt,\n COUNT(DISTINCT service_point_id) AS distinct_cnt,\n (COUNT(*) - COUNT(DISTINCT service_point_id)) AS duplicate_rows_est,\n (\n SELECT COUNT(*) FROM (\n SELECT service_point_id\n FROM `data-ge`.`water_meter_info`\n GROUP BY service_point_id\n HAVING COUNT(*) > 1\n ) AS dup\n ) AS dup_key_groups\nFROM `data-ge`.`water_meter_info`;"
},
"applicability": {
"constraints": {
"notes": [
"用于键完整性检查",
"重复行估算=总行数-去重数"
],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "质量检查口径在本表内评估service_point_id的非空与唯一性不代表跨表全局唯一。安全限制仅输出汇总指标不暴露明细重复值。"
},
{
"id": "snpt_quality_account_nulls",
"desc": "抽取account_id为空的记录用于排查",
"type": "quality",
"title": "账户ID缺失明细",
"examples": [
"列出account_id为空的水表",
"抽样查看账户缺失的数据行"
],
"variables": [
{
"name": "limit_n",
"type": "int",
"default": 50
}
],
"dialect_sql": {
"mysql": "SELECT *\nFROM `data-ge`.`water_meter_info`\nWHERE account_id IS NULL\nLIMIT {{limit_n}};"
},
"applicability": {
"constraints": {
"notes": [
"明细仅限小样本抽取",
"建议LIMIT<=100避免全量导出"
],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": null,
"required_columns": [
"account_id"
]
},
"business_caliber": "质量抽样筛出账户ID缺失的水表记录便于核对。安全限制仅用于样本排查不建议在生产中全量导出如需口径统计请改为COUNT聚合。"
},
{
"id": "snpt_sample_random_rows",
"desc": "随机抽取水表信息用于人工核验",
"type": "sample",
"title": "随机抽样明细",
"examples": [
"抽样查看水表信息",
"随机抽取20条做质检"
],
"variables": [
{
"name": "sample_size",
"type": "int",
"default": 20
}
],
"dialect_sql": {
"mysql": "SELECT *\nFROM `data-ge`.`water_meter_info`\nORDER BY RAND()\nLIMIT {{sample_size}};"
},
"applicability": {
"constraints": {
"notes": [
"使用RAND()随机,样本不可复现",
"建议限制样本量"
],
"fk_join_available": false,
"dim_cardinality_hint": 300
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "样本抽取从本表随机返回若干行明细。安全限制避免扩大LIMIT进行全量下载如需可复现样本请改用带种子的随机方法MySQL不原生支持。"
},
{
"id": "snpt_filter_office_type_where",
"desc": "常用WHERE筛选条件片段按营业所与类型且为有效",
"type": "sample",
"title": "机构类型筛选片",
"examples": [
"筛选A营业所的机械表",
"仅查看某营业所的指定类型水表"
],
"variables": [
{
"name": "supply_office",
"type": "string"
},
{
"name": "meter_type",
"type": "string"
}
],
"dialect_sql": {
"mysql": "WHERE supply_office = '{{supply_office}}'\n AND meter_type = '{{meter_type}}'\n AND meter_status = '有效'"
},
"applicability": {
"constraints": {
"notes": [
"这是条件片段,可拼接到其他查询",
"meter_status当前为单一值“有效”"
],
"fk_join_available": false,
"dim_cardinality_hint": 11
},
"time_column": null,
"required_columns": [
"supply_office",
"meter_type",
"meter_status"
]
},
"business_caliber": "过滤口径仅保留指定营业所与指定水表类型、且状态为“有效”的记录。安全限制为片段用途需拼接在SELECT…FROM之后使用。"
},
{
"id": "snpt_office_station_dist",
"desc": "按营业所与站点组合统计水表数",
"type": "aggregate",
"title": "所站层级分布",
"examples": [
"按营业所查看各站点水表数",
"所站两级的水表分布情况"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT supply_office,\n station,\n COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY supply_office, station\nORDER BY supply_office, meter_cnt DESC, station;"
},
"applicability": {
"constraints": {
"notes": [
"组合基数<=11×36=396",
"以service_point_id去重计数",
"如结果过长可再按TopN筛选"
],
"fk_join_available": false,
"dim_cardinality_hint": 396
},
"time_column": null,
"required_columns": [
"supply_office",
"station",
"service_point_id"
]
},
"business_caliber": "水表数=按service_point_id去重计数粒度=营业所×站点。安全限制:结果行数可能较多,建议在可视化端增加筛选或分页。"
},
{
"id": "snpt_total_meter_baseline",
"desc": "获取全表水表去重总量基线",
"type": "aggregate",
"title": "水表总量基线",
"examples": [
"当前有多少只水表",
"作为占比分析的分母基线"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT COUNT(DISTINCT service_point_id) AS meter_total\nFROM `data-ge`.`water_meter_info`;"
},
"applicability": {
"constraints": {
"notes": [
"作为其他占比/分摊分母基线",
"忽略service_point_id为空的记录"
],
"fk_join_available": false,
"dim_cardinality_hint": 300
},
"time_column": null,
"required_columns": [
"service_point_id"
]
},
"business_caliber": "水表总量=按service_point_id去重计数基于当前表的全量记录。安全限制无时间维度无法反映存量随时间变化。"
}
]

View File

@ -0,0 +1,415 @@
{
"role": "dimension",
"time": {
"range": null,
"column": null,
"has_gaps": null,
"granularity": "unknown"
},
"grain": [
"account_id",
"service_point_id"
],
"table": "data-ge.water_meter_info",
"columns": [
{
"name": "supply_office",
"dtype": "string",
"stats": {},
"comment": "供水管理所名称,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "宝山供水管理所"
},
{
"pct": null,
"value": "黄浦供水管理所"
},
{
"pct": null,
"value": "青东供水管理所"
},
{
"pct": null,
"value": "虹口供水管理所"
},
{
"pct": null,
"value": "闸北供水管理所"
},
{
"pct": null,
"value": "松北供水管理所"
},
{
"pct": null,
"value": "杨浦供水管理所"
},
{
"pct": null,
"value": "长宁供水管理所"
},
{
"pct": null,
"value": "闵行供水管理所"
},
{
"pct": null,
"value": "徐汇供水管理所"
},
{
"pct": null,
"value": "普陀供水管理所"
}
],
"semantic_type": "dimension",
"distinct_count": 11,
"distinct_ratio": 0.03666666666666667,
"pk_candidate_score": 0.11,
"metric_candidate_score": 0.0
},
{
"name": "station",
"dtype": "string",
"stats": {},
"comment": "站点名称,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "新闸站"
},
{
"pct": null,
"value": "宝杨站"
},
{
"pct": null,
"value": "江川站"
},
{
"pct": null,
"value": "长江站"
},
{
"pct": null,
"value": "市光站"
},
{
"pct": null,
"value": "徐泾站"
},
{
"pct": null,
"value": "真北站"
},
{
"pct": null,
"value": "半淞园站"
},
{
"pct": null,
"value": "芙蓉江站"
},
{
"pct": null,
"value": "密云站"
}
],
"semantic_type": "dimension",
"distinct_count": 36,
"distinct_ratio": 0.12,
"pk_candidate_score": 0.36,
"metric_candidate_score": 0.0
},
{
"name": "district",
"dtype": "string",
"stats": {},
"comment": "行政区划名称,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "普陀区"
},
{
"pct": null,
"value": "闵行区"
},
{
"pct": null,
"value": "嘉定区"
},
{
"pct": null,
"value": "杨浦区"
},
{
"pct": null,
"value": "徐汇区"
},
{
"pct": null,
"value": "黄浦区"
},
{
"pct": null,
"value": "松江区"
},
{
"pct": null,
"value": "长宁区"
},
{
"pct": null,
"value": "青浦区"
},
{
"pct": null,
"value": "虹口区"
}
],
"semantic_type": "dimension",
"distinct_count": 13,
"distinct_ratio": 0.043333333333333335,
"pk_candidate_score": 0.13,
"metric_candidate_score": 0.0
},
{
"name": "meter_diameter",
"dtype": "string",
"stats": {},
"comment": "水表直径规格,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "20mm"
},
{
"pct": null,
"value": "15mm"
},
{
"pct": null,
"value": "25mm"
},
{
"pct": null,
"value": "40mm"
},
{
"pct": null,
"value": "150mm"
},
{
"pct": null,
"value": "100mm"
},
{
"pct": null,
"value": "80mm"
},
{
"pct": null,
"value": "50mm"
}
],
"semantic_type": "dimension",
"distinct_count": 8,
"distinct_ratio": 0.02666666666666667,
"pk_candidate_score": 0.08,
"metric_candidate_score": 0.0
},
{
"name": "meter_status",
"dtype": "string",
"stats": {},
"comment": "水表状态,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "有效"
}
],
"semantic_type": "dimension",
"distinct_count": 1,
"distinct_ratio": 0.0033333333333333335,
"pk_candidate_score": 0.01,
"metric_candidate_score": 0.0
},
{
"name": "meter_subtype",
"dtype": "string",
"stats": {},
"comment": "水表子类型,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "旋翼半液封式"
},
{
"pct": null,
"value": "超声波式"
},
{
"pct": null,
"value": "旋翼湿式(指针式)"
},
{
"pct": null,
"value": "旋翼湿式(数字指针式)"
},
{
"pct": null,
"value": "电磁式"
},
{
"pct": null,
"value": "无直管段要求超声波式"
},
{
"pct": null,
"value": "无直管段要求电磁式"
},
{
"pct": null,
"value": "垂直螺翼干式"
},
{
"pct": null,
"value": "机械容积式"
}
],
"semantic_type": "dimension",
"distinct_count": 9,
"distinct_ratio": 0.03,
"pk_candidate_score": 0.09,
"metric_candidate_score": 0.0
},
{
"name": "meter_type",
"dtype": "string",
"stats": {},
"comment": "水表类型,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "容积式机械水表"
},
{
"pct": null,
"value": "速度式机械水表"
},
{
"pct": null,
"value": "电磁式远传水表"
},
{
"pct": null,
"value": "速度式机电远传水表"
},
{
"pct": null,
"value": "超声波式远传水表"
}
],
"semantic_type": "dimension",
"distinct_count": 5,
"distinct_ratio": 0.016666666666666666,
"pk_candidate_score": 0.05,
"metric_candidate_score": 0.0
},
{
"name": "installation_position",
"dtype": "string",
"stats": {},
"comment": "安装位置,枚举值",
"enumish": true,
"null_rate": 0.0,
"top_values": [
{
"pct": null,
"value": "嵌墙表"
},
{
"pct": null,
"value": "管道井表"
},
{
"pct": null,
"value": "地下表"
},
{
"pct": null,
"value": "龙头表"
}
],
"semantic_type": "dimension",
"distinct_count": 4,
"distinct_ratio": 0.013333333333333334,
"pk_candidate_score": 0.04,
"metric_candidate_score": 0.0
},
{
"name": "account_id",
"dtype": "string",
"stats": {},
"comment": "账户ID",
"enumish": false,
"null_rate": null,
"top_values": [],
"semantic_type": "id",
"distinct_count": null,
"distinct_ratio": null,
"pk_candidate_score": 0.95,
"metric_candidate_score": 0.0
},
{
"name": "service_point_id",
"dtype": "string",
"stats": {},
"comment": "服务点ID",
"enumish": false,
"null_rate": null,
"top_values": [],
"semantic_type": "id",
"distinct_count": null,
"distinct_ratio": null,
"pk_candidate_score": 0.95,
"metric_candidate_score": 0.0
}
],
"quality": {
"warning_hints": [],
"failed_expectations": []
},
"row_count": 300,
"fk_candidates": [],
"confidence_notes": [
"role判定为dimension因所有列均为枚举或ID类型无metric列",
"grain依据account_id和service_point_id为唯一标识推测",
"未发现时间列因此time字段为null"
],
"primary_key_candidates": [
[
"account_id"
],
[
"service_point_id"
]
]
}

View File

@ -0,0 +1,286 @@
[
{
"id": "snpt_water_meter_top_supply_office",
"aliases": [
{
"text": "供水所水表排行",
"tone": "中性"
},
{
"text": "哪个供水所水表最多",
"tone": "口语"
},
{
"text": "供水管理所水表TopN统计",
"tone": "专业"
}
],
"keywords": [
"水表",
"供水管理所",
"排行",
"TopN",
"数量",
"统计",
"count",
"排名",
"前N",
"供水所",
"水表数",
"维度聚合",
"by_dimension",
"topn"
],
"intent_tags": [
"topn",
"by_dimension"
]
},
{
"id": "snpt_water_meter_top_station",
"aliases": [
{
"text": "站点水表数量排行",
"tone": "中性"
},
{
"text": "哪个站点水表最多",
"tone": "口语"
},
{
"text": "站点维度水表TopN分析",
"tone": "专业"
}
],
"keywords": [
"水表",
"站点",
"排行",
"TopN",
"数量",
"统计",
"count",
"排名",
"前N",
"站点数",
"维度聚合",
"by_dimension",
"topn"
],
"intent_tags": [
"topn",
"by_dimension"
]
},
{
"id": "snpt_water_meter_top_district",
"aliases": [
{
"text": "区域水表数量排名",
"tone": "中性"
},
{
"text": "哪个区水表最多",
"tone": "口语"
},
{
"text": "行政区水表TopN统计",
"tone": "专业"
}
],
"keywords": [
"水表",
"区域",
"行政区",
"排行",
"TopN",
"数量",
"统计",
"count",
"排名",
"前N",
"区",
"水表数",
"维度聚合",
"by_dimension",
"topn"
],
"intent_tags": [
"topn",
"by_dimension"
]
},
{
"id": "snpt_water_meter_share_by_type",
"aliases": [
{
"text": "水表类型占比",
"tone": "中性"
},
{
"text": "哪种水表用得最多",
"tone": "口语"
},
{
"text": "水表类型分布比例",
"tone": "专业"
}
],
"keywords": [
"水表",
"类型",
"占比",
"比例",
"ratio",
"分布",
"meter_type",
"百分比",
"分类统计",
"水表类型",
"ratio",
"aggregate",
"by_dimension"
],
"intent_tags": [
"ratio",
"by_dimension"
]
},
{
"id": "snpt_water_meter_subtype_distribution",
"aliases": [
{
"text": "水表子类型分布",
"tone": "中性"
},
{
"text": "各种子类型水表情况",
"tone": "口语"
},
{
"text": "水表子类型计数与占比",
"tone": "专业"
}
],
"keywords": [
"水表",
"子类型",
"分布",
"数量",
"占比",
"meter_subtype",
"统计",
"count",
"百分比",
"分类统计",
"aggregate",
"by_dimension"
],
"intent_tags": [
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_water_meter_installation_position_stats",
"aliases": [
{
"text": "安装位置统计",
"tone": "中性"
},
{
"text": "哪种位置装表最多",
"tone": "口语"
},
{
"text": "水表安装位置分布",
"tone": "专业"
}
],
"keywords": [
"水表",
"安装位置",
"统计",
"分布",
"installation_position",
"数量",
"count",
"位置",
"安装点",
"aggregate",
"by_dimension"
],
"intent_tags": [
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_water_meter_grain_check",
"aliases": [
{
"text": "主键粒度校验",
"tone": "中性"
},
{
"text": "数据有没有重复",
"tone": "口语"
},
{
"text": "数据粒度一致性检查",
"tone": "专业"
}
],
"keywords": [
"主键",
"粒度",
"校验",
"质量",
"quality",
"重复",
"唯一性",
"account_id",
"service_point_id",
"数据校验",
"质量检查",
"异常检测"
],
"intent_tags": [
"quality"
]
},
{
"id": "snpt_water_meter_sample_records",
"aliases": [
{
"text": "水表数据抽样",
"tone": "中性"
},
{
"text": "给我看点水表数据",
"tone": "口语"
},
{
"text": "水表记录样本抽取",
"tone": "专业"
}
],
"keywords": [
"水表",
"样本",
"抽样",
"sample",
"随机",
"记录",
"抽查",
"limit",
"数据结构",
"数据示例",
"sample",
"limit_rows"
],
"intent_tags": [
"sample"
]
}
]

View File

@ -0,0 +1,235 @@
[
{
"id": "snpt_water_meter_top_supply_office",
"desc": "统计各供水管理所下辖水表数量并排序",
"type": "topn",
"title": "供水管理所水表数量排行",
"examples": [
"列出水表最多的前10个供水管理所",
"各供水所水表数量排名"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 10
}
],
"dialect_sql": {
"mysql": "SELECT supply_office AS dim_value, COUNT(*) AS metric_value FROM `data-ge.water_meter_info` GROUP BY supply_office ORDER BY metric_value DESC LIMIT {{top_n}}"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": 11
},
"time_column": "nullable",
"required_columns": [
"supply_office"
]
},
"business_caliber": "按供水管理所维度聚合水表总数,粒度=供水管理所"
},
{
"id": "snpt_water_meter_top_station",
"desc": "统计各个站点下辖水表数量并排序",
"type": "topn",
"title": "站点水表数量排行",
"examples": [
"列出水表最多的前10个站点",
"各站点水表数量排名"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 10
}
],
"dialect_sql": {
"mysql": "SELECT station AS dim_value, COUNT(*) AS metric_value FROM `data-ge.water_meter_info` GROUP BY station ORDER BY metric_value DESC LIMIT {{top_n}}"
},
"applicability": {
"constraints": {
"notes": [
"高基数维度建议LIMIT<=50"
],
"fk_join_available": false,
"dim_cardinality_hint": 36
},
"time_column": "nullable",
"required_columns": [
"station"
]
},
"business_caliber": "按站点维度聚合水表总数,粒度=站点"
},
{
"id": "snpt_water_meter_top_district",
"desc": "统计各区水表数量并排序",
"type": "topn",
"title": "区域水表数量排行",
"examples": [
"列出各区水表数量排名",
"哪个区的水表最多?"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 10
}
],
"dialect_sql": {
"mysql": "SELECT district AS dim_value, COUNT(*) AS metric_value FROM `data-ge.water_meter_info` GROUP BY district ORDER BY metric_value DESC LIMIT {{top_n}}"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": 13
},
"time_column": "nullable",
"required_columns": [
"district"
]
},
"business_caliber": "按行政区划维度聚合水表总数,粒度=区"
},
{
"id": "snpt_water_meter_share_by_type",
"desc": "计算各类水表占总水表的比例",
"type": "ratio",
"title": "水表类型占比分布",
"examples": [
"各类水表占比是多少?",
"哪种类型的水表使用最广泛?"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT meter_type AS dim_value, COUNT(*) * 100.0 / (SELECT COUNT(*) FROM `data-ge.water_meter_info`) AS ratio_percent FROM `data-ge.water_meter_info` GROUP BY meter_type ORDER BY ratio_percent DESC"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": 5
},
"time_column": "nullable",
"required_columns": [
"meter_type"
]
},
"business_caliber": "按水表类型分类计算其占比,粒度=水表类型"
},
{
"id": "snpt_water_meter_subtype_distribution",
"desc": "展示不同水表子类型的数量及比例",
"type": "aggregate",
"title": "水表子类型分布情况",
"examples": [
"各种子类型水表的数量和占比",
"哪种子类型水表最多?"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT meter_subtype AS dim_value, COUNT(*) AS count_value, ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM `data-ge.water_meter_info`), 2) AS percentage FROM `data-ge.water_meter_info` GROUP BY meter_subtype ORDER BY count_value DESC"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": 9
},
"time_column": "nullable",
"required_columns": [
"meter_subtype"
]
},
"business_caliber": "按水表子类型进行计数和百分比统计,粒度=水表子类型"
},
{
"id": "snpt_water_meter_installation_position_stats",
"desc": "统计不同安装位置下的水表数量",
"type": "aggregate",
"title": "安装位置分布统计",
"examples": [
"各种安装位置的水表数量",
"哪种安装位置最为常见?"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT installation_position AS dim_value, COUNT(*) AS count_value FROM `data-ge.water_meter_info` GROUP BY installation_position ORDER BY count_value DESC"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": 4
},
"time_column": "nullable",
"required_columns": [
"installation_position"
]
},
"business_caliber": "按安装位置对水表进行分组计数,粒度=安装位置"
},
{
"id": "snpt_water_meter_grain_check",
"desc": "验证 account_id 和 service_point_id 是否构成唯一组合",
"type": "quality",
"title": "主键粒度校验",
"examples": [
"这张表的数据粒度是否正确?",
"是否存在重复的服务点记录?"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT IF(COUNT(*) = COUNT(DISTINCT account_id, service_point_id), 'PASS', 'FAIL') AS grain_check_result FROM `data-ge.water_meter_info`"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": "nullable",
"required_columns": [
"account_id",
"service_point_id"
]
},
"business_caliber": "检验数据是否符合预期的主键粒度account_id + service_point_id"
},
{
"id": "snpt_water_meter_sample_records",
"desc": "随机抽取部分水表信息用于查看结构",
"type": "sample",
"title": "样本抽取",
"examples": [
"给我看几条水表数据的例子",
"抽查一些原始数据看看格式"
],
"variables": [
{
"name": "limit_rows",
"type": "int",
"default": 5
}
],
"dialect_sql": {
"mysql": "SELECT * FROM `data-ge.water_meter_info` ORDER BY RAND() LIMIT {{limit_rows}}"
},
"applicability": {
"constraints": {
"notes": [],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": "nullable",
"required_columns": []
},
"business_caliber": "从全量数据中随机采样若干条记录供参考"
}
]

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,249 @@
[
{
"id": "snpt_topn_supply_office_by_account",
"aliases": [
{
"text": "哪个供水所用户最多?",
"tone": "口语"
},
{
"text": "按供应办公室统计账户数量",
"tone": "中性"
},
{
"text": "供应办公室账户数TopN排名",
"tone": "专业"
}
],
"keywords": [
"供应办公室",
"账户数",
"TopN",
"排行",
"统计",
"account_id",
"supply_office",
"去重",
"高占比",
"维度分析",
"by_dimension",
"aggregate",
"topn"
],
"intent_tags": [
"topn",
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_topn_station_by_account",
"aliases": [
{
"text": "哪些站点用户最多?",
"tone": "口语"
},
{
"text": "按站点统计账户分布",
"tone": "中性"
},
{
"text": "站点账户数Top20排名",
"tone": "专业"
}
],
"keywords": [
"站点",
"账户数",
"TopN",
"排行",
"统计",
"station",
"account_id",
"去重",
"高负载",
"维度分析",
"by_dimension",
"aggregate",
"topn"
],
"intent_tags": [
"topn",
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_topn_district_by_account",
"aliases": [
{
"text": "哪个区用户最多?",
"tone": "口语"
},
{
"text": "按行政区统计账户数量",
"tone": "中性"
},
{
"text": "行政区账户数全量排名",
"tone": "专业"
}
],
"keywords": [
"行政区",
"账户数",
"TopN",
"排行",
"统计",
"district",
"account_id",
"去重",
"区域对比",
"维度分析",
"by_dimension",
"aggregate",
"topn"
],
"intent_tags": [
"topn",
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_share_of_meter_type",
"aliases": [
{
"text": "各类水表占多少比例?",
"tone": "口语"
},
{
"text": "水表类型占比分析",
"tone": "中性"
},
{
"text": "水表类型占比分布",
"tone": "专业"
}
],
"keywords": [
"水表类型",
"占比",
"比例",
"meter_type",
"account_id",
"去重",
"分布",
"主流类型",
"技术选型",
"ratio",
"aggregate",
"by_dimension"
],
"intent_tags": [
"ratio",
"aggregate",
"by_dimension"
]
},
{
"id": "snpt_sample_account_service_point",
"aliases": [
{
"text": "随机看10条账户信息",
"tone": "口语"
},
{
"text": "抽样账户与服务点明细",
"tone": "中性"
},
{
"text": "账户-服务点随机抽样验证",
"tone": "专业"
}
],
"keywords": [
"抽样",
"随机",
"样本",
"account_id",
"service_point_id",
"数据质量",
"验证",
"唯一性",
"格式检查",
"sample",
"quality"
],
"intent_tags": [
"sample",
"quality"
]
},
{
"id": "snpt_filter_meter_status_valid",
"aliases": [
{
"text": "只取有效的水表记录",
"tone": "口语"
},
{
"text": "筛选有效水表记录",
"tone": "中性"
},
{
"text": "水表状态有效性过滤",
"tone": "专业"
}
],
"keywords": [
"有效",
"过滤",
"筛选",
"meter_status",
"质量检查",
"断言",
"清洗",
"filter",
"quality"
],
"intent_tags": [
"filter",
"quality"
]
},
{
"id": "snpt_filter_meter_diameter_20mm",
"aliases": [
{
"text": "找出所有20mm水表用户",
"tone": "口语"
},
{
"text": "筛选20mm水表记录",
"tone": "中性"
},
{
"text": "20mm口径水表子集提取",
"tone": "专业"
}
],
"keywords": [
"20mm",
"水表直径",
"过滤",
"筛选",
"meter_diameter",
"子集",
"分析",
"住宅用水",
"规格",
"filter",
"by_dimension"
],
"intent_tags": [
"filter",
"by_dimension"
]
}
]

View File

@ -0,0 +1,227 @@
[
{
"id": "snpt_topn_supply_office_by_account",
"desc": "统计各供应办公室对应的账户数量,识别高占比管理所",
"type": "topn",
"title": "按供应办公室统计账户数",
"examples": [
"哪个供水管理所服务的用户最多?",
"列出前5个账户数最多的供应办公室"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 11
}
],
"dialect_sql": {
"mysql": "SELECT supply_office, COUNT(DISTINCT account_id) AS account_count\nFROM water_meter_info\nGROUP BY supply_office\nORDER BY account_count DESC\nLIMIT {{top_n}};"
},
"applicability": {
"constraints": {
"notes": [
"供应办公室仅11个唯一值可安全展示全部建议LIMIT 11避免冗余排序"
],
"fk_join_available": false,
"dim_cardinality_hint": 11
},
"time_column": "nullable",
"required_columns": [
"supply_office",
"account_id"
]
},
"business_caliber": "粒度=供应办公室,指标=去重账户数account_id仅统计水表信息表中有效账户不关联外部表"
},
{
"id": "snpt_topn_station_by_account",
"desc": "统计各站点服务的账户数量,识别高负载站点",
"type": "topn",
"title": "按站点统计账户分布",
"examples": [
"哪些站点服务的用户最多?",
"TOP10用户最多的站点是哪些"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 20
}
],
"dialect_sql": {
"mysql": "SELECT station, COUNT(DISTINCT account_id) AS account_count\nFROM water_meter_info\nGROUP BY station\nORDER BY account_count DESC\nLIMIT {{top_n}};"
},
"applicability": {
"constraints": {
"notes": [
"站点有36个唯一值建议LIMIT<=20以避免结果过长高基数维度可能影响查询性能"
],
"fk_join_available": false,
"dim_cardinality_hint": 36
},
"time_column": "nullable",
"required_columns": [
"station",
"account_id"
]
},
"business_caliber": "粒度=站点station指标=去重账户数account_id基于水表信息表直接聚合不涉及时间维度"
},
{
"id": "snpt_topn_district_by_account",
"desc": "统计各行政区的账户数量,辅助区域资源分配分析",
"type": "topn",
"title": "按行政区统计账户分布",
"examples": [
"哪个区的用水账户最多?",
"列出所有行政区的账户数量排名"
],
"variables": [
{
"name": "top_n",
"type": "int",
"default": 13
}
],
"dialect_sql": {
"mysql": "SELECT district, COUNT(DISTINCT account_id) AS account_count\nFROM water_meter_info\nGROUP BY district\nORDER BY account_count DESC\nLIMIT {{top_n}};"
},
"applicability": {
"constraints": {
"notes": [
"行政区共13个可完整展示适合用于区域对比分析"
],
"fk_join_available": false,
"dim_cardinality_hint": 13
},
"time_column": "nullable",
"required_columns": [
"district",
"account_id"
]
},
"business_caliber": "粒度=行政区district指标=去重账户数account_id基于水表信息表聚合反映各区域用户规模"
},
{
"id": "snpt_share_of_meter_type",
"desc": "计算各类水表类型在总账户中的占比,识别主流类型",
"type": "ratio",
"title": "水表类型占比分析",
"examples": [
"各类水表在用户中的占比是多少?",
"电磁式远传水表占总用户比例多少?"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT meter_type, \n COUNT(DISTINCT account_id) AS account_count,\n ROUND(COUNT(DISTINCT account_id) * 100.0 / SUM(COUNT(DISTINCT account_id)) OVER (), 2) AS percentage\nFROM water_meter_info\nGROUP BY meter_type\nORDER BY account_count DESC;"
},
"applicability": {
"constraints": {
"notes": [
"水表类型仅5种适合计算占比可直接展示全量分布"
],
"fk_join_available": false,
"dim_cardinality_hint": 5
},
"time_column": "nullable",
"required_columns": [
"meter_type",
"account_id"
]
},
"business_caliber": "粒度=水表类型meter_type指标=去重账户数占比,分母为全表去重账户总数,反映技术选型分布"
},
{
"id": "snpt_sample_account_service_point",
"desc": "随机抽取部分账户与服务点ID的原始记录用于数据质量核查",
"type": "sample",
"title": "抽样账户与服务点明细",
"examples": [
"随机查看10条账户与服务点的详细信息",
"抽样检查水表信息是否符合预期格式"
],
"variables": [
{
"name": "sample_size",
"type": "int",
"default": 10
}
],
"dialect_sql": {
"mysql": "SELECT account_id, service_point_id, supply_office, station, district, meter_diameter, meter_type, meter_subtype, installation_position\nFROM water_meter_info\nORDER BY RAND()\nLIMIT {{sample_size}};"
},
"applicability": {
"constraints": {
"notes": [
"主键组合为account_id+service_point_id适合抽样验证唯一性建议样本量≤100"
],
"fk_join_available": false,
"dim_cardinality_hint": null
},
"time_column": "nullable",
"required_columns": [
"account_id",
"service_point_id"
]
},
"business_caliber": "粒度=单条水表记录抽取样本用于验证account_id与service_point_id的组合唯一性及维度字段完整性"
},
{
"id": "snpt_filter_meter_status_valid",
"desc": "过滤出水表状态为'有效'的记录,用于后续分析",
"type": "quality",
"title": "筛选有效水表记录",
"examples": [
"只取状态为有效的水表记录",
"确认所有水表是否均为有效状态"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT *\nFROM water_meter_info\nWHERE meter_status = '有效';"
},
"applicability": {
"constraints": {
"notes": [
"meter_status仅存在'有效'值,此条件恒成立;可用于数据清洗流程的显式过滤"
],
"fk_join_available": false,
"dim_cardinality_hint": 1
},
"time_column": "nullable",
"required_columns": [
"meter_status"
]
},
"business_caliber": "仅保留水表状态为'有效'的记录,因全表均为有效值,此过滤为冗余但可作为数据质量校验的显式断言"
},
{
"id": "snpt_filter_meter_diameter_20mm",
"desc": "筛选水表直径为20mm的记录用于特定口径设备分析",
"type": "quality",
"title": "筛选20mm水表记录",
"examples": [
"找出所有使用20mm水表的用户",
"20mm水表分布在哪些站点"
],
"variables": [],
"dialect_sql": {
"mysql": "SELECT *\nFROM water_meter_info\nWHERE meter_diameter = '20mm';"
},
"applicability": {
"constraints": {
"notes": [
"水表直径共8种枚举值20mm为常见规格可作为子集分析的起点"
],
"fk_join_available": false,
"dim_cardinality_hint": 8
},
"time_column": "nullable",
"required_columns": [
"meter_diameter"
]
},
"business_caliber": "粒度=单条水表记录筛选条件为meter_diameter='20mm',用于分析标准住宅用水表的分布特征"
}
]

21
file/ecommerce_orders.sql Normal file
View File

@ -0,0 +1,21 @@
CREATE TABLE `ecommerce_orders` (
`order_id` char(36) COLLATE utf8mb4_unicode_ci NOT NULL COMMENT 'UUID from CSV',
`customer_id` int NOT NULL,
`product_id` int NOT NULL,
`category` varchar(64) COLLATE utf8mb4_unicode_ci NOT NULL,
`price` decimal(10,2) NOT NULL,
`quantity` int NOT NULL,
`order_date` datetime(6) NOT NULL,
`shipping_date` datetime(6) NOT NULL,
`delivery_status` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
`payment_method` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
`device_type` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
`channel` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
`shipping_address` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL,
`billing_address` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL,
`customer_segment` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
PRIMARY KEY (`order_id`),
KEY `idx_customer` (`customer_id`),
KEY `idx_product` (`product_id`),
KEY `idx_order_date` (`order_date`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;