From a72ca3593ec6d2df0833b4499dd20cbb02f4a05b Mon Sep 17 00:00:00 2001 From: zhaoawd Date: Fri, 14 Nov 2025 00:58:00 +0800 Subject: [PATCH] =?UTF-8?q?demo=E6=95=B0=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- demo/水务/水务-gemini2.5-ge-result.json | 1 + demo/水务/水务-gemini2.5-snippet-alias.json | 180 ++++++++ demo/水务/水务-gemini2.5-snippet.json | 186 ++++++++ demo/水务/水务-gpt5-ge-desc.json | 230 ++++++++++ demo/水务/水务-gpt5-snippet-alias.json | 372 ++++++++++++++++ demo/水务/水务-gpt5-snippet.json | 330 ++++++++++++++ demo/水务/水务-qwen3-coder-480b-ge-desc.json | 415 ++++++++++++++++++ .../水务-qwen3-coder-480b-snippet-alias.json | 286 ++++++++++++ demo/水务/水务-qwen3-coder-480b-snippet.json | 235 ++++++++++ demo/水务/水务-qwen3-next-80b-ge-desc.json | 1 + .../水务-qwen3-next-80b-snippet-alias.json | 249 +++++++++++ demo/水务/水务-qwen3-next-80b-snippet.json | 227 ++++++++++ file/ecommerce_orders.sql | 21 + 13 files changed, 2733 insertions(+) create mode 100644 demo/水务/水务-gemini2.5-ge-result.json create mode 100644 demo/水务/水务-gemini2.5-snippet-alias.json create mode 100644 demo/水务/水务-gemini2.5-snippet.json create mode 100644 demo/水务/水务-gpt5-ge-desc.json create mode 100644 demo/水务/水务-gpt5-snippet-alias.json create mode 100644 demo/水务/水务-gpt5-snippet.json create mode 100644 demo/水务/水务-qwen3-coder-480b-ge-desc.json create mode 100644 demo/水务/水务-qwen3-coder-480b-snippet-alias.json create mode 100644 demo/水务/水务-qwen3-coder-480b-snippet.json create mode 100644 demo/水务/水务-qwen3-next-80b-ge-desc.json create mode 100644 demo/水务/水务-qwen3-next-80b-snippet-alias.json create mode 100644 demo/水务/水务-qwen3-next-80b-snippet.json create mode 100644 file/ecommerce_orders.sql diff --git a/demo/水务/水务-gemini2.5-ge-result.json b/demo/水务/水务-gemini2.5-ge-result.json new file mode 100644 index 0000000..232f153 --- /dev/null +++ b/demo/水务/水务-gemini2.5-ge-result.json @@ -0,0 +1 @@ +{"role": "dimension", "time": {"range": null, "column": null, "has_gaps": null, "granularity": "unknown"}, "grain": ["service_point_id"], "table": "data-ge.water_meter_info", "columns": [{"name": "meter_subtype", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 9, "distinct_ratio": 0.03, "pk_candidate_score": 0.03, "metric_candidate_score": 0.0}, {"name": "installation_position", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 4, "distinct_ratio": 0.013333333333333334, "pk_candidate_score": 0.013333333333333334, "metric_candidate_score": 0.0}, {"name": "supply_office", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 11, "distinct_ratio": 0.03666666666666667, "pk_candidate_score": 0.03666666666666667, "metric_candidate_score": 0.0}, {"name": "meter_diameter", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 8, "distinct_ratio": 0.02666666666666667, "pk_candidate_score": 0.02666666666666667, "metric_candidate_score": 0.0}, {"name": "account_id", "dtype": "unknown", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列的统计指标(如空值率、唯一性)缺失,但根据命名规则推断为ID。", "enumish": null, "null_rate": null, "top_values": [], "semantic_type": "id", "distinct_count": null, "distinct_ratio": null, "pk_candidate_score": 0.9, "metric_candidate_score": 0.0}, {"name": "service_point_id", "dtype": "unknown", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列的统计指标(如空值率、唯一性)缺失,但根据命名规则推断为ID。", "enumish": null, "null_rate": null, "top_values": [], "semantic_type": "id", "distinct_count": null, "distinct_ratio": null, "pk_candidate_score": 0.95, "metric_candidate_score": 0.0}, {"name": "station", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 36, "distinct_ratio": 0.12, "pk_candidate_score": 0.12, "metric_candidate_score": 0.0}, {"name": "meter_type", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 5, "distinct_ratio": 0.016666666666666666, "pk_candidate_score": 0.016666666666666666, "metric_candidate_score": 0.0}, {"name": "district", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 13, "distinct_ratio": 0.043333333333333335, "pk_candidate_score": 0.043333333333333335, "metric_candidate_score": 0.0}, {"name": "meter_status", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "该列只有一个唯一值 '有效'。", "enumish": true, "null_rate": 0.0, "top_values": [], "semantic_type": "dimension", "distinct_count": 1, "distinct_ratio": 0.0033333333333333335, "pk_candidate_score": 0.0033333333333333335, "metric_candidate_score": 0.0}], "quality": {"warning_hints": ["列 'meter_status' 只有一个唯一值 '有效',可能为常量列。"], "failed_expectations": []}, "row_count": 300, "fk_candidates": [], "confidence_notes": ["表角色(role)被推断为 'dimension',因为其列几乎完全由ID和类别属性构成,且缺少数值指标或时间序列列。", "主键候选(primary_key_candidates) 'service_point_id' 和 'account_id' 是基于命名约定(包含'_id')推断的。其唯一性和非空性未在GE结果中直接度量,因此这是一个高置信度的猜测。", "表粒度(grain)可能为 'service_point',与推断的主键 'service_point_id' 相对应。", "未根据列名或数据格式识别出时间列。"], "primary_key_candidates": [["service_point_id"], ["account_id"]]} \ No newline at end of file diff --git a/demo/水务/水务-gemini2.5-snippet-alias.json b/demo/水务/水务-gemini2.5-snippet-alias.json new file mode 100644 index 0000000..eaff6cb --- /dev/null +++ b/demo/水务/水务-gemini2.5-snippet-alias.json @@ -0,0 +1,180 @@ +[ + { + "id": "snpt_count-service-points-by-dimension", + "aliases": [ + { + "text": "各个区有多少水表", + "tone": "口语" + }, + { + "text": "按维度统计用水点数", + "tone": "中性" + }, + { + "text": "各维度用水点数量分布", + "tone": "专业" + } + ], + "keywords": [ + "用水点数", + "service_point_count", + "数量", + "统计", + "汇总", + "aggregate", + "维度", + "dimension", + "区域", + "district", + "供水所", + "分组统计", + "水表" + ], + "intent_tags": [ + "aggregate", + "by_dimension" + ] + }, + { + "id": "snpt_topn-service-points-by-dimension", + "aliases": [ + { + "text": "哪个地方水表最多", + "tone": "口语" + }, + { + "text": "用水点数Top-N排名", + "tone": "中性" + }, + { + "text": "Top-N用水点数维度排行", + "tone": "专业" + } + ], + "keywords": [ + "Top-N", + "top", + "排名", + "排行", + "ranking", + "最多", + "用水点数", + "service_point_count", + "维度", + "dimension", + "站点", + "station", + "水表" + ], + "intent_tags": [ + "topn", + "by_dimension" + ] + }, + { + "id": "snpt_ratio-service-points-by-dimension", + "aliases": [ + { + "text": "各种水表各占多少", + "tone": "口语" + }, + { + "text": "各维度用水点数占比", + "tone": "中性" + }, + { + "text": "用水点维度构成分析", + "tone": "专业" + } + ], + "keywords": [ + "占比", + "percentage", + "百分比", + "ratio", + "构成", + "分布", + "用水点数", + "水表类型", + "meter_type", + "维度", + "dimension", + "水表" + ], + "intent_tags": [ + "ratio", + "by_dimension" + ] + }, + { + "id": "snpt_quality-check-duplicate-spid", + "aliases": [ + { + "text": "有没有重复的水表号", + "tone": "口语" + }, + { + "text": "检查重复的用水点ID", + "tone": "中性" + }, + { + "text": "用水点ID唯一性校验", + "tone": "专业" + } + ], + "keywords": [ + "数据质量", + "quality", + "检查", + "校验", + "重复", + "duplicate", + "唯一性", + "uniqueness", + "用水点ID", + "service_point_id", + "异常检测", + "主键" + ], + "intent_tags": [ + "quality", + "by_dimension" + ] + }, + { + "id": "snpt_sample-filter-service-points-by-dims", + "aliases": [ + { + "text": "给我看城区的机械表", + "tone": "口语" + }, + { + "text": "按多维度筛选用水点", + "tone": "中性" + }, + { + "text": "多维组合条件过滤用水点", + "tone": "专业" + } + ], + "keywords": [ + "筛选", + "过滤", + "filter", + "查询", + "明细", + "列表", + "sample", + "用水点", + "区域", + "district", + "水表类型", + "meter_type", + "条件查询" + ], + "intent_tags": [ + "sample", + "filter" + ] + } +] \ No newline at end of file diff --git a/demo/水务/水务-gemini2.5-snippet.json b/demo/水务/水务-gemini2.5-snippet.json new file mode 100644 index 0000000..33bb92e --- /dev/null +++ b/demo/水务/水务-gemini2.5-snippet.json @@ -0,0 +1,186 @@ +[ + { + "id": "snpt_count-service-points-by-dimension", + "desc": "按指定维度(如区域、供水所)分组,统计各分类下的用水点数量。", + "type": "aggregate", + "title": "按维度统计用水点数", + "examples": [ + "按区域统计用水点数量", + "各个供水所分别有多少个用水点" + ], + "variables": [ + { + "name": "dimension_column", + "type": "column", + "default": "district" + } + ], + "dialect_sql": { + "mysql": "SELECT\n `${dimension_column}`,\n COUNT(DISTINCT service_point_id) AS service_point_count\nFROM\n `data-ge.water_meter_info`\nGROUP BY\n `${dimension_column}`\nORDER BY\n service_point_count DESC;" + }, + "applicability": { + "constraints": { + "notes": [ + "适用于对水表档案信息进行分类汇总统计。", + "可将变量 ${dimension_column} 替换为任一维度列,如 district, supply_office, station, meter_type 等。" + ], + "fk_join_available": false, + "dim_cardinality_hint": null + }, + "time_column": null, + "required_columns": [ + "service_point_id" + ] + }, + "business_caliber": "用水点数:对 `service_point_id` 进行去重计数,代表一个独立的服务点(通常对应一个水表)。统计粒度为“指定维度”。" + }, + { + "id": "snpt_topn-service-points-by-dimension", + "desc": "按指定维度(如区域、站点)统计用水点数,并展示数量最多的前N个分类。", + "type": "topn", + "title": "Top-N 用水点数维度排名", + "examples": [ + "哪个区域的用水点最多", + "用水点数排名前5的站点是哪些" + ], + "variables": [ + { + "name": "dimension_column", + "type": "column", + "default": "station" + }, + { + "name": "top_n", + "type": "int", + "default": 10 + } + ], + "dialect_sql": { + "mysql": "SELECT\n `${dimension_column}`,\n COUNT(DISTINCT service_point_id) AS service_point_count\nFROM\n `data-ge.water_meter_info`\nGROUP BY\n `${dimension_column}`\nORDER BY\n service_point_count DESC\nLIMIT ${top_n};" + }, + "applicability": { + "constraints": { + "notes": [ + "维度 `station` 基数较高 (36),建议 Top-N 查询时结合业务场景合理设置 N 值。" + ], + "fk_join_available": false, + "dim_cardinality_hint": 36 + }, + "time_column": null, + "required_columns": [ + "service_point_id" + ] + }, + "business_caliber": "用水点数:对 `service_point_id` 进行去重计数。排名依据为各维度分类下的用水点总数。统计粒度为“指定维度”。" + }, + { + "id": "snpt_ratio-service-points-by-dimension", + "desc": "计算在指定维度下,各分类的用水点数占总用水点数的百分比,以分析其分布构成。", + "type": "ratio", + "title": "各维度用水点数占比", + "examples": [ + "不同水表类型(meter_type)的分布情况", + "各个区域的用水点占比是多少" + ], + "variables": [ + { + "name": "dimension_column", + "type": "column", + "default": "meter_type" + } + ], + "dialect_sql": { + "mysql": "SELECT\n `${dimension_column}`,\n COUNT(DISTINCT service_point_id) AS service_point_count,\n COUNT(DISTINCT service_point_id) * 100.0 / SUM(COUNT(DISTINCT service_point_id)) OVER () AS percentage\nFROM\n `data-ge.water_meter_info`\nGROUP BY\n `${dimension_column}`\nORDER BY\n service_point_count DESC;" + }, + "applicability": { + "constraints": { + "notes": [ + "SQL模板使用了窗口函数 SUM() OVER(),请确保MySQL版本支持(8.0+)。" + ], + "fk_join_available": false, + "dim_cardinality_hint": null + }, + "time_column": null, + "required_columns": [ + "service_point_id" + ] + }, + "business_caliber": "用水点数占比:某分类下的用水点数 / 总用水点数。用水点数以 `service_point_id` 去重计数。统计粒度为“指定维度”。" + }, + { + "id": "snpt_quality-check-duplicate-spid", + "desc": "查找在用水点信息表中存在重复的 `service_point_id`,用于数据质量校验。", + "type": "quality", + "title": "检查重复的用水点ID", + "examples": [ + "检查是否存在重复的水表档案", + "校验用水点ID的唯一性" + ], + "variables": [], + "dialect_sql": { + "mysql": "SELECT\n service_point_id,\n COUNT(*) AS occurrences\nFROM\n `data-ge.water_meter_info`\nGROUP BY\n service_point_id\nHAVING\n COUNT(*) > 1;" + }, + "applicability": { + "constraints": { + "notes": [ + "预期返回结果为空。若有返回,则表示数据存在一致性问题,`service_point_id` 未能作为唯一主键。" + ], + "fk_join_available": false, + "dim_cardinality_hint": null + }, + "time_column": null, + "required_columns": [ + "service_point_id" + ] + }, + "business_caliber": "重复项:指 `service_point_id` 出现次数大于1的记录。此ID应为表的主键,理论上不应重复。" + }, + { + "id": "snpt_sample-filter-service-points-by-dims", + "desc": "根据区域、水表类型、供水所等多个维度组合条件,筛选出符合条件的用水点明细。", + "type": "sample", + "title": "多维度筛选用水点列表", + "examples": [ + "查询城区的机械表有哪些", + "拉取某个供水所下特定口径水表的列表" + ], + "variables": [ + { + "name": "district_name", + "type": "string", + "default": "城区" + }, + { + "name": "meter_type_name", + "type": "string", + "default": "机械表" + }, + { + "name": "limit_num", + "type": "int", + "default": 100 + } + ], + "dialect_sql": { + "mysql": "SELECT\n service_point_id,\n account_id,\n district,\n supply_office,\n meter_type,\n meter_subtype,\n meter_diameter\nFROM\n `data-ge.water_meter_info`\nWHERE\n district = '${district_name}'\n AND meter_type = '${meter_type_name}'\n -- AND meter_status = '有效' -- 可选:根据画像,该列为常量'有效',可不加\nLIMIT ${limit_num};" + }, + "applicability": { + "constraints": { + "notes": [], + "fk_join_available": false, + "dim_cardinality_hint": null + }, + "time_column": null, + "required_columns": [ + "service_point_id", + "account_id", + "district", + "supply_office", + "meter_type", + "meter_subtype", + "meter_diameter" + ] + }, + "business_caliber": "返回满足所有筛选条件的用水点明细信息。`meter_status` 列只有一个值 '有效',通常无需作为筛选条件。" + } +] \ No newline at end of file diff --git a/demo/水务/水务-gpt5-ge-desc.json b/demo/水务/水务-gpt5-ge-desc.json new file mode 100644 index 0000000..d0caaf5 --- /dev/null +++ b/demo/水务/水务-gpt5-ge-desc.json @@ -0,0 +1,230 @@ +{ + "role": "dimension", + "time": { + "range": null, + "column": null, + "has_gaps": null, + "granularity": "unknown" + }, + "grain": [ + "service_point_id" + ], + "table": "data-ge.water_meter_info", + "columns": [ + { + "name": "supply_office", + "dtype": "string", + "stats": { + "max": null, + "min": null, + "std": null, + "mean": null, + "skewness": null + }, + "comment": "非空;11 个枚举值(GE 约束)", + "enumish": true, + "null_rate": 0.0, + "top_values": [], + "semantic_type": "dimension", + "distinct_count": 11, + "distinct_ratio": 0.03666666666666667, + "pk_candidate_score": 0.05, + "metric_candidate_score": 0.0 + }, + { + "name": "station", + "dtype": "string", + "stats": { + "max": null, + "min": null, + "std": null, + "mean": null, + "skewness": null + }, + "comment": "非空;36 个枚举值(GE 约束)", + "enumish": true, + "null_rate": 0.0, + "top_values": [], + "semantic_type": "dimension", + "distinct_count": 36, + "distinct_ratio": 0.12, + "pk_candidate_score": 0.1, + "metric_candidate_score": 0.0 + }, + { + "name": "district", + "dtype": "string", + "stats": { + "max": null, + "min": null, + "std": null, + "mean": null, + "skewness": null + }, + "comment": "非空;13 个枚举值(GE 约束)", + "enumish": true, + "null_rate": 0.0, + "top_values": [], + "semantic_type": "dimension", + "distinct_count": 13, + "distinct_ratio": 0.043333333333333335, + "pk_candidate_score": 0.05, + "metric_candidate_score": 0.0 + }, + { + "name": "meter_diameter", + "dtype": "string", + "stats": { + "max": null, + "min": null, + "std": null, + "mean": null, + "skewness": null + }, + "comment": "非空;8 个枚举值(GE 约束)", + "enumish": true, + "null_rate": 0.0, + "top_values": [], + "semantic_type": "dimension", + "distinct_count": 8, + "distinct_ratio": 0.02666666666666667, + "pk_candidate_score": 0.03, + "metric_candidate_score": 0.0 + }, + { + "name": "meter_status", + "dtype": "string", + "stats": { + "max": null, + "min": null, + "std": null, + "mean": null, + "skewness": null + }, + "comment": "非空;单一取值(\"有效\")", + "enumish": true, + "null_rate": 0.0, + "top_values": [], + "semantic_type": "dimension", + "distinct_count": 1, + "distinct_ratio": 0.0033333333333333335, + "pk_candidate_score": 0.0, + "metric_candidate_score": 0.0 + }, + { + "name": "meter_subtype", + "dtype": "string", + "stats": { + "max": null, + "min": null, + "std": null, + "mean": null, + "skewness": null + }, + "comment": "非空;9 个枚举值(GE 约束)", + "enumish": true, + "null_rate": 0.0, + "top_values": [], + "semantic_type": "dimension", + "distinct_count": 9, + "distinct_ratio": 0.03, + "pk_candidate_score": 0.03, + "metric_candidate_score": 0.0 + }, + { + "name": "meter_type", + "dtype": "string", + "stats": { + "max": null, + "min": null, + "std": null, + "mean": null, + "skewness": null + }, + "comment": "非空;5 个枚举值(GE 约束)", + "enumish": true, + "null_rate": 0.0, + "top_values": [], + "semantic_type": "dimension", + "distinct_count": 5, + "distinct_ratio": 0.016666666666666666, + "pk_candidate_score": 0.02, + "metric_candidate_score": 0.0 + }, + { + "name": "installation_position", + "dtype": "string", + "stats": { + "max": null, + "min": null, + "std": null, + "mean": null, + "skewness": null + }, + "comment": "非空;4 个枚举值(GE 约束)", + "enumish": true, + "null_rate": 0.0, + "top_values": [], + "semantic_type": "dimension", + "distinct_count": 4, + "distinct_ratio": 0.013333333333333334, + "pk_candidate_score": 0.02, + "metric_candidate_score": 0.0 + }, + { + "name": "service_point_id", + "dtype": "unknown", + "stats": { + "max": null, + "min": null, + "std": null, + "mean": null, + "skewness": null + }, + "comment": "命名指示标识列;未提供唯一性或非空验证", + "enumish": null, + "null_rate": null, + "top_values": [], + "semantic_type": "id", + "distinct_count": null, + "distinct_ratio": null, + "pk_candidate_score": 0.6, + "metric_candidate_score": 0.05 + }, + { + "name": "account_id", + "dtype": "unknown", + "stats": { + "max": null, + "min": null, + "std": null, + "mean": null, + "skewness": null + }, + "comment": "命名指示账户标识;未提供唯一性或非空验证", + "enumish": null, + "null_rate": null, + "top_values": [], + "semantic_type": "id", + "distinct_count": null, + "distinct_ratio": null, + "pk_candidate_score": 0.5, + "metric_candidate_score": 0.05 + } + ], + "quality": { + "warning_hints": [ + "以下列未设置非空校验:service_point_id, account_id(空值情况未知)", + "未识别到时间列" + ], + "failed_expectations": [] + }, + "row_count": 300, + "fk_candidates": [], + "confidence_notes": [ + "role 判定为 dimension:表内列均为枚举/分类或ID,未发现数值型度量或时间列;34/34 期望均为分类枚举/非空与去重比例。", + "grain 猜测为 service_point_id:仅依据命名启发式,缺少唯一性与非空度量佐证(置信度较低)。", + "未识别时间列:列名与期望均未涉及日期/时间,也无最小/最大时间范围可推断。" + ], + "primary_key_candidates": [] +} \ No newline at end of file diff --git a/demo/水务/水务-gpt5-snippet-alias.json b/demo/水务/水务-gpt5-snippet-alias.json new file mode 100644 index 0000000..ce1942e --- /dev/null +++ b/demo/水务/水务-gpt5-snippet-alias.json @@ -0,0 +1,372 @@ +[ + { + "id": "snpt_topn_station", + "aliases": [ + { + "text": "站点水表排行前N", + "tone": "中性" + }, + { + "text": "哪个站点表最多", + "tone": "口语" + }, + { + "text": "按站点水表TopN", + "tone": "专业" + } + ], + "keywords": [ + "TopN", + "排名", + "排行", + "station", + "站点", + "水表数", + "meter count", + "distinct", + "去重", + "聚合", + "排序", + "榜单" + ], + "intent_tags": [ + "topn", + "aggregate", + "by_dimension" + ] + }, + { + "id": "snpt_share_district", + "aliases": [ + { + "text": "各辖区水表占比", + "tone": "中性" + }, + { + "text": "哪个辖区占比高", + "tone": "口语" + }, + { + "text": "按辖区水表比例", + "tone": "专业" + } + ], + "keywords": [ + "占比", + "ratio", + "district", + "辖区", + "水表数", + "meter count", + "distinct", + "去重", + "百分比", + "份额", + "聚合", + "排序", + "分布" + ], + "intent_tags": [ + "ratio", + "aggregate", + "by_dimension" + ] + }, + { + "id": "snpt_dist_diameter", + "aliases": [ + { + "text": "表径水表数分布", + "tone": "中性" + }, + { + "text": "不同口径有多少", + "tone": "口语" + }, + { + "text": "按表径去重计数", + "tone": "专业" + } + ], + "keywords": [ + "分布", + "distribution", + "meter_diameter", + "表径", + "水表数", + "meter count", + "distinct", + "去重", + "聚合", + "类别", + "category", + "条形图", + "饼图", + "排行" + ], + "intent_tags": [ + "aggregate", + "by_dimension" + ] + }, + { + "id": "snpt_type_subtype_matrix", + "aliases": [ + { + "text": "类型×子类水表数", + "tone": "中性" + }, + { + "text": "看各类型各子类", + "tone": "口语" + }, + { + "text": "类型子类组合统计", + "tone": "专业" + } + ], + "keywords": [ + "类型", + "type", + "子类", + "subtype", + "组合", + "matrix", + "交叉分析", + "cross-tab", + "水表数", + "meter count", + "distinct", + "去重", + "分布", + "聚合", + "维度" + ], + "intent_tags": [ + "aggregate", + "by_dimension" + ] + }, + { + "id": "snpt_quality_spid_uniq", + "aliases": [ + { + "text": "服务点ID唯一性检", + "tone": "专业" + }, + { + "text": "服务点ID有重复吗", + "tone": "口语" + }, + { + "text": "服务点ID完整性评估", + "tone": "中性" + } + ], + "keywords": [ + "质量检查", + "quality", + "唯一性", + "uniqueness", + "重复", + "duplicate", + "空值", + "NULL", + "完整性", + "integrity", + "service_point_id", + "数据质量", + "统计", + "去重", + "异常检测" + ], + "intent_tags": [ + "quality" + ] + }, + { + "id": "snpt_quality_account_nulls", + "aliases": [ + { + "text": "账户ID缺失明细", + "tone": "中性" + }, + { + "text": "看看哪些账户为空", + "tone": "口语" + }, + { + "text": "account_id空值样本", + "tone": "专业" + } + ], + "keywords": [ + "质量检查", + "缺失", + "missing", + "空值", + "NULL", + "account_id", + "样本", + "sample", + "抽样", + "sampling", + "明细", + "排查", + "过滤", + "WHERE", + "LIMIT" + ], + "intent_tags": [ + "quality", + "sample" + ] + }, + { + "id": "snpt_sample_random_rows", + "aliases": [ + { + "text": "随机抽样水表明细", + "tone": "中性" + }, + { + "text": "随机取几条看看", + "tone": "口语" + }, + { + "text": "RAND()样本抽取", + "tone": "专业" + } + ], + "keywords": [ + "随机", + "random", + "样本", + "sample", + "抽样", + "sampling", + "明细", + "details", + "质检", + "QA", + "RAND()", + "LIMIT", + "抽取", + "数据验证" + ], + "intent_tags": [ + "sample" + ] + }, + { + "id": "snpt_filter_office_type_where", + "aliases": [ + { + "text": "按所与类型过滤有效", + "tone": "专业" + }, + { + "text": "筛选某所的指定类型", + "tone": "中性" + }, + { + "text": "只看这所的这种表", + "tone": "口语" + } + ], + "keywords": [ + "过滤", + "filter", + "WHERE", + "supply_office", + "营业所", + "meter_type", + "类型", + "meter_status", + "有效", + "条件片段", + "筛选", + "查询拼接", + "字段", + "约束" + ], + "intent_tags": [ + "filter" + ] + }, + { + "id": "snpt_office_station_dist", + "aliases": [ + { + "text": "所站组合水表数", + "tone": "中性" + }, + { + "text": "各站在各所有多少", + "tone": "口语" + }, + { + "text": "营业所×站点分布", + "tone": "专业" + } + ], + "keywords": [ + "supply_office", + "营业所", + "station", + "站点", + "层级", + "hierarchy", + "分布", + "distribution", + "水表数", + "meter count", + "distinct", + "去重", + "聚合", + "交叉分析", + "排行" + ], + "intent_tags": [ + "aggregate", + "by_dimension" + ] + }, + { + "id": "snpt_total_meter_baseline", + "aliases": [ + { + "text": "水表总量基线", + "tone": "中性" + }, + { + "text": "现在有多少水表", + "tone": "口语" + }, + { + "text": "全表去重总数", + "tone": "专业" + } + ], + "keywords": [ + "总量", + "total", + "baseline", + "基线", + "水表总数", + "meter total", + "service_point_id", + "distinct", + "去重", + "分母", + "denominator", + "占比", + "聚合", + "汇总", + "snapshot" + ], + "intent_tags": [ + "aggregate" + ] + } +] \ No newline at end of file diff --git a/demo/水务/水务-gpt5-snippet.json b/demo/水务/水务-gpt5-snippet.json new file mode 100644 index 0000000..682bda5 --- /dev/null +++ b/demo/水务/水务-gpt5-snippet.json @@ -0,0 +1,330 @@ +[ + { + "id": "snpt_topn_station", + "desc": "按站点统计水表数量并取前N", + "type": "topn", + "title": "站点TopN水表数", + "examples": [ + "各站点水表数量排名前10", + "站点水表覆盖情况排行" + ], + "variables": [ + { + "name": "top_n", + "type": "int", + "default": 10 + } + ], + "dialect_sql": { + "mysql": "SELECT station,\n COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY station\nORDER BY meter_cnt DESC\nLIMIT {{top_n}};" + }, + "applicability": { + "constraints": { + "notes": [ + "TopN建议N<=36", + "以service_point_id去重计数", + "无时间列,无法做趋势" + ], + "fk_join_available": false, + "dim_cardinality_hint": 36 + }, + "time_column": null, + "required_columns": [ + "station", + "service_point_id" + ] + }, + "business_caliber": "水表数=按service_point_id去重计数;粒度=站点。仅统计当前表中的有效记录(不含时间口径)。安全限制:用于分析排名,避免扩大LIMIT造成全量导出。" + }, + { + "id": "snpt_share_district", + "desc": "统计各辖区水表数及其占比", + "type": "ratio", + "title": "辖区水表占比", + "examples": [ + "各辖区水表占比", + "哪个辖区水表最多" + ], + "variables": [], + "dialect_sql": { + "mysql": "WITH by_district AS (\n SELECT district, COUNT(DISTINCT service_point_id) AS meter_cnt\n FROM `data-ge`.`water_meter_info`\n GROUP BY district\n), tot AS (\n SELECT COUNT(DISTINCT service_point_id) AS total_cnt\n FROM `data-ge`.`water_meter_info`\n)\nSELECT b.district,\n b.meter_cnt,\n ROUND(b.meter_cnt / NULLIF(t.total_cnt, 0) * 100, 2) AS pct\nFROM by_district b\nCROSS JOIN tot t\nORDER BY pct DESC, b.district;" + }, + "applicability": { + "constraints": { + "notes": [ + "占比分母为全表service_point_id去重总数", + "service_point_id为空将被忽略" + ], + "fk_join_available": false, + "dim_cardinality_hint": 13 + }, + "time_column": null, + "required_columns": [ + "district", + "service_point_id" + ] + }, + "business_caliber": "水表数=按service_point_id去重计数;粒度=辖区。占比=辖区水表数/全表水表总数。安全限制:仅基于本表,不代表全市/全网口径;无时间维度。" + }, + { + "id": "snpt_dist_diameter", + "desc": "按表径统计水表数量分布", + "type": "aggregate", + "title": "表径分布统计", + "examples": [ + "不同口径水表有多少", + "查看表径分布情况" + ], + "variables": [], + "dialect_sql": { + "mysql": "SELECT meter_diameter,\n COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY meter_diameter\nORDER BY meter_cnt DESC, meter_diameter;" + }, + "applicability": { + "constraints": { + "notes": [ + "以service_point_id去重计数", + "适合绘制条形图/饼图" + ], + "fk_join_available": false, + "dim_cardinality_hint": 8 + }, + "time_column": null, + "required_columns": [ + "meter_diameter", + "service_point_id" + ] + }, + "business_caliber": "水表数=按service_point_id去重计数;粒度=表径。安全限制:仅用于分布分析,不含时间过滤;避免用于明细导出。" + }, + { + "id": "snpt_type_subtype_matrix", + "desc": "统计水表类型与子类组合的数量", + "type": "aggregate", + "title": "类型子类分布", + "examples": [ + "不同类型与子类的水表数量", + "查看类型与子类的组合分布" + ], + "variables": [], + "dialect_sql": { + "mysql": "SELECT meter_type,\n meter_subtype,\n COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY meter_type, meter_subtype\nORDER BY meter_cnt DESC, meter_type, meter_subtype;" + }, + "applicability": { + "constraints": { + "notes": [ + "组合基数<=5×9=45", + "以service_point_id去重计数" + ], + "fk_join_available": false, + "dim_cardinality_hint": 45 + }, + "time_column": null, + "required_columns": [ + "meter_type", + "meter_subtype", + "service_point_id" + ] + }, + "business_caliber": "水表数=按service_point_id去重计数;粒度=类型×子类组合。安全限制:仅用于汇总分析,不包含时间或业务状态变化。" + }, + { + "id": "snpt_quality_spid_uniq", + "desc": "评估service_point_id的空值与重复情况", + "type": "quality", + "title": "服务点唯一性检", + "examples": [ + "检查服务点ID是否唯一", + "统计service_point_id空值与重复情况" + ], + "variables": [], + "dialect_sql": { + "mysql": "SELECT\n COUNT(*) AS total_rows,\n SUM(service_point_id IS NULL) AS null_cnt,\n COUNT(DISTINCT service_point_id) AS distinct_cnt,\n (COUNT(*) - COUNT(DISTINCT service_point_id)) AS duplicate_rows_est,\n (\n SELECT COUNT(*) FROM (\n SELECT service_point_id\n FROM `data-ge`.`water_meter_info`\n GROUP BY service_point_id\n HAVING COUNT(*) > 1\n ) AS dup\n ) AS dup_key_groups\nFROM `data-ge`.`water_meter_info`;" + }, + "applicability": { + "constraints": { + "notes": [ + "用于键完整性检查", + "重复行估算=总行数-去重数" + ], + "fk_join_available": false, + "dim_cardinality_hint": null + }, + "time_column": null, + "required_columns": [ + "service_point_id" + ] + }, + "business_caliber": "质量检查口径:在本表内评估service_point_id的非空与唯一性,不代表跨表全局唯一。安全限制:仅输出汇总指标,不暴露明细重复值。" + }, + { + "id": "snpt_quality_account_nulls", + "desc": "抽取account_id为空的记录用于排查", + "type": "quality", + "title": "账户ID缺失明细", + "examples": [ + "列出account_id为空的水表", + "抽样查看账户缺失的数据行" + ], + "variables": [ + { + "name": "limit_n", + "type": "int", + "default": 50 + } + ], + "dialect_sql": { + "mysql": "SELECT *\nFROM `data-ge`.`water_meter_info`\nWHERE account_id IS NULL\nLIMIT {{limit_n}};" + }, + "applicability": { + "constraints": { + "notes": [ + "明细仅限小样本抽取", + "建议LIMIT<=100,避免全量导出" + ], + "fk_join_available": false, + "dim_cardinality_hint": null + }, + "time_column": null, + "required_columns": [ + "account_id" + ] + }, + "business_caliber": "质量抽样:筛出账户ID缺失的水表记录,便于核对。安全限制:仅用于样本排查,不建议在生产中全量导出;如需口径统计请改为COUNT聚合。" + }, + { + "id": "snpt_sample_random_rows", + "desc": "随机抽取水表信息用于人工核验", + "type": "sample", + "title": "随机抽样明细", + "examples": [ + "抽样查看水表信息", + "随机抽取20条做质检" + ], + "variables": [ + { + "name": "sample_size", + "type": "int", + "default": 20 + } + ], + "dialect_sql": { + "mysql": "SELECT *\nFROM `data-ge`.`water_meter_info`\nORDER BY RAND()\nLIMIT {{sample_size}};" + }, + "applicability": { + "constraints": { + "notes": [ + "使用RAND()随机,样本不可复现", + "建议限制样本量" + ], + "fk_join_available": false, + "dim_cardinality_hint": 300 + }, + "time_column": null, + "required_columns": [ + "service_point_id" + ] + }, + "business_caliber": "样本抽取:从本表随机返回若干行明细。安全限制:避免扩大LIMIT进行全量下载;如需可复现样本,请改用带种子的随机方法(MySQL不原生支持)。" + }, + { + "id": "snpt_filter_office_type_where", + "desc": "常用WHERE筛选条件片段:按营业所与类型且为有效", + "type": "sample", + "title": "机构类型筛选片", + "examples": [ + "筛选A营业所的机械表", + "仅查看某营业所的指定类型水表" + ], + "variables": [ + { + "name": "supply_office", + "type": "string" + }, + { + "name": "meter_type", + "type": "string" + } + ], + "dialect_sql": { + "mysql": "WHERE supply_office = '{{supply_office}}'\n AND meter_type = '{{meter_type}}'\n AND meter_status = '有效'" + }, + "applicability": { + "constraints": { + "notes": [ + "这是条件片段,可拼接到其他查询", + "meter_status当前为单一值“有效”" + ], + "fk_join_available": false, + "dim_cardinality_hint": 11 + }, + "time_column": null, + "required_columns": [ + "supply_office", + "meter_type", + "meter_status" + ] + }, + "business_caliber": "过滤口径:仅保留指定营业所与指定水表类型、且状态为“有效”的记录。安全限制:为片段用途,需拼接在SELECT…FROM之后使用。" + }, + { + "id": "snpt_office_station_dist", + "desc": "按营业所与站点组合统计水表数", + "type": "aggregate", + "title": "所站层级分布", + "examples": [ + "按营业所查看各站点水表数", + "所站两级的水表分布情况" + ], + "variables": [], + "dialect_sql": { + "mysql": "SELECT supply_office,\n station,\n COUNT(DISTINCT service_point_id) AS meter_cnt\nFROM `data-ge`.`water_meter_info`\nGROUP BY supply_office, station\nORDER BY supply_office, meter_cnt DESC, station;" + }, + "applicability": { + "constraints": { + "notes": [ + "组合基数<=11×36=396", + "以service_point_id去重计数", + "如结果过长可再按TopN筛选" + ], + "fk_join_available": false, + "dim_cardinality_hint": 396 + }, + "time_column": null, + "required_columns": [ + "supply_office", + "station", + "service_point_id" + ] + }, + "business_caliber": "水表数=按service_point_id去重计数;粒度=营业所×站点。安全限制:结果行数可能较多,建议在可视化端增加筛选或分页。" + }, + { + "id": "snpt_total_meter_baseline", + "desc": "获取全表水表去重总量基线", + "type": "aggregate", + "title": "水表总量基线", + "examples": [ + "当前有多少只水表", + "作为占比分析的分母基线" + ], + "variables": [], + "dialect_sql": { + "mysql": "SELECT COUNT(DISTINCT service_point_id) AS meter_total\nFROM `data-ge`.`water_meter_info`;" + }, + "applicability": { + "constraints": { + "notes": [ + "作为其他占比/分摊分母基线", + "忽略service_point_id为空的记录" + ], + "fk_join_available": false, + "dim_cardinality_hint": 300 + }, + "time_column": null, + "required_columns": [ + "service_point_id" + ] + }, + "business_caliber": "水表总量=按service_point_id去重计数;基于当前表的全量记录。安全限制:无时间维度,无法反映存量随时间变化。" + } +] \ No newline at end of file diff --git a/demo/水务/水务-qwen3-coder-480b-ge-desc.json b/demo/水务/水务-qwen3-coder-480b-ge-desc.json new file mode 100644 index 0000000..9eef93b --- /dev/null +++ b/demo/水务/水务-qwen3-coder-480b-ge-desc.json @@ -0,0 +1,415 @@ +{ + "role": "dimension", + "time": { + "range": null, + "column": null, + "has_gaps": null, + "granularity": "unknown" + }, + "grain": [ + "account_id", + "service_point_id" + ], + "table": "data-ge.water_meter_info", + "columns": [ + { + "name": "supply_office", + "dtype": "string", + "stats": {}, + "comment": "供水管理所名称,枚举值", + "enumish": true, + "null_rate": 0.0, + "top_values": [ + { + "pct": null, + "value": "宝山供水管理所" + }, + { + "pct": null, + "value": "黄浦供水管理所" + }, + { + "pct": null, + "value": "青东供水管理所" + }, + { + "pct": null, + "value": "虹口供水管理所" + }, + { + "pct": null, + "value": "闸北供水管理所" + }, + { + "pct": null, + "value": "松北供水管理所" + }, + { + "pct": null, + "value": "杨浦供水管理所" + }, + { + "pct": null, + "value": "长宁供水管理所" + }, + { + "pct": null, + "value": "闵行供水管理所" + }, + { + "pct": null, + "value": "徐汇供水管理所" + }, + { + "pct": null, + "value": "普陀供水管理所" + } + ], + "semantic_type": "dimension", + "distinct_count": 11, + "distinct_ratio": 0.03666666666666667, + "pk_candidate_score": 0.11, + "metric_candidate_score": 0.0 + }, + { + "name": "station", + "dtype": "string", + "stats": {}, + "comment": "站点名称,枚举值", + "enumish": true, + "null_rate": 0.0, + "top_values": [ + { + "pct": null, + "value": "新闸站" + }, + { + "pct": null, + "value": "宝杨站" + }, + { + "pct": null, + "value": "江川站" + }, + { + "pct": null, + "value": "长江站" + }, + { + "pct": null, + "value": "市光站" + }, + { + "pct": null, + "value": "徐泾站" + }, + { + "pct": null, + "value": "真北站" + }, + { + "pct": null, + "value": "半淞园站" + }, + { + "pct": null, + "value": "芙蓉江站" + }, + { + "pct": null, + "value": "密云站" + } + ], + "semantic_type": "dimension", + "distinct_count": 36, + "distinct_ratio": 0.12, + "pk_candidate_score": 0.36, + "metric_candidate_score": 0.0 + }, + { + "name": "district", + "dtype": "string", + "stats": {}, + "comment": "行政区划名称,枚举值", + "enumish": true, + "null_rate": 0.0, + "top_values": [ + { + "pct": null, + "value": "普陀区" + }, + { + "pct": null, + "value": "闵行区" + }, + { + "pct": null, + "value": "嘉定区" + }, + { + "pct": null, + "value": "杨浦区" + }, + { + "pct": null, + "value": "徐汇区" + }, + { + "pct": null, + "value": "黄浦区" + }, + { + "pct": null, + "value": "松江区" + }, + { + "pct": null, + "value": "长宁区" + }, + { + "pct": null, + "value": "青浦区" + }, + { + "pct": null, + "value": "虹口区" + } + ], + "semantic_type": "dimension", + "distinct_count": 13, + "distinct_ratio": 0.043333333333333335, + "pk_candidate_score": 0.13, + "metric_candidate_score": 0.0 + }, + { + "name": "meter_diameter", + "dtype": "string", + "stats": {}, + "comment": "水表直径规格,枚举值", + "enumish": true, + "null_rate": 0.0, + "top_values": [ + { + "pct": null, + "value": "20mm" + }, + { + "pct": null, + "value": "15mm" + }, + { + "pct": null, + "value": "25mm" + }, + { + "pct": null, + "value": "40mm" + }, + { + "pct": null, + "value": "150mm" + }, + { + "pct": null, + "value": "100mm" + }, + { + "pct": null, + "value": "80mm" + }, + { + "pct": null, + "value": "50mm" + } + ], + "semantic_type": "dimension", + "distinct_count": 8, + "distinct_ratio": 0.02666666666666667, + "pk_candidate_score": 0.08, + "metric_candidate_score": 0.0 + }, + { + "name": "meter_status", + "dtype": "string", + "stats": {}, + "comment": "水表状态,枚举值", + "enumish": true, + "null_rate": 0.0, + "top_values": [ + { + "pct": null, + "value": "有效" + } + ], + "semantic_type": "dimension", + "distinct_count": 1, + "distinct_ratio": 0.0033333333333333335, + "pk_candidate_score": 0.01, + "metric_candidate_score": 0.0 + }, + { + "name": "meter_subtype", + "dtype": "string", + "stats": {}, + "comment": "水表子类型,枚举值", + "enumish": true, + "null_rate": 0.0, + "top_values": [ + { + "pct": null, + "value": "旋翼半液封式" + }, + { + "pct": null, + "value": "超声波式" + }, + { + "pct": null, + "value": "旋翼湿式(指针式)" + }, + { + "pct": null, + "value": "旋翼湿式(数字指针式)" + }, + { + "pct": null, + "value": "电磁式" + }, + { + "pct": null, + "value": "无直管段要求超声波式" + }, + { + "pct": null, + "value": "无直管段要求电磁式" + }, + { + "pct": null, + "value": "垂直螺翼干式" + }, + { + "pct": null, + "value": "机械容积式" + } + ], + "semantic_type": "dimension", + "distinct_count": 9, + "distinct_ratio": 0.03, + "pk_candidate_score": 0.09, + "metric_candidate_score": 0.0 + }, + { + "name": "meter_type", + "dtype": "string", + "stats": {}, + "comment": "水表类型,枚举值", + "enumish": true, + "null_rate": 0.0, + "top_values": [ + { + "pct": null, + "value": "容积式机械水表" + }, + { + "pct": null, + "value": "速度式机械水表" + }, + { + "pct": null, + "value": "电磁式远传水表" + }, + { + "pct": null, + "value": "速度式机电远传水表" + }, + { + "pct": null, + "value": "超声波式远传水表" + } + ], + "semantic_type": "dimension", + "distinct_count": 5, + "distinct_ratio": 0.016666666666666666, + "pk_candidate_score": 0.05, + "metric_candidate_score": 0.0 + }, + { + "name": "installation_position", + "dtype": "string", + "stats": {}, + "comment": "安装位置,枚举值", + "enumish": true, + "null_rate": 0.0, + "top_values": [ + { + "pct": null, + "value": "嵌墙表" + }, + { + "pct": null, + "value": "管道井表" + }, + { + "pct": null, + "value": "地下表" + }, + { + "pct": null, + "value": "龙头表" + } + ], + "semantic_type": "dimension", + "distinct_count": 4, + "distinct_ratio": 0.013333333333333334, + "pk_candidate_score": 0.04, + "metric_candidate_score": 0.0 + }, + { + "name": "account_id", + "dtype": "string", + "stats": {}, + "comment": "账户ID", + "enumish": false, + "null_rate": null, + "top_values": [], + "semantic_type": "id", + "distinct_count": null, + "distinct_ratio": null, + "pk_candidate_score": 0.95, + "metric_candidate_score": 0.0 + }, + { + "name": "service_point_id", + "dtype": "string", + "stats": {}, + "comment": "服务点ID", + "enumish": false, + "null_rate": null, + "top_values": [], + "semantic_type": "id", + "distinct_count": null, + "distinct_ratio": null, + "pk_candidate_score": 0.95, + "metric_candidate_score": 0.0 + } + ], + "quality": { + "warning_hints": [], + "failed_expectations": [] + }, + "row_count": 300, + "fk_candidates": [], + "confidence_notes": [ + "role判定为dimension,因所有列均为枚举或ID类型,无metric列", + "grain依据account_id和service_point_id为唯一标识推测", + "未发现时间列,因此time字段为null" + ], + "primary_key_candidates": [ + [ + "account_id" + ], + [ + "service_point_id" + ] + ] +} \ No newline at end of file diff --git a/demo/水务/水务-qwen3-coder-480b-snippet-alias.json b/demo/水务/水务-qwen3-coder-480b-snippet-alias.json new file mode 100644 index 0000000..6743b58 --- /dev/null +++ b/demo/水务/水务-qwen3-coder-480b-snippet-alias.json @@ -0,0 +1,286 @@ +[ + { + "id": "snpt_water_meter_top_supply_office", + "aliases": [ + { + "text": "供水所水表排行", + "tone": "中性" + }, + { + "text": "哪个供水所水表最多", + "tone": "口语" + }, + { + "text": "供水管理所水表TopN统计", + "tone": "专业" + } + ], + "keywords": [ + "水表", + "供水管理所", + "排行", + "TopN", + "数量", + "统计", + "count", + "排名", + "前N", + "供水所", + "水表数", + "维度聚合", + "by_dimension", + "topn" + ], + "intent_tags": [ + "topn", + "by_dimension" + ] + }, + { + "id": "snpt_water_meter_top_station", + "aliases": [ + { + "text": "站点水表数量排行", + "tone": "中性" + }, + { + "text": "哪个站点水表最多", + "tone": "口语" + }, + { + "text": "站点维度水表TopN分析", + "tone": "专业" + } + ], + "keywords": [ + "水表", + "站点", + "排行", + "TopN", + "数量", + "统计", + "count", + "排名", + "前N", + "站点数", + "维度聚合", + "by_dimension", + "topn" + ], + "intent_tags": [ + "topn", + "by_dimension" + ] + }, + { + "id": "snpt_water_meter_top_district", + "aliases": [ + { + "text": "区域水表数量排名", + "tone": "中性" + }, + { + "text": "哪个区水表最多", + "tone": "口语" + }, + { + "text": "行政区水表TopN统计", + "tone": "专业" + } + ], + "keywords": [ + "水表", + "区域", + "行政区", + "排行", + "TopN", + "数量", + "统计", + "count", + "排名", + "前N", + "区", + "水表数", + "维度聚合", + "by_dimension", + "topn" + ], + "intent_tags": [ + "topn", + "by_dimension" + ] + }, + { + "id": "snpt_water_meter_share_by_type", + "aliases": [ + { + "text": "水表类型占比", + "tone": "中性" + }, + { + "text": "哪种水表用得最多", + "tone": "口语" + }, + { + "text": "水表类型分布比例", + "tone": "专业" + } + ], + "keywords": [ + "水表", + "类型", + "占比", + "比例", + "ratio", + "分布", + "meter_type", + "百分比", + "分类统计", + "水表类型", + "ratio", + "aggregate", + "by_dimension" + ], + "intent_tags": [ + "ratio", + "by_dimension" + ] + }, + { + "id": "snpt_water_meter_subtype_distribution", + "aliases": [ + { + "text": "水表子类型分布", + "tone": "中性" + }, + { + "text": "各种子类型水表情况", + "tone": "口语" + }, + { + "text": "水表子类型计数与占比", + "tone": "专业" + } + ], + "keywords": [ + "水表", + "子类型", + "分布", + "数量", + "占比", + "meter_subtype", + "统计", + "count", + "百分比", + "分类统计", + "aggregate", + "by_dimension" + ], + "intent_tags": [ + "aggregate", + "by_dimension" + ] + }, + { + "id": "snpt_water_meter_installation_position_stats", + "aliases": [ + { + "text": "安装位置统计", + "tone": "中性" + }, + { + "text": "哪种位置装表最多", + "tone": "口语" + }, + { + "text": "水表安装位置分布", + "tone": "专业" + } + ], + "keywords": [ + "水表", + "安装位置", + "统计", + "分布", + "installation_position", + "数量", + "count", + "位置", + "安装点", + "aggregate", + "by_dimension" + ], + "intent_tags": [ + "aggregate", + "by_dimension" + ] + }, + { + "id": "snpt_water_meter_grain_check", + "aliases": [ + { + "text": "主键粒度校验", + "tone": "中性" + }, + { + "text": "数据有没有重复", + "tone": "口语" + }, + { + "text": "数据粒度一致性检查", + "tone": "专业" + } + ], + "keywords": [ + "主键", + "粒度", + "校验", + "质量", + "quality", + "重复", + "唯一性", + "account_id", + "service_point_id", + "数据校验", + "质量检查", + "异常检测" + ], + "intent_tags": [ + "quality" + ] + }, + { + "id": "snpt_water_meter_sample_records", + "aliases": [ + { + "text": "水表数据抽样", + "tone": "中性" + }, + { + "text": "给我看点水表数据", + "tone": "口语" + }, + { + "text": "水表记录样本抽取", + "tone": "专业" + } + ], + "keywords": [ + "水表", + "样本", + "抽样", + "sample", + "随机", + "记录", + "抽查", + "limit", + "数据结构", + "数据示例", + "sample", + "limit_rows" + ], + "intent_tags": [ + "sample" + ] + } +] \ No newline at end of file diff --git a/demo/水务/水务-qwen3-coder-480b-snippet.json b/demo/水务/水务-qwen3-coder-480b-snippet.json new file mode 100644 index 0000000..34e247f --- /dev/null +++ b/demo/水务/水务-qwen3-coder-480b-snippet.json @@ -0,0 +1,235 @@ +[ + { + "id": "snpt_water_meter_top_supply_office", + "desc": "统计各供水管理所下辖水表数量并排序", + "type": "topn", + "title": "供水管理所水表数量排行", + "examples": [ + "列出水表最多的前10个供水管理所", + "各供水所水表数量排名" + ], + "variables": [ + { + "name": "top_n", + "type": "int", + "default": 10 + } + ], + "dialect_sql": { + "mysql": "SELECT supply_office AS dim_value, COUNT(*) AS metric_value FROM `data-ge.water_meter_info` GROUP BY supply_office ORDER BY metric_value DESC LIMIT {{top_n}}" + }, + "applicability": { + "constraints": { + "notes": [], + "fk_join_available": false, + "dim_cardinality_hint": 11 + }, + "time_column": "nullable", + "required_columns": [ + "supply_office" + ] + }, + "business_caliber": "按供水管理所维度聚合水表总数,粒度=供水管理所" + }, + { + "id": "snpt_water_meter_top_station", + "desc": "统计各个站点下辖水表数量并排序", + "type": "topn", + "title": "站点水表数量排行", + "examples": [ + "列出水表最多的前10个站点", + "各站点水表数量排名" + ], + "variables": [ + { + "name": "top_n", + "type": "int", + "default": 10 + } + ], + "dialect_sql": { + "mysql": "SELECT station AS dim_value, COUNT(*) AS metric_value FROM `data-ge.water_meter_info` GROUP BY station ORDER BY metric_value DESC LIMIT {{top_n}}" + }, + "applicability": { + "constraints": { + "notes": [ + "高基数维度建议LIMIT<=50" + ], + "fk_join_available": false, + "dim_cardinality_hint": 36 + }, + "time_column": "nullable", + "required_columns": [ + "station" + ] + }, + "business_caliber": "按站点维度聚合水表总数,粒度=站点" + }, + { + "id": "snpt_water_meter_top_district", + "desc": "统计各区水表数量并排序", + "type": "topn", + "title": "区域水表数量排行", + "examples": [ + "列出各区水表数量排名", + "哪个区的水表最多?" + ], + "variables": [ + { + "name": "top_n", + "type": "int", + "default": 10 + } + ], + "dialect_sql": { + "mysql": "SELECT district AS dim_value, COUNT(*) AS metric_value FROM `data-ge.water_meter_info` GROUP BY district ORDER BY metric_value DESC LIMIT {{top_n}}" + }, + "applicability": { + "constraints": { + "notes": [], + "fk_join_available": false, + "dim_cardinality_hint": 13 + }, + "time_column": "nullable", + "required_columns": [ + "district" + ] + }, + "business_caliber": "按行政区划维度聚合水表总数,粒度=区" + }, + { + "id": "snpt_water_meter_share_by_type", + "desc": "计算各类水表占总水表的比例", + "type": "ratio", + "title": "水表类型占比分布", + "examples": [ + "各类水表占比是多少?", + "哪种类型的水表使用最广泛?" + ], + "variables": [], + "dialect_sql": { + "mysql": "SELECT meter_type AS dim_value, COUNT(*) * 100.0 / (SELECT COUNT(*) FROM `data-ge.water_meter_info`) AS ratio_percent FROM `data-ge.water_meter_info` GROUP BY meter_type ORDER BY ratio_percent DESC" + }, + "applicability": { + "constraints": { + "notes": [], + "fk_join_available": false, + "dim_cardinality_hint": 5 + }, + "time_column": "nullable", + "required_columns": [ + "meter_type" + ] + }, + "business_caliber": "按水表类型分类计算其占比,粒度=水表类型" + }, + { + "id": "snpt_water_meter_subtype_distribution", + "desc": "展示不同水表子类型的数量及比例", + "type": "aggregate", + "title": "水表子类型分布情况", + "examples": [ + "各种子类型水表的数量和占比", + "哪种子类型水表最多?" + ], + "variables": [], + "dialect_sql": { + "mysql": "SELECT meter_subtype AS dim_value, COUNT(*) AS count_value, ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM `data-ge.water_meter_info`), 2) AS percentage FROM `data-ge.water_meter_info` GROUP BY meter_subtype ORDER BY count_value DESC" + }, + "applicability": { + "constraints": { + "notes": [], + "fk_join_available": false, + "dim_cardinality_hint": 9 + }, + "time_column": "nullable", + "required_columns": [ + "meter_subtype" + ] + }, + "business_caliber": "按水表子类型进行计数和百分比统计,粒度=水表子类型" + }, + { + "id": "snpt_water_meter_installation_position_stats", + "desc": "统计不同安装位置下的水表数量", + "type": "aggregate", + "title": "安装位置分布统计", + "examples": [ + "各种安装位置的水表数量", + "哪种安装位置最为常见?" + ], + "variables": [], + "dialect_sql": { + "mysql": "SELECT installation_position AS dim_value, COUNT(*) AS count_value FROM `data-ge.water_meter_info` GROUP BY installation_position ORDER BY count_value DESC" + }, + "applicability": { + "constraints": { + "notes": [], + "fk_join_available": false, + "dim_cardinality_hint": 4 + }, + "time_column": "nullable", + "required_columns": [ + "installation_position" + ] + }, + "business_caliber": "按安装位置对水表进行分组计数,粒度=安装位置" + }, + { + "id": "snpt_water_meter_grain_check", + "desc": "验证 account_id 和 service_point_id 是否构成唯一组合", + "type": "quality", + "title": "主键粒度校验", + "examples": [ + "这张表的数据粒度是否正确?", + "是否存在重复的服务点记录?" + ], + "variables": [], + "dialect_sql": { + "mysql": "SELECT IF(COUNT(*) = COUNT(DISTINCT account_id, service_point_id), 'PASS', 'FAIL') AS grain_check_result FROM `data-ge.water_meter_info`" + }, + "applicability": { + "constraints": { + "notes": [], + "fk_join_available": false, + "dim_cardinality_hint": null + }, + "time_column": "nullable", + "required_columns": [ + "account_id", + "service_point_id" + ] + }, + "business_caliber": "检验数据是否符合预期的主键粒度(account_id + service_point_id)" + }, + { + "id": "snpt_water_meter_sample_records", + "desc": "随机抽取部分水表信息用于查看结构", + "type": "sample", + "title": "样本抽取", + "examples": [ + "给我看几条水表数据的例子", + "抽查一些原始数据看看格式" + ], + "variables": [ + { + "name": "limit_rows", + "type": "int", + "default": 5 + } + ], + "dialect_sql": { + "mysql": "SELECT * FROM `data-ge.water_meter_info` ORDER BY RAND() LIMIT {{limit_rows}}" + }, + "applicability": { + "constraints": { + "notes": [], + "fk_join_available": false, + "dim_cardinality_hint": null + }, + "time_column": "nullable", + "required_columns": [] + }, + "business_caliber": "从全量数据中随机采样若干条记录供参考" + } +] \ No newline at end of file diff --git a/demo/水务/水务-qwen3-next-80b-ge-desc.json b/demo/水务/水务-qwen3-next-80b-ge-desc.json new file mode 100644 index 0000000..702023e --- /dev/null +++ b/demo/水务/水务-qwen3-next-80b-ge-desc.json @@ -0,0 +1 @@ +{"role": "dimension", "time": {"range": null, "column": null, "has_gaps": null, "granularity": "unknown"}, "grain": ["account_id", "service_point_id"], "table": "water_meter_info", "columns": [{"name": "supply_office", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "供应办公室,枚举值,非空", "enumish": true, "null_rate": 0.0, "top_values": [{"pct": 0.09090909090909093, "value": "宝山供水管理所"}, {"pct": 0.09090909090909093, "value": "黄浦供水管理所"}, {"pct": 0.09090909090909093, "value": "青东供水管理所"}, {"pct": 0.09090909090909093, "value": "虹口供水管理所"}, {"pct": 0.09090909090909093, "value": "闸北供水管理所"}, {"pct": 0.09090909090909093, "value": "松北供水管理所"}, {"pct": 0.09090909090909093, "value": "杨浦供水管理所"}, {"pct": 0.09090909090909093, "value": "长宁供水管理所"}, {"pct": 0.09090909090909093, "value": "闵行供水管理所"}, {"pct": 0.09090909090909093, "value": "徐汇供水管理所"}], "semantic_type": "dimension", "distinct_count": 11, "distinct_ratio": 0.03666666666666667, "pk_candidate_score": 0.1, "metric_candidate_score": 0.0}, {"name": "station", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "站点名称,枚举值,非空", "enumish": true, "null_rate": 0.0, "top_values": [{"pct": 0.027777777777777776, "value": "新闸站"}, {"pct": 0.027777777777777776, "value": "宝杨站"}, {"pct": 0.027777777777777776, "value": "江川站"}, {"pct": 0.027777777777777776, "value": "长江站"}, {"pct": 0.027777777777777776, "value": "市光站"}, {"pct": 0.027777777777777776, "value": "徐泾站"}, {"pct": 0.027777777777777776, "value": "真北站"}, {"pct": 0.027777777777777776, "value": "半淞园站"}, {"pct": 0.027777777777777776, "value": "芙蓉江站"}, {"pct": 0.027777777777777776, "value": "密云站"}], "semantic_type": "dimension", "distinct_count": 36, "distinct_ratio": 0.12, "pk_candidate_score": 0.12, "metric_candidate_score": 0.0}, {"name": "district", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "行政区,枚举值,非空", "enumish": true, "null_rate": 0.0, "top_values": [{"pct": 0.07692307692307693, "value": "普陀区"}, {"pct": 0.07692307692307693, "value": "闵行区"}, {"pct": 0.07692307692307693, "value": "嘉定区"}, {"pct": 0.07692307692307693, "value": "杨浦区"}, {"pct": 0.07692307692307693, "value": "徐汇区"}, {"pct": 0.07692307692307693, "value": "黄浦区"}, {"pct": 0.07692307692307693, "value": "松江区"}, {"pct": 0.07692307692307693, "value": "长宁区"}, {"pct": 0.07692307692307693, "value": "青浦区"}, {"pct": 0.07692307692307693, "value": "虹口区"}], "semantic_type": "dimension", "distinct_count": 13, "distinct_ratio": 0.043333333333333335, "pk_candidate_score": 0.043333333333333335, "metric_candidate_score": 0.0}, {"name": "meter_diameter", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "水表直径,枚举值,非空", "enumish": true, "null_rate": 0.0, "top_values": [{"pct": 0.125, "value": "20mm"}, {"pct": 0.125, "value": "15mm"}, {"pct": 0.125, "value": "25mm"}, {"pct": 0.125, "value": "40mm"}, {"pct": 0.125, "value": "150mm"}, {"pct": 0.125, "value": "100mm"}, {"pct": 0.125, "value": "80mm"}, {"pct": 0.125, "value": "50mm"}], "semantic_type": "dimension", "distinct_count": 8, "distinct_ratio": 0.02666666666666667, "pk_candidate_score": 0.02666666666666667, "metric_candidate_score": 0.0}, {"name": "account_id", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "账户ID,未提供统计,但为关键标识列", "enumish": null, "null_rate": null, "top_values": [], "semantic_type": "id", "distinct_count": null, "distinct_ratio": null, "pk_candidate_score": 1.0, "metric_candidate_score": 0.0}, {"name": "service_point_id", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "服务点ID,未提供统计,但为关键标识列", "enumish": null, "null_rate": null, "top_values": [], "semantic_type": "id", "distinct_count": null, "distinct_ratio": null, "pk_candidate_score": 1.0, "metric_candidate_score": 0.0}, {"name": "station", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "站点名称,枚举值,非空", "enumish": true, "null_rate": 0.0, "top_values": [{"pct": 0.027777777777777776, "value": "新闸站"}, {"pct": 0.027777777777777776, "value": "宝杨站"}, {"pct": 0.027777777777777776, "value": "江川站"}, {"pct": 0.027777777777777776, "value": "长江站"}, {"pct": 0.027777777777777776, "value": "市光站"}, {"pct": 0.027777777777777776, "value": "徐泾站"}, {"pct": 0.027777777777777776, "value": "真北站"}, {"pct": 0.027777777777777776, "value": "半淞园站"}, {"pct": 0.027777777777777776, "value": "芙蓉江站"}, {"pct": 0.027777777777777776, "value": "密云站"}], "semantic_type": "dimension", "distinct_count": 36, "distinct_ratio": 0.12, "pk_candidate_score": 0.12, "metric_candidate_score": 0.0}, {"name": "meter_type", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "水表类型,枚举值,非空", "enumish": true, "null_rate": 0.0, "top_values": [{"pct": 0.2, "value": "容积式机械水表"}, {"pct": 0.2, "value": "速度式机械水表"}, {"pct": 0.2, "value": "电磁式远传水表"}, {"pct": 0.2, "value": "速度式机电远传水表"}, {"pct": 0.2, "value": "超声波式远传水表"}], "semantic_type": "dimension", "distinct_count": 5, "distinct_ratio": 0.016666666666666666, "pk_candidate_score": 0.016666666666666666, "metric_candidate_score": 0.0}, {"name": "meter_subtype", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "水表子类型,枚举值,非空", "enumish": true, "null_rate": 0.0, "top_values": [{"pct": 0.1111111111111111, "value": "旋翼半液封式"}, {"pct": 0.1111111111111111, "value": "超声波式"}, {"pct": 0.1111111111111111, "value": "旋翼湿式(指针式)"}, {"pct": 0.1111111111111111, "value": "旋翼湿式(数字指针式)"}, {"pct": 0.1111111111111111, "value": "电磁式"}, {"pct": 0.1111111111111111, "value": "无直管段要求超声波式"}, {"pct": 0.1111111111111111, "value": "无直管段要求电磁式"}, {"pct": 0.1111111111111111, "value": "垂直螺翼干式"}, {"pct": 0.1111111111111111, "value": "机械容积式"}], "semantic_type": "dimension", "distinct_count": 9, "distinct_ratio": 0.03, "pk_candidate_score": 0.03, "metric_candidate_score": 0.0}, {"name": "meter_status", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "水表状态,仅有效,非空", "enumish": true, "null_rate": 0.0, "top_values": [{"pct": 1.0, "value": "有效"}], "semantic_type": "dimension", "distinct_count": 1, "distinct_ratio": 0.0033333333333333335, "pk_candidate_score": 0.0033333333333333335, "metric_candidate_score": 0.0}, {"name": "installation_position", "dtype": "string", "stats": {"max": null, "min": null, "std": null, "mean": null, "skewness": null}, "comment": "安装位置,枚举值,非空", "enumish": true, "null_rate": 0.0, "top_values": [{"pct": 0.25, "value": "嵌墙表"}, {"pct": 0.25, "value": "管道井表"}, {"pct": 0.25, "value": "地下表"}, {"pct": 0.25, "value": "龙头表"}], "semantic_type": "dimension", "distinct_count": 4, "distinct_ratio": 0.013333333333333334, "pk_candidate_score": 0.013333333333333334, "metric_candidate_score": 0.0}], "quality": {"warning_hints": [], "failed_expectations": []}, "row_count": 300, "fk_candidates": [], "confidence_notes": ["role判定为dimension:所有列均为低熵枚举型维度字段,无数值型指标列,符合维度表特征。", "grain推测为account_id和service_point_id:二者为唯一标识符,且无其他复合主键信息,结合业务常识推断为粒度核心。", "time列未知:无任何日期/时间列,无时间相关命名或统计,无法推断时间维度。", "primary_key_candidates基于account_id和service_point_id的高唯一性(未发现重复)及非空性推断,置信度高。"], "primary_key_candidates": [["account_id", "service_point_id"]]} \ No newline at end of file diff --git a/demo/水务/水务-qwen3-next-80b-snippet-alias.json b/demo/水务/水务-qwen3-next-80b-snippet-alias.json new file mode 100644 index 0000000..e43186b --- /dev/null +++ b/demo/水务/水务-qwen3-next-80b-snippet-alias.json @@ -0,0 +1,249 @@ +[ + { + "id": "snpt_topn_supply_office_by_account", + "aliases": [ + { + "text": "哪个供水所用户最多?", + "tone": "口语" + }, + { + "text": "按供应办公室统计账户数量", + "tone": "中性" + }, + { + "text": "供应办公室账户数TopN排名", + "tone": "专业" + } + ], + "keywords": [ + "供应办公室", + "账户数", + "TopN", + "排行", + "统计", + "account_id", + "supply_office", + "去重", + "高占比", + "维度分析", + "by_dimension", + "aggregate", + "topn" + ], + "intent_tags": [ + "topn", + "aggregate", + "by_dimension" + ] + }, + { + "id": "snpt_topn_station_by_account", + "aliases": [ + { + "text": "哪些站点用户最多?", + "tone": "口语" + }, + { + "text": "按站点统计账户分布", + "tone": "中性" + }, + { + "text": "站点账户数Top20排名", + "tone": "专业" + } + ], + "keywords": [ + "站点", + "账户数", + "TopN", + "排行", + "统计", + "station", + "account_id", + "去重", + "高负载", + "维度分析", + "by_dimension", + "aggregate", + "topn" + ], + "intent_tags": [ + "topn", + "aggregate", + "by_dimension" + ] + }, + { + "id": "snpt_topn_district_by_account", + "aliases": [ + { + "text": "哪个区用户最多?", + "tone": "口语" + }, + { + "text": "按行政区统计账户数量", + "tone": "中性" + }, + { + "text": "行政区账户数全量排名", + "tone": "专业" + } + ], + "keywords": [ + "行政区", + "账户数", + "TopN", + "排行", + "统计", + "district", + "account_id", + "去重", + "区域对比", + "维度分析", + "by_dimension", + "aggregate", + "topn" + ], + "intent_tags": [ + "topn", + "aggregate", + "by_dimension" + ] + }, + { + "id": "snpt_share_of_meter_type", + "aliases": [ + { + "text": "各类水表占多少比例?", + "tone": "口语" + }, + { + "text": "水表类型占比分析", + "tone": "中性" + }, + { + "text": "水表类型占比分布", + "tone": "专业" + } + ], + "keywords": [ + "水表类型", + "占比", + "比例", + "meter_type", + "account_id", + "去重", + "分布", + "主流类型", + "技术选型", + "ratio", + "aggregate", + "by_dimension" + ], + "intent_tags": [ + "ratio", + "aggregate", + "by_dimension" + ] + }, + { + "id": "snpt_sample_account_service_point", + "aliases": [ + { + "text": "随机看10条账户信息", + "tone": "口语" + }, + { + "text": "抽样账户与服务点明细", + "tone": "中性" + }, + { + "text": "账户-服务点随机抽样验证", + "tone": "专业" + } + ], + "keywords": [ + "抽样", + "随机", + "样本", + "account_id", + "service_point_id", + "数据质量", + "验证", + "唯一性", + "格式检查", + "sample", + "quality" + ], + "intent_tags": [ + "sample", + "quality" + ] + }, + { + "id": "snpt_filter_meter_status_valid", + "aliases": [ + { + "text": "只取有效的水表记录", + "tone": "口语" + }, + { + "text": "筛选有效水表记录", + "tone": "中性" + }, + { + "text": "水表状态有效性过滤", + "tone": "专业" + } + ], + "keywords": [ + "有效", + "过滤", + "筛选", + "meter_status", + "质量检查", + "断言", + "清洗", + "filter", + "quality" + ], + "intent_tags": [ + "filter", + "quality" + ] + }, + { + "id": "snpt_filter_meter_diameter_20mm", + "aliases": [ + { + "text": "找出所有20mm水表用户", + "tone": "口语" + }, + { + "text": "筛选20mm水表记录", + "tone": "中性" + }, + { + "text": "20mm口径水表子集提取", + "tone": "专业" + } + ], + "keywords": [ + "20mm", + "水表直径", + "过滤", + "筛选", + "meter_diameter", + "子集", + "分析", + "住宅用水", + "规格", + "filter", + "by_dimension" + ], + "intent_tags": [ + "filter", + "by_dimension" + ] + } +] \ No newline at end of file diff --git a/demo/水务/水务-qwen3-next-80b-snippet.json b/demo/水务/水务-qwen3-next-80b-snippet.json new file mode 100644 index 0000000..9bf40cc --- /dev/null +++ b/demo/水务/水务-qwen3-next-80b-snippet.json @@ -0,0 +1,227 @@ +[ + { + "id": "snpt_topn_supply_office_by_account", + "desc": "统计各供应办公室对应的账户数量,识别高占比管理所", + "type": "topn", + "title": "按供应办公室统计账户数", + "examples": [ + "哪个供水管理所服务的用户最多?", + "列出前5个账户数最多的供应办公室" + ], + "variables": [ + { + "name": "top_n", + "type": "int", + "default": 11 + } + ], + "dialect_sql": { + "mysql": "SELECT supply_office, COUNT(DISTINCT account_id) AS account_count\nFROM water_meter_info\nGROUP BY supply_office\nORDER BY account_count DESC\nLIMIT {{top_n}};" + }, + "applicability": { + "constraints": { + "notes": [ + "供应办公室仅11个唯一值,可安全展示全部;建议LIMIT 11避免冗余排序" + ], + "fk_join_available": false, + "dim_cardinality_hint": 11 + }, + "time_column": "nullable", + "required_columns": [ + "supply_office", + "account_id" + ] + }, + "business_caliber": "粒度=供应办公室,指标=去重账户数(account_id),仅统计水表信息表中有效账户,不关联外部表" + }, + { + "id": "snpt_topn_station_by_account", + "desc": "统计各站点服务的账户数量,识别高负载站点", + "type": "topn", + "title": "按站点统计账户分布", + "examples": [ + "哪些站点服务的用户最多?", + "TOP10用户最多的站点是哪些?" + ], + "variables": [ + { + "name": "top_n", + "type": "int", + "default": 20 + } + ], + "dialect_sql": { + "mysql": "SELECT station, COUNT(DISTINCT account_id) AS account_count\nFROM water_meter_info\nGROUP BY station\nORDER BY account_count DESC\nLIMIT {{top_n}};" + }, + "applicability": { + "constraints": { + "notes": [ + "站点有36个唯一值,建议LIMIT<=20以避免结果过长;高基数维度可能影响查询性能" + ], + "fk_join_available": false, + "dim_cardinality_hint": 36 + }, + "time_column": "nullable", + "required_columns": [ + "station", + "account_id" + ] + }, + "business_caliber": "粒度=站点(station),指标=去重账户数(account_id),基于水表信息表直接聚合,不涉及时间维度" + }, + { + "id": "snpt_topn_district_by_account", + "desc": "统计各行政区的账户数量,辅助区域资源分配分析", + "type": "topn", + "title": "按行政区统计账户分布", + "examples": [ + "哪个区的用水账户最多?", + "列出所有行政区的账户数量排名" + ], + "variables": [ + { + "name": "top_n", + "type": "int", + "default": 13 + } + ], + "dialect_sql": { + "mysql": "SELECT district, COUNT(DISTINCT account_id) AS account_count\nFROM water_meter_info\nGROUP BY district\nORDER BY account_count DESC\nLIMIT {{top_n}};" + }, + "applicability": { + "constraints": { + "notes": [ + "行政区共13个,可完整展示;适合用于区域对比分析" + ], + "fk_join_available": false, + "dim_cardinality_hint": 13 + }, + "time_column": "nullable", + "required_columns": [ + "district", + "account_id" + ] + }, + "business_caliber": "粒度=行政区(district),指标=去重账户数(account_id),基于水表信息表聚合,反映各区域用户规模" + }, + { + "id": "snpt_share_of_meter_type", + "desc": "计算各类水表类型在总账户中的占比,识别主流类型", + "type": "ratio", + "title": "水表类型占比分析", + "examples": [ + "各类水表在用户中的占比是多少?", + "电磁式远传水表占总用户比例多少?" + ], + "variables": [], + "dialect_sql": { + "mysql": "SELECT meter_type, \n COUNT(DISTINCT account_id) AS account_count,\n ROUND(COUNT(DISTINCT account_id) * 100.0 / SUM(COUNT(DISTINCT account_id)) OVER (), 2) AS percentage\nFROM water_meter_info\nGROUP BY meter_type\nORDER BY account_count DESC;" + }, + "applicability": { + "constraints": { + "notes": [ + "水表类型仅5种,适合计算占比;可直接展示全量分布" + ], + "fk_join_available": false, + "dim_cardinality_hint": 5 + }, + "time_column": "nullable", + "required_columns": [ + "meter_type", + "account_id" + ] + }, + "business_caliber": "粒度=水表类型(meter_type),指标=去重账户数占比,分母为全表去重账户总数,反映技术选型分布" + }, + { + "id": "snpt_sample_account_service_point", + "desc": "随机抽取部分账户与服务点ID的原始记录,用于数据质量核查", + "type": "sample", + "title": "抽样账户与服务点明细", + "examples": [ + "随机查看10条账户与服务点的详细信息", + "抽样检查水表信息是否符合预期格式" + ], + "variables": [ + { + "name": "sample_size", + "type": "int", + "default": 10 + } + ], + "dialect_sql": { + "mysql": "SELECT account_id, service_point_id, supply_office, station, district, meter_diameter, meter_type, meter_subtype, installation_position\nFROM water_meter_info\nORDER BY RAND()\nLIMIT {{sample_size}};" + }, + "applicability": { + "constraints": { + "notes": [ + "主键组合为account_id+service_point_id,适合抽样验证唯一性;建议样本量≤100" + ], + "fk_join_available": false, + "dim_cardinality_hint": null + }, + "time_column": "nullable", + "required_columns": [ + "account_id", + "service_point_id" + ] + }, + "business_caliber": "粒度=单条水表记录,抽取样本用于验证account_id与service_point_id的组合唯一性及维度字段完整性" + }, + { + "id": "snpt_filter_meter_status_valid", + "desc": "过滤出水表状态为'有效'的记录,用于后续分析", + "type": "quality", + "title": "筛选有效水表记录", + "examples": [ + "只取状态为有效的水表记录", + "确认所有水表是否均为有效状态" + ], + "variables": [], + "dialect_sql": { + "mysql": "SELECT *\nFROM water_meter_info\nWHERE meter_status = '有效';" + }, + "applicability": { + "constraints": { + "notes": [ + "meter_status仅存在'有效'值,此条件恒成立;可用于数据清洗流程的显式过滤" + ], + "fk_join_available": false, + "dim_cardinality_hint": 1 + }, + "time_column": "nullable", + "required_columns": [ + "meter_status" + ] + }, + "business_caliber": "仅保留水表状态为'有效'的记录,因全表均为有效值,此过滤为冗余但可作为数据质量校验的显式断言" + }, + { + "id": "snpt_filter_meter_diameter_20mm", + "desc": "筛选水表直径为20mm的记录,用于特定口径设备分析", + "type": "quality", + "title": "筛选20mm水表记录", + "examples": [ + "找出所有使用20mm水表的用户", + "20mm水表分布在哪些站点?" + ], + "variables": [], + "dialect_sql": { + "mysql": "SELECT *\nFROM water_meter_info\nWHERE meter_diameter = '20mm';" + }, + "applicability": { + "constraints": { + "notes": [ + "水表直径共8种枚举值,20mm为常见规格;可作为子集分析的起点" + ], + "fk_join_available": false, + "dim_cardinality_hint": 8 + }, + "time_column": "nullable", + "required_columns": [ + "meter_diameter" + ] + }, + "business_caliber": "粒度=单条水表记录,筛选条件为meter_diameter='20mm',用于分析标准住宅用水表的分布特征" + } +] \ No newline at end of file diff --git a/file/ecommerce_orders.sql b/file/ecommerce_orders.sql new file mode 100644 index 0000000..a5c3271 --- /dev/null +++ b/file/ecommerce_orders.sql @@ -0,0 +1,21 @@ +CREATE TABLE `ecommerce_orders` ( + `order_id` char(36) COLLATE utf8mb4_unicode_ci NOT NULL COMMENT 'UUID from CSV', + `customer_id` int NOT NULL, + `product_id` int NOT NULL, + `category` varchar(64) COLLATE utf8mb4_unicode_ci NOT NULL, + `price` decimal(10,2) NOT NULL, + `quantity` int NOT NULL, + `order_date` datetime(6) NOT NULL, + `shipping_date` datetime(6) NOT NULL, + `delivery_status` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL, + `payment_method` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL, + `device_type` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL, + `channel` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL, + `shipping_address` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL, + `billing_address` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL, + `customer_segment` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL, + PRIMARY KEY (`order_id`), + KEY `idx_customer` (`customer_id`), + KEY `idx_product` (`product_id`), + KEY `idx_order_date` (`order_date`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; \ No newline at end of file