测试用例和结果

This commit is contained in:
zhaoawd
2025-10-29 23:43:06 +08:00
parent 59c9efa5d8
commit a78c8b9446
4 changed files with 137 additions and 1 deletions

41
deepseek-result.json Normal file
View File

@ -0,0 +1,41 @@
{
"provider": "deepseek",
"model": "deepseek-chat",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": "```json\n{\n \"table_name\": \"national_brand_sales\",\n \"description\": \"全国品牌系统外销售数据\",\n \"columns\": [\n {\n \"original_name\": \"品牌\",\n \"standard_name\": \"brand\",\n \"data_type\": \"string\",\n \"db_type\": \"varchar(50)\",\n \"java_type\": \"string\",\n \"nullable\": true,\n \"distinct_count_sample\": 5,\n \"null_ratio_sample\": 0.4,\n \"is_enum_candidate\": false,\n \"description\": \"品牌名称\",\n \"date_format\": null\n },\n {\n \"original_name\": \"产品价类\",\n \"standard_name\": \"price_category\",\n \"data_type\": \"string\",\n \"db_type\": \"varchar(10)\",\n \"java_type\": \"string\",\n \"nullable\": false,\n \"distinct_count_sample\": 3,\n \"null_ratio_sample\": 0.0,\n \"is_enum_candidate\": true,\n \"description\": \"产品价格分类(一类/二类/三类)\",\n \"date_format\": null\n },\n {\n \"original_name\": \"是否重点品牌"
}
}
],
"raw": {
"id": "67f3cc80-38bc-4bb7-b336-48d4886722c4",
"object": "chat.completion",
"created": 1761752207,
"model": "deepseek-chat",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": "```json\n{\n \"table_name\": \"national_brand_sales\",\n \"description\": \"全国品牌系统外销售数据\",\n \"columns\": [\n {\n \"original_name\": \"品牌\",\n \"standard_name\": \"brand\",\n \"data_type\": \"string\",\n \"db_type\": \"varchar(50)\",\n \"java_type\": \"string\",\n \"nullable\": true,\n \"distinct_count_sample\": 5,\n \"null_ratio_sample\": 0.4,\n \"is_enum_candidate\": false,\n \"description\": \"品牌名称\",\n \"date_format\": null\n },\n {\n \"original_name\": \"产品价类\",\n \"standard_name\": \"price_category\",\n \"data_type\": \"string\",\n \"db_type\": \"varchar(10)\",\n \"java_type\": \"string\",\n \"nullable\": false,\n \"distinct_count_sample\": 3,\n \"null_ratio_sample\": 0.0,\n \"is_enum_candidate\": true,\n \"description\": \"产品价格分类(一类/二类/三类)\",\n \"date_format\": null\n },\n {\n \"original_name\": \"是否重点品牌"
},
"logprobs": null,
"finish_reason": "length"
}
],
"usage": {
"prompt_tokens": 1078,
"completion_tokens": 256,
"total_tokens": 1334,
"prompt_tokens_details": {
"cached_tokens": 1024
},
"prompt_cache_hit_tokens": 1024,
"prompt_cache_miss_tokens": 54
},
"system_fingerprint": "fp_ffc7281d48_prod0820_fp8_kvcache"
}
}

View File

@ -0,0 +1,43 @@
"""Demonstrates calling /v1/chat/completions with the DeepSeek provider."""
from __future__ import annotations
import asyncio
import json
import httpx
from dotenv import load_dotenv
load_dotenv()
API_URL = "http://localhost:8000/v1/chat/completions"
async def main() -> None:
payload = {
"provider": "deepseek",
"model": "deepseek-chat",
"messages": [
{
"role": "system",
"content": "角色你是一名数据分析导入助手Data Ingestion Analyst擅长从原始数据抽取结构化元数据、推断字段类型、识别维度/事实属性并输出导入建模建议Table + JSON\n\n任务目标对提供的数据含表头或table schema与若干行样本数据进行解析生成一份导入分析与处理报告指导如何将其导入为标准化表结构及 JSON 元数据定义,不要省略任何字段信息,全量输出。\n\n请从以下两个方向进行思考:\n\n方向 1元数据识别与整理\n解析表明根据表头、Origin Table Name、Orign File Name生成表名表名需要有意义\n解析列名生成标准化字段名snake_case 或小驼峰),并给出原始列名与标准字段名映射。\n为每个字段写出中文/英文注释(若无法确定,给出“待确认”并附可能解释)。\n\n方向 2字段数据类型与格式推断\n针对每列:输出推断数据类型(如 varchar(n) / int / bigint / tinyint / float / double / decimal(p,s) / date / datetime / text\n说明推断依据:样本值分布、长度范围、格式正则、是否存在空值、是否数值但含前导零等。\n指出数据质量初步观察:缺失率、是否有异常/离群值(简单规则即可)、是否需标准化(如去空格、去重、枚举值归一)。\n给出“建议处理动作”:如 trim、cast_float、cast_int、cast_double、cast_date、cast_time、cast_datetime适用于将样本数据转换成数据库表字段兼容的格式。\n若为“可能是枚举”的字段,列出候选枚举值及占比。\n\n最终内容都输出为一个json对象格式为字段级与表级定义字段含\n{\n \"table_name\": \"标准化后的表名\",\n \"description\": \"表简短描述\",\n \"columns\": [{\n \"original_name\": \"原始名称\",\n \"standard_name\": \"标准化后的名称: 下划线命名,大小写字母、数字、下划线\",\n \"data_type\": \"数据类型限制为number/string/datetime\",\n \"db_type\": \"数据库字段类型\",\n \"java_type\": \"java字段类型限制为: int/long/double/string/date\",\n \"nullable\": true/false,\n \"distinct_count_sample\": number,\n \"null_ratio_sample\": 0.x,\n \"is_enum_candidate\": true/false,\n \"description\": \"字段简短描述\",\n \"date_format\": \"转换成Date类型的pattern\"\n }]\n}\n\n约束与风格:\n\n若信息不足,请显式指出“信息不足”并给出补充数据需求清单。\n避免武断结论,用“可能 / 候选 / 建议”字样。\n不要捏造样本未出现的值。"
},
{
"role": "user",
"content": "导入记录ID: demo-import-001\n\n表头信息:\n- 品牌\n- 产品价类\n- 是否重点品牌\n- 系统外销售量(箱)\n- 系统外销售金额(万元)\n- 同期系统外销售量(箱)\n- 同期系统外销售金额(万元)\n\n示例数据:\nCSV样本预览:\n品牌,产品价类,是否重点品牌,系统外销售量(箱),系统外销售金额(万元),同期系统外销售量(箱),同期系统外销售金额(万元)\r\n白沙,一类,重点品牌,3332.406875,64283.5593333333,3123.693375,61821.7986666667\r\nnan,二类,重点品牌,1094.4707375,3859.69366666667,869.65725,3067.00966666667\r\nnan,三类,重点品牌,3965.0457375,8388.306,4401.6714875,8802.132\r\n宝岛,一类,否,39.934375,301.617666666667,30.5975,249.399666666667\r\n长白山,一类,重点品牌,2666.53775,12360.8306666667,1916.252,9051.672\r\nnan,二类,重点品牌,2359.910025,7671.26233333333,2335.2480875,7590.791\r\nnan,三类,重点品牌,1263.293875,2826.665,1590.750875,3503.083\r\n大前门,一类,否,81.5806875,343.721333333333,114.1179875,480.809333333333\r\nnan,三类,否,226.445225,319.975666666667,254.6595125,359.894\r\n大青山,二类,否,60.73525,209.415,60.2415,207.712666666667\n\n附加结构信息:\n{\n \"source\": \"excel\",\n \"file_name\": \"全国品牌.xlsx\",\n \"sheet_name\": \"Sheet1\"\n}"
}
],
"temperature": 0.2,
"max_tokens": 256,
}
async with httpx.AsyncClient(timeout=httpx.Timeout(20.0)) as client:
response = await client.post(API_URL, json=payload)
response.raise_for_status()
data = response.json()
print(json.dumps(data, ensure_ascii=False, indent=2))
if __name__ == "__main__":
asyncio.run(main())

View File

@ -7,7 +7,9 @@ from pathlib import Path
import httpx import httpx
import pandas as pd import pandas as pd
from dotenv import load_dotenv
load_dotenv()
API_URL = "http://localhost:8000/v1/import/analyze" API_URL = "http://localhost:8000/v1/import/analyze"
CALLBACK_URL = "http://localhost:8000/__mock__/import-callback" CALLBACK_URL = "http://localhost:8000/__mock__/import-callback"
@ -26,9 +28,15 @@ async def main() -> None:
payload = { payload = {
"import_record_id": "demo-import-001", "import_record_id": "demo-import-001",
"rows": rows, "rows": rows,
"struce": headers, "headers": headers,
"table_schema": {
"source": "excel",
"file_name": EXCEL_PATH.name,
"sheet_name": sheet_name,
},
"llm_model": "deepseek:deepseek-chat", "llm_model": "deepseek:deepseek-chat",
"temperature": 0.2, "temperature": 0.2,
"max_output_tokens": 256,
"callback_url": CALLBACK_URL, "callback_url": CALLBACK_URL,
} }

View File

@ -0,0 +1,44 @@
"""Quick demo call against the unified chat endpoint using the OpenRouter provider."""
from __future__ import annotations
import asyncio
import httpx
from dotenv import load_dotenv
load_dotenv()
API_URL = "http://localhost:8000/v1/chat/completions"
async def main() -> None:
payload = {
"provider": "openrouter",
"model": "anthropic/claude-3.5-sonnet",
"messages": [
{
"role": "system",
"content": "You are an API assistant that writes concise JSON only.",
},
{
"role": "user",
"content": "Return a JSON object describing this test invocation.",
},
],
"temperature": 0.1,
"max_tokens": 300,
}
async with httpx.AsyncClient(timeout=httpx.Timeout(15.0)) as client:
response = await client.post(API_URL, json=payload)
print("Status:", response.status_code)
try:
print("Body:", response.json())
except ValueError:
print("Raw Body:", response.text)
if __name__ == "__main__":
asyncio.run(main())