reference_trajectory=[# example 1[{"tool_name":"set_device_info","tool_input":{"device_id":"device_2","updates":{"status":"OFF"}}}],# example 2[{"tool_name":"get_user_preferences","tool_input":{"user_id":"user_y"}},{"tool_name":"set_temperature","tool_input":{"location":"Living Room","temperature":23}},]]predicted_trajectory=[# example 1[{"tool_name":"set_device_info","tool_input":{"device_id":"device_3","updates":{"status":"OFF"}}}],# example 2[{"tool_name":"get_user_preferences","tool_input":{"user_id":"user_z"}},{"tool_name":"set_temperature","tool_input":{"location":"Living Room","temperature":23}},]]eval_dataset=pd.DataFrame({"predicted_trajectory":predicted_trajectory,"reference_trajectory":reference_trajectory,})
导入评估数据集
您可以使用以下格式导入数据集:
存储在 Cloud Storage 中的 JSONL 或 CSV 文件
BigQuery 表
Pandas DataFrame
Gen AI Evaluation Service 提供了一些公开的示例数据集,用于演示如何评估智能体。以下代码展示了如何从 Cloud Storage 存储桶导入这些公开数据集:
# dataset name to be importeddataset="on-device"# Alternatives: "customer-support", "content-creation"# copy the tools and dataset file!gcloudstoragecpgs://cloud-ai-demo-datasets/agent-eval-datasets/{dataset}/tools.py.!gcloudstoragecpgs://cloud-ai-demo-datasets/agent-eval-datasets/{dataset}/eval_dataset.json.# load the dataset examplesimportjsoneval_dataset=json.loads(open('eval_dataset.json').read())# run the tools file%run-itools.py
response_follows_trajectory_prompt_template=PointwiseMetricPromptTemplate(criteria={"Follows trajectory":("Evaluate whether the agent's response logically follows from the ""sequence of actions it took. Consider these sub-points:\n"" - Does the response reflect the information gathered during the trajectory?\n"" - Is the response consistent with the goals and constraints of the task?\n"" - Are there any unexpected or illogical jumps in reasoning?\n""Provide specific examples from the trajectory and response to support your evaluation.")},rating_rubric={"1":"Follows trajectory","0":"Does not follow trajectory",},input_variables=["prompt","predicted_trajectory"],)response_follows_trajectory_metric=PointwiseMetric(metric="response_follows_trajectory",metric_prompt_template=response_follows_trajectory_prompt_template,)